src/librustdoc/passes/html_tags.rs

   1 //! Detects invalid HTML (like an unclosed `<span>`) in doc comments.
   2 use super::Pass;
   3 use crate::clean::*;
   4 use crate::core::DocContext;
   5 use crate::html::markdown::main_body_opts;
   6 use crate::visit::DocVisitor;
   7
   8 use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag};
   9
  10 use std::iter::Peekable;
  11 use std::ops::Range;
  12 use std::str::CharIndices;
  13
  14 pub(crate) const CHECK_INVALID_HTML_TAGS: Pass = Pass {
  15     name: "check-invalid-html-tags",
  16     run: check_invalid_html_tags,
  17     description: "detects invalid HTML tags in doc comments",
  18 };
  19
  20 struct InvalidHtmlTagsLinter<'a, 'tcx> {
  21     cx: &'a mut DocContext<'tcx>,
  22 }
  23
  24 pub(crate) fn check_invalid_html_tags(krate: Crate, cx: &mut DocContext<'_>) -> Crate {
  25     if cx.tcx.sess.is_nightly_build() {
  26         let mut coll = InvalidHtmlTagsLinter { cx };
  27         coll.visit_crate(&krate);
  28     }
  29     krate
  30 }
  31
  32 const ALLOWED_UNCLOSED: &[&str] = &[
  33     "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
  34     "source", "track", "wbr",
  35 ];
  36
  37 fn drop_tag(
  38     tags: &mut Vec<(String, Range<usize>)>,
  39     tag_name: String,
  40     range: Range<usize>,
  41     f: &impl Fn(&str, &Range<usize>, bool),
  42 ) {
  43     let tag_name_low = tag_name.to_lowercase();
  44     if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
  45         // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
  46         // be emitted.
  47         let should_not_warn = tags.iter().take(pos + 1).any(|(at, _)| {
  48             let at = at.to_lowercase();
  49             at == "script" || at == "style"
  50         });
  51         for (last_tag_name, last_tag_span) in tags.drain(pos + 1..) {
  52             if should_not_warn {
  53                 continue;
  54             }
  55             let last_tag_name_low = last_tag_name.to_lowercase();
  56             if ALLOWED_UNCLOSED.contains(&last_tag_name_low.as_str()) {
  57                 continue;
  58             }
  59             // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
  60             // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
  61             // have `h3`, meaning the tag wasn't closed as it should have.
  62             f(&format!("unclosed HTML tag `{}`", last_tag_name), &last_tag_span, true);
  63         }
  64         // Remove the `tag_name` that was originally closed
  65         tags.pop();
  66     } else {
  67         // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
  68         // but it helps for the visualization).
  69         f(&format!("unopened HTML tag `{}`", tag_name), &range, false);
  70     }
  71 }
  72
  73 fn extract_path_backwards(text: &str, end_pos: usize) -> Option<usize> {
  74     use rustc_lexer::{is_id_continue, is_id_start};
  75     let mut current_pos = end_pos;
  76     loop {
  77         if current_pos >= 2 && text[..current_pos].ends_with("::") {
  78             current_pos -= 2;
  79         }
  80         let new_pos = text[..current_pos]
  81             .char_indices()
  82             .rev()
  83             .take_while(|(_, c)| is_id_start(*c) || is_id_continue(*c))
  84             .reduce(|_accum, item| item)
  85             .and_then(|(new_pos, c)| is_id_start(c).then_some(new_pos));
  86         if let Some(new_pos) = new_pos {
  87             if current_pos != new_pos {
  88                 current_pos = new_pos;
  89                 continue;
  90             }
  91         }
  92         break;
  93     }
  94     if current_pos == end_pos { None } else { Some(current_pos) }
  95 }
  96
  97 fn extract_path_forward(text: &str, start_pos: usize) -> Option<usize> {
  98     use rustc_lexer::{is_id_continue, is_id_start};
  99     let mut current_pos = start_pos;
 100     loop {
 101         if current_pos < text.len() && text[current_pos..].starts_with("::") {
 102             current_pos += 2;
 103         } else {
 104             break;
 105         }
 106         let mut chars = text[current_pos..].chars();
 107         if let Some(c) = chars.next() {
 108             if is_id_start(c) {
 109                 current_pos += c.len_utf8();
 110             } else {
 111                 break;
 112             }
 113         }
 114         while let Some(c) = chars.next() {
 115             if is_id_continue(c) {
 116                 current_pos += c.len_utf8();
 117             } else {
 118                 break;
 119             }
 120         }
 121     }
 122     if current_pos == start_pos { None } else { Some(current_pos) }
 123 }
 124
 125 fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool {
 126     // https://spec.commonmark.org/0.30/#raw-html
 127     //
 128     // > A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or
 129     // > hyphens (-).
 130     c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit())
 131 }
 132
 133 fn extract_html_tag(
 134     tags: &mut Vec<(String, Range<usize>)>,
 135     text: &str,
 136     range: &Range<usize>,
 137     start_pos: usize,
 138     iter: &mut Peekable<CharIndices<'_>>,
 139     f: &impl Fn(&str, &Range<usize>, bool),
 140 ) {
 141     let mut tag_name = String::new();
 142     let mut is_closing = false;
 143     let mut prev_pos = start_pos;
 144
 145     loop {
 146         let (pos, c) = match iter.peek() {
 147             Some((pos, c)) => (*pos, *c),
 148             // In case we reached the of the doc comment, we want to check that it's an
 149             // unclosed HTML tag. For example "/// <h3".
 150             None => (prev_pos, '\0'),
 151         };
 152         prev_pos = pos;
 153         // Checking if this is a closing tag (like `</a>` for `<a>`).
 154         if c == '/' && tag_name.is_empty() {
 155             is_closing = true;
 156         } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) {
 157             tag_name.push(c);
 158         } else {
 159             if !tag_name.is_empty() {
 160                 let mut r = Range { start: range.start + start_pos, end: range.start + pos };
 161                 if c == '>' {
 162                     // In case we have a tag without attribute, we can consider the span to
 163                     // refer to it fully.
 164                     r.end += 1;
 165                 }
 166                 if is_closing {
 167                     // In case we have "</div >" or even "</div         >".
 168                     if c != '>' {
 169                         if !c.is_whitespace() {
 170                             // It seems like it's not a valid HTML tag.
 171                             break;
 172                         }
 173                         let mut found = false;
 174                         for (new_pos, c) in text[pos..].char_indices() {
 175                             if !c.is_whitespace() {
 176                                 if c == '>' {
 177                                     r.end = range.start + new_pos + 1;
 178                                     found = true;
 179                                 }
 180                                 break;
 181                             }
 182                         }
 183                         if !found {
 184                             break;
 185                         }
 186                     }
 187                     drop_tag(tags, tag_name, r, f);
 188                 } else {
 189                     tags.push((tag_name, r));
 190                 }
 191             }
 192             break;
 193         }
 194         iter.next();
 195     }
 196 }
 197
 198 fn extract_tags(
 199     tags: &mut Vec<(String, Range<usize>)>,
 200     text: &str,
 201     range: Range<usize>,
 202     is_in_comment: &mut Option<Range<usize>>,
 203     f: &impl Fn(&str, &Range<usize>, bool),
 204 ) {
 205     let mut iter = text.char_indices().peekable();
 206
 207     while let Some((start_pos, c)) = iter.next() {
 208         if is_in_comment.is_some() {
 209             if text[start_pos..].starts_with("-->") {
 210                 *is_in_comment = None;
 211             }
 212         } else if c == '<' {
 213             if text[start_pos..].starts_with("<!--") {
 214                 // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
 215                 iter.next();
 216                 iter.next();
 217                 iter.next();
 218                 *is_in_comment = Some(Range {
 219                     start: range.start + start_pos,
 220                     end: range.start + start_pos + 3,
 221                 });
 222             } else {
 223                 extract_html_tag(tags, text, &range, start_pos, &mut iter, f);
 224             }
 225         }
 226     }
 227 }
 228
 229 impl<'a, 'tcx> DocVisitor for InvalidHtmlTagsLinter<'a, 'tcx> {
 230     fn visit_item(&mut self, item: &Item) {
 231         let tcx = self.cx.tcx;
 232         let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id)
 233         // If non-local, no need to check anything.
 234         else { return };
 235         let dox = item.attrs.collapsed_doc_value().unwrap_or_default();
 236         if !dox.is_empty() {
 237             let report_diag = |msg: &str, range: &Range<usize>, is_open_tag: bool| {
 238                 let sp = match super::source_span_for_markdown_range(tcx, &dox, range, &item.attrs)
 239                 {
 240                     Some(sp) => sp,
 241                     None => item.attr_span(tcx),
 242                 };
 243                 tcx.struct_span_lint_hir(crate::lint::INVALID_HTML_TAGS, hir_id, sp, |lint| {
 244                     use rustc_lint_defs::Applicability;
 245                     let mut diag = lint.build(msg);
 246                     // If a tag looks like `<this>`, it might actually be a generic.
 247                     // We don't try to detect stuff `<like, this>` because that's not valid HTML,
 248                     // and we don't try to detect stuff `<like this>` because that's not valid Rust.
 249                     let mut generics_end = range.end;
 250                     if let Some(Some(mut generics_start)) = (is_open_tag
 251                         && dox[..generics_end].ends_with('>'))
 252                     .then(|| extract_path_backwards(&dox, range.start))
 253                     {
 254                         while generics_start != 0
 255                             && generics_end < dox.len()
 256                             && dox.as_bytes()[generics_start - 1] == b'<'
 257                             && dox.as_bytes()[generics_end] == b'>'
 258                         {
 259                             generics_end += 1;
 260                             generics_start -= 1;
 261                             if let Some(new_start) = extract_path_backwards(&dox, generics_start) {
 262                                 generics_start = new_start;
 263                             }
 264                             if let Some(new_end) = extract_path_forward(&dox, generics_end) {
 265                                 generics_end = new_end;
 266                             }
 267                         }
 268                         if let Some(new_end) = extract_path_forward(&dox, generics_end) {
 269                             generics_end = new_end;
 270                         }
 271                         let generics_sp = match super::source_span_for_markdown_range(
 272                             tcx,
 273                             &dox,
 274                             &(generics_start..generics_end),
 275                             &item.attrs,
 276                         ) {
 277                             Some(sp) => sp,
 278                             None => item.attr_span(tcx),
 279                         };
 280                         // Sometimes, we only extract part of a path. For example, consider this:
 281                         //
 282                         //     <[u32] as IntoIter<u32>>::Item
 283                         //                       ^^^^^ unclosed HTML tag `u32`
 284                         //
 285                         // We don't have any code for parsing fully-qualified trait paths.
 286                         // In theory, we could add it, but doing it correctly would require
 287                         // parsing the entire path grammar, which is problematic because of
 288                         // overlap between the path grammar and Markdown.
 289                         //
 290                         // The example above shows that ambiguity. Is `[u32]` intended to be an
 291                         // intra-doc link to the u32 primitive, or is it intended to be a slice?
 292                         //
 293                         // If the below conditional were removed, we would suggest this, which is
 294                         // not what the user probably wants.
 295                         //
 296                         //     <[u32] as `IntoIter<u32>`>::Item
 297                         //
 298                         // We know that the user actually wants to wrap the whole thing in a code
 299                         // block, but the only reason we know that is because `u32` does not, in
 300                         // fact, implement IntoIter. If the example looks like this:
 301                         //
 302                         //     <[Vec<i32>] as IntoIter<i32>::Item
 303                         //
 304                         // The ideal fix would be significantly different.
 305                         if (generics_start > 0 && dox.as_bytes()[generics_start - 1] == b'<')
 306                             || (generics_end < dox.len() && dox.as_bytes()[generics_end] == b'>')
 307                         {
 308                             diag.emit();
 309                             return;
 310                         }
 311                         // multipart form is chosen here because ``Vec<i32>`` would be confusing.
 312                         diag.multipart_suggestion(
 313                             "try marking as source code",
 314                             vec![
 315                                 (generics_sp.shrink_to_lo(), String::from("`")),
 316                                 (generics_sp.shrink_to_hi(), String::from("`")),
 317                             ],
 318                             Applicability::MaybeIncorrect,
 319                         );
 320                     }
 321                     diag.emit()
 322                 });
 323             };
 324
 325             let mut tags = Vec::new();
 326             let mut is_in_comment = None;
 327             let mut in_code_block = false;
 328
 329             let link_names = item.link_names(&self.cx.cache);
 330
 331             let mut replacer = |broken_link: BrokenLink<'_>| {
 332                 if let Some(link) =
 333                     link_names.iter().find(|link| *link.original_text == *broken_link.reference)
 334                 {
 335                     Some((link.href.as_str().into(), link.new_text.as_str().into()))
 336                 } else if matches!(
 337                     &broken_link.link_type,
 338                     LinkType::Reference | LinkType::ReferenceUnknown
 339                 ) {
 340                     // If the link is shaped [like][this], suppress any broken HTML in the [this] part.
 341                     // The `broken_intra_doc_links` will report typos in there anyway.
 342                     Some((
 343                         broken_link.reference.to_string().into(),
 344                         broken_link.reference.to_string().into(),
 345                     ))
 346                 } else {
 347                     None
 348                 }
 349             };
 350
 351             let p =
 352                 Parser::new_with_broken_link_callback(&dox, main_body_opts(), Some(&mut replacer))
 353                     .into_offset_iter();
 354
 355             for (event, range) in p {
 356                 match event {
 357                     Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
 358                     Event::Html(text) if !in_code_block => {
 359                         extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag)
 360                     }
 361                     Event::End(Tag::CodeBlock(_)) => in_code_block = false,
 362                     _ => {}
 363                 }
 364             }
 365
 366             for (tag, range) in tags.iter().filter(|(t, _)| {
 367                 let t = t.to_lowercase();
 368                 !ALLOWED_UNCLOSED.contains(&t.as_str())
 369             }) {
 370                 report_diag(&format!("unclosed HTML tag `{}`", tag), range, true);
 371             }
 372
 373             if let Some(range) = is_in_comment {
 374                 report_diag("Unclosed HTML comment", &range, false);
 375             }
 376         }
 377
 378         self.visit_item_recur(item)
 379     }
 380 }