]> git.lizzy.rs Git - rust.git/blob - src/librustdoc/passes/lint/html_tags.rs
Merge commit '4bdfb0741dbcecd5279a2635c3280726db0604b5' into clippyup
[rust.git] / src / librustdoc / passes / lint / html_tags.rs
1 //! Detects invalid HTML (like an unclosed `<span>`) in doc comments.
2 use crate::clean::*;
3 use crate::core::DocContext;
4 use crate::html::markdown::main_body_opts;
5 use crate::passes::source_span_for_markdown_range;
6
7 use pulldown_cmark::{BrokenLink, Event, LinkType, Parser, Tag};
8
9 use std::iter::Peekable;
10 use std::ops::Range;
11 use std::str::CharIndices;
12
13 pub(crate) fn visit_item(cx: &DocContext<'_>, item: &Item) {
14     let tcx = cx.tcx;
15     let Some(hir_id) = DocContext::as_local_hir_id(tcx, item.item_id)
16     // If non-local, no need to check anything.
17     else { return };
18     let dox = item.attrs.collapsed_doc_value().unwrap_or_default();
19     if !dox.is_empty() {
20         let report_diag = |msg: &str, range: &Range<usize>, is_open_tag: bool| {
21             let sp = match source_span_for_markdown_range(tcx, &dox, range, &item.attrs) {
22                 Some(sp) => sp,
23                 None => item.attr_span(tcx),
24             };
25             tcx.struct_span_lint_hir(crate::lint::INVALID_HTML_TAGS, hir_id, sp, msg, |lint| {
26                 use rustc_lint_defs::Applicability;
27                 // If a tag looks like `<this>`, it might actually be a generic.
28                 // We don't try to detect stuff `<like, this>` because that's not valid HTML,
29                 // and we don't try to detect stuff `<like this>` because that's not valid Rust.
30                 let mut generics_end = range.end;
31                 if let Some(Some(mut generics_start)) = (is_open_tag
32                     && dox[..generics_end].ends_with('>'))
33                 .then(|| extract_path_backwards(&dox, range.start))
34                 {
35                     while generics_start != 0
36                         && generics_end < dox.len()
37                         && dox.as_bytes()[generics_start - 1] == b'<'
38                         && dox.as_bytes()[generics_end] == b'>'
39                     {
40                         generics_end += 1;
41                         generics_start -= 1;
42                         if let Some(new_start) = extract_path_backwards(&dox, generics_start) {
43                             generics_start = new_start;
44                         }
45                         if let Some(new_end) = extract_path_forward(&dox, generics_end) {
46                             generics_end = new_end;
47                         }
48                     }
49                     if let Some(new_end) = extract_path_forward(&dox, generics_end) {
50                         generics_end = new_end;
51                     }
52                     let generics_sp = match source_span_for_markdown_range(
53                         tcx,
54                         &dox,
55                         &(generics_start..generics_end),
56                         &item.attrs,
57                     ) {
58                         Some(sp) => sp,
59                         None => item.attr_span(tcx),
60                     };
61                     // Sometimes, we only extract part of a path. For example, consider this:
62                     //
63                     //     <[u32] as IntoIter<u32>>::Item
64                     //                       ^^^^^ unclosed HTML tag `u32`
65                     //
66                     // We don't have any code for parsing fully-qualified trait paths.
67                     // In theory, we could add it, but doing it correctly would require
68                     // parsing the entire path grammar, which is problematic because of
69                     // overlap between the path grammar and Markdown.
70                     //
71                     // The example above shows that ambiguity. Is `[u32]` intended to be an
72                     // intra-doc link to the u32 primitive, or is it intended to be a slice?
73                     //
74                     // If the below conditional were removed, we would suggest this, which is
75                     // not what the user probably wants.
76                     //
77                     //     <[u32] as `IntoIter<u32>`>::Item
78                     //
79                     // We know that the user actually wants to wrap the whole thing in a code
80                     // block, but the only reason we know that is because `u32` does not, in
81                     // fact, implement IntoIter. If the example looks like this:
82                     //
83                     //     <[Vec<i32>] as IntoIter<i32>::Item
84                     //
85                     // The ideal fix would be significantly different.
86                     if (generics_start > 0 && dox.as_bytes()[generics_start - 1] == b'<')
87                         || (generics_end < dox.len() && dox.as_bytes()[generics_end] == b'>')
88                     {
89                         return lint;
90                     }
91                     // multipart form is chosen here because ``Vec<i32>`` would be confusing.
92                     lint.multipart_suggestion(
93                         "try marking as source code",
94                         vec![
95                             (generics_sp.shrink_to_lo(), String::from("`")),
96                             (generics_sp.shrink_to_hi(), String::from("`")),
97                         ],
98                         Applicability::MaybeIncorrect,
99                     );
100                 }
101
102                 lint
103             });
104         };
105
106         let mut tags = Vec::new();
107         let mut is_in_comment = None;
108         let mut in_code_block = false;
109
110         let link_names = item.link_names(&cx.cache);
111
112         let mut replacer = |broken_link: BrokenLink<'_>| {
113             if let Some(link) =
114                 link_names.iter().find(|link| *link.original_text == *broken_link.reference)
115             {
116                 Some((link.href.as_str().into(), link.new_text.as_str().into()))
117             } else if matches!(
118                 &broken_link.link_type,
119                 LinkType::Reference | LinkType::ReferenceUnknown
120             ) {
121                 // If the link is shaped [like][this], suppress any broken HTML in the [this] part.
122                 // The `broken_intra_doc_links` will report typos in there anyway.
123                 Some((
124                     broken_link.reference.to_string().into(),
125                     broken_link.reference.to_string().into(),
126                 ))
127             } else {
128                 None
129             }
130         };
131
132         let p = Parser::new_with_broken_link_callback(&dox, main_body_opts(), Some(&mut replacer))
133             .into_offset_iter();
134
135         for (event, range) in p {
136             match event {
137                 Event::Start(Tag::CodeBlock(_)) => in_code_block = true,
138                 Event::Html(text) if !in_code_block => {
139                     extract_tags(&mut tags, &text, range, &mut is_in_comment, &report_diag)
140                 }
141                 Event::End(Tag::CodeBlock(_)) => in_code_block = false,
142                 _ => {}
143             }
144         }
145
146         for (tag, range) in tags.iter().filter(|(t, _)| {
147             let t = t.to_lowercase();
148             !ALLOWED_UNCLOSED.contains(&t.as_str())
149         }) {
150             report_diag(&format!("unclosed HTML tag `{}`", tag), range, true);
151         }
152
153         if let Some(range) = is_in_comment {
154             report_diag("Unclosed HTML comment", &range, false);
155         }
156     }
157 }
158
159 const ALLOWED_UNCLOSED: &[&str] = &[
160     "area", "base", "br", "col", "embed", "hr", "img", "input", "keygen", "link", "meta", "param",
161     "source", "track", "wbr",
162 ];
163
164 fn drop_tag(
165     tags: &mut Vec<(String, Range<usize>)>,
166     tag_name: String,
167     range: Range<usize>,
168     f: &impl Fn(&str, &Range<usize>, bool),
169 ) {
170     let tag_name_low = tag_name.to_lowercase();
171     if let Some(pos) = tags.iter().rposition(|(t, _)| t.to_lowercase() == tag_name_low) {
172         // If the tag is nested inside a "<script>" or a "<style>" tag, no warning should
173         // be emitted.
174         let should_not_warn = tags.iter().take(pos + 1).any(|(at, _)| {
175             let at = at.to_lowercase();
176             at == "script" || at == "style"
177         });
178         for (last_tag_name, last_tag_span) in tags.drain(pos + 1..) {
179             if should_not_warn {
180                 continue;
181             }
182             let last_tag_name_low = last_tag_name.to_lowercase();
183             if ALLOWED_UNCLOSED.contains(&last_tag_name_low.as_str()) {
184                 continue;
185             }
186             // `tags` is used as a queue, meaning that everything after `pos` is included inside it.
187             // So `<h2><h3></h2>` will look like `["h2", "h3"]`. So when closing `h2`, we will still
188             // have `h3`, meaning the tag wasn't closed as it should have.
189             f(&format!("unclosed HTML tag `{}`", last_tag_name), &last_tag_span, true);
190         }
191         // Remove the `tag_name` that was originally closed
192         tags.pop();
193     } else {
194         // It can happen for example in this case: `<h2></script></h2>` (the `h2` tag isn't required
195         // but it helps for the visualization).
196         f(&format!("unopened HTML tag `{}`", tag_name), &range, false);
197     }
198 }
199
200 fn extract_path_backwards(text: &str, end_pos: usize) -> Option<usize> {
201     use rustc_lexer::{is_id_continue, is_id_start};
202     let mut current_pos = end_pos;
203     loop {
204         if current_pos >= 2 && text[..current_pos].ends_with("::") {
205             current_pos -= 2;
206         }
207         let new_pos = text[..current_pos]
208             .char_indices()
209             .rev()
210             .take_while(|(_, c)| is_id_start(*c) || is_id_continue(*c))
211             .reduce(|_accum, item| item)
212             .and_then(|(new_pos, c)| is_id_start(c).then_some(new_pos));
213         if let Some(new_pos) = new_pos {
214             if current_pos != new_pos {
215                 current_pos = new_pos;
216                 continue;
217             }
218         }
219         break;
220     }
221     if current_pos == end_pos { None } else { Some(current_pos) }
222 }
223
224 fn extract_path_forward(text: &str, start_pos: usize) -> Option<usize> {
225     use rustc_lexer::{is_id_continue, is_id_start};
226     let mut current_pos = start_pos;
227     loop {
228         if current_pos < text.len() && text[current_pos..].starts_with("::") {
229             current_pos += 2;
230         } else {
231             break;
232         }
233         let mut chars = text[current_pos..].chars();
234         if let Some(c) = chars.next() {
235             if is_id_start(c) {
236                 current_pos += c.len_utf8();
237             } else {
238                 break;
239             }
240         }
241         while let Some(c) = chars.next() {
242             if is_id_continue(c) {
243                 current_pos += c.len_utf8();
244             } else {
245                 break;
246             }
247         }
248     }
249     if current_pos == start_pos { None } else { Some(current_pos) }
250 }
251
252 fn is_valid_for_html_tag_name(c: char, is_empty: bool) -> bool {
253     // https://spec.commonmark.org/0.30/#raw-html
254     //
255     // > A tag name consists of an ASCII letter followed by zero or more ASCII letters, digits, or
256     // > hyphens (-).
257     c.is_ascii_alphabetic() || !is_empty && (c == '-' || c.is_ascii_digit())
258 }
259
260 fn extract_html_tag(
261     tags: &mut Vec<(String, Range<usize>)>,
262     text: &str,
263     range: &Range<usize>,
264     start_pos: usize,
265     iter: &mut Peekable<CharIndices<'_>>,
266     f: &impl Fn(&str, &Range<usize>, bool),
267 ) {
268     let mut tag_name = String::new();
269     let mut is_closing = false;
270     let mut prev_pos = start_pos;
271
272     loop {
273         let (pos, c) = match iter.peek() {
274             Some((pos, c)) => (*pos, *c),
275             // In case we reached the of the doc comment, we want to check that it's an
276             // unclosed HTML tag. For example "/// <h3".
277             None => (prev_pos, '\0'),
278         };
279         prev_pos = pos;
280         // Checking if this is a closing tag (like `</a>` for `<a>`).
281         if c == '/' && tag_name.is_empty() {
282             is_closing = true;
283         } else if is_valid_for_html_tag_name(c, tag_name.is_empty()) {
284             tag_name.push(c);
285         } else {
286             if !tag_name.is_empty() {
287                 let mut r = Range { start: range.start + start_pos, end: range.start + pos };
288                 if c == '>' {
289                     // In case we have a tag without attribute, we can consider the span to
290                     // refer to it fully.
291                     r.end += 1;
292                 }
293                 if is_closing {
294                     // In case we have "</div >" or even "</div         >".
295                     if c != '>' {
296                         if !c.is_whitespace() {
297                             // It seems like it's not a valid HTML tag.
298                             break;
299                         }
300                         let mut found = false;
301                         for (new_pos, c) in text[pos..].char_indices() {
302                             if !c.is_whitespace() {
303                                 if c == '>' {
304                                     r.end = range.start + new_pos + 1;
305                                     found = true;
306                                 }
307                                 break;
308                             }
309                         }
310                         if !found {
311                             break;
312                         }
313                     }
314                     drop_tag(tags, tag_name, r, f);
315                 } else {
316                     let mut is_self_closing = false;
317                     let mut quote_pos = None;
318                     if c != '>' {
319                         let mut quote = None;
320                         let mut after_eq = false;
321                         for (i, c) in text[pos..].char_indices() {
322                             if !c.is_whitespace() {
323                                 if let Some(q) = quote {
324                                     if c == q {
325                                         quote = None;
326                                         quote_pos = None;
327                                         after_eq = false;
328                                     }
329                                 } else if c == '>' {
330                                     break;
331                                 } else if c == '/' && !after_eq {
332                                     is_self_closing = true;
333                                 } else {
334                                     if is_self_closing {
335                                         is_self_closing = false;
336                                     }
337                                     if (c == '"' || c == '\'') && after_eq {
338                                         quote = Some(c);
339                                         quote_pos = Some(pos + i);
340                                     } else if c == '=' {
341                                         after_eq = true;
342                                     }
343                                 }
344                             } else if quote.is_none() {
345                                 after_eq = false;
346                             }
347                         }
348                     }
349                     if let Some(quote_pos) = quote_pos {
350                         let qr = Range { start: quote_pos, end: quote_pos };
351                         f(
352                             &format!("unclosed quoted HTML attribute on tag `{}`", tag_name),
353                             &qr,
354                             false,
355                         );
356                     }
357                     if is_self_closing {
358                         // https://html.spec.whatwg.org/#parse-error-non-void-html-element-start-tag-with-trailing-solidus
359                         let valid = ALLOWED_UNCLOSED.contains(&&tag_name[..])
360                             || tags.iter().take(pos + 1).any(|(at, _)| {
361                                 let at = at.to_lowercase();
362                                 at == "svg" || at == "math"
363                             });
364                         if !valid {
365                             f(&format!("invalid self-closing HTML tag `{}`", tag_name), &r, false);
366                         }
367                     } else {
368                         tags.push((tag_name, r));
369                     }
370                 }
371             }
372             break;
373         }
374         iter.next();
375     }
376 }
377
378 fn extract_tags(
379     tags: &mut Vec<(String, Range<usize>)>,
380     text: &str,
381     range: Range<usize>,
382     is_in_comment: &mut Option<Range<usize>>,
383     f: &impl Fn(&str, &Range<usize>, bool),
384 ) {
385     let mut iter = text.char_indices().peekable();
386
387     while let Some((start_pos, c)) = iter.next() {
388         if is_in_comment.is_some() {
389             if text[start_pos..].starts_with("-->") {
390                 *is_in_comment = None;
391             }
392         } else if c == '<' {
393             if text[start_pos..].starts_with("<!--") {
394                 // We skip the "!--" part. (Once `advance_by` is stable, might be nice to use it!)
395                 iter.next();
396                 iter.next();
397                 iter.next();
398                 *is_in_comment = Some(Range {
399                     start: range.start + start_pos,
400                     end: range.start + start_pos + 3,
401                 });
402             } else {
403                 extract_html_tag(tags, text, &range, start_pos, &mut iter, f);
404             }
405         }
406     }
407 }