]> git.lizzy.rs Git - rust.git/blob - compiler/rustc_lint/src/non_ascii_idents.rs
Auto merge of #99814 - aliemjay:patch-2, r=jackh726
[rust.git] / compiler / rustc_lint / src / non_ascii_idents.rs
1 use crate::{EarlyContext, EarlyLintPass, LintContext};
2 use rustc_ast as ast;
3 use rustc_data_structures::fx::FxHashMap;
4 use rustc_errors::fluent;
5 use rustc_span::symbol::Symbol;
6
7 declare_lint! {
8     /// The `non_ascii_idents` lint detects non-ASCII identifiers.
9     ///
10     /// ### Example
11     ///
12     /// ```rust,compile_fail
13     /// # #![allow(unused)]
14     /// #![deny(non_ascii_idents)]
15     /// fn main() {
16     ///     let föö = 1;
17     /// }
18     /// ```
19     ///
20     /// {{produces}}
21     ///
22     /// ### Explanation
23     ///
24     /// This lint allows projects that wish to retain the limit of only using
25     /// ASCII characters to switch this lint to "forbid" (for example to ease
26     /// collaboration or for security reasons).
27     /// See [RFC 2457] for more details.
28     ///
29     /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
30     pub NON_ASCII_IDENTS,
31     Allow,
32     "detects non-ASCII identifiers",
33     crate_level_only
34 }
35
36 declare_lint! {
37     /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
38     /// identifiers.
39     ///
40     /// ### Example
41     ///
42     /// ```rust
43     /// # #![allow(unused)]
44     /// const µ: f64 = 0.000001;
45     /// ```
46     ///
47     /// {{produces}}
48     ///
49     /// ### Explanation
50     ///
51     /// This lint warns about using characters which are not commonly used, and may
52     /// cause visual confusion.
53     ///
54     /// This lint is triggered by identifiers that contain a codepoint that is
55     /// not part of the set of "Allowed" codepoints as described by [Unicode®
56     /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
57     /// Security Profile for Identifiers][TR39Allowed].
58     ///
59     /// Note that the set of uncommon codepoints may change over time. Beware
60     /// that if you "forbid" this lint that existing code may fail in the
61     /// future.
62     ///
63     /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
64     pub UNCOMMON_CODEPOINTS,
65     Warn,
66     "detects uncommon Unicode codepoints in identifiers",
67     crate_level_only
68 }
69
70 declare_lint! {
71     /// The `confusable_idents` lint detects visually confusable pairs between
72     /// identifiers.
73     ///
74     /// ### Example
75     ///
76     /// ```rust
77     /// // Latin Capital Letter E With Caron
78     /// pub const Ě: i32 = 1;
79     /// // Latin Capital Letter E With Breve
80     /// pub const Ĕ: i32 = 2;
81     /// ```
82     ///
83     /// {{produces}}
84     ///
85     /// ### Explanation
86     ///
87     /// This lint warns when different identifiers may appear visually similar,
88     /// which can cause confusion.
89     ///
90     /// The confusable detection algorithm is based on [Unicode® Technical
91     /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
92     /// Detection][TR39Confusable]. For every distinct identifier X execute
93     /// the function `skeleton(X)`. If there exist two distinct identifiers X
94     /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
95     /// The compiler uses the same mechanism to check if an identifier is too
96     /// similar to a keyword.
97     ///
98     /// Note that the set of confusable characters may change over time.
99     /// Beware that if you "forbid" this lint that existing code may fail in
100     /// the future.
101     ///
102     /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
103     pub CONFUSABLE_IDENTS,
104     Warn,
105     "detects visually confusable pairs between identifiers",
106     crate_level_only
107 }
108
109 declare_lint! {
110     /// The `mixed_script_confusables` lint detects visually confusable
111     /// characters in identifiers between different [scripts].
112     ///
113     /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
114     ///
115     /// ### Example
116     ///
117     /// ```rust
118     /// // The Japanese katakana character エ can be confused with the Han character 工.
119     /// const エ: &'static str = "アイウ";
120     /// ```
121     ///
122     /// {{produces}}
123     ///
124     /// ### Explanation
125     ///
126     /// This lint warns when characters between different scripts may appear
127     /// visually similar, which can cause confusion.
128     ///
129     /// If the crate contains other identifiers in the same script that have
130     /// non-confusable characters, then this lint will *not* be issued. For
131     /// example, if the example given above has another identifier with
132     /// katakana characters (such as `let カタカナ = 123;`), then this indicates
133     /// that you are intentionally using katakana, and it will not warn about
134     /// it.
135     ///
136     /// Note that the set of confusable characters may change over time.
137     /// Beware that if you "forbid" this lint that existing code may fail in
138     /// the future.
139     pub MIXED_SCRIPT_CONFUSABLES,
140     Warn,
141     "detects Unicode scripts whose mixed script confusables codepoints are solely used",
142     crate_level_only
143 }
144
145 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
146
147 impl EarlyLintPass for NonAsciiIdents {
148     fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
149         use rustc_session::lint::Level;
150         use rustc_span::Span;
151         use std::collections::BTreeMap;
152         use unicode_security::GeneralSecurityProfile;
153
154         let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
155         let check_uncommon_codepoints =
156             cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
157         let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
158         let check_mixed_script_confusables =
159             cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
160
161         if !check_non_ascii_idents
162             && !check_uncommon_codepoints
163             && !check_confusable_idents
164             && !check_mixed_script_confusables
165         {
166             return;
167         }
168
169         let mut has_non_ascii_idents = false;
170         let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();
171
172         // Sort by `Span` so that error messages make sense with respect to the
173         // order of identifier locations in the code.
174         let mut symbols: Vec<_> = symbols.iter().collect();
175         symbols.sort_by_key(|k| k.1);
176
177         for (symbol, &sp) in symbols.iter() {
178             let symbol_str = symbol.as_str();
179             if symbol_str.is_ascii() {
180                 continue;
181             }
182             has_non_ascii_idents = true;
183             cx.struct_span_lint(
184                 NON_ASCII_IDENTS,
185                 sp,
186                 fluent::lint_identifier_non_ascii_char,
187                 |lint| lint,
188             );
189             if check_uncommon_codepoints
190                 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
191             {
192                 cx.struct_span_lint(
193                     UNCOMMON_CODEPOINTS,
194                     sp,
195                     fluent::lint_identifier_uncommon_codepoints,
196                     |lint| lint,
197                 )
198             }
199         }
200
201         if has_non_ascii_idents && check_confusable_idents {
202             let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
203                 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
204             let mut skeleton_buf = String::new();
205
206             for (&symbol, &sp) in symbols.iter() {
207                 use unicode_security::confusable_detection::skeleton;
208
209                 let symbol_str = symbol.as_str();
210                 let is_ascii = symbol_str.is_ascii();
211
212                 // Get the skeleton as a `Symbol`.
213                 skeleton_buf.clear();
214                 skeleton_buf.extend(skeleton(&symbol_str));
215                 let skeleton_sym = if *symbol_str == *skeleton_buf {
216                     symbol
217                 } else {
218                     Symbol::intern(&skeleton_buf)
219                 };
220
221                 skeleton_map
222                     .entry(skeleton_sym)
223                     .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
224                         if !*existing_is_ascii || !is_ascii {
225                             cx.struct_span_lint(
226                                 CONFUSABLE_IDENTS,
227                                 sp,
228                                 fluent::lint_confusable_identifier_pair,
229                                 |lint| {
230                                     lint.set_arg("existing_sym", *existing_symbol)
231                                         .set_arg("sym", symbol)
232                                         .span_label(*existing_span, fluent::label)
233                                 },
234                             );
235                         }
236                         if *existing_is_ascii && !is_ascii {
237                             *existing_symbol = symbol;
238                             *existing_span = sp;
239                             *existing_is_ascii = is_ascii;
240                         }
241                     })
242                     .or_insert((symbol, sp, is_ascii));
243             }
244         }
245
246         if has_non_ascii_idents && check_mixed_script_confusables {
247             use unicode_security::is_potential_mixed_script_confusable_char;
248             use unicode_security::mixed_script::AugmentedScriptSet;
249
250             #[derive(Clone)]
251             enum ScriptSetUsage {
252                 Suspicious(Vec<char>, Span),
253                 Verified,
254             }
255
256             let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
257                 FxHashMap::default();
258             let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
259             script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
260
261             let mut has_suspicous = false;
262             for (symbol, &sp) in symbols.iter() {
263                 let symbol_str = symbol.as_str();
264                 for ch in symbol_str.chars() {
265                     if ch.is_ascii() {
266                         // all ascii characters are covered by exception.
267                         continue;
268                     }
269                     if !GeneralSecurityProfile::identifier_allowed(ch) {
270                         // this character is covered by `uncommon_codepoints` lint.
271                         continue;
272                     }
273                     let augmented_script_set = AugmentedScriptSet::for_char(ch);
274                     script_states
275                         .entry(augmented_script_set)
276                         .and_modify(|existing_state| {
277                             if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
278                                 if is_potential_mixed_script_confusable_char(ch) {
279                                     ch_list.push(ch);
280                                 } else {
281                                     *existing_state = ScriptSetUsage::Verified;
282                                 }
283                             }
284                         })
285                         .or_insert_with(|| {
286                             if !is_potential_mixed_script_confusable_char(ch) {
287                                 ScriptSetUsage::Verified
288                             } else {
289                                 has_suspicous = true;
290                                 ScriptSetUsage::Suspicious(vec![ch], sp)
291                             }
292                         });
293                 }
294             }
295
296             if has_suspicous {
297                 let verified_augmented_script_sets = script_states
298                     .iter()
299                     .flat_map(|(k, v)| match v {
300                         ScriptSetUsage::Verified => Some(*k),
301                         _ => None,
302                     })
303                     .collect::<Vec<_>>();
304
305                 // we're sorting the output here.
306                 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
307                     BTreeMap::new();
308
309                 'outerloop: for (augment_script_set, usage) in script_states {
310                     let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
311
312                     if augment_script_set.is_all() {
313                         continue;
314                     }
315
316                     for existing in verified_augmented_script_sets.iter() {
317                         if existing.is_all() {
318                             continue;
319                         }
320                         let mut intersect = *existing;
321                         intersect.intersect_with(augment_script_set);
322                         if !intersect.is_empty() && !intersect.is_all() {
323                             continue 'outerloop;
324                         }
325                     }
326
327                     // We sort primitive chars here and can use unstable sort
328                     ch_list.sort_unstable();
329                     ch_list.dedup();
330                     lint_reports.insert((sp, ch_list), augment_script_set);
331                 }
332
333                 for ((sp, ch_list), script_set) in lint_reports {
334                     cx.struct_span_lint(
335                         MIXED_SCRIPT_CONFUSABLES,
336                         sp,
337                         fluent::lint_mixed_script_confusables,
338                         |lint| {
339                             let mut includes = String::new();
340                             for (idx, ch) in ch_list.into_iter().enumerate() {
341                                 if idx != 0 {
342                                     includes += ", ";
343                                 }
344                                 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
345                                 includes += &char_info;
346                             }
347                             lint.set_arg("set", script_set.to_string())
348                                 .set_arg("includes", includes)
349                                 .note(fluent::includes_note)
350                                 .note(fluent::note)
351                         },
352                     );
353                 }
354             }
355         }
356     }
357 }