]> git.lizzy.rs Git - rust.git/blob - src/librustc_lint/non_ascii_idents.rs
Rollup merge of #73959 - GuillaumeGomez:cleanup-e0716, r=Dylan-DPC
[rust.git] / src / librustc_lint / non_ascii_idents.rs
1 use crate::{EarlyContext, EarlyLintPass, LintContext};
2 use rustc_ast::ast;
3 use rustc_data_structures::fx::FxHashMap;
4 use rustc_span::symbol::SymbolStr;
5
6 declare_lint! {
7     pub NON_ASCII_IDENTS,
8     Allow,
9     "detects non-ASCII identifiers",
10     crate_level_only
11 }
12
13 declare_lint! {
14     pub UNCOMMON_CODEPOINTS,
15     Warn,
16     "detects uncommon Unicode codepoints in identifiers",
17     crate_level_only
18 }
19
20 declare_lint! {
21     pub CONFUSABLE_IDENTS,
22     Warn,
23     "detects visually confusable pairs between identifiers",
24     crate_level_only
25 }
26
27 declare_lint! {
28     pub MIXED_SCRIPT_CONFUSABLES,
29     Warn,
30     "detects Unicode scripts whose mixed script confusables codepoints are solely used",
31     crate_level_only
32 }
33
34 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
35
36 impl EarlyLintPass for NonAsciiIdents {
37     fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
38         use rustc_session::lint::Level;
39         use rustc_span::Span;
40         use std::collections::BTreeMap;
41         use unicode_security::GeneralSecurityProfile;
42         use utils::CowBoxSymStr;
43
44         let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
45         let check_uncommon_codepoints =
46             cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
47         let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
48         let check_mixed_script_confusables =
49             cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
50
51         if !check_non_ascii_idents
52             && !check_uncommon_codepoints
53             && !check_confusable_idents
54             && !check_mixed_script_confusables
55         {
56             return;
57         }
58
59         let mut has_non_ascii_idents = false;
60         let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
61         for (symbol, &sp) in symbols.iter() {
62             let symbol_str = symbol.as_str();
63             if symbol_str.is_ascii() {
64                 continue;
65             }
66             has_non_ascii_idents = true;
67             cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
68                 lint.build("identifier contains non-ASCII characters").emit()
69             });
70             if check_uncommon_codepoints
71                 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
72             {
73                 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
74                     lint.build("identifier contains uncommon Unicode codepoints").emit()
75                 })
76             }
77         }
78
79         if has_non_ascii_idents && check_confusable_idents {
80             let mut skeleton_map: FxHashMap<CowBoxSymStr, (SymbolStr, Span, bool)> =
81                 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
82             let mut str_buf = String::new();
83             for (symbol, &sp) in symbols.iter() {
84                 fn calc_skeleton(symbol_str: &SymbolStr, buffer: &mut String) -> CowBoxSymStr {
85                     use std::mem::replace;
86                     use unicode_security::confusable_detection::skeleton;
87                     buffer.clear();
88                     buffer.extend(skeleton(symbol_str));
89                     if *symbol_str == *buffer {
90                         CowBoxSymStr::Interned(symbol_str.clone())
91                     } else {
92                         let owned = replace(buffer, String::new());
93                         CowBoxSymStr::Owned(owned.into_boxed_str())
94                     }
95                 }
96                 let symbol_str = symbol.as_str();
97                 let is_ascii = symbol_str.is_ascii();
98                 let skeleton = calc_skeleton(&symbol_str, &mut str_buf);
99                 skeleton_map
100                     .entry(skeleton)
101                     .and_modify(|(existing_symbolstr, existing_span, existing_is_ascii)| {
102                         if !*existing_is_ascii || !is_ascii {
103                             cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
104                                 lint.build(&format!(
105                                     "identifier pair considered confusable between `{}` and `{}`",
106                                     existing_symbolstr, symbol_str
107                                 ))
108                                 .span_label(
109                                     *existing_span,
110                                     "this is where the previous identifier occurred",
111                                 )
112                                 .emit();
113                             });
114                         }
115                         if *existing_is_ascii && !is_ascii {
116                             *existing_symbolstr = symbol_str.clone();
117                             *existing_span = sp;
118                             *existing_is_ascii = is_ascii;
119                         }
120                     })
121                     .or_insert((symbol_str, sp, is_ascii));
122             }
123         }
124
125         if has_non_ascii_idents && check_mixed_script_confusables {
126             use unicode_security::is_potential_mixed_script_confusable_char;
127             use unicode_security::mixed_script::AugmentedScriptSet;
128
129             #[derive(Clone)]
130             enum ScriptSetUsage {
131                 Suspicious(Vec<char>, Span),
132                 Verified,
133             }
134
135             let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
136                 FxHashMap::default();
137             let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
138             script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
139
140             let mut has_suspicous = false;
141             for (symbol, &sp) in symbols.iter() {
142                 let symbol_str = symbol.as_str();
143                 for ch in symbol_str.chars() {
144                     if ch.is_ascii() {
145                         // all ascii characters are covered by exception.
146                         continue;
147                     }
148                     if !GeneralSecurityProfile::identifier_allowed(ch) {
149                         // this character is covered by `uncommon_codepoints` lint.
150                         continue;
151                     }
152                     let augmented_script_set = AugmentedScriptSet::for_char(ch);
153                     script_states
154                         .entry(augmented_script_set)
155                         .and_modify(|existing_state| {
156                             if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
157                                 if is_potential_mixed_script_confusable_char(ch) {
158                                     ch_list.push(ch);
159                                 } else {
160                                     *existing_state = ScriptSetUsage::Verified;
161                                 }
162                             }
163                         })
164                         .or_insert_with(|| {
165                             if !is_potential_mixed_script_confusable_char(ch) {
166                                 ScriptSetUsage::Verified
167                             } else {
168                                 has_suspicous = true;
169                                 ScriptSetUsage::Suspicious(vec![ch], sp)
170                             }
171                         });
172                 }
173             }
174
175             if has_suspicous {
176                 let verified_augmented_script_sets = script_states
177                     .iter()
178                     .flat_map(|(k, v)| match v {
179                         ScriptSetUsage::Verified => Some(*k),
180                         _ => None,
181                     })
182                     .collect::<Vec<_>>();
183
184                 // we're sorting the output here.
185                 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
186                     BTreeMap::new();
187
188                 'outerloop: for (augment_script_set, usage) in script_states {
189                     let (mut ch_list, sp) = match usage {
190                         ScriptSetUsage::Verified => continue,
191                         ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
192                     };
193
194                     if augment_script_set.is_all() {
195                         continue;
196                     }
197
198                     for existing in verified_augmented_script_sets.iter() {
199                         if existing.is_all() {
200                             continue;
201                         }
202                         let mut intersect = *existing;
203                         intersect.intersect_with(augment_script_set);
204                         if !intersect.is_empty() && !intersect.is_all() {
205                             continue 'outerloop;
206                         }
207                     }
208
209                     ch_list.sort();
210                     ch_list.dedup();
211                     lint_reports.insert((sp, ch_list), augment_script_set);
212                 }
213
214                 for ((sp, ch_list), script_set) in lint_reports {
215                     cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
216                         let message = format!(
217                             "The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
218                             script_set);
219                         let mut note = "The usage includes ".to_string();
220                         for (idx, ch) in ch_list.into_iter().enumerate() {
221                             if idx != 0 {
222                                 note += ", ";
223                             }
224                             let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
225                             note += &char_info;
226                         }
227                         note += ".";
228                         lint.build(&message).note(&note).note("Please recheck to make sure their usages are indeed what you want.").emit()
229                     });
230                 }
231             }
232         }
233     }
234 }
235
236 mod utils {
237     use rustc_span::symbol::SymbolStr;
238     use std::hash::{Hash, Hasher};
239     use std::ops::Deref;
240
241     pub(super) enum CowBoxSymStr {
242         Interned(SymbolStr),
243         Owned(Box<str>),
244     }
245
246     impl Deref for CowBoxSymStr {
247         type Target = str;
248
249         fn deref(&self) -> &str {
250             match self {
251                 CowBoxSymStr::Interned(interned) => interned,
252                 CowBoxSymStr::Owned(ref owned) => owned,
253             }
254         }
255     }
256
257     impl Hash for CowBoxSymStr {
258         #[inline]
259         fn hash<H: Hasher>(&self, state: &mut H) {
260             Hash::hash(&**self, state)
261         }
262     }
263
264     impl PartialEq<CowBoxSymStr> for CowBoxSymStr {
265         #[inline]
266         fn eq(&self, other: &CowBoxSymStr) -> bool {
267             PartialEq::eq(&**self, &**other)
268         }
269     }
270
271     impl Eq for CowBoxSymStr {}
272 }