1 use crate::{EarlyContext, EarlyLintPass, LintContext};
3 use rustc_data_structures::fx::FxHashMap;
4 use rustc_span::symbol::Symbol;
9 "detects non-ASCII identifiers",
14 pub UNCOMMON_CODEPOINTS,
16 "detects uncommon Unicode codepoints in identifiers",
21 pub CONFUSABLE_IDENTS,
23 "detects visually confusable pairs between identifiers",
28 pub MIXED_SCRIPT_CONFUSABLES,
30 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
34 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
36 impl EarlyLintPass for NonAsciiIdents {
37 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
38 use rustc_session::lint::Level;
40 use std::collections::BTreeMap;
41 use unicode_security::GeneralSecurityProfile;
43 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
44 let check_uncommon_codepoints =
45 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
46 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
47 let check_mixed_script_confusables =
48 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
50 if !check_non_ascii_idents
51 && !check_uncommon_codepoints
52 && !check_confusable_idents
53 && !check_mixed_script_confusables
58 let mut has_non_ascii_idents = false;
59 let symbols = cx.sess.parse_sess.symbol_gallery.symbols.lock();
61 // Sort by `Span` so that error messages make sense with respect to the
62 // order of identifier locations in the code.
63 let mut symbols: Vec<_> = symbols.iter().collect();
64 symbols.sort_by_key(|k| k.1);
66 for (symbol, &sp) in symbols.iter() {
67 let symbol_str = symbol.as_str();
68 if symbol_str.is_ascii() {
71 has_non_ascii_idents = true;
72 cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
73 lint.build("identifier contains non-ASCII characters").emit()
75 if check_uncommon_codepoints
76 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
78 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
79 lint.build("identifier contains uncommon Unicode codepoints").emit()
84 if has_non_ascii_idents && check_confusable_idents {
85 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
86 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
87 let mut skeleton_buf = String::new();
89 for (&symbol, &sp) in symbols.iter() {
90 use unicode_security::confusable_detection::skeleton;
92 let symbol_str = symbol.as_str();
93 let is_ascii = symbol_str.is_ascii();
95 // Get the skeleton as a `Symbol`.
97 skeleton_buf.extend(skeleton(&symbol_str));
98 let skeleton_sym = if *symbol_str == *skeleton_buf {
101 Symbol::intern(&skeleton_buf)
106 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
107 if !*existing_is_ascii || !is_ascii {
108 cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
110 "identifier pair considered confusable between `{}` and `{}`",
111 existing_symbol.as_str(),
116 "this is where the previous identifier occurred",
121 if *existing_is_ascii && !is_ascii {
122 *existing_symbol = symbol;
124 *existing_is_ascii = is_ascii;
127 .or_insert((symbol, sp, is_ascii));
131 if has_non_ascii_idents && check_mixed_script_confusables {
132 use unicode_security::is_potential_mixed_script_confusable_char;
133 use unicode_security::mixed_script::AugmentedScriptSet;
136 enum ScriptSetUsage {
137 Suspicious(Vec<char>, Span),
141 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
142 FxHashMap::default();
143 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
144 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
146 let mut has_suspicous = false;
147 for (symbol, &sp) in symbols.iter() {
148 let symbol_str = symbol.as_str();
149 for ch in symbol_str.chars() {
151 // all ascii characters are covered by exception.
154 if !GeneralSecurityProfile::identifier_allowed(ch) {
155 // this character is covered by `uncommon_codepoints` lint.
158 let augmented_script_set = AugmentedScriptSet::for_char(ch);
160 .entry(augmented_script_set)
161 .and_modify(|existing_state| {
162 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
163 if is_potential_mixed_script_confusable_char(ch) {
166 *existing_state = ScriptSetUsage::Verified;
171 if !is_potential_mixed_script_confusable_char(ch) {
172 ScriptSetUsage::Verified
174 has_suspicous = true;
175 ScriptSetUsage::Suspicious(vec![ch], sp)
182 let verified_augmented_script_sets = script_states
184 .flat_map(|(k, v)| match v {
185 ScriptSetUsage::Verified => Some(*k),
188 .collect::<Vec<_>>();
190 // we're sorting the output here.
191 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
194 'outerloop: for (augment_script_set, usage) in script_states {
195 let (mut ch_list, sp) = match usage {
196 ScriptSetUsage::Verified => continue,
197 ScriptSetUsage::Suspicious(ch_list, sp) => (ch_list, sp),
200 if augment_script_set.is_all() {
204 for existing in verified_augmented_script_sets.iter() {
205 if existing.is_all() {
208 let mut intersect = *existing;
209 intersect.intersect_with(augment_script_set);
210 if !intersect.is_empty() && !intersect.is_all() {
217 lint_reports.insert((sp, ch_list), augment_script_set);
220 for ((sp, ch_list), script_set) in lint_reports {
221 cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
222 let message = format!(
223 "The usage of Script Group `{}` in this crate consists solely of mixed script confusables",
225 let mut note = "The usage includes ".to_string();
226 for (idx, ch) in ch_list.into_iter().enumerate() {
230 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
234 lint.build(&message).note(¬e).note("Please recheck to make sure their usages are indeed what you want.").emit()