1 use crate::{EarlyContext, EarlyLintPass, LintContext};
3 use rustc_data_structures::fx::FxHashMap;
4 use rustc_errors::fluent;
5 use rustc_span::symbol::Symbol;
8 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
12 /// ```rust,compile_fail
13 /// # #![allow(unused)]
14 /// #![deny(non_ascii_idents)]
24 /// This lint allows projects that wish to retain the limit of only using
25 /// ASCII characters to switch this lint to "forbid" (for example to ease
26 /// collaboration or for security reasons).
27 /// See [RFC 2457] for more details.
29 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
32 "detects non-ASCII identifiers",
37 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
43 /// # #![allow(unused)]
44 /// const µ: f64 = 0.000001;
51 /// This lint warns about using characters which are not commonly used, and may
52 /// cause visual confusion.
54 /// This lint is triggered by identifiers that contain a codepoint that is
55 /// not part of the set of "Allowed" codepoints as described by [Unicode®
56 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
57 /// Security Profile for Identifiers][TR39Allowed].
59 /// Note that the set of uncommon codepoints may change over time. Beware
60 /// that if you "forbid" this lint that existing code may fail in the
63 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
64 pub UNCOMMON_CODEPOINTS,
66 "detects uncommon Unicode codepoints in identifiers",
71 /// The `confusable_idents` lint detects visually confusable pairs between
77 /// // Latin Capital Letter E With Caron
78 /// pub const Ě: i32 = 1;
79 /// // Latin Capital Letter E With Breve
80 /// pub const Ĕ: i32 = 2;
87 /// This lint warns when different identifiers may appear visually similar,
88 /// which can cause confusion.
90 /// The confusable detection algorithm is based on [Unicode® Technical
91 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
92 /// Detection][TR39Confusable]. For every distinct identifier X execute
93 /// the function `skeleton(X)`. If there exist two distinct identifiers X
94 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
95 /// The compiler uses the same mechanism to check if an identifier is too
96 /// similar to a keyword.
98 /// Note that the set of confusable characters may change over time.
99 /// Beware that if you "forbid" this lint that existing code may fail in
102 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
103 pub CONFUSABLE_IDENTS,
105 "detects visually confusable pairs between identifiers",
110 /// The `mixed_script_confusables` lint detects visually confusable
111 /// characters in identifiers between different [scripts].
113 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
118 /// // The Japanese katakana character エ can be confused with the Han character 工.
119 /// const エ: &'static str = "アイウ";
126 /// This lint warns when characters between different scripts may appear
127 /// visually similar, which can cause confusion.
129 /// If the crate contains other identifiers in the same script that have
130 /// non-confusable characters, then this lint will *not* be issued. For
131 /// example, if the example given above has another identifier with
132 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
133 /// that you are intentionally using katakana, and it will not warn about
136 /// Note that the set of confusable characters may change over time.
137 /// Beware that if you "forbid" this lint that existing code may fail in
139 pub MIXED_SCRIPT_CONFUSABLES,
141 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
145 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
147 impl EarlyLintPass for NonAsciiIdents {
148 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
149 use rustc_session::lint::Level;
150 use rustc_span::Span;
151 use std::collections::BTreeMap;
152 use unicode_security::GeneralSecurityProfile;
154 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
155 let check_uncommon_codepoints =
156 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
157 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
158 let check_mixed_script_confusables =
159 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
161 if !check_non_ascii_idents
162 && !check_uncommon_codepoints
163 && !check_confusable_idents
164 && !check_mixed_script_confusables
169 let mut has_non_ascii_idents = false;
170 let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();
172 // Sort by `Span` so that error messages make sense with respect to the
173 // order of identifier locations in the code.
174 let mut symbols: Vec<_> = symbols.iter().collect();
175 symbols.sort_by_key(|k| k.1);
177 for (symbol, &sp) in symbols.iter() {
178 let symbol_str = symbol.as_str();
179 if symbol_str.is_ascii() {
182 has_non_ascii_idents = true;
183 cx.struct_span_lint(NON_ASCII_IDENTS, sp, |lint| {
184 lint.build(fluent::lint::identifier_non_ascii_char).emit();
186 if check_uncommon_codepoints
187 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
189 cx.struct_span_lint(UNCOMMON_CODEPOINTS, sp, |lint| {
190 lint.build(fluent::lint::identifier_uncommon_codepoints).emit();
195 if has_non_ascii_idents && check_confusable_idents {
196 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
197 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
198 let mut skeleton_buf = String::new();
200 for (&symbol, &sp) in symbols.iter() {
201 use unicode_security::confusable_detection::skeleton;
203 let symbol_str = symbol.as_str();
204 let is_ascii = symbol_str.is_ascii();
206 // Get the skeleton as a `Symbol`.
207 skeleton_buf.clear();
208 skeleton_buf.extend(skeleton(&symbol_str));
209 let skeleton_sym = if *symbol_str == *skeleton_buf {
212 Symbol::intern(&skeleton_buf)
217 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
218 if !*existing_is_ascii || !is_ascii {
219 cx.struct_span_lint(CONFUSABLE_IDENTS, sp, |lint| {
220 lint.build(fluent::lint::confusable_identifier_pair)
221 .set_arg("existing_sym", *existing_symbol)
222 .set_arg("sym", symbol)
223 .span_label(*existing_span, fluent::lint::label)
227 if *existing_is_ascii && !is_ascii {
228 *existing_symbol = symbol;
230 *existing_is_ascii = is_ascii;
233 .or_insert((symbol, sp, is_ascii));
237 if has_non_ascii_idents && check_mixed_script_confusables {
238 use unicode_security::is_potential_mixed_script_confusable_char;
239 use unicode_security::mixed_script::AugmentedScriptSet;
242 enum ScriptSetUsage {
243 Suspicious(Vec<char>, Span),
247 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
248 FxHashMap::default();
249 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
250 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
252 let mut has_suspicous = false;
253 for (symbol, &sp) in symbols.iter() {
254 let symbol_str = symbol.as_str();
255 for ch in symbol_str.chars() {
257 // all ascii characters are covered by exception.
260 if !GeneralSecurityProfile::identifier_allowed(ch) {
261 // this character is covered by `uncommon_codepoints` lint.
264 let augmented_script_set = AugmentedScriptSet::for_char(ch);
266 .entry(augmented_script_set)
267 .and_modify(|existing_state| {
268 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
269 if is_potential_mixed_script_confusable_char(ch) {
272 *existing_state = ScriptSetUsage::Verified;
277 if !is_potential_mixed_script_confusable_char(ch) {
278 ScriptSetUsage::Verified
280 has_suspicous = true;
281 ScriptSetUsage::Suspicious(vec![ch], sp)
288 let verified_augmented_script_sets = script_states
290 .flat_map(|(k, v)| match v {
291 ScriptSetUsage::Verified => Some(*k),
294 .collect::<Vec<_>>();
296 // we're sorting the output here.
297 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
300 'outerloop: for (augment_script_set, usage) in script_states {
301 let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
303 if augment_script_set.is_all() {
307 for existing in verified_augmented_script_sets.iter() {
308 if existing.is_all() {
311 let mut intersect = *existing;
312 intersect.intersect_with(augment_script_set);
313 if !intersect.is_empty() && !intersect.is_all() {
318 // We sort primitive chars here and can use unstable sort
319 ch_list.sort_unstable();
321 lint_reports.insert((sp, ch_list), augment_script_set);
324 for ((sp, ch_list), script_set) in lint_reports {
325 cx.struct_span_lint(MIXED_SCRIPT_CONFUSABLES, sp, |lint| {
326 let mut includes = String::new();
327 for (idx, ch) in ch_list.into_iter().enumerate() {
331 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
332 includes += &char_info;
334 lint.build(fluent::lint::mixed_script_confusables)
335 .set_arg("set", script_set.to_string())
336 .set_arg("includes", includes)
337 .note(fluent::lint::includes_note)
338 .note(fluent::lint::note)