1 #![deny(rustc::untranslatable_diagnostic)]
2 #![deny(rustc::diagnostic_outside_of_impl)]
4 ConfusableIdentifierPair, IdentifierNonAsciiChar, IdentifierUncommonCodepoints,
5 MixedScriptConfusables,
7 use crate::{EarlyContext, EarlyLintPass, LintContext};
9 use rustc_data_structures::fx::FxHashMap;
10 use rustc_span::symbol::Symbol;
13 /// The `non_ascii_idents` lint detects non-ASCII identifiers.
17 /// ```rust,compile_fail
18 /// # #![allow(unused)]
19 /// #![deny(non_ascii_idents)]
29 /// This lint allows projects that wish to retain the limit of only using
30 /// ASCII characters to switch this lint to "forbid" (for example to ease
31 /// collaboration or for security reasons).
32 /// See [RFC 2457] for more details.
34 /// [RFC 2457]: https://github.com/rust-lang/rfcs/blob/master/text/2457-non-ascii-idents.md
37 "detects non-ASCII identifiers",
42 /// The `uncommon_codepoints` lint detects uncommon Unicode codepoints in
48 /// # #![allow(unused)]
49 /// const µ: f64 = 0.000001;
56 /// This lint warns about using characters which are not commonly used, and may
57 /// cause visual confusion.
59 /// This lint is triggered by identifiers that contain a codepoint that is
60 /// not part of the set of "Allowed" codepoints as described by [Unicode®
61 /// Technical Standard #39 Unicode Security Mechanisms Section 3.1 General
62 /// Security Profile for Identifiers][TR39Allowed].
64 /// Note that the set of uncommon codepoints may change over time. Beware
65 /// that if you "forbid" this lint that existing code may fail in the
68 /// [TR39Allowed]: https://www.unicode.org/reports/tr39/#General_Security_Profile
69 pub UNCOMMON_CODEPOINTS,
71 "detects uncommon Unicode codepoints in identifiers",
76 /// The `confusable_idents` lint detects visually confusable pairs between
82 /// // Latin Capital Letter E With Caron
83 /// pub const Ě: i32 = 1;
84 /// // Latin Capital Letter E With Breve
85 /// pub const Ĕ: i32 = 2;
92 /// This lint warns when different identifiers may appear visually similar,
93 /// which can cause confusion.
95 /// The confusable detection algorithm is based on [Unicode® Technical
96 /// Standard #39 Unicode Security Mechanisms Section 4 Confusable
97 /// Detection][TR39Confusable]. For every distinct identifier X execute
98 /// the function `skeleton(X)`. If there exist two distinct identifiers X
99 /// and Y in the same crate where `skeleton(X) = skeleton(Y)` report it.
100 /// The compiler uses the same mechanism to check if an identifier is too
101 /// similar to a keyword.
103 /// Note that the set of confusable characters may change over time.
104 /// Beware that if you "forbid" this lint that existing code may fail in
107 /// [TR39Confusable]: https://www.unicode.org/reports/tr39/#Confusable_Detection
108 pub CONFUSABLE_IDENTS,
110 "detects visually confusable pairs between identifiers",
115 /// The `mixed_script_confusables` lint detects visually confusable
116 /// characters in identifiers between different [scripts].
118 /// [scripts]: https://en.wikipedia.org/wiki/Script_(Unicode)
123 /// // The Japanese katakana character エ can be confused with the Han character 工.
124 /// const エ: &'static str = "アイウ";
131 /// This lint warns when characters between different scripts may appear
132 /// visually similar, which can cause confusion.
134 /// If the crate contains other identifiers in the same script that have
135 /// non-confusable characters, then this lint will *not* be issued. For
136 /// example, if the example given above has another identifier with
137 /// katakana characters (such as `let カタカナ = 123;`), then this indicates
138 /// that you are intentionally using katakana, and it will not warn about
141 /// Note that the set of confusable characters may change over time.
142 /// Beware that if you "forbid" this lint that existing code may fail in
144 pub MIXED_SCRIPT_CONFUSABLES,
146 "detects Unicode scripts whose mixed script confusables codepoints are solely used",
150 declare_lint_pass!(NonAsciiIdents => [NON_ASCII_IDENTS, UNCOMMON_CODEPOINTS, CONFUSABLE_IDENTS, MIXED_SCRIPT_CONFUSABLES]);
152 impl EarlyLintPass for NonAsciiIdents {
153 fn check_crate(&mut self, cx: &EarlyContext<'_>, _: &ast::Crate) {
154 use rustc_session::lint::Level;
155 use rustc_span::Span;
156 use std::collections::BTreeMap;
157 use unicode_security::GeneralSecurityProfile;
159 let check_non_ascii_idents = cx.builder.lint_level(NON_ASCII_IDENTS).0 != Level::Allow;
160 let check_uncommon_codepoints =
161 cx.builder.lint_level(UNCOMMON_CODEPOINTS).0 != Level::Allow;
162 let check_confusable_idents = cx.builder.lint_level(CONFUSABLE_IDENTS).0 != Level::Allow;
163 let check_mixed_script_confusables =
164 cx.builder.lint_level(MIXED_SCRIPT_CONFUSABLES).0 != Level::Allow;
166 if !check_non_ascii_idents
167 && !check_uncommon_codepoints
168 && !check_confusable_idents
169 && !check_mixed_script_confusables
174 let mut has_non_ascii_idents = false;
175 let symbols = cx.sess().parse_sess.symbol_gallery.symbols.lock();
177 // Sort by `Span` so that error messages make sense with respect to the
178 // order of identifier locations in the code.
179 let mut symbols: Vec<_> = symbols.iter().collect();
180 symbols.sort_by_key(|k| k.1);
182 for (symbol, &sp) in symbols.iter() {
183 let symbol_str = symbol.as_str();
184 if symbol_str.is_ascii() {
187 has_non_ascii_idents = true;
188 cx.emit_spanned_lint(NON_ASCII_IDENTS, sp, IdentifierNonAsciiChar);
189 if check_uncommon_codepoints
190 && !symbol_str.chars().all(GeneralSecurityProfile::identifier_allowed)
192 cx.emit_spanned_lint(UNCOMMON_CODEPOINTS, sp, IdentifierUncommonCodepoints);
196 if has_non_ascii_idents && check_confusable_idents {
197 let mut skeleton_map: FxHashMap<Symbol, (Symbol, Span, bool)> =
198 FxHashMap::with_capacity_and_hasher(symbols.len(), Default::default());
199 let mut skeleton_buf = String::new();
201 for (&symbol, &sp) in symbols.iter() {
202 use unicode_security::confusable_detection::skeleton;
204 let symbol_str = symbol.as_str();
205 let is_ascii = symbol_str.is_ascii();
207 // Get the skeleton as a `Symbol`.
208 skeleton_buf.clear();
209 skeleton_buf.extend(skeleton(&symbol_str));
210 let skeleton_sym = if *symbol_str == *skeleton_buf {
213 Symbol::intern(&skeleton_buf)
218 .and_modify(|(existing_symbol, existing_span, existing_is_ascii)| {
219 if !*existing_is_ascii || !is_ascii {
220 cx.emit_spanned_lint(
223 ConfusableIdentifierPair {
224 existing_sym: *existing_symbol,
226 label: *existing_span,
230 if *existing_is_ascii && !is_ascii {
231 *existing_symbol = symbol;
233 *existing_is_ascii = is_ascii;
236 .or_insert((symbol, sp, is_ascii));
240 if has_non_ascii_idents && check_mixed_script_confusables {
241 use unicode_security::is_potential_mixed_script_confusable_char;
242 use unicode_security::mixed_script::AugmentedScriptSet;
245 enum ScriptSetUsage {
246 Suspicious(Vec<char>, Span),
250 let mut script_states: FxHashMap<AugmentedScriptSet, ScriptSetUsage> =
251 FxHashMap::default();
252 let latin_augmented_script_set = AugmentedScriptSet::for_char('A');
253 script_states.insert(latin_augmented_script_set, ScriptSetUsage::Verified);
255 let mut has_suspicous = false;
256 for (symbol, &sp) in symbols.iter() {
257 let symbol_str = symbol.as_str();
258 for ch in symbol_str.chars() {
260 // all ascii characters are covered by exception.
263 if !GeneralSecurityProfile::identifier_allowed(ch) {
264 // this character is covered by `uncommon_codepoints` lint.
267 let augmented_script_set = AugmentedScriptSet::for_char(ch);
269 .entry(augmented_script_set)
270 .and_modify(|existing_state| {
271 if let ScriptSetUsage::Suspicious(ch_list, _) = existing_state {
272 if is_potential_mixed_script_confusable_char(ch) {
275 *existing_state = ScriptSetUsage::Verified;
280 if !is_potential_mixed_script_confusable_char(ch) {
281 ScriptSetUsage::Verified
283 has_suspicous = true;
284 ScriptSetUsage::Suspicious(vec![ch], sp)
291 let verified_augmented_script_sets = script_states
293 .flat_map(|(k, v)| match v {
294 ScriptSetUsage::Verified => Some(*k),
297 .collect::<Vec<_>>();
299 // we're sorting the output here.
300 let mut lint_reports: BTreeMap<(Span, Vec<char>), AugmentedScriptSet> =
303 'outerloop: for (augment_script_set, usage) in script_states {
304 let ScriptSetUsage::Suspicious(mut ch_list, sp) = usage else { continue };
306 if augment_script_set.is_all() {
310 for existing in verified_augmented_script_sets.iter() {
311 if existing.is_all() {
314 let mut intersect = *existing;
315 intersect.intersect_with(augment_script_set);
316 if !intersect.is_empty() && !intersect.is_all() {
321 // We sort primitive chars here and can use unstable sort
322 ch_list.sort_unstable();
324 lint_reports.insert((sp, ch_list), augment_script_set);
327 for ((sp, ch_list), script_set) in lint_reports {
328 let mut includes = String::new();
329 for (idx, ch) in ch_list.into_iter().enumerate() {
333 let char_info = format!("'{}' (U+{:04X})", ch, ch as u32);
334 includes += &char_info;
336 cx.emit_spanned_lint(
337 MIXED_SCRIPT_CONFUSABLES,
339 MixedScriptConfusables { set: script_set.to_string(), includes },