compiler/rustc_span/src/lev_distance.rs

   1 //! Levenshtein distances.
   2 //!
   3 //! The [Levenshtein distance] is a metric for measuring the difference between two strings.
   4 //!
   5 //! [Levenshtein distance]: https://en.wikipedia.org/wiki/Levenshtein_distance
   6
   7 use crate::symbol::Symbol;
   8 use std::cmp;
   9
  10 #[cfg(test)]
  11 mod tests;
  12
  13 /// Finds the Levenshtein distance between two strings.
  14 ///
  15 /// Returns None if the distance exceeds the limit.
  16 pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
  17     let n = a.chars().count();
  18     let m = b.chars().count();
  19     let min_dist = if n < m { m - n } else { n - m };
  20
  21     if min_dist > limit {
  22         return None;
  23     }
  24     if n == 0 || m == 0 {
  25         return (min_dist <= limit).then_some(min_dist);
  26     }
  27
  28     let mut dcol: Vec<_> = (0..=m).collect();
  29
  30     for (i, sc) in a.chars().enumerate() {
  31         let mut current = i;
  32         dcol[0] = current + 1;
  33
  34         for (j, tc) in b.chars().enumerate() {
  35             let next = dcol[j + 1];
  36             if sc == tc {
  37                 dcol[j + 1] = current;
  38             } else {
  39                 dcol[j + 1] = cmp::min(current, next);
  40                 dcol[j + 1] = cmp::min(dcol[j + 1], dcol[j]) + 1;
  41             }
  42             current = next;
  43         }
  44     }
  45
  46     (dcol[m] <= limit).then_some(dcol[m])
  47 }
  48
  49 /// Provides a word similarity score between two words that accounts for substrings being more
  50 /// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
  51 /// 0 is an identical match.
  52 ///
  53 /// Uses the Levenshtein distance between the two strings and removes the cost of the length
  54 /// difference. If this is 0 then it is either a substring match or a full word match, in the
  55 /// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
  56 /// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
  57 /// is not greater than twice the length of the other. For cases where the words are close in size
  58 /// but not an exact substring then the cost of the length difference is discounted by half.
  59 ///
  60 /// Returns `None` if the distance exceeds the limit.
  61 pub fn lev_distance_with_substrings(a: &str, b: &str, limit: usize) -> Option<usize> {
  62     let n = a.chars().count();
  63     let m = b.chars().count();
  64
  65     // Check one isn't less than half the length of the other. If this is true then there is a
  66     // big difference in length.
  67     let big_len_diff = (n * 2) < m || (m * 2) < n;
  68     let len_diff = if n < m { m - n } else { n - m };
  69     let lev = lev_distance(a, b, limit + len_diff)?;
  70
  71     // This is the crux, subtracting length difference means exact substring matches will now be 0
  72     let score = lev - len_diff;
  73
  74     // If the score is 0 but the words have different lengths then it's a substring match not a full
  75     // word match
  76     let score = if score == 0 && len_diff > 0 && !big_len_diff {
  77         1 // Exact substring match, but not a total word match so return non-zero
  78     } else if !big_len_diff {
  79         // Not a big difference in length, discount cost of length difference
  80         score + (len_diff + 1) / 2
  81     } else {
  82         // A big difference in length, add back the difference in length to the score
  83         score + len_diff
  84     };
  85
  86     (score <= limit).then_some(score)
  87 }
  88
  89 /// Finds the best match for given word in the given iterator where substrings are meaningful.
  90 ///
  91 /// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
  92 /// for word similarity. This takes an optional distance limit which defaults to one-third of the
  93 /// given word.
  94 ///
  95 /// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
  96 /// on an edge case with a lower(upper)case letters mismatch.
  97 pub fn find_best_match_for_name_with_substrings(
  98     candidates: &[Symbol],
  99     lookup: Symbol,
 100     dist: Option<usize>,
 101 ) -> Option<Symbol> {
 102     find_best_match_for_name_impl(true, candidates, lookup, dist)
 103 }
 104
 105 /// Finds the best match for a given word in the given iterator.
 106 ///
 107 /// As a loose rule to avoid the obviously incorrect suggestions, it takes
 108 /// an optional limit for the maximum allowable edit distance, which defaults
 109 /// to one-third of the given word.
 110 ///
 111 /// Besides Levenshtein, we use case insensitive comparison to improve accuracy
 112 /// on an edge case with a lower(upper)case letters mismatch.
 113 pub fn find_best_match_for_name(
 114     candidates: &[Symbol],
 115     lookup: Symbol,
 116     dist: Option<usize>,
 117 ) -> Option<Symbol> {
 118     find_best_match_for_name_impl(false, candidates, lookup, dist)
 119 }
 120
 121 #[cold]
 122 fn find_best_match_for_name_impl(
 123     use_substring_score: bool,
 124     candidates: &[Symbol],
 125     lookup: Symbol,
 126     dist: Option<usize>,
 127 ) -> Option<Symbol> {
 128     let lookup = lookup.as_str();
 129     let lookup_uppercase = lookup.to_uppercase();
 130
 131     // Priority of matches:
 132     // 1. Exact case insensitive match
 133     // 2. Levenshtein distance match
 134     // 3. Sorted word match
 135     if let Some(c) = candidates.iter().find(|c| c.as_str().to_uppercase() == lookup_uppercase) {
 136         return Some(*c);
 137     }
 138
 139     let mut dist = dist.unwrap_or_else(|| cmp::max(lookup.len(), 3) / 3);
 140     let mut best = None;
 141     for c in candidates {
 142         match if use_substring_score {
 143             lev_distance_with_substrings(lookup, c.as_str(), dist)
 144         } else {
 145             lev_distance(lookup, c.as_str(), dist)
 146         } {
 147             Some(0) => return Some(*c),
 148             Some(d) => {
 149                 dist = d - 1;
 150                 best = Some(*c);
 151             }
 152             None => {}
 153         }
 154     }
 155     if best.is_some() {
 156         return best;
 157     }
 158
 159     find_match_by_sorted_words(candidates, lookup)
 160 }
 161
 162 fn find_match_by_sorted_words(iter_names: &[Symbol], lookup: &str) -> Option<Symbol> {
 163     iter_names.iter().fold(None, |result, candidate| {
 164         if sort_by_words(candidate.as_str()) == sort_by_words(lookup) {
 165             Some(*candidate)
 166         } else {
 167             result
 168         }
 169     })
 170 }
 171
 172 fn sort_by_words(name: &str) -> String {
 173     let mut split_words: Vec<&str> = name.split('_').collect();
 174     // We are sorting primitive &strs and can use unstable sort here.
 175     split_words.sort_unstable();
 176     split_words.join("_")
 177 }