src/librustc_lexer/src/unescape.rs

   1 //! Utilities for validating string and char literals and turning them into
   2 //! values they represent.
   3
   4 use std::str::Chars;
   5 use std::ops::Range;
   6
   7 #[cfg(test)]
   8 mod tests;
   9
  10 /// Errors that can occur during string unescaping.
  11 #[derive(Debug, PartialEq, Eq)]
  12 pub enum EscapeError {
  13     /// Expected 1 char, but 0 were found.
  14     ZeroChars,
  15     /// Expected 1 char, but more than 1 were found.
  16     MoreThanOneChar,
  17
  18     /// Escaped '\' character without continuation.
  19     LoneSlash,
  20     /// Invalid escape characted (e.g. '\z').
  21     InvalidEscape,
  22     /// Raw '\r' encountered.
  23     BareCarriageReturn,
  24     /// Raw '\r' encountered in raw string.
  25     BareCarriageReturnInRawString,
  26     /// Unescaped character that was expected to be escaped (e.g. raw '\t').
  27     EscapeOnlyChar,
  28
  29     /// Numeric character escape is too short (e.g. '\x1').
  30     TooShortHexEscape,
  31     /// Invalid character in numeric escape (e.g. '\xz')
  32     InvalidCharInHexEscape,
  33     /// Character code in numeric escape is non-ascii (e.g. '\xFF').
  34     OutOfRangeHexEscape,
  35
  36     /// '\u' not followed by '{'.
  37     NoBraceInUnicodeEscape,
  38     /// Non-hexadecimal value in '\u{..}'.
  39     InvalidCharInUnicodeEscape,
  40     /// '\u{}'
  41     EmptyUnicodeEscape,
  42     /// No closing brace in '\u{..}', e.g. '\u{12'.
  43     UnclosedUnicodeEscape,
  44     /// '\u{_12}'
  45     LeadingUnderscoreUnicodeEscape,
  46     /// More than 6 charactes in '\u{..}', e.g. '\u{10FFFF_FF}'
  47     OverlongUnicodeEscape,
  48     /// Invalid in-bound unicode character code, e.g. '\u{DFFF}'.
  49     LoneSurrogateUnicodeEscape,
  50     /// Out of bounds unicode character code, e.g. '\u{FFFFFF}'.
  51     OutOfRangeUnicodeEscape,
  52
  53     /// Unicode escape code in byte literal.
  54     UnicodeEscapeInByte,
  55     /// Non-ascii character in byte literal.
  56     NonAsciiCharInByte,
  57     /// Non-ascii character in byte string literal.
  58     NonAsciiCharInByteString,
  59 }
  60
  61 /// Takes a contents of a char literal (without quotes), and returns an
  62 /// unescaped char or an error
  63 pub fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
  64     let mut chars = literal_text.chars();
  65     unescape_char_or_byte(&mut chars, Mode::Char)
  66         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
  67 }
  68
  69 /// Takes a contents of a byte literal (without quotes), and returns an
  70 /// unescaped byte or an error.
  71 pub fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
  72     let mut chars = literal_text.chars();
  73     unescape_char_or_byte(&mut chars, Mode::Byte)
  74         .map(byte_from_char)
  75         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
  76 }
  77
  78 /// Takes a contents of a string literal (without quotes) and produces a
  79 /// sequence of escaped characters or errors.
  80 /// Values are returned through invoking of the provided callback.
  81 pub fn unescape_str<F>(literal_text: &str, callback: &mut F)
  82 where
  83     F: FnMut(Range<usize>, Result<char, EscapeError>),
  84 {
  85     unescape_str_or_byte_str(literal_text, Mode::Str, callback)
  86 }
  87
  88 /// Takes a contents of a byte string literal (without quotes) and produces a
  89 /// sequence of bytes or errors.
  90 /// Values are returned through invoking of the provided callback.
  91 pub fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
  92 where
  93     F: FnMut(Range<usize>, Result<u8, EscapeError>),
  94 {
  95     unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
  96         callback(range, char.map(byte_from_char))
  97     })
  98 }
  99
 100 /// Takes a contents of a raw string literal (without quotes) and produces a
 101 /// sequence of characters or errors.
 102 /// Values are returned through invoking of the provided callback.
 103 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 104 /// only translate CRLF to LF and produce errors on bare CR.
 105 pub fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
 106 where
 107     F: FnMut(Range<usize>, Result<char, EscapeError>),
 108 {
 109     unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
 110 }
 111
 112 /// Takes a contents of a raw byte string literal (without quotes) and produces a
 113 /// sequence of bytes or errors.
 114 /// Values are returned through invoking of the provided callback.
 115 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 116 /// only translate CRLF to LF and produce errors on bare CR.
 117 pub fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
 118 where
 119     F: FnMut(Range<usize>, Result<u8, EscapeError>),
 120 {
 121     unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
 122         callback(range, char.map(byte_from_char))
 123     })
 124 }
 125
 126 /// What kind of literal do we parse.
 127 #[derive(Debug, Clone, Copy)]
 128 pub enum Mode {
 129     Char,
 130     Str,
 131     Byte,
 132     ByteStr,
 133 }
 134
 135 impl Mode {
 136     pub fn in_single_quotes(self) -> bool {
 137         match self {
 138             Mode::Char | Mode::Byte => true,
 139             Mode::Str | Mode::ByteStr => false,
 140         }
 141     }
 142
 143     pub fn in_double_quotes(self) -> bool {
 144         !self.in_single_quotes()
 145     }
 146
 147     pub fn is_bytes(self) -> bool {
 148         match self {
 149             Mode::Byte | Mode::ByteStr => true,
 150             Mode::Char | Mode::Str => false,
 151         }
 152     }
 153 }
 154
 155
 156 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 157     if first_char != '\\' {
 158         // Previous character was not a slash, and we don't expect it to be
 159         // an escape-only character.
 160         return match first_char {
 161             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
 162             '\r' => Err(EscapeError::BareCarriageReturn),
 163             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
 164             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
 165             _ => {
 166                 if mode.is_bytes() && !first_char.is_ascii() {
 167                     // Byte literal can't be a non-ascii character.
 168                     return Err(EscapeError::NonAsciiCharInByte);
 169                 }
 170                 Ok(first_char)
 171             }
 172         };
 173     }
 174
 175     // Previous character is '\\', try to unescape it.
 176
 177     let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
 178
 179     let res = match second_char {
 180         '"' => '"',
 181         'n' => '\n',
 182         'r' => '\r',
 183         't' => '\t',
 184         '\\' => '\\',
 185         '\'' => '\'',
 186         '0' => '\0',
 187
 188         'x' => {
 189             // Parse hexadecimal character code.
 190
 191             let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 192             let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 193
 194             let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 195             let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 196
 197             let value = hi * 16 + lo;
 198
 199             // For a byte literal verify that it is within ASCII range.
 200             if !mode.is_bytes() && !is_ascii(value) {
 201                 return Err(EscapeError::OutOfRangeHexEscape);
 202             }
 203             let value = value as u8;
 204
 205             value as char
 206         }
 207
 208         'u' => {
 209             // We've parsed '\u', now we have to parse '{..}'.
 210
 211             if chars.next() != Some('{') {
 212                 return Err(EscapeError::NoBraceInUnicodeEscape);
 213             }
 214
 215             // First characrer must be a hexadecimal digit.
 216             let mut n_digits = 1;
 217             let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
 218                 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
 219                 '}' => return Err(EscapeError::EmptyUnicodeEscape),
 220                 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
 221             };
 222
 223             // First character is valid, now parse the rest of the number
 224             // and closing brace.
 225             loop {
 226                 match chars.next() {
 227                     None => return Err(EscapeError::UnclosedUnicodeEscape),
 228                     Some('_') => continue,
 229                     Some('}') => {
 230                         if n_digits > 6 {
 231                             return Err(EscapeError::OverlongUnicodeEscape);
 232                         }
 233
 234                         // Incorrect syntax has higher priority for error reporting
 235                         // than unallowed value for a literal.
 236                         if mode.is_bytes() {
 237                             return Err(EscapeError::UnicodeEscapeInByte);
 238                         }
 239
 240                         break std::char::from_u32(value).ok_or_else(|| {
 241                             if value > 0x10FFFF {
 242                                 EscapeError::OutOfRangeUnicodeEscape
 243                             } else {
 244                                 EscapeError::LoneSurrogateUnicodeEscape
 245                             }
 246                         })?;
 247                     }
 248                     Some(c) => {
 249                         let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
 250                         n_digits += 1;
 251                         if n_digits > 6 {
 252                             // Stop updating value since we're sure that it's is incorrect already.
 253                             continue;
 254                         }
 255                         let digit = digit as u32;
 256                         value = value * 16 + digit;
 257                     }
 258                 };
 259             }
 260         }
 261         _ => return Err(EscapeError::InvalidEscape),
 262     };
 263     Ok(res)
 264 }
 265
 266 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 267     let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
 268     let res = scan_escape(first_char, chars, mode)?;
 269     if chars.next().is_some() {
 270         return Err(EscapeError::MoreThanOneChar);
 271     }
 272     Ok(res)
 273 }
 274
 275 /// Takes a contents of a string literal (without quotes) and produces a
 276 /// sequence of escaped characters or errors.
 277 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
 278 where
 279     F: FnMut(Range<usize>, Result<char, EscapeError>),
 280 {
 281     assert!(mode.in_double_quotes());
 282     let initial_len = src.len();
 283     let mut chars = src.chars();
 284     while let Some(first_char) = chars.next() {
 285         let start = initial_len - chars.as_str().len() - first_char.len_utf8();
 286
 287         let unescaped_char = match first_char {
 288             '\\' => {
 289                 let second_char = chars.clone().next();
 290                 match second_char {
 291                     Some('\n') => {
 292                         // Rust language specification requires us to skip whitespaces
 293                         // if unescaped '\' character is followed by '\n'.
 294                         // For details see [Rust language reference]
 295                         // (https://doc.rust-lang.org/reference/tokens.html#string-literals).
 296                         skip_ascii_whitespace(&mut chars);
 297                         continue;
 298                     }
 299                     _ => scan_escape(first_char, &mut chars, mode),
 300                 }
 301             }
 302             '\n' => Ok('\n'),
 303             '\t' => Ok('\t'),
 304             _ => scan_escape(first_char, &mut chars, mode),
 305         };
 306         let end = initial_len - chars.as_str().len();
 307         callback(start..end, unescaped_char);
 308     }
 309
 310     fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
 311         let str = chars.as_str();
 312         let first_non_space = str
 313             .bytes()
 314             .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
 315             .unwrap_or(str.len());
 316         *chars = str[first_non_space..].chars()
 317     }
 318 }
 319
 320 /// Takes a contents of a string literal (without quotes) and produces a
 321 /// sequence of characters or errors.
 322 /// NOTE: Raw strings do not perform any explicit character escaping, here we
 323 /// only translate CRLF to LF and produce errors on bare CR.
 324 fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
 325 where
 326     F: FnMut(Range<usize>, Result<char, EscapeError>),
 327 {
 328     assert!(mode.in_double_quotes());
 329     let initial_len = literal_text.len();
 330
 331     let mut chars = literal_text.chars();
 332     while let Some(curr) = chars.next() {
 333         let start = initial_len - chars.as_str().len() - curr.len_utf8();
 334
 335         let result = match curr {
 336             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
 337             c if mode.is_bytes() && !c.is_ascii() =>
 338                 Err(EscapeError::NonAsciiCharInByteString),
 339             c => Ok(c),
 340         };
 341         let end = initial_len - chars.as_str().len();
 342
 343         callback(start..end, result);
 344     }
 345 }
 346
 347 fn byte_from_char(c: char) -> u8 {
 348     let res = c as u32;
 349     assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)");
 350     res as u8
 351 }
 352
 353 fn is_ascii(x: u32) -> bool {
 354     x <= 0x7F
 355 }