src/libsyntax/parse/unescape.rs

   1 //! Utilities for validating  string and char literals and turning them into
   2 //! values they represent.
   3
   4 use std::str::Chars;
   5 use std::ops::Range;
   6
   7 #[derive(Debug, PartialEq, Eq)]
   8 pub(crate) enum EscapeError {
   9     ZeroChars,
  10     MoreThanOneChar,
  11
  12     LoneSlash,
  13     InvalidEscape,
  14     BareCarriageReturn,
  15     EscapeOnlyChar,
  16
  17     TooShortHexEscape,
  18     InvalidCharInHexEscape,
  19     OutOfRangeHexEscape,
  20
  21     NoBraceInUnicodeEscape,
  22     InvalidCharInUnicodeEscape,
  23     EmptyUnicodeEscape,
  24     UnclosedUnicodeEscape,
  25     LeadingUnderscoreUnicodeEscape,
  26     OverlongUnicodeEscape,
  27     LoneSurrogateUnicodeEscape,
  28     OutOfRangeUnicodeEscape,
  29
  30     UnicodeEscapeInByte,
  31     NonAsciiCharInByte,
  32 }
  33
  34 /// Takes a contents of a char literal (without quotes), and returns an
  35 /// unescaped char or an error
  36 pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
  37     let mut chars = literal_text.chars();
  38     unescape_char_or_byte(&mut chars, Mode::Char)
  39         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
  40 }
  41
  42 /// Takes a contents of a string literal (without quotes) and produces a
  43 /// sequence of escaped characters or errors.
  44 pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
  45 where
  46     F: FnMut(Range<usize>, Result<char, EscapeError>),
  47 {
  48     unescape_str_or_byte_str(literal_text, Mode::Str, callback)
  49 }
  50
  51 pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
  52     let mut chars = literal_text.chars();
  53     unescape_char_or_byte(&mut chars, Mode::Byte)
  54         .map(byte_from_char)
  55         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
  56 }
  57
  58 /// Takes a contents of a string literal (without quotes) and produces a
  59 /// sequence of escaped characters or errors.
  60 pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
  61 where
  62     F: FnMut(Range<usize>, Result<u8, EscapeError>),
  63 {
  64     unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
  65         callback(range, char.map(byte_from_char))
  66     })
  67 }
  68
  69 #[derive(Debug, Clone, Copy)]
  70 pub(crate) enum Mode {
  71     Char,
  72     Str,
  73     Byte,
  74     ByteStr,
  75 }
  76
  77 impl Mode {
  78     fn in_single_quotes(self) -> bool {
  79         match self {
  80             Mode::Char | Mode::Byte => true,
  81             Mode::Str | Mode::ByteStr => false,
  82         }
  83     }
  84
  85     pub(crate) fn in_double_quotes(self) -> bool {
  86         !self.in_single_quotes()
  87     }
  88
  89     pub(crate) fn is_bytes(self) -> bool {
  90         match self {
  91             Mode::Byte | Mode::ByteStr => true,
  92             Mode::Char | Mode::Str => false,
  93         }
  94     }
  95 }
  96
  97
  98 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
  99     if first_char != '\\' {
 100         return match first_char {
 101             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
 102             '\r' => Err(if chars.clone().next() == Some('\n') {
 103                 EscapeError::EscapeOnlyChar
 104             } else {
 105                 EscapeError::BareCarriageReturn
 106             }),
 107             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
 108             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
 109             _ => {
 110                 if mode.is_bytes() && !first_char.is_ascii() {
 111                     return Err(EscapeError::NonAsciiCharInByte);
 112                 }
 113                 Ok(first_char)
 114             }
 115         };
 116     }
 117
 118     let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
 119
 120     let res = match second_char {
 121         '"' => '"',
 122         'n' => '\n',
 123         'r' => '\r',
 124         't' => '\t',
 125         '\\' => '\\',
 126         '\'' => '\'',
 127         '0' => '\0',
 128
 129         'x' => {
 130             let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 131             let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 132
 133             let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
 134             let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 135
 136             let value = hi * 16 + lo;
 137
 138             if !mode.is_bytes() && !is_ascii(value) {
 139                 return Err(EscapeError::OutOfRangeHexEscape);
 140             }
 141             let value = value as u8;
 142
 143             value as char
 144         }
 145
 146         'u' => {
 147             if chars.next() != Some('{') {
 148                 return Err(EscapeError::NoBraceInUnicodeEscape);
 149             }
 150
 151             let mut n_digits = 1;
 152             let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
 153                 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
 154                 '}' => return Err(EscapeError::EmptyUnicodeEscape),
 155                 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
 156             };
 157
 158             loop {
 159                 match chars.next() {
 160                     None => return Err(EscapeError::UnclosedUnicodeEscape),
 161                     Some('_') => continue,
 162                     Some('}') => {
 163                         if n_digits > 6 {
 164                             return Err(EscapeError::OverlongUnicodeEscape);
 165                         }
 166                         if mode.is_bytes() {
 167                             return Err(EscapeError::UnicodeEscapeInByte);
 168                         }
 169
 170                         break std::char::from_u32(value).ok_or_else(|| {
 171                             if value > 0x10FFFF {
 172                                 EscapeError::OutOfRangeUnicodeEscape
 173                             } else {
 174                                 EscapeError::LoneSurrogateUnicodeEscape
 175                             }
 176                         })?;
 177                     }
 178                     Some(c) => {
 179                         let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
 180                         n_digits += 1;
 181                         if n_digits > 6 {
 182                             continue;
 183                         }
 184                         let digit = digit as u32;
 185                         value = value * 16 + digit;
 186                     }
 187                 };
 188             }
 189         }
 190         _ => return Err(EscapeError::InvalidEscape),
 191     };
 192     Ok(res)
 193 }
 194
 195 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
 196     let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
 197     let res = scan_escape(first_char, chars, mode)?;
 198     if chars.next().is_some() {
 199         return Err(EscapeError::MoreThanOneChar);
 200     }
 201     Ok(res)
 202 }
 203
 204 /// Takes a contents of a string literal (without quotes) and produces a
 205 /// sequence of escaped characters or errors.
 206 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
 207 where
 208     F: FnMut(Range<usize>, Result<char, EscapeError>),
 209 {
 210     assert!(mode.in_double_quotes());
 211     let initial_len = src.len();
 212     let mut chars = src.chars();
 213     while let Some(first_char) = chars.next() {
 214         let start = initial_len - chars.as_str().len() - first_char.len_utf8();
 215
 216         let unescaped_char = match first_char {
 217             '\\' => {
 218                 let (second_char, third_char) = {
 219                     let mut chars = chars.clone();
 220                     (chars.next(), chars.next())
 221                 };
 222                 match (second_char, third_char) {
 223                     (Some('\n'), _) | (Some('\r'), Some('\n')) => {
 224                         skip_ascii_whitespace(&mut chars);
 225                         continue;
 226                     }
 227                     _ => scan_escape(first_char, &mut chars, mode),
 228                 }
 229             }
 230             '\r' => {
 231                 let second_char = chars.clone().next();
 232                 if second_char == Some('\n') {
 233                     chars.next();
 234                     Ok('\n')
 235                 } else {
 236                     scan_escape(first_char, &mut chars, mode)
 237                 }
 238             }
 239             '\n' => Ok('\n'),
 240             '\t' => Ok('\t'),
 241             _ => scan_escape(first_char, &mut chars, mode),
 242         };
 243         let end = initial_len - chars.as_str().len();
 244         callback(start..end, unescaped_char);
 245     }
 246
 247     fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
 248         let str = chars.as_str();
 249         let first_non_space = str
 250             .bytes()
 251             .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
 252             .unwrap_or(str.len());
 253         *chars = str[first_non_space..].chars()
 254     }
 255 }
 256
 257 fn byte_from_char(c: char) -> u8 {
 258     let res = c as u32;
 259     assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
 260     res as u8
 261 }
 262
 263 fn is_ascii(x: u32) -> bool {
 264     x <= 0x7F
 265 }
 266
 267 #[cfg(test)]
 268 mod tests {
 269     use super::*;
 270
 271     #[test]
 272     fn test_unescape_char_bad() {
 273         fn check(literal_text: &str, expected_error: EscapeError) {
 274             let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
 275             assert_eq!(actual_result, Err(expected_error));
 276         }
 277
 278         check("", EscapeError::ZeroChars);
 279         check(r"\", EscapeError::LoneSlash);
 280
 281         check("\n", EscapeError::EscapeOnlyChar);
 282         check("\r\n", EscapeError::EscapeOnlyChar);
 283         check("\t", EscapeError::EscapeOnlyChar);
 284         check("'", EscapeError::EscapeOnlyChar);
 285         check("\r", EscapeError::BareCarriageReturn);
 286
 287         check("spam", EscapeError::MoreThanOneChar);
 288         check(r"\x0ff", EscapeError::MoreThanOneChar);
 289         check(r#"\"a"#, EscapeError::MoreThanOneChar);
 290         check(r"\na", EscapeError::MoreThanOneChar);
 291         check(r"\ra", EscapeError::MoreThanOneChar);
 292         check(r"\ta", EscapeError::MoreThanOneChar);
 293         check(r"\\a", EscapeError::MoreThanOneChar);
 294         check(r"\'a", EscapeError::MoreThanOneChar);
 295         check(r"\0a", EscapeError::MoreThanOneChar);
 296         check(r"\u{0}x", EscapeError::MoreThanOneChar);
 297         check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
 298
 299         check(r"\v", EscapeError::InvalidEscape);
 300         check(r"\💩", EscapeError::InvalidEscape);
 301         check(r"\●", EscapeError::InvalidEscape);
 302
 303         check(r"\x", EscapeError::TooShortHexEscape);
 304         check(r"\x0", EscapeError::TooShortHexEscape);
 305         check(r"\xf", EscapeError::TooShortHexEscape);
 306         check(r"\xa", EscapeError::TooShortHexEscape);
 307         check(r"\xx", EscapeError::InvalidCharInHexEscape);
 308         check(r"\xы", EscapeError::InvalidCharInHexEscape);
 309         check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
 310         check(r"\xtt", EscapeError::InvalidCharInHexEscape);
 311         check(r"\xff", EscapeError::OutOfRangeHexEscape);
 312         check(r"\xFF", EscapeError::OutOfRangeHexEscape);
 313         check(r"\x80", EscapeError::OutOfRangeHexEscape);
 314
 315         check(r"\u", EscapeError::NoBraceInUnicodeEscape);
 316         check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
 317         check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
 318         check(r"\u{", EscapeError::UnclosedUnicodeEscape);
 319         check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
 320         check(r"\u{}", EscapeError::EmptyUnicodeEscape);
 321         check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
 322         check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
 323         check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
 324         check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
 325         check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
 326
 327         check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
 328         check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
 329         check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
 330
 331         check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
 332         check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
 333         check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
 334     }
 335
 336     #[test]
 337     fn test_unescape_char_good() {
 338         fn check(literal_text: &str, expected_char: char) {
 339             let actual_result = unescape_char(literal_text);
 340             assert_eq!(actual_result, Ok(expected_char));
 341         }
 342
 343         check("a", 'a');
 344         check("ы", 'ы');
 345         check("🦀", '🦀');
 346
 347         check(r#"\""#, '"');
 348         check(r"\n", '\n');
 349         check(r"\r", '\r');
 350         check(r"\t", '\t');
 351         check(r"\\", '\\');
 352         check(r"\'", '\'');
 353         check(r"\0", '\0');
 354
 355         check(r"\x00", '\0');
 356         check(r"\x5a", 'Z');
 357         check(r"\x5A", 'Z');
 358         check(r"\x7f", 127 as char);
 359
 360         check(r"\u{0}", '\0');
 361         check(r"\u{000000}", '\0');
 362         check(r"\u{41}", 'A');
 363         check(r"\u{0041}", 'A');
 364         check(r"\u{00_41}", 'A');
 365         check(r"\u{4__1__}", 'A');
 366         check(r"\u{1F63b}", '😻');
 367     }
 368
 369     #[test]
 370     fn test_unescape_str_good() {
 371         fn check(literal_text: &str, expected: &str) {
 372             let mut buf = Ok(String::with_capacity(literal_text.len()));
 373             unescape_str(literal_text, &mut |range, c| {
 374                 if let Ok(b) = &mut buf {
 375                     match c {
 376                         Ok(c) => b.push(c),
 377                         Err(e) => buf = Err((range, e)),
 378                     }
 379                 }
 380             });
 381             let buf = buf.as_ref().map(|it| it.as_ref());
 382             assert_eq!(buf, Ok(expected))
 383         }
 384
 385         check("foo", "foo");
 386         check("", "");
 387         check(" \t\n\r\n", " \t\n\n");
 388
 389         check("hello \\\n     world", "hello world");
 390         check("hello \\\r\n     world", "hello world");
 391         check("thread's", "thread's")
 392     }
 393
 394     #[test]
 395     fn test_unescape_byte_bad() {
 396         fn check(literal_text: &str, expected_error: EscapeError) {
 397             let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
 398             assert_eq!(actual_result, Err(expected_error));
 399         }
 400
 401         check("", EscapeError::ZeroChars);
 402         check(r"\", EscapeError::LoneSlash);
 403
 404         check("\n", EscapeError::EscapeOnlyChar);
 405         check("\r\n", EscapeError::EscapeOnlyChar);
 406         check("\t", EscapeError::EscapeOnlyChar);
 407         check("'", EscapeError::EscapeOnlyChar);
 408         check("\r", EscapeError::BareCarriageReturn);
 409
 410         check("spam", EscapeError::MoreThanOneChar);
 411         check(r"\x0ff", EscapeError::MoreThanOneChar);
 412         check(r#"\"a"#, EscapeError::MoreThanOneChar);
 413         check(r"\na", EscapeError::MoreThanOneChar);
 414         check(r"\ra", EscapeError::MoreThanOneChar);
 415         check(r"\ta", EscapeError::MoreThanOneChar);
 416         check(r"\\a", EscapeError::MoreThanOneChar);
 417         check(r"\'a", EscapeError::MoreThanOneChar);
 418         check(r"\0a", EscapeError::MoreThanOneChar);
 419
 420         check(r"\v", EscapeError::InvalidEscape);
 421         check(r"\💩", EscapeError::InvalidEscape);
 422         check(r"\●", EscapeError::InvalidEscape);
 423
 424         check(r"\x", EscapeError::TooShortHexEscape);
 425         check(r"\x0", EscapeError::TooShortHexEscape);
 426         check(r"\xa", EscapeError::TooShortHexEscape);
 427         check(r"\xf", EscapeError::TooShortHexEscape);
 428         check(r"\xx", EscapeError::InvalidCharInHexEscape);
 429         check(r"\xы", EscapeError::InvalidCharInHexEscape);
 430         check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
 431         check(r"\xtt", EscapeError::InvalidCharInHexEscape);
 432
 433         check(r"\u", EscapeError::NoBraceInUnicodeEscape);
 434         check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
 435         check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
 436         check(r"\u{", EscapeError::UnclosedUnicodeEscape);
 437         check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
 438         check(r"\u{}", EscapeError::EmptyUnicodeEscape);
 439         check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
 440         check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
 441
 442         check("ы", EscapeError::NonAsciiCharInByte);
 443         check("🦀", EscapeError::NonAsciiCharInByte);
 444
 445         check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
 446         check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
 447         check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
 448         check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
 449         check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
 450         check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
 451         check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
 452         check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
 453         check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
 454         check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
 455         check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
 456         check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
 457         check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
 458         check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
 459         check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
 460         check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
 461         check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
 462         check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
 463     }
 464
 465     #[test]
 466     fn test_unescape_byte_good() {
 467         fn check(literal_text: &str, expected_byte: u8) {
 468             let actual_result = unescape_byte(literal_text);
 469             assert_eq!(actual_result, Ok(expected_byte));
 470         }
 471
 472         check("a", b'a');
 473
 474         check(r#"\""#, b'"');
 475         check(r"\n", b'\n');
 476         check(r"\r", b'\r');
 477         check(r"\t", b'\t');
 478         check(r"\\", b'\\');
 479         check(r"\'", b'\'');
 480         check(r"\0", b'\0');
 481
 482         check(r"\x00", b'\0');
 483         check(r"\x5a", b'Z');
 484         check(r"\x5A", b'Z');
 485         check(r"\x7f", 127);
 486         check(r"\x80", 128);
 487         check(r"\xff", 255);
 488         check(r"\xFF", 255);
 489     }
 490
 491     #[test]
 492     fn test_unescape_byte_str_good() {
 493         fn check(literal_text: &str, expected: &[u8]) {
 494             let mut buf = Ok(Vec::with_capacity(literal_text.len()));
 495             unescape_byte_str(literal_text, &mut |range, c| {
 496                 if let Ok(b) = &mut buf {
 497                     match c {
 498                         Ok(c) => b.push(c),
 499                         Err(e) => buf = Err((range, e)),
 500                     }
 501                 }
 502             });
 503             let buf = buf.as_ref().map(|it| it.as_ref());
 504             assert_eq!(buf, Ok(expected))
 505         }
 506
 507         check("foo", b"foo");
 508         check("", b"");
 509         check(" \t\n\r\n", b" \t\n\n");
 510
 511         check("hello \\\n     world", b"hello world");
 512         check("hello \\\r\n     world", b"hello world");
 513         check("thread's", b"thread's")
 514     }
 515 }