1 //! Utilities for validating string and char literals and turning them into
2 //! values they represent.
7 #[derive(Debug, PartialEq, Eq)]
8 pub(crate) enum EscapeError {
15 BareCarriageReturnInRawString,
19 InvalidCharInHexEscape,
22 NoBraceInUnicodeEscape,
23 InvalidCharInUnicodeEscape,
25 UnclosedUnicodeEscape,
26 LeadingUnderscoreUnicodeEscape,
27 OverlongUnicodeEscape,
28 LoneSurrogateUnicodeEscape,
29 OutOfRangeUnicodeEscape,
33 NonAsciiCharInByteString,
36 /// Takes a contents of a char literal (without quotes), and returns an
37 /// unescaped char or an error
38 pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
39 let mut chars = literal_text.chars();
40 unescape_char_or_byte(&mut chars, Mode::Char)
41 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
44 /// Takes a contents of a string literal (without quotes) and produces a
45 /// sequence of escaped characters or errors.
46 pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
48 F: FnMut(Range<usize>, Result<char, EscapeError>),
50 unescape_str_or_byte_str(literal_text, Mode::Str, callback)
53 pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
54 let mut chars = literal_text.chars();
55 unescape_char_or_byte(&mut chars, Mode::Byte)
57 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
60 /// Takes a contents of a string literal (without quotes) and produces a
61 /// sequence of escaped characters or errors.
62 pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
64 F: FnMut(Range<usize>, Result<u8, EscapeError>),
66 unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
67 callback(range, char.map(byte_from_char))
71 /// Takes a contents of a string literal (without quotes) and produces a
72 /// sequence of characters or errors.
73 /// NOTE: Raw strings do not perform any explicit character escaping, here we
74 /// only translate CRLF to LF and produce errors on bare CR.
75 pub(crate) fn unescape_raw_str<F>(literal_text: &str, callback: &mut F)
77 F: FnMut(Range<usize>, Result<char, EscapeError>),
79 unescape_raw_str_or_byte_str(literal_text, Mode::Str, callback)
82 /// Takes a contents of a string literal (without quotes) and produces a
83 /// sequence of characters or errors.
84 /// NOTE: Raw strings do not perform any explicit character escaping, here we
85 /// only translate CRLF to LF and produce errors on bare CR.
86 pub(crate) fn unescape_raw_byte_str<F>(literal_text: &str, callback: &mut F)
88 F: FnMut(Range<usize>, Result<u8, EscapeError>),
90 unescape_raw_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
91 callback(range, char.map(byte_from_char))
95 #[derive(Debug, Clone, Copy)]
96 pub(crate) enum Mode {
104 fn in_single_quotes(self) -> bool {
106 Mode::Char | Mode::Byte => true,
107 Mode::Str | Mode::ByteStr => false,
111 pub(crate) fn in_double_quotes(self) -> bool {
112 !self.in_single_quotes()
115 pub(crate) fn is_bytes(self) -> bool {
117 Mode::Byte | Mode::ByteStr => true,
118 Mode::Char | Mode::Str => false,
124 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
125 if first_char != '\\' {
126 return match first_char {
127 '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
128 '\r' => Err(if chars.clone().next() == Some('\n') {
129 EscapeError::EscapeOnlyChar
131 EscapeError::BareCarriageReturn
133 '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
134 '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
136 if mode.is_bytes() && !first_char.is_ascii() {
137 return Err(EscapeError::NonAsciiCharInByte);
144 let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
146 let res = match second_char {
156 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
157 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
159 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
160 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
162 let value = hi * 16 + lo;
164 if !mode.is_bytes() && !is_ascii(value) {
165 return Err(EscapeError::OutOfRangeHexEscape);
167 let value = value as u8;
173 if chars.next() != Some('{') {
174 return Err(EscapeError::NoBraceInUnicodeEscape);
177 let mut n_digits = 1;
178 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
179 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
180 '}' => return Err(EscapeError::EmptyUnicodeEscape),
181 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
186 None => return Err(EscapeError::UnclosedUnicodeEscape),
187 Some('_') => continue,
190 return Err(EscapeError::OverlongUnicodeEscape);
193 return Err(EscapeError::UnicodeEscapeInByte);
196 break std::char::from_u32(value).ok_or_else(|| {
197 if value > 0x10FFFF {
198 EscapeError::OutOfRangeUnicodeEscape
200 EscapeError::LoneSurrogateUnicodeEscape
205 let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
210 let digit = digit as u32;
211 value = value * 16 + digit;
216 _ => return Err(EscapeError::InvalidEscape),
221 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
222 let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
223 let res = scan_escape(first_char, chars, mode)?;
224 if chars.next().is_some() {
225 return Err(EscapeError::MoreThanOneChar);
230 /// Takes a contents of a string literal (without quotes) and produces a
231 /// sequence of escaped characters or errors.
232 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
234 F: FnMut(Range<usize>, Result<char, EscapeError>),
236 assert!(mode.in_double_quotes());
237 let initial_len = src.len();
238 let mut chars = src.chars();
239 while let Some(first_char) = chars.next() {
240 let start = initial_len - chars.as_str().len() - first_char.len_utf8();
242 let unescaped_char = match first_char {
244 let (second_char, third_char) = {
245 let mut chars = chars.clone();
246 (chars.next(), chars.next())
248 match (second_char, third_char) {
249 (Some('\n'), _) | (Some('\r'), Some('\n')) => {
250 skip_ascii_whitespace(&mut chars);
253 _ => scan_escape(first_char, &mut chars, mode),
257 let second_char = chars.clone().next();
258 if second_char == Some('\n') {
262 scan_escape(first_char, &mut chars, mode)
267 _ => scan_escape(first_char, &mut chars, mode),
269 let end = initial_len - chars.as_str().len();
270 callback(start..end, unescaped_char);
273 fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
274 let str = chars.as_str();
275 let first_non_space = str
277 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
278 .unwrap_or(str.len());
279 *chars = str[first_non_space..].chars()
283 /// Takes a contents of a string literal (without quotes) and produces a
284 /// sequence of characters or errors.
285 /// NOTE: Raw strings do not perform any explicit character escaping, here we
286 /// only translate CRLF to LF and produce errors on bare CR.
287 fn unescape_raw_str_or_byte_str<F>(literal_text: &str, mode: Mode, callback: &mut F)
289 F: FnMut(Range<usize>, Result<char, EscapeError>),
291 assert!(mode.in_double_quotes());
292 let initial_len = literal_text.len();
294 let mut chars = literal_text.chars();
295 while let Some(curr) = chars.next() {
296 let start = initial_len - chars.as_str().len() - curr.len_utf8();
298 let result = match (curr, chars.clone().next()) {
299 ('\r', Some('\n')) => {
303 ('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
304 (c, _) if mode.is_bytes() && !c.is_ascii() =>
305 Err(EscapeError::NonAsciiCharInByteString),
308 let end = initial_len - chars.as_str().len();
310 callback(start..end, result);
314 fn byte_from_char(c: char) -> u8 {
316 assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte(Str)");
320 fn is_ascii(x: u32) -> bool {
329 fn test_unescape_char_bad() {
330 fn check(literal_text: &str, expected_error: EscapeError) {
331 let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
332 assert_eq!(actual_result, Err(expected_error));
335 check("", EscapeError::ZeroChars);
336 check(r"\", EscapeError::LoneSlash);
338 check("\n", EscapeError::EscapeOnlyChar);
339 check("\r\n", EscapeError::EscapeOnlyChar);
340 check("\t", EscapeError::EscapeOnlyChar);
341 check("'", EscapeError::EscapeOnlyChar);
342 check("\r", EscapeError::BareCarriageReturn);
344 check("spam", EscapeError::MoreThanOneChar);
345 check(r"\x0ff", EscapeError::MoreThanOneChar);
346 check(r#"\"a"#, EscapeError::MoreThanOneChar);
347 check(r"\na", EscapeError::MoreThanOneChar);
348 check(r"\ra", EscapeError::MoreThanOneChar);
349 check(r"\ta", EscapeError::MoreThanOneChar);
350 check(r"\\a", EscapeError::MoreThanOneChar);
351 check(r"\'a", EscapeError::MoreThanOneChar);
352 check(r"\0a", EscapeError::MoreThanOneChar);
353 check(r"\u{0}x", EscapeError::MoreThanOneChar);
354 check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
356 check(r"\v", EscapeError::InvalidEscape);
357 check(r"\💩", EscapeError::InvalidEscape);
358 check(r"\●", EscapeError::InvalidEscape);
360 check(r"\x", EscapeError::TooShortHexEscape);
361 check(r"\x0", EscapeError::TooShortHexEscape);
362 check(r"\xf", EscapeError::TooShortHexEscape);
363 check(r"\xa", EscapeError::TooShortHexEscape);
364 check(r"\xx", EscapeError::InvalidCharInHexEscape);
365 check(r"\xы", EscapeError::InvalidCharInHexEscape);
366 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
367 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
368 check(r"\xff", EscapeError::OutOfRangeHexEscape);
369 check(r"\xFF", EscapeError::OutOfRangeHexEscape);
370 check(r"\x80", EscapeError::OutOfRangeHexEscape);
372 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
373 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
374 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
375 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
376 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
377 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
378 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
379 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
380 check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
381 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
382 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
384 check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
385 check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
386 check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
388 check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
389 check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
390 check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
394 fn test_unescape_char_good() {
395 fn check(literal_text: &str, expected_char: char) {
396 let actual_result = unescape_char(literal_text);
397 assert_eq!(actual_result, Ok(expected_char));
412 check(r"\x00", '\0');
415 check(r"\x7f", 127 as char);
417 check(r"\u{0}", '\0');
418 check(r"\u{000000}", '\0');
419 check(r"\u{41}", 'A');
420 check(r"\u{0041}", 'A');
421 check(r"\u{00_41}", 'A');
422 check(r"\u{4__1__}", 'A');
423 check(r"\u{1F63b}", '😻');
427 fn test_unescape_str_good() {
428 fn check(literal_text: &str, expected: &str) {
429 let mut buf = Ok(String::with_capacity(literal_text.len()));
430 unescape_str(literal_text, &mut |range, c| {
431 if let Ok(b) = &mut buf {
434 Err(e) => buf = Err((range, e)),
438 let buf = buf.as_ref().map(|it| it.as_ref());
439 assert_eq!(buf, Ok(expected))
444 check(" \t\n\r\n", " \t\n\n");
446 check("hello \\\n world", "hello world");
447 check("hello \\\r\n world", "hello world");
448 check("thread's", "thread's")
452 fn test_unescape_byte_bad() {
453 fn check(literal_text: &str, expected_error: EscapeError) {
454 let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
455 assert_eq!(actual_result, Err(expected_error));
458 check("", EscapeError::ZeroChars);
459 check(r"\", EscapeError::LoneSlash);
461 check("\n", EscapeError::EscapeOnlyChar);
462 check("\r\n", EscapeError::EscapeOnlyChar);
463 check("\t", EscapeError::EscapeOnlyChar);
464 check("'", EscapeError::EscapeOnlyChar);
465 check("\r", EscapeError::BareCarriageReturn);
467 check("spam", EscapeError::MoreThanOneChar);
468 check(r"\x0ff", EscapeError::MoreThanOneChar);
469 check(r#"\"a"#, EscapeError::MoreThanOneChar);
470 check(r"\na", EscapeError::MoreThanOneChar);
471 check(r"\ra", EscapeError::MoreThanOneChar);
472 check(r"\ta", EscapeError::MoreThanOneChar);
473 check(r"\\a", EscapeError::MoreThanOneChar);
474 check(r"\'a", EscapeError::MoreThanOneChar);
475 check(r"\0a", EscapeError::MoreThanOneChar);
477 check(r"\v", EscapeError::InvalidEscape);
478 check(r"\💩", EscapeError::InvalidEscape);
479 check(r"\●", EscapeError::InvalidEscape);
481 check(r"\x", EscapeError::TooShortHexEscape);
482 check(r"\x0", EscapeError::TooShortHexEscape);
483 check(r"\xa", EscapeError::TooShortHexEscape);
484 check(r"\xf", EscapeError::TooShortHexEscape);
485 check(r"\xx", EscapeError::InvalidCharInHexEscape);
486 check(r"\xы", EscapeError::InvalidCharInHexEscape);
487 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
488 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
490 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
491 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
492 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
493 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
494 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
495 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
496 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
497 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
499 check("ы", EscapeError::NonAsciiCharInByte);
500 check("🦀", EscapeError::NonAsciiCharInByte);
502 check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
503 check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
504 check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
505 check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
506 check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
507 check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
508 check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
509 check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
510 check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
511 check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
512 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
513 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
514 check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
515 check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
516 check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
517 check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
518 check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
519 check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
523 fn test_unescape_byte_good() {
524 fn check(literal_text: &str, expected_byte: u8) {
525 let actual_result = unescape_byte(literal_text);
526 assert_eq!(actual_result, Ok(expected_byte));
531 check(r#"\""#, b'"');
539 check(r"\x00", b'\0');
540 check(r"\x5a", b'Z');
541 check(r"\x5A", b'Z');
549 fn test_unescape_byte_str_good() {
550 fn check(literal_text: &str, expected: &[u8]) {
551 let mut buf = Ok(Vec::with_capacity(literal_text.len()));
552 unescape_byte_str(literal_text, &mut |range, c| {
553 if let Ok(b) = &mut buf {
556 Err(e) => buf = Err((range, e)),
560 let buf = buf.as_ref().map(|it| it.as_ref());
561 assert_eq!(buf, Ok(expected))
564 check("foo", b"foo");
566 check(" \t\n\r\n", b" \t\n\n");
568 check("hello \\\n world", b"hello world");
569 check("hello \\\r\n world", b"hello world");
570 check("thread's", b"thread's")