1 //! Utilities for validating string and char literals and turning them into
2 //! values they represent.
7 #[derive(Debug, PartialEq, Eq)]
8 pub(crate) enum EscapeError {
18 InvalidCharInHexEscape,
21 NoBraceInUnicodeEscape,
22 InvalidCharInUnicodeEscape,
24 UnclosedUnicodeEscape,
25 LeadingUnderscoreUnicodeEscape,
26 OverlongUnicodeEscape,
27 LoneSurrogateUnicodeEscape,
28 OutOfRangeUnicodeEscape,
34 /// Takes a contents of a char literal (without quotes), and returns an
35 /// unescaped char or an error
36 pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
37 let mut chars = literal_text.chars();
38 unescape_char_or_byte(&mut chars, Mode::Char)
39 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
42 /// Takes a contents of a string literal (without quotes) and produces a
43 /// sequence of escaped characters or errors.
44 pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
46 F: FnMut(Range<usize>, Result<char, EscapeError>),
48 unescape_str_or_byte_str(literal_text, Mode::Str, callback)
51 pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
52 let mut chars = literal_text.chars();
53 unescape_char_or_byte(&mut chars, Mode::Byte)
55 .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
58 /// Takes a contents of a string literal (without quotes) and produces a
59 /// sequence of escaped characters or errors.
60 pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
62 F: FnMut(Range<usize>, Result<u8, EscapeError>),
64 unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
65 callback(range, char.map(byte_from_char))
69 #[derive(Debug, Clone, Copy)]
70 pub(crate) enum Mode {
78 fn in_single_quotes(self) -> bool {
80 Mode::Char | Mode::Byte => true,
81 Mode::Str | Mode::ByteStr => false,
85 pub(crate) fn in_double_quotes(self) -> bool {
86 !self.in_single_quotes()
89 pub(crate) fn is_bytes(self) -> bool {
91 Mode::Byte | Mode::ByteStr => true,
92 Mode::Char | Mode::Str => false,
98 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
99 if first_char != '\\' {
100 return match first_char {
101 '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
102 '\r' => Err(if chars.clone().next() == Some('\n') {
103 EscapeError::EscapeOnlyChar
105 EscapeError::BareCarriageReturn
107 '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
108 '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
110 if mode.is_bytes() && !first_char.is_ascii() {
111 return Err(EscapeError::NonAsciiCharInByte);
118 let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
120 let res = match second_char {
130 let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
131 let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
133 let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
134 let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
136 let value = hi * 16 + lo;
138 if !mode.is_bytes() && !is_ascii(value) {
139 return Err(EscapeError::OutOfRangeHexEscape);
141 let value = value as u8;
147 if chars.next() != Some('{') {
148 return Err(EscapeError::NoBraceInUnicodeEscape);
151 let mut n_digits = 1;
152 let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
153 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
154 '}' => return Err(EscapeError::EmptyUnicodeEscape),
155 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
160 None => return Err(EscapeError::UnclosedUnicodeEscape),
161 Some('_') => continue,
164 return Err(EscapeError::OverlongUnicodeEscape);
167 return Err(EscapeError::UnicodeEscapeInByte);
170 break std::char::from_u32(value).ok_or_else(|| {
171 if value > 0x10FFFF {
172 EscapeError::OutOfRangeUnicodeEscape
174 EscapeError::LoneSurrogateUnicodeEscape
179 let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
184 let digit = digit as u32;
185 value = value * 16 + digit;
190 _ => return Err(EscapeError::InvalidEscape),
195 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
196 let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
197 let res = scan_escape(first_char, chars, mode)?;
198 if chars.next().is_some() {
199 return Err(EscapeError::MoreThanOneChar);
204 /// Takes a contents of a string literal (without quotes) and produces a
205 /// sequence of escaped characters or errors.
206 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
208 F: FnMut(Range<usize>, Result<char, EscapeError>),
210 assert!(mode.in_double_quotes());
211 let initial_len = src.len();
212 let mut chars = src.chars();
213 while let Some(first_char) = chars.next() {
214 let start = initial_len - chars.as_str().len() - first_char.len_utf8();
216 let unescaped_char = match first_char {
218 let (second_char, third_char) = {
219 let mut chars = chars.clone();
220 (chars.next(), chars.next())
222 match (second_char, third_char) {
223 (Some('\n'), _) | (Some('\r'), Some('\n')) => {
224 skip_ascii_whitespace(&mut chars);
227 _ => scan_escape(first_char, &mut chars, mode),
231 let second_char = chars.clone().next();
232 if second_char == Some('\n') {
236 scan_escape(first_char, &mut chars, mode)
241 _ => scan_escape(first_char, &mut chars, mode),
243 let end = initial_len - chars.as_str().len();
244 callback(start..end, unescaped_char);
247 fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
248 let str = chars.as_str();
249 let first_non_space = str
251 .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
252 .unwrap_or(str.len());
253 *chars = str[first_non_space..].chars()
257 fn byte_from_char(c: char) -> u8 {
259 assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
263 fn is_ascii(x: u32) -> bool {
272 fn test_unescape_char_bad() {
273 fn check(literal_text: &str, expected_error: EscapeError) {
274 let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
275 assert_eq!(actual_result, Err(expected_error));
278 check("", EscapeError::ZeroChars);
279 check(r"\", EscapeError::LoneSlash);
281 check("\n", EscapeError::EscapeOnlyChar);
282 check("\r\n", EscapeError::EscapeOnlyChar);
283 check("\t", EscapeError::EscapeOnlyChar);
284 check("'", EscapeError::EscapeOnlyChar);
285 check("\r", EscapeError::BareCarriageReturn);
287 check("spam", EscapeError::MoreThanOneChar);
288 check(r"\x0ff", EscapeError::MoreThanOneChar);
289 check(r#"\"a"#, EscapeError::MoreThanOneChar);
290 check(r"\na", EscapeError::MoreThanOneChar);
291 check(r"\ra", EscapeError::MoreThanOneChar);
292 check(r"\ta", EscapeError::MoreThanOneChar);
293 check(r"\\a", EscapeError::MoreThanOneChar);
294 check(r"\'a", EscapeError::MoreThanOneChar);
295 check(r"\0a", EscapeError::MoreThanOneChar);
296 check(r"\u{0}x", EscapeError::MoreThanOneChar);
297 check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
299 check(r"\v", EscapeError::InvalidEscape);
300 check(r"\💩", EscapeError::InvalidEscape);
301 check(r"\●", EscapeError::InvalidEscape);
303 check(r"\x", EscapeError::TooShortHexEscape);
304 check(r"\x0", EscapeError::TooShortHexEscape);
305 check(r"\xf", EscapeError::TooShortHexEscape);
306 check(r"\xa", EscapeError::TooShortHexEscape);
307 check(r"\xx", EscapeError::InvalidCharInHexEscape);
308 check(r"\xы", EscapeError::InvalidCharInHexEscape);
309 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
310 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
311 check(r"\xff", EscapeError::OutOfRangeHexEscape);
312 check(r"\xFF", EscapeError::OutOfRangeHexEscape);
313 check(r"\x80", EscapeError::OutOfRangeHexEscape);
315 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
316 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
317 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
318 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
319 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
320 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
321 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
322 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
323 check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
324 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
325 check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
327 check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
328 check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
329 check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
331 check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
332 check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
333 check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
337 fn test_unescape_char_good() {
338 fn check(literal_text: &str, expected_char: char) {
339 let actual_result = unescape_char(literal_text);
340 assert_eq!(actual_result, Ok(expected_char));
355 check(r"\x00", '\0');
358 check(r"\x7f", 127 as char);
360 check(r"\u{0}", '\0');
361 check(r"\u{000000}", '\0');
362 check(r"\u{41}", 'A');
363 check(r"\u{0041}", 'A');
364 check(r"\u{00_41}", 'A');
365 check(r"\u{4__1__}", 'A');
366 check(r"\u{1F63b}", '😻');
370 fn test_unescape_str_good() {
371 fn check(literal_text: &str, expected: &str) {
372 let mut buf = Ok(String::with_capacity(literal_text.len()));
373 unescape_str(literal_text, &mut |range, c| {
374 if let Ok(b) = &mut buf {
377 Err(e) => buf = Err((range, e)),
381 let buf = buf.as_ref().map(|it| it.as_ref());
382 assert_eq!(buf, Ok(expected))
387 check(" \t\n\r\n", " \t\n\n");
389 check("hello \\\n world", "hello world");
390 check("hello \\\r\n world", "hello world");
391 check("thread's", "thread's")
395 fn test_unescape_byte_bad() {
396 fn check(literal_text: &str, expected_error: EscapeError) {
397 let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
398 assert_eq!(actual_result, Err(expected_error));
401 check("", EscapeError::ZeroChars);
402 check(r"\", EscapeError::LoneSlash);
404 check("\n", EscapeError::EscapeOnlyChar);
405 check("\r\n", EscapeError::EscapeOnlyChar);
406 check("\t", EscapeError::EscapeOnlyChar);
407 check("'", EscapeError::EscapeOnlyChar);
408 check("\r", EscapeError::BareCarriageReturn);
410 check("spam", EscapeError::MoreThanOneChar);
411 check(r"\x0ff", EscapeError::MoreThanOneChar);
412 check(r#"\"a"#, EscapeError::MoreThanOneChar);
413 check(r"\na", EscapeError::MoreThanOneChar);
414 check(r"\ra", EscapeError::MoreThanOneChar);
415 check(r"\ta", EscapeError::MoreThanOneChar);
416 check(r"\\a", EscapeError::MoreThanOneChar);
417 check(r"\'a", EscapeError::MoreThanOneChar);
418 check(r"\0a", EscapeError::MoreThanOneChar);
420 check(r"\v", EscapeError::InvalidEscape);
421 check(r"\💩", EscapeError::InvalidEscape);
422 check(r"\●", EscapeError::InvalidEscape);
424 check(r"\x", EscapeError::TooShortHexEscape);
425 check(r"\x0", EscapeError::TooShortHexEscape);
426 check(r"\xa", EscapeError::TooShortHexEscape);
427 check(r"\xf", EscapeError::TooShortHexEscape);
428 check(r"\xx", EscapeError::InvalidCharInHexEscape);
429 check(r"\xы", EscapeError::InvalidCharInHexEscape);
430 check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
431 check(r"\xtt", EscapeError::InvalidCharInHexEscape);
433 check(r"\u", EscapeError::NoBraceInUnicodeEscape);
434 check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
435 check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
436 check(r"\u{", EscapeError::UnclosedUnicodeEscape);
437 check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
438 check(r"\u{}", EscapeError::EmptyUnicodeEscape);
439 check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
440 check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
442 check("ы", EscapeError::NonAsciiCharInByte);
443 check("🦀", EscapeError::NonAsciiCharInByte);
445 check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
446 check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
447 check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
448 check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
449 check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
450 check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
451 check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
452 check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
453 check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
454 check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
455 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
456 check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
457 check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
458 check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
459 check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
460 check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
461 check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
462 check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
466 fn test_unescape_byte_good() {
467 fn check(literal_text: &str, expected_byte: u8) {
468 let actual_result = unescape_byte(literal_text);
469 assert_eq!(actual_result, Ok(expected_byte));
474 check(r#"\""#, b'"');
482 check(r"\x00", b'\0');
483 check(r"\x5a", b'Z');
484 check(r"\x5A", b'Z');
492 fn test_unescape_byte_str_good() {
493 fn check(literal_text: &str, expected: &[u8]) {
494 let mut buf = Ok(Vec::with_capacity(literal_text.len()));
495 unescape_byte_str(literal_text, &mut |range, c| {
496 if let Ok(b) = &mut buf {
499 Err(e) => buf = Err((range, e)),
503 let buf = buf.as_ref().map(|it| it.as_ref());
504 assert_eq!(buf, Ok(expected))
507 check("foo", b"foo");
509 check(" \t\n\r\n", b" \t\n\n");
511 check("hello \\\n world", b"hello world");
512 check("hello \\\r\n world", b"hello world");
513 check("thread's", b"thread's")