]> git.lizzy.rs Git - rust.git/blob - src/libsyntax/parse/unescape.rs
comments
[rust.git] / src / libsyntax / parse / unescape.rs
1 //! Utilities for validating  string and char literals and turning them into
2 //! values they represent.
3
4 use std::str::Chars;
5 use std::ops::Range;
6
7 #[derive(Debug, PartialEq, Eq)]
8 pub(crate) enum EscapeError {
9     ZeroChars,
10     MoreThanOneChar,
11
12     LoneSlash,
13     InvalidEscape,
14     BareCarriageReturn,
15     EscapeOnlyChar,
16
17     TooShortHexEscape,
18     InvalidCharInHexEscape,
19     OutOfRangeHexEscape,
20
21     NoBraceInUnicodeEscape,
22     InvalidCharInUnicodeEscape,
23     EmptyUnicodeEscape,
24     UnclosedUnicodeEscape,
25     LeadingUnderscoreUnicodeEscape,
26     OverlongUnicodeEscape,
27     LoneSurrogateUnicodeEscape,
28     OutOfRangeUnicodeEscape,
29
30     UnicodeEscapeInByte,
31     NonAsciiCharInByte,
32 }
33
34 /// Takes a contents of a char literal (without quotes), and returns an
35 /// unescaped char or an error
36 pub(crate) fn unescape_char(literal_text: &str) -> Result<char, (usize, EscapeError)> {
37     let mut chars = literal_text.chars();
38     unescape_char_or_byte(&mut chars, Mode::Char)
39         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
40 }
41
42 /// Takes a contents of a string literal (without quotes) and produces a
43 /// sequence of escaped characters or errors.
44 pub(crate) fn unescape_str<F>(literal_text: &str, callback: &mut F)
45 where
46     F: FnMut(Range<usize>, Result<char, EscapeError>),
47 {
48     unescape_str_or_byte_str(literal_text, Mode::Str, callback)
49 }
50
51 pub(crate) fn unescape_byte(literal_text: &str) -> Result<u8, (usize, EscapeError)> {
52     let mut chars = literal_text.chars();
53     unescape_char_or_byte(&mut chars, Mode::Byte)
54         .map(byte_from_char)
55         .map_err(|err| (literal_text.len() - chars.as_str().len(), err))
56 }
57
58 /// Takes a contents of a string literal (without quotes) and produces a
59 /// sequence of escaped characters or errors.
60 pub(crate) fn unescape_byte_str<F>(literal_text: &str, callback: &mut F)
61 where
62     F: FnMut(Range<usize>, Result<u8, EscapeError>),
63 {
64     unescape_str_or_byte_str(literal_text, Mode::ByteStr, &mut |range, char| {
65         callback(range, char.map(byte_from_char))
66     })
67 }
68
69 #[derive(Debug, Clone, Copy)]
70 pub(crate) enum Mode {
71     Char,
72     Str,
73     Byte,
74     ByteStr,
75 }
76
77 impl Mode {
78     fn in_single_quotes(self) -> bool {
79         match self {
80             Mode::Char | Mode::Byte => true,
81             Mode::Str | Mode::ByteStr => false,
82         }
83     }
84
85     pub(crate) fn in_double_quotes(self) -> bool {
86         !self.in_single_quotes()
87     }
88
89     pub(crate) fn is_bytes(self) -> bool {
90         match self {
91             Mode::Byte | Mode::ByteStr => true,
92             Mode::Char | Mode::Str => false,
93         }
94     }
95 }
96
97
98 fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
99     if first_char != '\\' {
100         return match first_char {
101             '\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
102             '\r' => Err(if chars.clone().next() == Some('\n') {
103                 EscapeError::EscapeOnlyChar
104             } else {
105                 EscapeError::BareCarriageReturn
106             }),
107             '\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
108             '"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
109             _ => {
110                 if mode.is_bytes() && !first_char.is_ascii() {
111                     return Err(EscapeError::NonAsciiCharInByte);
112                 }
113                 Ok(first_char)
114             }
115         };
116     }
117
118     let second_char = chars.next().ok_or(EscapeError::LoneSlash)?;
119
120     let res = match second_char {
121         '"' => '"',
122         'n' => '\n',
123         'r' => '\r',
124         't' => '\t',
125         '\\' => '\\',
126         '\'' => '\'',
127         '0' => '\0',
128
129         'x' => {
130             let hi = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
131             let hi = hi.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
132
133             let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
134             let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
135
136             let value = hi * 16 + lo;
137
138             if !mode.is_bytes() && !is_ascii(value) {
139                 return Err(EscapeError::OutOfRangeHexEscape);
140             }
141             let value = value as u8;
142
143             value as char
144         }
145
146         'u' => {
147             if chars.next() != Some('{') {
148                 return Err(EscapeError::NoBraceInUnicodeEscape);
149             }
150
151             let mut n_digits = 1;
152             let mut value: u32 = match chars.next().ok_or(EscapeError::UnclosedUnicodeEscape)? {
153                 '_' => return Err(EscapeError::LeadingUnderscoreUnicodeEscape),
154                 '}' => return Err(EscapeError::EmptyUnicodeEscape),
155                 c => c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?,
156             };
157
158             loop {
159                 match chars.next() {
160                     None => return Err(EscapeError::UnclosedUnicodeEscape),
161                     Some('_') => continue,
162                     Some('}') => {
163                         if n_digits > 6 {
164                             return Err(EscapeError::OverlongUnicodeEscape);
165                         }
166                         if mode.is_bytes() {
167                             return Err(EscapeError::UnicodeEscapeInByte);
168                         }
169
170                         break std::char::from_u32(value).ok_or_else(|| {
171                             if value > 0x10FFFF {
172                                 EscapeError::OutOfRangeUnicodeEscape
173                             } else {
174                                 EscapeError::LoneSurrogateUnicodeEscape
175                             }
176                         })?;
177                     }
178                     Some(c) => {
179                         let digit = c.to_digit(16).ok_or(EscapeError::InvalidCharInUnicodeEscape)?;
180                         n_digits += 1;
181                         if n_digits > 6 {
182                             continue;
183                         }
184                         let digit = digit as u32;
185                         value = value * 16 + digit;
186                     }
187                 };
188             }
189         }
190         _ => return Err(EscapeError::InvalidEscape),
191     };
192     Ok(res)
193 }
194
195 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
196     let first_char = chars.next().ok_or(EscapeError::ZeroChars)?;
197     let res = scan_escape(first_char, chars, mode)?;
198     if chars.next().is_some() {
199         return Err(EscapeError::MoreThanOneChar);
200     }
201     Ok(res)
202 }
203
204 /// Takes a contents of a string literal (without quotes) and produces a
205 /// sequence of escaped characters or errors.
206 fn unescape_str_or_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
207 where
208     F: FnMut(Range<usize>, Result<char, EscapeError>),
209 {
210     assert!(mode.in_double_quotes());
211     let initial_len = src.len();
212     let mut chars = src.chars();
213     while let Some(first_char) = chars.next() {
214         let start = initial_len - chars.as_str().len() - first_char.len_utf8();
215
216         let unescaped_char = match first_char {
217             '\\' => {
218                 let (second_char, third_char) = {
219                     let mut chars = chars.clone();
220                     (chars.next(), chars.next())
221                 };
222                 match (second_char, third_char) {
223                     (Some('\n'), _) | (Some('\r'), Some('\n')) => {
224                         skip_ascii_whitespace(&mut chars);
225                         continue;
226                     }
227                     _ => scan_escape(first_char, &mut chars, mode),
228                 }
229             }
230             '\r' => {
231                 let second_char = chars.clone().next();
232                 if second_char == Some('\n') {
233                     chars.next();
234                     Ok('\n')
235                 } else {
236                     scan_escape(first_char, &mut chars, mode)
237                 }
238             }
239             '\n' => Ok('\n'),
240             '\t' => Ok('\t'),
241             _ => scan_escape(first_char, &mut chars, mode),
242         };
243         let end = initial_len - chars.as_str().len();
244         callback(start..end, unescaped_char);
245     }
246
247     fn skip_ascii_whitespace(chars: &mut Chars<'_>) {
248         let str = chars.as_str();
249         let first_non_space = str
250             .bytes()
251             .position(|b| b != b' ' && b != b'\t' && b != b'\n' && b != b'\r')
252             .unwrap_or(str.len());
253         *chars = str[first_non_space..].chars()
254     }
255 }
256
257 fn byte_from_char(c: char) -> u8 {
258     let res = c as u32;
259     assert!(res <= u8::max_value() as u32, "guaranteed because of Mode::Byte");
260     res as u8
261 }
262
263 fn is_ascii(x: u32) -> bool {
264     x <= 0x7F
265 }
266
267 #[cfg(test)]
268 mod tests {
269     use super::*;
270
271     #[test]
272     fn test_unescape_char_bad() {
273         fn check(literal_text: &str, expected_error: EscapeError) {
274             let actual_result = unescape_char(literal_text).map_err(|(_offset, err)| err);
275             assert_eq!(actual_result, Err(expected_error));
276         }
277
278         check("", EscapeError::ZeroChars);
279         check(r"\", EscapeError::LoneSlash);
280
281         check("\n", EscapeError::EscapeOnlyChar);
282         check("\r\n", EscapeError::EscapeOnlyChar);
283         check("\t", EscapeError::EscapeOnlyChar);
284         check("'", EscapeError::EscapeOnlyChar);
285         check("\r", EscapeError::BareCarriageReturn);
286
287         check("spam", EscapeError::MoreThanOneChar);
288         check(r"\x0ff", EscapeError::MoreThanOneChar);
289         check(r#"\"a"#, EscapeError::MoreThanOneChar);
290         check(r"\na", EscapeError::MoreThanOneChar);
291         check(r"\ra", EscapeError::MoreThanOneChar);
292         check(r"\ta", EscapeError::MoreThanOneChar);
293         check(r"\\a", EscapeError::MoreThanOneChar);
294         check(r"\'a", EscapeError::MoreThanOneChar);
295         check(r"\0a", EscapeError::MoreThanOneChar);
296         check(r"\u{0}x", EscapeError::MoreThanOneChar);
297         check(r"\u{1F63b}}", EscapeError::MoreThanOneChar);
298
299         check(r"\v", EscapeError::InvalidEscape);
300         check(r"\💩", EscapeError::InvalidEscape);
301         check(r"\●", EscapeError::InvalidEscape);
302
303         check(r"\x", EscapeError::TooShortHexEscape);
304         check(r"\x0", EscapeError::TooShortHexEscape);
305         check(r"\xf", EscapeError::TooShortHexEscape);
306         check(r"\xa", EscapeError::TooShortHexEscape);
307         check(r"\xx", EscapeError::InvalidCharInHexEscape);
308         check(r"\xы", EscapeError::InvalidCharInHexEscape);
309         check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
310         check(r"\xtt", EscapeError::InvalidCharInHexEscape);
311         check(r"\xff", EscapeError::OutOfRangeHexEscape);
312         check(r"\xFF", EscapeError::OutOfRangeHexEscape);
313         check(r"\x80", EscapeError::OutOfRangeHexEscape);
314
315         check(r"\u", EscapeError::NoBraceInUnicodeEscape);
316         check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
317         check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
318         check(r"\u{", EscapeError::UnclosedUnicodeEscape);
319         check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
320         check(r"\u{}", EscapeError::EmptyUnicodeEscape);
321         check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
322         check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
323         check(r"\u{FFFFFF}", EscapeError::OutOfRangeUnicodeEscape);
324         check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
325         check(r"\u{ffffff}", EscapeError::OutOfRangeUnicodeEscape);
326
327         check(r"\u{DC00}", EscapeError::LoneSurrogateUnicodeEscape);
328         check(r"\u{DDDD}", EscapeError::LoneSurrogateUnicodeEscape);
329         check(r"\u{DFFF}", EscapeError::LoneSurrogateUnicodeEscape);
330
331         check(r"\u{D800}", EscapeError::LoneSurrogateUnicodeEscape);
332         check(r"\u{DAAA}", EscapeError::LoneSurrogateUnicodeEscape);
333         check(r"\u{DBFF}", EscapeError::LoneSurrogateUnicodeEscape);
334     }
335
336     #[test]
337     fn test_unescape_char_good() {
338         fn check(literal_text: &str, expected_char: char) {
339             let actual_result = unescape_char(literal_text);
340             assert_eq!(actual_result, Ok(expected_char));
341         }
342
343         check("a", 'a');
344         check("ы", 'ы');
345         check("🦀", '🦀');
346
347         check(r#"\""#, '"');
348         check(r"\n", '\n');
349         check(r"\r", '\r');
350         check(r"\t", '\t');
351         check(r"\\", '\\');
352         check(r"\'", '\'');
353         check(r"\0", '\0');
354
355         check(r"\x00", '\0');
356         check(r"\x5a", 'Z');
357         check(r"\x5A", 'Z');
358         check(r"\x7f", 127 as char);
359
360         check(r"\u{0}", '\0');
361         check(r"\u{000000}", '\0');
362         check(r"\u{41}", 'A');
363         check(r"\u{0041}", 'A');
364         check(r"\u{00_41}", 'A');
365         check(r"\u{4__1__}", 'A');
366         check(r"\u{1F63b}", '😻');
367     }
368
369     #[test]
370     fn test_unescape_str_good() {
371         fn check(literal_text: &str, expected: &str) {
372             let mut buf = Ok(String::with_capacity(literal_text.len()));
373             unescape_str(literal_text, &mut |range, c| {
374                 if let Ok(b) = &mut buf {
375                     match c {
376                         Ok(c) => b.push(c),
377                         Err(e) => buf = Err((range, e)),
378                     }
379                 }
380             });
381             let buf = buf.as_ref().map(|it| it.as_ref());
382             assert_eq!(buf, Ok(expected))
383         }
384
385         check("foo", "foo");
386         check("", "");
387         check(" \t\n\r\n", " \t\n\n");
388
389         check("hello \\\n     world", "hello world");
390         check("hello \\\r\n     world", "hello world");
391         check("thread's", "thread's")
392     }
393
394     #[test]
395     fn test_unescape_byte_bad() {
396         fn check(literal_text: &str, expected_error: EscapeError) {
397             let actual_result = unescape_byte(literal_text).map_err(|(_offset, err)| err);
398             assert_eq!(actual_result, Err(expected_error));
399         }
400
401         check("", EscapeError::ZeroChars);
402         check(r"\", EscapeError::LoneSlash);
403
404         check("\n", EscapeError::EscapeOnlyChar);
405         check("\r\n", EscapeError::EscapeOnlyChar);
406         check("\t", EscapeError::EscapeOnlyChar);
407         check("'", EscapeError::EscapeOnlyChar);
408         check("\r", EscapeError::BareCarriageReturn);
409
410         check("spam", EscapeError::MoreThanOneChar);
411         check(r"\x0ff", EscapeError::MoreThanOneChar);
412         check(r#"\"a"#, EscapeError::MoreThanOneChar);
413         check(r"\na", EscapeError::MoreThanOneChar);
414         check(r"\ra", EscapeError::MoreThanOneChar);
415         check(r"\ta", EscapeError::MoreThanOneChar);
416         check(r"\\a", EscapeError::MoreThanOneChar);
417         check(r"\'a", EscapeError::MoreThanOneChar);
418         check(r"\0a", EscapeError::MoreThanOneChar);
419
420         check(r"\v", EscapeError::InvalidEscape);
421         check(r"\💩", EscapeError::InvalidEscape);
422         check(r"\●", EscapeError::InvalidEscape);
423
424         check(r"\x", EscapeError::TooShortHexEscape);
425         check(r"\x0", EscapeError::TooShortHexEscape);
426         check(r"\xa", EscapeError::TooShortHexEscape);
427         check(r"\xf", EscapeError::TooShortHexEscape);
428         check(r"\xx", EscapeError::InvalidCharInHexEscape);
429         check(r"\xы", EscapeError::InvalidCharInHexEscape);
430         check(r"\x🦀", EscapeError::InvalidCharInHexEscape);
431         check(r"\xtt", EscapeError::InvalidCharInHexEscape);
432
433         check(r"\u", EscapeError::NoBraceInUnicodeEscape);
434         check(r"\u[0123]", EscapeError::NoBraceInUnicodeEscape);
435         check(r"\u{0x}", EscapeError::InvalidCharInUnicodeEscape);
436         check(r"\u{", EscapeError::UnclosedUnicodeEscape);
437         check(r"\u{0000", EscapeError::UnclosedUnicodeEscape);
438         check(r"\u{}", EscapeError::EmptyUnicodeEscape);
439         check(r"\u{_0000}", EscapeError::LeadingUnderscoreUnicodeEscape);
440         check(r"\u{0000000}", EscapeError::OverlongUnicodeEscape);
441
442         check("ы", EscapeError::NonAsciiCharInByte);
443         check("🦀", EscapeError::NonAsciiCharInByte);
444
445         check(r"\u{0}", EscapeError::UnicodeEscapeInByte);
446         check(r"\u{000000}", EscapeError::UnicodeEscapeInByte);
447         check(r"\u{41}", EscapeError::UnicodeEscapeInByte);
448         check(r"\u{0041}", EscapeError::UnicodeEscapeInByte);
449         check(r"\u{00_41}", EscapeError::UnicodeEscapeInByte);
450         check(r"\u{4__1__}", EscapeError::UnicodeEscapeInByte);
451         check(r"\u{1F63b}", EscapeError::UnicodeEscapeInByte);
452         check(r"\u{0}x", EscapeError::UnicodeEscapeInByte);
453         check(r"\u{1F63b}}", EscapeError::UnicodeEscapeInByte);
454         check(r"\u{FFFFFF}", EscapeError::UnicodeEscapeInByte);
455         check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
456         check(r"\u{ffffff}", EscapeError::UnicodeEscapeInByte);
457         check(r"\u{DC00}", EscapeError::UnicodeEscapeInByte);
458         check(r"\u{DDDD}", EscapeError::UnicodeEscapeInByte);
459         check(r"\u{DFFF}", EscapeError::UnicodeEscapeInByte);
460         check(r"\u{D800}", EscapeError::UnicodeEscapeInByte);
461         check(r"\u{DAAA}", EscapeError::UnicodeEscapeInByte);
462         check(r"\u{DBFF}", EscapeError::UnicodeEscapeInByte);
463     }
464
465     #[test]
466     fn test_unescape_byte_good() {
467         fn check(literal_text: &str, expected_byte: u8) {
468             let actual_result = unescape_byte(literal_text);
469             assert_eq!(actual_result, Ok(expected_byte));
470         }
471
472         check("a", b'a');
473
474         check(r#"\""#, b'"');
475         check(r"\n", b'\n');
476         check(r"\r", b'\r');
477         check(r"\t", b'\t');
478         check(r"\\", b'\\');
479         check(r"\'", b'\'');
480         check(r"\0", b'\0');
481
482         check(r"\x00", b'\0');
483         check(r"\x5a", b'Z');
484         check(r"\x5A", b'Z');
485         check(r"\x7f", 127);
486         check(r"\x80", 128);
487         check(r"\xff", 255);
488         check(r"\xFF", 255);
489     }
490
491     #[test]
492     fn test_unescape_byte_str_good() {
493         fn check(literal_text: &str, expected: &[u8]) {
494             let mut buf = Ok(Vec::with_capacity(literal_text.len()));
495             unescape_byte_str(literal_text, &mut |range, c| {
496                 if let Ok(b) = &mut buf {
497                     match c {
498                         Ok(c) => b.push(c),
499                         Err(e) => buf = Err((range, e)),
500                     }
501                 }
502             });
503             let buf = buf.as_ref().map(|it| it.as_ref());
504             assert_eq!(buf, Ok(expected))
505         }
506
507         check("foo", b"foo");
508         check("", b"");
509         check(" \t\n\r\n", b" \t\n\n");
510
511         check("hello \\\n     world", b"hello world");
512         check("hello \\\r\n     world", b"hello world");
513         check("thread's", b"thread's")
514     }
515 }