]> git.lizzy.rs Git - rust.git/blob - src/libcore/str/lossy.rs
Rollup merge of #69340 - Centril:self-ctor-normalize, r=nikomatsakis
[rust.git] / src / libcore / str / lossy.rs
1 use crate::char;
2 use crate::fmt::{self, Write};
3 use crate::mem;
4 use crate::str as core_str;
5
6 /// Lossy UTF-8 string.
7 #[unstable(feature = "str_internals", issue = "none")]
8 pub struct Utf8Lossy {
9     bytes: [u8],
10 }
11
12 impl Utf8Lossy {
13     pub fn from_str(s: &str) -> &Utf8Lossy {
14         Utf8Lossy::from_bytes(s.as_bytes())
15     }
16
17     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
18         // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
19         unsafe { mem::transmute(bytes) }
20     }
21
22     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
23         Utf8LossyChunksIter { source: &self.bytes }
24     }
25 }
26
27 /// Iterator over lossy UTF-8 string
28 #[unstable(feature = "str_internals", issue = "none")]
29 #[allow(missing_debug_implementations)]
30 pub struct Utf8LossyChunksIter<'a> {
31     source: &'a [u8],
32 }
33
34 #[unstable(feature = "str_internals", issue = "none")]
35 #[derive(PartialEq, Eq, Debug)]
36 pub struct Utf8LossyChunk<'a> {
37     /// Sequence of valid chars.
38     /// Can be empty between broken UTF-8 chars.
39     pub valid: &'a str,
40     /// Single broken char, empty if none.
41     /// Empty iff iterator item is last.
42     pub broken: &'a [u8],
43 }
44
45 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
46     type Item = Utf8LossyChunk<'a>;
47
48     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
49         if self.source.is_empty() {
50             return None;
51         }
52
53         const TAG_CONT_U8: u8 = 128;
54         fn safe_get(xs: &[u8], i: usize) -> u8 {
55             *xs.get(i).unwrap_or(&0)
56         }
57
58         let mut i = 0;
59         while i < self.source.len() {
60             let i_ = i;
61
62             // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
63             // only increases, so `0 <= i < self.source.len()`.
64             let byte = unsafe { *self.source.get_unchecked(i) };
65             i += 1;
66
67             if byte < 128 {
68             } else {
69                 let w = core_str::utf8_char_width(byte);
70
71                 macro_rules! error {
72                     () => {{
73                         // SAFETY: We have checked up to `i` that source is valid UTF-8.
74                         unsafe {
75                             let r = Utf8LossyChunk {
76                                 valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
77                                 broken: &self.source[i_..i],
78                             };
79                             self.source = &self.source[i..];
80                             return Some(r);
81                         }
82                     }};
83                 }
84
85                 match w {
86                     2 => {
87                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
88                             error!();
89                         }
90                         i += 1;
91                     }
92                     3 => {
93                         match (byte, safe_get(self.source, i)) {
94                             (0xE0, 0xA0..=0xBF) => (),
95                             (0xE1..=0xEC, 0x80..=0xBF) => (),
96                             (0xED, 0x80..=0x9F) => (),
97                             (0xEE..=0xEF, 0x80..=0xBF) => (),
98                             _ => {
99                                 error!();
100                             }
101                         }
102                         i += 1;
103                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
104                             error!();
105                         }
106                         i += 1;
107                     }
108                     4 => {
109                         match (byte, safe_get(self.source, i)) {
110                             (0xF0, 0x90..=0xBF) => (),
111                             (0xF1..=0xF3, 0x80..=0xBF) => (),
112                             (0xF4, 0x80..=0x8F) => (),
113                             _ => {
114                                 error!();
115                             }
116                         }
117                         i += 1;
118                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
119                             error!();
120                         }
121                         i += 1;
122                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
123                             error!();
124                         }
125                         i += 1;
126                     }
127                     _ => {
128                         error!();
129                     }
130                 }
131             }
132         }
133
134         let r = Utf8LossyChunk {
135             // SAFETY: We have checked that the entire source is valid UTF-8.
136             valid: unsafe { core_str::from_utf8_unchecked(self.source) },
137             broken: &[],
138         };
139         self.source = &[];
140         Some(r)
141     }
142 }
143
144 impl fmt::Display for Utf8Lossy {
145     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
146         // If we're the empty string then our iterator won't actually yield
147         // anything, so perform the formatting manually
148         if self.bytes.is_empty() {
149             return "".fmt(f);
150         }
151
152         for Utf8LossyChunk { valid, broken } in self.chunks() {
153             // If we successfully decoded the whole chunk as a valid string then
154             // we can return a direct formatting of the string which will also
155             // respect various formatting flags if possible.
156             if valid.len() == self.bytes.len() {
157                 assert!(broken.is_empty());
158                 return valid.fmt(f);
159             }
160
161             f.write_str(valid)?;
162             if !broken.is_empty() {
163                 f.write_char(char::REPLACEMENT_CHARACTER)?;
164             }
165         }
166         Ok(())
167     }
168 }
169
170 impl fmt::Debug for Utf8Lossy {
171     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
172         f.write_char('"')?;
173
174         for Utf8LossyChunk { valid, broken } in self.chunks() {
175             // Valid part.
176             // Here we partially parse UTF-8 again which is suboptimal.
177             {
178                 let mut from = 0;
179                 for (i, c) in valid.char_indices() {
180                     let esc = c.escape_debug();
181                     // If char needs escaping, flush backlog so far and write, else skip
182                     if esc.len() != 1 {
183                         f.write_str(&valid[from..i])?;
184                         for c in esc {
185                             f.write_char(c)?;
186                         }
187                         from = i + c.len_utf8();
188                     }
189                 }
190                 f.write_str(&valid[from..])?;
191             }
192
193             // Broken parts of string as hex escape.
194             for &b in broken {
195                 write!(f, "\\x{:02x}", b)?;
196             }
197         }
198
199         f.write_char('"')
200     }
201 }