]> git.lizzy.rs Git - rust.git/blob - src/libcore/str/lossy.rs
Rollup merge of #63055 - Mark-Simulacrum:save-analysis-clean-2, r=Xanewok
[rust.git] / src / libcore / str / lossy.rs
1 use crate::char;
2 use crate::str as core_str;
3 use crate::fmt::{self, Write};
4 use crate::mem;
5
6 /// Lossy UTF-8 string.
7 #[unstable(feature = "str_internals", issue = "0")]
8 pub struct Utf8Lossy {
9     bytes: [u8]
10 }
11
12 impl Utf8Lossy {
13     pub fn from_str(s: &str) -> &Utf8Lossy {
14         Utf8Lossy::from_bytes(s.as_bytes())
15     }
16
17     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
18         unsafe { mem::transmute(bytes) }
19     }
20
21     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
22         Utf8LossyChunksIter { source: &self.bytes }
23     }
24 }
25
26
27 /// Iterator over lossy UTF-8 string
28 #[unstable(feature = "str_internals", issue = "0")]
29 #[allow(missing_debug_implementations)]
30 pub struct Utf8LossyChunksIter<'a> {
31     source: &'a [u8],
32 }
33
34 #[unstable(feature = "str_internals", issue = "0")]
35 #[derive(PartialEq, Eq, Debug)]
36 pub struct Utf8LossyChunk<'a> {
37     /// Sequence of valid chars.
38     /// Can be empty between broken UTF-8 chars.
39     pub valid: &'a str,
40     /// Single broken char, empty if none.
41     /// Empty iff iterator item is last.
42     pub broken: &'a [u8],
43 }
44
45 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
46     type Item = Utf8LossyChunk<'a>;
47
48     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
49         if self.source.is_empty() {
50             return None;
51         }
52
53         const TAG_CONT_U8: u8 = 128;
54         fn safe_get(xs: &[u8], i: usize) -> u8 {
55             *xs.get(i).unwrap_or(&0)
56         }
57
58         let mut i = 0;
59         while i < self.source.len() {
60             let i_ = i;
61
62             let byte = unsafe { *self.source.get_unchecked(i) };
63             i += 1;
64
65             if byte < 128 {
66
67             } else {
68                 let w = core_str::utf8_char_width(byte);
69
70                 macro_rules! error { () => ({
71                     unsafe {
72                         let r = Utf8LossyChunk {
73                             valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
74                             broken: &self.source[i_..i],
75                         };
76                         self.source = &self.source[i..];
77                         return Some(r);
78                     }
79                 })}
80
81                 match w {
82                     2 => {
83                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
84                             error!();
85                         }
86                         i += 1;
87                     }
88                     3 => {
89                         match (byte, safe_get(self.source, i)) {
90                             (0xE0, 0xA0 ..= 0xBF) => (),
91                             (0xE1 ..= 0xEC, 0x80 ..= 0xBF) => (),
92                             (0xED, 0x80 ..= 0x9F) => (),
93                             (0xEE ..= 0xEF, 0x80 ..= 0xBF) => (),
94                             _ => {
95                                 error!();
96                             }
97                         }
98                         i += 1;
99                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
100                             error!();
101                         }
102                         i += 1;
103                     }
104                     4 => {
105                         match (byte, safe_get(self.source, i)) {
106                             (0xF0, 0x90 ..= 0xBF) => (),
107                             (0xF1 ..= 0xF3, 0x80 ..= 0xBF) => (),
108                             (0xF4, 0x80 ..= 0x8F) => (),
109                             _ => {
110                                 error!();
111                             }
112                         }
113                         i += 1;
114                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
115                             error!();
116                         }
117                         i += 1;
118                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
119                             error!();
120                         }
121                         i += 1;
122                     }
123                     _ => {
124                         error!();
125                     }
126                 }
127             }
128         }
129
130         let r = Utf8LossyChunk {
131             valid: unsafe { core_str::from_utf8_unchecked(self.source) },
132             broken: &[],
133         };
134         self.source = &[];
135         Some(r)
136     }
137 }
138
139
140 impl fmt::Display for Utf8Lossy {
141     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
142         // If we're the empty string then our iterator won't actually yield
143         // anything, so perform the formatting manually
144         if self.bytes.is_empty() {
145             return "".fmt(f)
146         }
147
148         for Utf8LossyChunk { valid, broken } in self.chunks() {
149             // If we successfully decoded the whole chunk as a valid string then
150             // we can return a direct formatting of the string which will also
151             // respect various formatting flags if possible.
152             if valid.len() == self.bytes.len() {
153                 assert!(broken.is_empty());
154                 return valid.fmt(f)
155             }
156
157             f.write_str(valid)?;
158             if !broken.is_empty() {
159                 f.write_char(char::REPLACEMENT_CHARACTER)?;
160             }
161         }
162         Ok(())
163     }
164 }
165
166 impl fmt::Debug for Utf8Lossy {
167     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
168         f.write_char('"')?;
169
170         for Utf8LossyChunk { valid, broken } in self.chunks() {
171
172             // Valid part.
173             // Here we partially parse UTF-8 again which is suboptimal.
174             {
175                 let mut from = 0;
176                 for (i, c) in valid.char_indices() {
177                     let esc = c.escape_debug();
178                     // If char needs escaping, flush backlog so far and write, else skip
179                     if esc.len() != 1 {
180                         f.write_str(&valid[from..i])?;
181                         for c in esc {
182                             f.write_char(c)?;
183                         }
184                         from = i + c.len_utf8();
185                     }
186                 }
187                 f.write_str(&valid[from..])?;
188             }
189
190             // Broken parts of string as hex escape.
191             for &b in broken {
192                 write!(f, "\\x{:02x}", b)?;
193             }
194         }
195
196         f.write_char('"')
197     }
198 }