]> git.lizzy.rs Git - rust.git/blob - library/core/src/str/lossy.rs
Merge commit 'e8dca3e87d164d2806098c462c6ce41301341f68' into sync_from_cg_gcc
[rust.git] / library / core / src / str / lossy.rs
1 use crate::char;
2 use crate::fmt::{self, Write};
3 use crate::mem;
4
5 use super::from_utf8_unchecked;
6 use super::validations::utf8_char_width;
7
8 /// Lossy UTF-8 string.
9 #[unstable(feature = "str_internals", issue = "none")]
10 pub struct Utf8Lossy {
11     bytes: [u8],
12 }
13
14 impl Utf8Lossy {
15     #[must_use]
16     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
17         // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
18         unsafe { mem::transmute(bytes) }
19     }
20
21     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
22         Utf8LossyChunksIter { source: &self.bytes }
23     }
24 }
25
26 /// Iterator over lossy UTF-8 string
27 #[must_use = "iterators are lazy and do nothing unless consumed"]
28 #[unstable(feature = "str_internals", issue = "none")]
29 #[allow(missing_debug_implementations)]
30 pub struct Utf8LossyChunksIter<'a> {
31     source: &'a [u8],
32 }
33
34 #[unstable(feature = "str_internals", issue = "none")]
35 #[derive(PartialEq, Eq, Debug)]
36 pub struct Utf8LossyChunk<'a> {
37     /// Sequence of valid chars.
38     /// Can be empty between broken UTF-8 chars.
39     pub valid: &'a str,
40     /// Single broken char, empty if none.
41     /// Empty iff iterator item is last.
42     pub broken: &'a [u8],
43 }
44
45 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
46     type Item = Utf8LossyChunk<'a>;
47
48     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
49         if self.source.is_empty() {
50             return None;
51         }
52
53         const TAG_CONT_U8: u8 = 128;
54         fn safe_get(xs: &[u8], i: usize) -> u8 {
55             *xs.get(i).unwrap_or(&0)
56         }
57
58         let mut i = 0;
59         let mut valid_up_to = 0;
60         while i < self.source.len() {
61             // SAFETY: `i < self.source.len()` per previous line.
62             // For some reason the following are both significantly slower:
63             // while let Some(&byte) = self.source.get(i) {
64             // while let Some(byte) = self.source.get(i).copied() {
65             let byte = unsafe { *self.source.get_unchecked(i) };
66             i += 1;
67
68             if byte < 128 {
69                 // This could be a `1 => ...` case in the match below, but for
70                 // the common case of all-ASCII inputs, we bypass loading the
71                 // sizeable UTF8_CHAR_WIDTH table into cache.
72             } else {
73                 let w = utf8_char_width(byte);
74
75                 match w {
76                     2 => {
77                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
78                             break;
79                         }
80                         i += 1;
81                     }
82                     3 => {
83                         match (byte, safe_get(self.source, i)) {
84                             (0xE0, 0xA0..=0xBF) => (),
85                             (0xE1..=0xEC, 0x80..=0xBF) => (),
86                             (0xED, 0x80..=0x9F) => (),
87                             (0xEE..=0xEF, 0x80..=0xBF) => (),
88                             _ => break,
89                         }
90                         i += 1;
91                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
92                             break;
93                         }
94                         i += 1;
95                     }
96                     4 => {
97                         match (byte, safe_get(self.source, i)) {
98                             (0xF0, 0x90..=0xBF) => (),
99                             (0xF1..=0xF3, 0x80..=0xBF) => (),
100                             (0xF4, 0x80..=0x8F) => (),
101                             _ => break,
102                         }
103                         i += 1;
104                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
105                             break;
106                         }
107                         i += 1;
108                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
109                             break;
110                         }
111                         i += 1;
112                     }
113                     _ => break,
114                 }
115             }
116
117             valid_up_to = i;
118         }
119
120         // SAFETY: `i <= self.source.len()` because it is only ever incremented
121         // via `i += 1` and in between every single one of those increments, `i`
122         // is compared against `self.source.len()`. That happens either
123         // literally by `i < self.source.len()` in the while-loop's condition,
124         // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
125         // loop is terminated as soon as the latest `i += 1` has made `i` no
126         // longer less than `self.source.len()`, which means it'll be at most
127         // equal to `self.source.len()`.
128         let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
129         self.source = remaining;
130
131         // SAFETY: `valid_up_to <= i` because it is only ever assigned via
132         // `valid_up_to = i` and `i` only increases.
133         let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
134
135         Some(Utf8LossyChunk {
136             // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
137             valid: unsafe { from_utf8_unchecked(valid) },
138             broken,
139         })
140     }
141 }
142
143 impl fmt::Display for Utf8Lossy {
144     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
145         // If we're the empty string then our iterator won't actually yield
146         // anything, so perform the formatting manually
147         if self.bytes.is_empty() {
148             return "".fmt(f);
149         }
150
151         for Utf8LossyChunk { valid, broken } in self.chunks() {
152             // If we successfully decoded the whole chunk as a valid string then
153             // we can return a direct formatting of the string which will also
154             // respect various formatting flags if possible.
155             if valid.len() == self.bytes.len() {
156                 assert!(broken.is_empty());
157                 return valid.fmt(f);
158             }
159
160             f.write_str(valid)?;
161             if !broken.is_empty() {
162                 f.write_char(char::REPLACEMENT_CHARACTER)?;
163             }
164         }
165         Ok(())
166     }
167 }
168
169 impl fmt::Debug for Utf8Lossy {
170     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
171         f.write_char('"')?;
172
173         for Utf8LossyChunk { valid, broken } in self.chunks() {
174             // Valid part.
175             // Here we partially parse UTF-8 again which is suboptimal.
176             {
177                 let mut from = 0;
178                 for (i, c) in valid.char_indices() {
179                     let esc = c.escape_debug();
180                     // If char needs escaping, flush backlog so far and write, else skip
181                     if esc.len() != 1 {
182                         f.write_str(&valid[from..i])?;
183                         for c in esc {
184                             f.write_char(c)?;
185                         }
186                         from = i + c.len_utf8();
187                     }
188                 }
189                 f.write_str(&valid[from..])?;
190             }
191
192             // Broken parts of string as hex escape.
193             for &b in broken {
194                 write!(f, "\\x{:02x}", b)?;
195             }
196         }
197
198         f.write_char('"')
199     }
200 }