library/core/src/str/lossy.rs

   1 use crate::char;
   2 use crate::fmt::{self, Write};
   3 use crate::mem;
   4
   5 use super::from_utf8_unchecked;
   6 use super::validations::utf8_char_width;
   7
   8 /// Lossy UTF-8 string.
   9 #[unstable(feature = "str_internals", issue = "none")]
  10 pub struct Utf8Lossy {
  11     bytes: [u8],
  12 }
  13
  14 impl Utf8Lossy {
  15     #[must_use]
  16     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
  17         // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
  18         unsafe { mem::transmute(bytes) }
  19     }
  20
  21     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
  22         Utf8LossyChunksIter { source: &self.bytes }
  23     }
  24 }
  25
  26 /// Iterator over lossy UTF-8 string
  27 #[must_use = "iterators are lazy and do nothing unless consumed"]
  28 #[unstable(feature = "str_internals", issue = "none")]
  29 #[allow(missing_debug_implementations)]
  30 pub struct Utf8LossyChunksIter<'a> {
  31     source: &'a [u8],
  32 }
  33
  34 #[unstable(feature = "str_internals", issue = "none")]
  35 #[derive(PartialEq, Eq, Debug)]
  36 pub struct Utf8LossyChunk<'a> {
  37     /// Sequence of valid chars.
  38     /// Can be empty between broken UTF-8 chars.
  39     pub valid: &'a str,
  40     /// Single broken char, empty if none.
  41     /// Empty iff iterator item is last.
  42     pub broken: &'a [u8],
  43 }
  44
  45 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
  46     type Item = Utf8LossyChunk<'a>;
  47
  48     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
  49         if self.source.is_empty() {
  50             return None;
  51         }
  52
  53         const TAG_CONT_U8: u8 = 128;
  54         fn safe_get(xs: &[u8], i: usize) -> u8 {
  55             *xs.get(i).unwrap_or(&0)
  56         }
  57
  58         let mut i = 0;
  59         let mut valid_up_to = 0;
  60         while i < self.source.len() {
  61             // SAFETY: `i < self.source.len()` per previous line.
  62             // For some reason the following are both significantly slower:
  63             // while let Some(&byte) = self.source.get(i) {
  64             // while let Some(byte) = self.source.get(i).copied() {
  65             let byte = unsafe { *self.source.get_unchecked(i) };
  66             i += 1;
  67
  68             if byte < 128 {
  69                 // This could be a `1 => ...` case in the match below, but for
  70                 // the common case of all-ASCII inputs, we bypass loading the
  71                 // sizeable UTF8_CHAR_WIDTH table into cache.
  72             } else {
  73                 let w = utf8_char_width(byte);
  74
  75                 match w {
  76                     2 => {
  77                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  78                             break;
  79                         }
  80                         i += 1;
  81                     }
  82                     3 => {
  83                         match (byte, safe_get(self.source, i)) {
  84                             (0xE0, 0xA0..=0xBF) => (),
  85                             (0xE1..=0xEC, 0x80..=0xBF) => (),
  86                             (0xED, 0x80..=0x9F) => (),
  87                             (0xEE..=0xEF, 0x80..=0xBF) => (),
  88                             _ => break,
  89                         }
  90                         i += 1;
  91                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  92                             break;
  93                         }
  94                         i += 1;
  95                     }
  96                     4 => {
  97                         match (byte, safe_get(self.source, i)) {
  98                             (0xF0, 0x90..=0xBF) => (),
  99                             (0xF1..=0xF3, 0x80..=0xBF) => (),
 100                             (0xF4, 0x80..=0x8F) => (),
 101                             _ => break,
 102                         }
 103                         i += 1;
 104                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 105                             break;
 106                         }
 107                         i += 1;
 108                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 109                             break;
 110                         }
 111                         i += 1;
 112                     }
 113                     _ => break,
 114                 }
 115             }
 116
 117             valid_up_to = i;
 118         }
 119
 120         // SAFETY: `i <= self.source.len()` because it is only ever incremented
 121         // via `i += 1` and in between every single one of those increments, `i`
 122         // is compared against `self.source.len()`. That happens either
 123         // literally by `i < self.source.len()` in the while-loop's condition,
 124         // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
 125         // loop is terminated as soon as the latest `i += 1` has made `i` no
 126         // longer less than `self.source.len()`, which means it'll be at most
 127         // equal to `self.source.len()`.
 128         let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
 129         self.source = remaining;
 130
 131         // SAFETY: `valid_up_to <= i` because it is only ever assigned via
 132         // `valid_up_to = i` and `i` only increases.
 133         let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
 134
 135         Some(Utf8LossyChunk {
 136             // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
 137             valid: unsafe { from_utf8_unchecked(valid) },
 138             broken,
 139         })
 140     }
 141 }
 142
 143 impl fmt::Display for Utf8Lossy {
 144     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 145         // If we're the empty string then our iterator won't actually yield
 146         // anything, so perform the formatting manually
 147         if self.bytes.is_empty() {
 148             return "".fmt(f);
 149         }
 150
 151         for Utf8LossyChunk { valid, broken } in self.chunks() {
 152             // If we successfully decoded the whole chunk as a valid string then
 153             // we can return a direct formatting of the string which will also
 154             // respect various formatting flags if possible.
 155             if valid.len() == self.bytes.len() {
 156                 assert!(broken.is_empty());
 157                 return valid.fmt(f);
 158             }
 159
 160             f.write_str(valid)?;
 161             if !broken.is_empty() {
 162                 f.write_char(char::REPLACEMENT_CHARACTER)?;
 163             }
 164         }
 165         Ok(())
 166     }
 167 }
 168
 169 impl fmt::Debug for Utf8Lossy {
 170     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 171         f.write_char('"')?;
 172
 173         for Utf8LossyChunk { valid, broken } in self.chunks() {
 174             // Valid part.
 175             // Here we partially parse UTF-8 again which is suboptimal.
 176             {
 177                 let mut from = 0;
 178                 for (i, c) in valid.char_indices() {
 179                     let esc = c.escape_debug();
 180                     // If char needs escaping, flush backlog so far and write, else skip
 181                     if esc.len() != 1 {
 182                         f.write_str(&valid[from..i])?;
 183                         for c in esc {
 184                             f.write_char(c)?;
 185                         }
 186                         from = i + c.len_utf8();
 187                     }
 188                 }
 189                 f.write_str(&valid[from..])?;
 190             }
 191
 192             // Broken parts of string as hex escape.
 193             for &b in broken {
 194                 write!(f, "\\x{:02x}", b)?;
 195             }
 196         }
 197
 198         f.write_char('"')
 199     }
 200 }