src/libcore/str/lossy.rs

   1 use crate::char;
   2 use crate::fmt::{self, Write};
   3 use crate::mem;
   4 use crate::str as core_str;
   5
   6 /// Lossy UTF-8 string.
   7 #[unstable(feature = "str_internals", issue = "none")]
   8 pub struct Utf8Lossy {
   9     bytes: [u8],
  10 }
  11
  12 impl Utf8Lossy {
  13     pub fn from_str(s: &str) -> &Utf8Lossy {
  14         Utf8Lossy::from_bytes(s.as_bytes())
  15     }
  16
  17     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
  18         // SAFETY: Both use the same memory layout, and UTF-8 correctness isn't required.
  19         unsafe { mem::transmute(bytes) }
  20     }
  21
  22     pub fn chunks(&self) -> Utf8LossyChunksIter<'_> {
  23         Utf8LossyChunksIter { source: &self.bytes }
  24     }
  25 }
  26
  27 /// Iterator over lossy UTF-8 string
  28 #[unstable(feature = "str_internals", issue = "none")]
  29 #[allow(missing_debug_implementations)]
  30 pub struct Utf8LossyChunksIter<'a> {
  31     source: &'a [u8],
  32 }
  33
  34 #[unstable(feature = "str_internals", issue = "none")]
  35 #[derive(PartialEq, Eq, Debug)]
  36 pub struct Utf8LossyChunk<'a> {
  37     /// Sequence of valid chars.
  38     /// Can be empty between broken UTF-8 chars.
  39     pub valid: &'a str,
  40     /// Single broken char, empty if none.
  41     /// Empty iff iterator item is last.
  42     pub broken: &'a [u8],
  43 }
  44
  45 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
  46     type Item = Utf8LossyChunk<'a>;
  47
  48     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
  49         if self.source.is_empty() {
  50             return None;
  51         }
  52
  53         const TAG_CONT_U8: u8 = 128;
  54         fn safe_get(xs: &[u8], i: usize) -> u8 {
  55             *xs.get(i).unwrap_or(&0)
  56         }
  57
  58         let mut i = 0;
  59         while i < self.source.len() {
  60             let i_ = i;
  61
  62             // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
  63             // only increases, so `0 <= i < self.source.len()`.
  64             let byte = unsafe { *self.source.get_unchecked(i) };
  65             i += 1;
  66
  67             if byte < 128 {
  68             } else {
  69                 let w = core_str::utf8_char_width(byte);
  70
  71                 macro_rules! error {
  72                     () => {{
  73                         // SAFETY: We have checked up to `i` that source is valid UTF-8.
  74                         unsafe {
  75                             let r = Utf8LossyChunk {
  76                                 valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
  77                                 broken: &self.source[i_..i],
  78                             };
  79                             self.source = &self.source[i..];
  80                             return Some(r);
  81                         }
  82                     }};
  83                 }
  84
  85                 match w {
  86                     2 => {
  87                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  88                             error!();
  89                         }
  90                         i += 1;
  91                     }
  92                     3 => {
  93                         match (byte, safe_get(self.source, i)) {
  94                             (0xE0, 0xA0..=0xBF) => (),
  95                             (0xE1..=0xEC, 0x80..=0xBF) => (),
  96                             (0xED, 0x80..=0x9F) => (),
  97                             (0xEE..=0xEF, 0x80..=0xBF) => (),
  98                             _ => {
  99                                 error!();
 100                             }
 101                         }
 102                         i += 1;
 103                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 104                             error!();
 105                         }
 106                         i += 1;
 107                     }
 108                     4 => {
 109                         match (byte, safe_get(self.source, i)) {
 110                             (0xF0, 0x90..=0xBF) => (),
 111                             (0xF1..=0xF3, 0x80..=0xBF) => (),
 112                             (0xF4, 0x80..=0x8F) => (),
 113                             _ => {
 114                                 error!();
 115                             }
 116                         }
 117                         i += 1;
 118                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 119                             error!();
 120                         }
 121                         i += 1;
 122                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 123                             error!();
 124                         }
 125                         i += 1;
 126                     }
 127                     _ => {
 128                         error!();
 129                     }
 130                 }
 131             }
 132         }
 133
 134         let r = Utf8LossyChunk {
 135             // SAFETY: We have checked that the entire source is valid UTF-8.
 136             valid: unsafe { core_str::from_utf8_unchecked(self.source) },
 137             broken: &[],
 138         };
 139         self.source = &[];
 140         Some(r)
 141     }
 142 }
 143
 144 impl fmt::Display for Utf8Lossy {
 145     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 146         // If we're the empty string then our iterator won't actually yield
 147         // anything, so perform the formatting manually
 148         if self.bytes.is_empty() {
 149             return "".fmt(f);
 150         }
 151
 152         for Utf8LossyChunk { valid, broken } in self.chunks() {
 153             // If we successfully decoded the whole chunk as a valid string then
 154             // we can return a direct formatting of the string which will also
 155             // respect various formatting flags if possible.
 156             if valid.len() == self.bytes.len() {
 157                 assert!(broken.is_empty());
 158                 return valid.fmt(f);
 159             }
 160
 161             f.write_str(valid)?;
 162             if !broken.is_empty() {
 163                 f.write_char(char::REPLACEMENT_CHARACTER)?;
 164             }
 165         }
 166         Ok(())
 167     }
 168 }
 169
 170 impl fmt::Debug for Utf8Lossy {
 171     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 172         f.write_char('"')?;
 173
 174         for Utf8LossyChunk { valid, broken } in self.chunks() {
 175             // Valid part.
 176             // Here we partially parse UTF-8 again which is suboptimal.
 177             {
 178                 let mut from = 0;
 179                 for (i, c) in valid.char_indices() {
 180                     let esc = c.escape_debug();
 181                     // If char needs escaping, flush backlog so far and write, else skip
 182                     if esc.len() != 1 {
 183                         f.write_str(&valid[from..i])?;
 184                         for c in esc {
 185                             f.write_char(c)?;
 186                         }
 187                         from = i + c.len_utf8();
 188                     }
 189                 }
 190                 f.write_str(&valid[from..])?;
 191             }
 192
 193             // Broken parts of string as hex escape.
 194             for &b in broken {
 195                 write!(f, "\\x{:02x}", b)?;
 196             }
 197         }
 198
 199         f.write_char('"')
 200     }
 201 }