library/core/src/str/lossy.rs

   1 use crate::fmt;
   2 use crate::fmt::Formatter;
   3 use crate::fmt::Write;
   4 use crate::iter::FusedIterator;
   5
   6 use super::from_utf8_unchecked;
   7 use super::validations::utf8_char_width;
   8
   9 /// An item returned by the [`Utf8Chunks`] iterator.
  10 ///
  11 /// A `Utf8Chunk` stores a sequence of [`u8`] up to the first broken character
  12 /// when decoding a UTF-8 string.
  13 ///
  14 /// # Examples
  15 ///
  16 /// ```
  17 /// #![feature(utf8_chunks)]
  18 ///
  19 /// use std::str::Utf8Chunks;
  20 ///
  21 /// // An invalid UTF-8 string
  22 /// let bytes = b"foo\xF1\x80bar";
  23 ///
  24 /// // Decode the first `Utf8Chunk`
  25 /// let chunk = Utf8Chunks::new(bytes).next().unwrap();
  26 ///
  27 /// // The first three characters are valid UTF-8
  28 /// assert_eq!("foo", chunk.valid());
  29 ///
  30 /// // The fourth character is broken
  31 /// assert_eq!(b"\xF1\x80", chunk.invalid());
  32 /// ```
  33 #[unstable(feature = "utf8_chunks", issue = "99543")]
  34 #[derive(Clone, Debug, PartialEq, Eq)]
  35 pub struct Utf8Chunk<'a> {
  36     valid: &'a str,
  37     invalid: &'a [u8],
  38 }
  39
  40 impl<'a> Utf8Chunk<'a> {
  41     /// Returns the next validated UTF-8 substring.
  42     ///
  43     /// This substring can be empty at the start of the string or between
  44     /// broken UTF-8 characters.
  45     #[must_use]
  46     #[unstable(feature = "utf8_chunks", issue = "99543")]
  47     pub fn valid(&self) -> &'a str {
  48         self.valid
  49     }
  50
  51     /// Returns the invalid sequence that caused a failure.
  52     ///
  53     /// The returned slice will have a maximum length of 3 and starts after the
  54     /// substring given by [`valid`]. Decoding will resume after this sequence.
  55     ///
  56     /// If empty, this is the last chunk in the string. If non-empty, an
  57     /// unexpected byte was encountered or the end of the input was reached
  58     /// unexpectedly.
  59     ///
  60     /// Lossy decoding would replace this sequence with [`U+FFFD REPLACEMENT
  61     /// CHARACTER`].
  62     ///
  63     /// [`valid`]: Self::valid
  64     /// [`U+FFFD REPLACEMENT CHARACTER`]: crate::char::REPLACEMENT_CHARACTER
  65     #[must_use]
  66     #[unstable(feature = "utf8_chunks", issue = "99543")]
  67     pub fn invalid(&self) -> &'a [u8] {
  68         self.invalid
  69     }
  70 }
  71
  72 #[must_use]
  73 #[unstable(feature = "str_internals", issue = "none")]
  74 pub struct Debug<'a>(&'a [u8]);
  75
  76 #[unstable(feature = "str_internals", issue = "none")]
  77 impl fmt::Debug for Debug<'_> {
  78     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
  79         f.write_char('"')?;
  80
  81         for chunk in Utf8Chunks::new(self.0) {
  82             // Valid part.
  83             // Here we partially parse UTF-8 again which is suboptimal.
  84             {
  85                 let valid = chunk.valid();
  86                 let mut from = 0;
  87                 for (i, c) in valid.char_indices() {
  88                     let esc = c.escape_debug();
  89                     // If char needs escaping, flush backlog so far and write, else skip
  90                     if esc.len() != 1 {
  91                         f.write_str(&valid[from..i])?;
  92                         for c in esc {
  93                             f.write_char(c)?;
  94                         }
  95                         from = i + c.len_utf8();
  96                     }
  97                 }
  98                 f.write_str(&valid[from..])?;
  99             }
 100
 101             // Broken parts of string as hex escape.
 102             for &b in chunk.invalid() {
 103                 write!(f, "\\x{:02X}", b)?;
 104             }
 105         }
 106
 107         f.write_char('"')
 108     }
 109 }
 110
 111 /// An iterator used to decode a slice of mostly UTF-8 bytes to string slices
 112 /// ([`&str`]) and byte slices ([`&[u8]`][byteslice]).
 113 ///
 114 /// If you want a simple conversion from UTF-8 byte slices to string slices,
 115 /// [`from_utf8`] is easier to use.
 116 ///
 117 /// [byteslice]: slice
 118 /// [`from_utf8`]: super::from_utf8
 119 ///
 120 /// # Examples
 121 ///
 122 /// This can be used to create functionality similar to
 123 /// [`String::from_utf8_lossy`] without allocating heap memory:
 124 ///
 125 /// ```
 126 /// #![feature(utf8_chunks)]
 127 ///
 128 /// use std::str::Utf8Chunks;
 129 ///
 130 /// fn from_utf8_lossy<F>(input: &[u8], mut push: F) where F: FnMut(&str) {
 131 ///     for chunk in Utf8Chunks::new(input) {
 132 ///         push(chunk.valid());
 133 ///
 134 ///         if !chunk.invalid().is_empty() {
 135 ///             push("\u{FFFD}");
 136 ///         }
 137 ///     }
 138 /// }
 139 /// ```
 140 ///
 141 /// [`String::from_utf8_lossy`]: ../../std/string/struct.String.html#method.from_utf8_lossy
 142 #[must_use = "iterators are lazy and do nothing unless consumed"]
 143 #[unstable(feature = "utf8_chunks", issue = "99543")]
 144 #[derive(Clone)]
 145 pub struct Utf8Chunks<'a> {
 146     source: &'a [u8],
 147 }
 148
 149 impl<'a> Utf8Chunks<'a> {
 150     /// Creates a new iterator to decode the bytes.
 151     #[unstable(feature = "utf8_chunks", issue = "99543")]
 152     pub fn new(bytes: &'a [u8]) -> Self {
 153         Self { source: bytes }
 154     }
 155
 156     #[doc(hidden)]
 157     #[unstable(feature = "str_internals", issue = "none")]
 158     pub fn debug(&self) -> Debug<'_> {
 159         Debug(self.source)
 160     }
 161 }
 162
 163 #[unstable(feature = "utf8_chunks", issue = "99543")]
 164 impl<'a> Iterator for Utf8Chunks<'a> {
 165     type Item = Utf8Chunk<'a>;
 166
 167     fn next(&mut self) -> Option<Utf8Chunk<'a>> {
 168         if self.source.is_empty() {
 169             return None;
 170         }
 171
 172         const TAG_CONT_U8: u8 = 128;
 173         fn safe_get(xs: &[u8], i: usize) -> u8 {
 174             *xs.get(i).unwrap_or(&0)
 175         }
 176
 177         let mut i = 0;
 178         let mut valid_up_to = 0;
 179         while i < self.source.len() {
 180             // SAFETY: `i < self.source.len()` per previous line.
 181             // For some reason the following are both significantly slower:
 182             // while let Some(&byte) = self.source.get(i) {
 183             // while let Some(byte) = self.source.get(i).copied() {
 184             let byte = unsafe { *self.source.get_unchecked(i) };
 185             i += 1;
 186
 187             if byte < 128 {
 188                 // This could be a `1 => ...` case in the match below, but for
 189                 // the common case of all-ASCII inputs, we bypass loading the
 190                 // sizeable UTF8_CHAR_WIDTH table into cache.
 191             } else {
 192                 let w = utf8_char_width(byte);
 193
 194                 match w {
 195                     2 => {
 196                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 197                             break;
 198                         }
 199                         i += 1;
 200                     }
 201                     3 => {
 202                         match (byte, safe_get(self.source, i)) {
 203                             (0xE0, 0xA0..=0xBF) => (),
 204                             (0xE1..=0xEC, 0x80..=0xBF) => (),
 205                             (0xED, 0x80..=0x9F) => (),
 206                             (0xEE..=0xEF, 0x80..=0xBF) => (),
 207                             _ => break,
 208                         }
 209                         i += 1;
 210                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 211                             break;
 212                         }
 213                         i += 1;
 214                     }
 215                     4 => {
 216                         match (byte, safe_get(self.source, i)) {
 217                             (0xF0, 0x90..=0xBF) => (),
 218                             (0xF1..=0xF3, 0x80..=0xBF) => (),
 219                             (0xF4, 0x80..=0x8F) => (),
 220                             _ => break,
 221                         }
 222                         i += 1;
 223                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 224                             break;
 225                         }
 226                         i += 1;
 227                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 228                             break;
 229                         }
 230                         i += 1;
 231                     }
 232                     _ => break,
 233                 }
 234             }
 235
 236             valid_up_to = i;
 237         }
 238
 239         // SAFETY: `i <= self.source.len()` because it is only ever incremented
 240         // via `i += 1` and in between every single one of those increments, `i`
 241         // is compared against `self.source.len()`. That happens either
 242         // literally by `i < self.source.len()` in the while-loop's condition,
 243         // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
 244         // loop is terminated as soon as the latest `i += 1` has made `i` no
 245         // longer less than `self.source.len()`, which means it'll be at most
 246         // equal to `self.source.len()`.
 247         let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
 248         self.source = remaining;
 249
 250         // SAFETY: `valid_up_to <= i` because it is only ever assigned via
 251         // `valid_up_to = i` and `i` only increases.
 252         let (valid, invalid) = unsafe { inspected.split_at_unchecked(valid_up_to) };
 253
 254         Some(Utf8Chunk {
 255             // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
 256             valid: unsafe { from_utf8_unchecked(valid) },
 257             invalid,
 258         })
 259     }
 260 }
 261
 262 #[unstable(feature = "utf8_chunks", issue = "99543")]
 263 impl FusedIterator for Utf8Chunks<'_> {}
 264
 265 #[unstable(feature = "utf8_chunks", issue = "99543")]
 266 impl fmt::Debug for Utf8Chunks<'_> {
 267     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
 268         f.debug_struct("Utf8Chunks").field("source", &self.debug()).finish()
 269     }
 270 }