src/libstd_unicode/lossy.rs

   1 // Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use core::str as core_str;
  12 use core::fmt;
  13 use core::fmt::Write;
  14 use char;
  15 use core::mem;
  16
  17
  18 /// Lossy UTF-8 string.
  19 #[unstable(feature = "str_internals", issue = "0")]
  20 pub struct Utf8Lossy {
  21     bytes: [u8]
  22 }
  23
  24 impl Utf8Lossy {
  25     pub fn from_str(s: &str) -> &Utf8Lossy {
  26         Utf8Lossy::from_bytes(s.as_bytes())
  27     }
  28
  29     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
  30         unsafe { mem::transmute(bytes) }
  31     }
  32
  33     pub fn chunks(&self) -> Utf8LossyChunksIter {
  34         Utf8LossyChunksIter { source: &self.bytes }
  35     }
  36 }
  37
  38
  39 /// Iterator over lossy UTF-8 string
  40 #[unstable(feature = "str_internals", issue = "0")]
  41 pub struct Utf8LossyChunksIter<'a> {
  42     source: &'a [u8],
  43 }
  44
  45 #[unstable(feature = "str_internals", issue = "0")]
  46 #[derive(PartialEq, Eq, Debug)]
  47 pub struct Utf8LossyChunk<'a> {
  48     /// Sequence of valid chars.
  49     /// Can be empty between broken UTF-8 chars.
  50     pub valid: &'a str,
  51     /// Single broken char, empty if none.
  52     /// Empty iff iterator item is last.
  53     pub broken: &'a [u8],
  54 }
  55
  56 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
  57     type Item = Utf8LossyChunk<'a>;
  58
  59     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
  60         if self.source.len() == 0 {
  61             return None;
  62         }
  63
  64         const TAG_CONT_U8: u8 = 128;
  65         fn unsafe_get(xs: &[u8], i: usize) -> u8 {
  66             unsafe { *xs.get_unchecked(i) }
  67         }
  68         fn safe_get(xs: &[u8], i: usize) -> u8 {
  69             if i >= xs.len() { 0 } else { unsafe_get(xs, i) }
  70         }
  71
  72         let mut i = 0;
  73         while i < self.source.len() {
  74             let i_ = i;
  75
  76             let byte = unsafe_get(self.source, i);
  77             i += 1;
  78
  79             if byte < 128 {
  80
  81             } else {
  82                 let w = core_str::utf8_char_width(byte);
  83
  84                 macro_rules! error { () => ({
  85                     unsafe {
  86                         let r = Utf8LossyChunk {
  87                             valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
  88                             broken: &self.source[i_..i],
  89                         };
  90                         self.source = &self.source[i..];
  91                         return Some(r);
  92                     }
  93                 })}
  94
  95                 match w {
  96                     2 => {
  97                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  98                             error!();
  99                         }
 100                         i += 1;
 101                     }
 102                     3 => {
 103                         match (byte, safe_get(self.source, i)) {
 104                             (0xE0, 0xA0 ... 0xBF) => (),
 105                             (0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
 106                             (0xED, 0x80 ... 0x9F) => (),
 107                             (0xEE ... 0xEF, 0x80 ... 0xBF) => (),
 108                             _ => {
 109                                 error!();
 110                             }
 111                         }
 112                         i += 1;
 113                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 114                             error!();
 115                         }
 116                         i += 1;
 117                     }
 118                     4 => {
 119                         match (byte, safe_get(self.source, i)) {
 120                             (0xF0, 0x90 ... 0xBF) => (),
 121                             (0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
 122                             (0xF4, 0x80 ... 0x8F) => (),
 123                             _ => {
 124                                 error!();
 125                             }
 126                         }
 127                         i += 1;
 128                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 129                             error!();
 130                         }
 131                         i += 1;
 132                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 133                             error!();
 134                         }
 135                         i += 1;
 136                     }
 137                     _ => {
 138                         error!();
 139                     }
 140                 }
 141             }
 142         }
 143
 144         let r = Utf8LossyChunk {
 145             valid: unsafe { core_str::from_utf8_unchecked(self.source) },
 146             broken: &[],
 147         };
 148         self.source = &[];
 149         return Some(r);
 150     }
 151 }
 152
 153
 154 impl fmt::Display for Utf8Lossy {
 155     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 156         // If we're the empty string then our iterator won't actually yield
 157         // anything, so perform the formatting manually
 158         if self.bytes.len() == 0 {
 159             return "".fmt(f)
 160         }
 161
 162         for Utf8LossyChunk { valid, broken } in self.chunks() {
 163             // If we successfully decoded the whole chunk as a valid string then
 164             // we can return a direct formatting of the string which will also
 165             // respect various formatting flags if possible.
 166             if valid.len() == self.bytes.len() {
 167                 assert!(broken.is_empty());
 168                 return valid.fmt(f)
 169             }
 170
 171             f.write_str(valid)?;
 172             if !broken.is_empty() {
 173                 f.write_char(char::REPLACEMENT_CHARACTER)?;
 174             }
 175         }
 176         Ok(())
 177     }
 178 }
 179
 180 impl fmt::Debug for Utf8Lossy {
 181     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 182         f.write_char('"')?;
 183
 184         for Utf8LossyChunk { valid, broken } in self.chunks() {
 185
 186             // Valid part.
 187             // Here we partially parse UTF-8 again which is suboptimal.
 188             {
 189                 let mut from = 0;
 190                 for (i, c) in valid.char_indices() {
 191                     let esc = c.escape_debug();
 192                     // If char needs escaping, flush backlog so far and write, else skip
 193                     if esc.len() != 1 {
 194                         f.write_str(&valid[from..i])?;
 195                         for c in esc {
 196                             f.write_char(c)?;
 197                         }
 198                         from = i + c.len_utf8();
 199                     }
 200                 }
 201                 f.write_str(&valid[from..])?;
 202             }
 203
 204             // Broken parts of string as hex escape.
 205             for &b in broken {
 206                 write!(f, "\\x{:02x}", b)?;
 207             }
 208         }
 209
 210         f.write_char('"')
 211     }
 212 }