src/libstd_unicode/lossy.rs

   1 // Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use core::str as core_str;
  12 use core::fmt;
  13 use core::fmt::Write;
  14 use char;
  15 use core::intrinsics;
  16
  17
  18 /// Lossy UTF-8 string.
  19 #[unstable(feature = "str_internals", issue = "0")]
  20 pub struct Utf8Lossy {
  21     bytes: [u8]
  22 }
  23
  24 impl Utf8Lossy {
  25     pub fn from_str(s: &str) -> &Utf8Lossy {
  26         Utf8Lossy::from_bytes(s.as_bytes())
  27     }
  28
  29     pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
  30         unsafe { intrinsics::transmute(bytes) }
  31     }
  32
  33     pub fn chunks(&self) -> Utf8LossyChunksIter {
  34         Utf8LossyChunksIter { source: &self.bytes }
  35     }
  36 }
  37
  38
  39 /// Iterator over lossy UTF-8 string
  40 #[unstable(feature = "str_internals", issue = "0")]
  41 pub struct Utf8LossyChunksIter<'a> {
  42     source: &'a [u8],
  43 }
  44
  45 #[unstable(feature = "str_internals", issue = "0")]
  46 #[derive(PartialEq, Eq, Debug)]
  47 pub struct Utf8LossyChunk<'a> {
  48     /// Sequence of valid chars.
  49     /// Can be empty between broken UTF-8 chars.
  50     pub valid: &'a str,
  51     /// Single broken char, empty if none.
  52     /// Empty iff iterator item is last.
  53     pub broken: &'a [u8],
  54 }
  55
  56 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
  57     type Item = Utf8LossyChunk<'a>;
  58
  59     fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
  60         if self.source.len() == 0 {
  61             return None;
  62         }
  63
  64         const TAG_CONT_U8: u8 = 128;
  65         fn unsafe_get(xs: &[u8], i: usize) -> u8 {
  66             unsafe { *xs.get_unchecked(i) }
  67         }
  68         fn safe_get(xs: &[u8], i: usize) -> u8 {
  69             if i >= xs.len() { 0 } else { unsafe_get(xs, i) }
  70         }
  71
  72         let mut i = 0;
  73         while i < self.source.len() {
  74             let i_ = i;
  75
  76             let byte = unsafe_get(self.source, i);
  77             i += 1;
  78
  79             if byte < 128 {
  80
  81             } else {
  82                 let w = core_str::utf8_char_width(byte);
  83
  84                 macro_rules! error { () => ({
  85                     unsafe {
  86                         let r = Utf8LossyChunk {
  87                             valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
  88                             broken: &self.source[i_..i],
  89                         };
  90                         self.source = &self.source[i..];
  91                         return Some(r);
  92                     }
  93                 })}
  94
  95                 match w {
  96                     2 => {
  97                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
  98                             error!();
  99                         }
 100                         i += 1;
 101                     }
 102                     3 => {
 103                         match (byte, safe_get(self.source, i)) {
 104                             (0xE0, 0xA0 ... 0xBF) => (),
 105                             (0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
 106                             (0xED, 0x80 ... 0x9F) => (),
 107                             (0xEE ... 0xEF, 0x80 ... 0xBF) => (),
 108                             _ => {
 109                                 error!();
 110                             }
 111                         }
 112                         i += 1;
 113                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 114                             error!();
 115                         }
 116                         i += 1;
 117                     }
 118                     4 => {
 119                         match (byte, safe_get(self.source, i)) {
 120                             (0xF0, 0x90 ... 0xBF) => (),
 121                             (0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
 122                             (0xF4, 0x80 ... 0x8F) => (),
 123                             _ => {
 124                                 error!();
 125                             }
 126                         }
 127                         i += 1;
 128                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 129                             error!();
 130                         }
 131                         i += 1;
 132                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
 133                             error!();
 134                         }
 135                         i += 1;
 136                     }
 137                     _ => {
 138                         error!();
 139                     }
 140                 }
 141             }
 142         }
 143
 144         let r = Utf8LossyChunk {
 145             valid: unsafe { core_str::from_utf8_unchecked(self.source) },
 146             broken: &[],
 147         };
 148         self.source = &[];
 149         return Some(r);
 150     }
 151 }
 152
 153
 154 impl fmt::Display for Utf8Lossy {
 155     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 156         for Utf8LossyChunk { valid, broken } in self.chunks() {
 157             f.write_str(valid)?;
 158             if !broken.is_empty() {
 159                 f.write_char(char::REPLACEMENT_CHARACTER)?;
 160             }
 161         }
 162         Ok(())
 163     }
 164 }
 165
 166 impl fmt::Debug for Utf8Lossy {
 167     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 168         f.write_char('"')?;
 169
 170         for Utf8LossyChunk { valid, broken } in self.chunks() {
 171
 172             // Valid part.
 173             // Here we partially parse UTF-8 again which is suboptimal.
 174             {
 175                 let mut from = 0;
 176                 for (i, c) in valid.char_indices() {
 177                     let esc = c.escape_debug();
 178                     // If char needs escaping, flush backlog so far and write, else skip
 179                     if esc.len() != 1 {
 180                         f.write_str(&valid[from..i])?;
 181                         for c in esc {
 182                             f.write_char(c)?;
 183                         }
 184                         from = i + c.len_utf8();
 185                     }
 186                 }
 187                 f.write_str(&valid[from..])?;
 188             }
 189
 190             // Broken parts of string as hex escape.
 191             for &b in broken {
 192                 write!(f, "\\x{:02x}", b)?;
 193             }
 194         }
 195
 196         f.write_char('"')
 197     }
 198 }