1 // Copyright 2012-2017 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 use core::str as core_str;
18 /// Lossy UTF-8 string.
19 #[unstable(feature = "str_internals", issue = "0")]
20 pub struct Utf8Lossy {
25 pub fn from_str(s: &str) -> &Utf8Lossy {
26 Utf8Lossy::from_bytes(s.as_bytes())
29 pub fn from_bytes(bytes: &[u8]) -> &Utf8Lossy {
30 unsafe { intrinsics::transmute(bytes) }
33 pub fn chunks(&self) -> Utf8LossyChunksIter {
34 Utf8LossyChunksIter { source: &self.bytes }
39 /// Iterator over lossy UTF-8 string
40 #[unstable(feature = "str_internals", issue = "0")]
41 pub struct Utf8LossyChunksIter<'a> {
45 #[unstable(feature = "str_internals", issue = "0")]
46 #[derive(PartialEq, Eq, Debug)]
47 pub struct Utf8LossyChunk<'a> {
48 /// Sequence of valid chars.
49 /// Can be empty between broken UTF-8 chars.
51 /// Single broken char, empty if none.
52 /// Empty iff iterator item is last.
56 impl<'a> Iterator for Utf8LossyChunksIter<'a> {
57 type Item = Utf8LossyChunk<'a>;
59 fn next(&mut self) -> Option<Utf8LossyChunk<'a>> {
60 if self.source.len() == 0 {
64 const TAG_CONT_U8: u8 = 128;
65 fn unsafe_get(xs: &[u8], i: usize) -> u8 {
66 unsafe { *xs.get_unchecked(i) }
68 fn safe_get(xs: &[u8], i: usize) -> u8 {
69 if i >= xs.len() { 0 } else { unsafe_get(xs, i) }
73 while i < self.source.len() {
76 let byte = unsafe_get(self.source, i);
82 let w = core_str::utf8_char_width(byte);
84 macro_rules! error { () => ({
86 let r = Utf8LossyChunk {
87 valid: core_str::from_utf8_unchecked(&self.source[0..i_]),
88 broken: &self.source[i_..i],
90 self.source = &self.source[i..];
97 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
103 match (byte, safe_get(self.source, i)) {
104 (0xE0, 0xA0 ... 0xBF) => (),
105 (0xE1 ... 0xEC, 0x80 ... 0xBF) => (),
106 (0xED, 0x80 ... 0x9F) => (),
107 (0xEE ... 0xEF, 0x80 ... 0xBF) => (),
113 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
119 match (byte, safe_get(self.source, i)) {
120 (0xF0, 0x90 ... 0xBF) => (),
121 (0xF1 ... 0xF3, 0x80 ... 0xBF) => (),
122 (0xF4, 0x80 ... 0x8F) => (),
128 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
132 if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
144 let r = Utf8LossyChunk {
145 valid: unsafe { core_str::from_utf8_unchecked(self.source) },
154 impl fmt::Display for Utf8Lossy {
155 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
156 for Utf8LossyChunk { valid, broken } in self.chunks() {
158 if !broken.is_empty() {
159 f.write_char(char::REPLACEMENT_CHARACTER)?;
166 impl fmt::Debug for Utf8Lossy {
167 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
170 for Utf8LossyChunk { valid, broken } in self.chunks() {
173 // Here we partially parse UTF-8 again which is suboptimal.
176 for (i, c) in valid.char_indices() {
177 let esc = c.escape_debug();
178 // If char needs escaping, flush backlog so far and write, else skip
180 f.write_str(&valid[from..i])?;
184 from = i + c.len_utf8();
187 f.write_str(&valid[from..])?;
190 // Broken parts of string as hex escape.
192 write!(f, "\\x{:02x}", b)?;