library/std/src/sys_common/wtf8.rs

   1 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
   2 //!
   3 //! This library uses Rust’s type system to maintain
   4 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
   5 //! like the `String` and `&str` types do for UTF-8.
   6 //!
   7 //! Since [WTF-8 must not be used
   8 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
   9 //! this library deliberately does not provide access to the underlying bytes
  10 //! of WTF-8 strings,
  11 //! nor can it decode WTF-8 from arbitrary bytes.
  12 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
  13
  14 // this module is imported from @SimonSapin's repo and has tons of dead code on
  15 // unix (it's mostly used on windows), so don't worry about dead code here.
  16 #![allow(dead_code)]
  17
  18 #[cfg(test)]
  19 mod tests;
  20
  21 use core::str::next_code_point;
  22
  23 use crate::borrow::Cow;
  24 use crate::char;
  25 use crate::fmt;
  26 use crate::hash::{Hash, Hasher};
  27 use crate::iter::FromIterator;
  28 use crate::mem;
  29 use crate::ops;
  30 use crate::rc::Rc;
  31 use crate::slice;
  32 use crate::str;
  33 use crate::sync::Arc;
  34 use crate::sys_common::AsInner;
  35
  36 const UTF8_REPLACEMENT_CHARACTER: &str = "\u{FFFD}";
  37
  38 /// A Unicode code point: from U+0000 to U+10FFFF.
  39 ///
  40 /// Compares with the `char` type,
  41 /// which represents a Unicode scalar value:
  42 /// a code point that is not a surrogate (U+D800 to U+DFFF).
  43 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
  44 pub struct CodePoint {
  45     value: u32,
  46 }
  47
  48 /// Format the code point as `U+` followed by four to six hexadecimal digits.
  49 /// Example: `U+1F4A9`
  50 impl fmt::Debug for CodePoint {
  51     #[inline]
  52     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
  53         write!(formatter, "U+{:04X}", self.value)
  54     }
  55 }
  56
  57 impl CodePoint {
  58     /// Unsafely creates a new `CodePoint` without checking the value.
  59     ///
  60     /// Only use when `value` is known to be less than or equal to 0x10FFFF.
  61     #[inline]
  62     pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
  63         CodePoint { value }
  64     }
  65
  66     /// Creates a new `CodePoint` if the value is a valid code point.
  67     ///
  68     /// Returns `None` if `value` is above 0x10FFFF.
  69     #[inline]
  70     pub fn from_u32(value: u32) -> Option<CodePoint> {
  71         match value {
  72             0..=0x10FFFF => Some(CodePoint { value }),
  73             _ => None,
  74         }
  75     }
  76
  77     /// Creates a new `CodePoint` from a `char`.
  78     ///
  79     /// Since all Unicode scalar values are code points, this always succeeds.
  80     #[inline]
  81     pub fn from_char(value: char) -> CodePoint {
  82         CodePoint { value: value as u32 }
  83     }
  84
  85     /// Returns the numeric value of the code point.
  86     #[inline]
  87     pub fn to_u32(&self) -> u32 {
  88         self.value
  89     }
  90
  91     /// Optionally returns a Unicode scalar value for the code point.
  92     ///
  93     /// Returns `None` if the code point is a surrogate (from U+D800 to U+DFFF).
  94     #[inline]
  95     pub fn to_char(&self) -> Option<char> {
  96         match self.value {
  97             0xD800..=0xDFFF => None,
  98             _ => Some(unsafe { char::from_u32_unchecked(self.value) }),
  99         }
 100     }
 101
 102     /// Returns a Unicode scalar value for the code point.
 103     ///
 104     /// Returns `'\u{FFFD}'` (the replacement character “�”)
 105     /// if the code point is a surrogate (from U+D800 to U+DFFF).
 106     #[inline]
 107     pub fn to_char_lossy(&self) -> char {
 108         self.to_char().unwrap_or('\u{FFFD}')
 109     }
 110 }
 111
 112 /// An owned, growable string of well-formed WTF-8 data.
 113 ///
 114 /// Similar to `String`, but can additionally contain surrogate code points
 115 /// if they’re not in a surrogate pair.
 116 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
 117 pub struct Wtf8Buf {
 118     bytes: Vec<u8>,
 119 }
 120
 121 impl ops::Deref for Wtf8Buf {
 122     type Target = Wtf8;
 123
 124     fn deref(&self) -> &Wtf8 {
 125         self.as_slice()
 126     }
 127 }
 128
 129 impl ops::DerefMut for Wtf8Buf {
 130     fn deref_mut(&mut self) -> &mut Wtf8 {
 131         self.as_mut_slice()
 132     }
 133 }
 134
 135 /// Format the string with double quotes,
 136 /// and surrogates as `\u` followed by four hexadecimal digits.
 137 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
 138 impl fmt::Debug for Wtf8Buf {
 139     #[inline]
 140     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 141         fmt::Debug::fmt(&**self, formatter)
 142     }
 143 }
 144
 145 impl Wtf8Buf {
 146     /// Creates a new, empty WTF-8 string.
 147     #[inline]
 148     pub fn new() -> Wtf8Buf {
 149         Wtf8Buf { bytes: Vec::new() }
 150     }
 151
 152     /// Creates a new, empty WTF-8 string with pre-allocated capacity for `capacity` bytes.
 153     #[inline]
 154     pub fn with_capacity(capacity: usize) -> Wtf8Buf {
 155         Wtf8Buf { bytes: Vec::with_capacity(capacity) }
 156     }
 157
 158     /// Creates a WTF-8 string from a UTF-8 `String`.
 159     ///
 160     /// This takes ownership of the `String` and does not copy.
 161     ///
 162     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 163     #[inline]
 164     pub fn from_string(string: String) -> Wtf8Buf {
 165         Wtf8Buf { bytes: string.into_bytes() }
 166     }
 167
 168     /// Creates a WTF-8 string from a UTF-8 `&str` slice.
 169     ///
 170     /// This copies the content of the slice.
 171     ///
 172     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 173     #[inline]
 174     pub fn from_str(str: &str) -> Wtf8Buf {
 175         Wtf8Buf { bytes: <[_]>::to_vec(str.as_bytes()) }
 176     }
 177
 178     pub fn clear(&mut self) {
 179         self.bytes.clear()
 180     }
 181
 182     /// Creates a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
 183     ///
 184     /// This is lossless: calling `.encode_wide()` on the resulting string
 185     /// will always return the original code units.
 186     pub fn from_wide(v: &[u16]) -> Wtf8Buf {
 187         let mut string = Wtf8Buf::with_capacity(v.len());
 188         for item in char::decode_utf16(v.iter().cloned()) {
 189             match item {
 190                 Ok(ch) => string.push_char(ch),
 191                 Err(surrogate) => {
 192                     let surrogate = surrogate.unpaired_surrogate();
 193                     // Surrogates are known to be in the code point range.
 194                     let code_point = unsafe { CodePoint::from_u32_unchecked(surrogate as u32) };
 195                     // Skip the WTF-8 concatenation check,
 196                     // surrogate pairs are already decoded by decode_utf16
 197                     string.push_code_point_unchecked(code_point)
 198                 }
 199             }
 200         }
 201         string
 202     }
 203
 204     /// Copied from String::push
 205     /// This does **not** include the WTF-8 concatenation check.
 206     fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
 207         let mut bytes = [0; 4];
 208         let bytes = char::encode_utf8_raw(code_point.value, &mut bytes);
 209         self.bytes.extend_from_slice(bytes)
 210     }
 211
 212     #[inline]
 213     pub fn as_slice(&self) -> &Wtf8 {
 214         unsafe { Wtf8::from_bytes_unchecked(&self.bytes) }
 215     }
 216
 217     #[inline]
 218     pub fn as_mut_slice(&mut self) -> &mut Wtf8 {
 219         unsafe { Wtf8::from_mut_bytes_unchecked(&mut self.bytes) }
 220     }
 221
 222     /// Reserves capacity for at least `additional` more bytes to be inserted
 223     /// in the given `Wtf8Buf`.
 224     /// The collection may reserve more space to avoid frequent reallocations.
 225     ///
 226     /// # Panics
 227     ///
 228     /// Panics if the new capacity overflows `usize`.
 229     #[inline]
 230     pub fn reserve(&mut self, additional: usize) {
 231         self.bytes.reserve(additional)
 232     }
 233
 234     #[inline]
 235     pub fn reserve_exact(&mut self, additional: usize) {
 236         self.bytes.reserve_exact(additional)
 237     }
 238
 239     #[inline]
 240     pub fn shrink_to_fit(&mut self) {
 241         self.bytes.shrink_to_fit()
 242     }
 243
 244     #[inline]
 245     pub fn shrink_to(&mut self, min_capacity: usize) {
 246         self.bytes.shrink_to(min_capacity)
 247     }
 248
 249     /// Returns the number of bytes that this string buffer can hold without reallocating.
 250     #[inline]
 251     pub fn capacity(&self) -> usize {
 252         self.bytes.capacity()
 253     }
 254
 255     /// Append a UTF-8 slice at the end of the string.
 256     #[inline]
 257     pub fn push_str(&mut self, other: &str) {
 258         self.bytes.extend_from_slice(other.as_bytes())
 259     }
 260
 261     /// Append a WTF-8 slice at the end of the string.
 262     ///
 263     /// This replaces newly paired surrogates at the boundary
 264     /// with a supplementary code point,
 265     /// like concatenating ill-formed UTF-16 strings effectively would.
 266     #[inline]
 267     pub fn push_wtf8(&mut self, other: &Wtf8) {
 268         match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
 269             // Replace newly paired surrogates by a supplementary code point.
 270             (Some(lead), Some(trail)) => {
 271                 let len_without_lead_surrogate = self.len() - 3;
 272                 self.bytes.truncate(len_without_lead_surrogate);
 273                 let other_without_trail_surrogate = &other.bytes[3..];
 274                 // 4 bytes for the supplementary code point
 275                 self.bytes.reserve(4 + other_without_trail_surrogate.len());
 276                 self.push_char(decode_surrogate_pair(lead, trail));
 277                 self.bytes.extend_from_slice(other_without_trail_surrogate);
 278             }
 279             _ => self.bytes.extend_from_slice(&other.bytes),
 280         }
 281     }
 282
 283     /// Append a Unicode scalar value at the end of the string.
 284     #[inline]
 285     pub fn push_char(&mut self, c: char) {
 286         self.push_code_point_unchecked(CodePoint::from_char(c))
 287     }
 288
 289     /// Append a code point at the end of the string.
 290     ///
 291     /// This replaces newly paired surrogates at the boundary
 292     /// with a supplementary code point,
 293     /// like concatenating ill-formed UTF-16 strings effectively would.
 294     #[inline]
 295     pub fn push(&mut self, code_point: CodePoint) {
 296         if let trail @ 0xDC00..=0xDFFF = code_point.to_u32() {
 297             if let Some(lead) = (&*self).final_lead_surrogate() {
 298                 let len_without_lead_surrogate = self.len() - 3;
 299                 self.bytes.truncate(len_without_lead_surrogate);
 300                 self.push_char(decode_surrogate_pair(lead, trail as u16));
 301                 return;
 302             }
 303         }
 304
 305         // No newly paired surrogates at the boundary.
 306         self.push_code_point_unchecked(code_point)
 307     }
 308
 309     /// Shortens a string to the specified length.
 310     ///
 311     /// # Panics
 312     ///
 313     /// Panics if `new_len` > current length,
 314     /// or if `new_len` is not a code point boundary.
 315     #[inline]
 316     pub fn truncate(&mut self, new_len: usize) {
 317         assert!(is_code_point_boundary(self, new_len));
 318         self.bytes.truncate(new_len)
 319     }
 320
 321     /// Consumes the WTF-8 string and tries to convert it to UTF-8.
 322     ///
 323     /// This does not copy the data.
 324     ///
 325     /// If the contents are not well-formed UTF-8
 326     /// (that is, if the string contains surrogates),
 327     /// the original WTF-8 string is returned instead.
 328     pub fn into_string(self) -> Result<String, Wtf8Buf> {
 329         match self.next_surrogate(0) {
 330             None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
 331             Some(_) => Err(self),
 332         }
 333     }
 334
 335     /// Consumes the WTF-8 string and converts it lossily to UTF-8.
 336     ///
 337     /// This does not copy the data (but may overwrite parts of it in place).
 338     ///
 339     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
 340     pub fn into_string_lossy(mut self) -> String {
 341         let mut pos = 0;
 342         loop {
 343             match self.next_surrogate(pos) {
 344                 Some((surrogate_pos, _)) => {
 345                     pos = surrogate_pos + 3;
 346                     self.bytes[surrogate_pos..pos]
 347                         .copy_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 348                 }
 349                 None => return unsafe { String::from_utf8_unchecked(self.bytes) },
 350             }
 351         }
 352     }
 353
 354     /// Converts this `Wtf8Buf` into a boxed `Wtf8`.
 355     #[inline]
 356     pub fn into_box(self) -> Box<Wtf8> {
 357         unsafe { mem::transmute(self.bytes.into_boxed_slice()) }
 358     }
 359
 360     /// Converts a `Box<Wtf8>` into a `Wtf8Buf`.
 361     pub fn from_box(boxed: Box<Wtf8>) -> Wtf8Buf {
 362         let bytes: Box<[u8]> = unsafe { mem::transmute(boxed) };
 363         Wtf8Buf { bytes: bytes.into_vec() }
 364     }
 365 }
 366
 367 /// Creates a new WTF-8 string from an iterator of code points.
 368 ///
 369 /// This replaces surrogate code point pairs with supplementary code points,
 370 /// like concatenating ill-formed UTF-16 strings effectively would.
 371 impl FromIterator<CodePoint> for Wtf8Buf {
 372     fn from_iter<T: IntoIterator<Item = CodePoint>>(iter: T) -> Wtf8Buf {
 373         let mut string = Wtf8Buf::new();
 374         string.extend(iter);
 375         string
 376     }
 377 }
 378
 379 /// Append code points from an iterator to the string.
 380 ///
 381 /// This replaces surrogate code point pairs with supplementary code points,
 382 /// like concatenating ill-formed UTF-16 strings effectively would.
 383 impl Extend<CodePoint> for Wtf8Buf {
 384     fn extend<T: IntoIterator<Item = CodePoint>>(&mut self, iter: T) {
 385         let iterator = iter.into_iter();
 386         let (low, _high) = iterator.size_hint();
 387         // Lower bound of one byte per code point (ASCII only)
 388         self.bytes.reserve(low);
 389         iterator.for_each(move |code_point| self.push(code_point));
 390     }
 391
 392     #[inline]
 393     fn extend_one(&mut self, code_point: CodePoint) {
 394         self.push(code_point);
 395     }
 396
 397     #[inline]
 398     fn extend_reserve(&mut self, additional: usize) {
 399         // Lower bound of one byte per code point (ASCII only)
 400         self.bytes.reserve(additional);
 401     }
 402 }
 403
 404 /// A borrowed slice of well-formed WTF-8 data.
 405 ///
 406 /// Similar to `&str`, but can additionally contain surrogate code points
 407 /// if they’re not in a surrogate pair.
 408 #[derive(Eq, Ord, PartialEq, PartialOrd)]
 409 pub struct Wtf8 {
 410     bytes: [u8],
 411 }
 412
 413 impl AsInner<[u8]> for Wtf8 {
 414     fn as_inner(&self) -> &[u8] {
 415         &self.bytes
 416     }
 417 }
 418
 419 /// Format the slice with double quotes,
 420 /// and surrogates as `\u` followed by four hexadecimal digits.
 421 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
 422 impl fmt::Debug for Wtf8 {
 423     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 424         fn write_str_escaped(f: &mut fmt::Formatter<'_>, s: &str) -> fmt::Result {
 425             use crate::fmt::Write;
 426             for c in s.chars().flat_map(|c| c.escape_debug()) {
 427                 f.write_char(c)?
 428             }
 429             Ok(())
 430         }
 431
 432         formatter.write_str("\"")?;
 433         let mut pos = 0;
 434         while let Some((surrogate_pos, surrogate)) = self.next_surrogate(pos) {
 435             write_str_escaped(formatter, unsafe {
 436                 str::from_utf8_unchecked(&self.bytes[pos..surrogate_pos])
 437             })?;
 438             write!(formatter, "\\u{{{:x}}}", surrogate)?;
 439             pos = surrogate_pos + 3;
 440         }
 441         write_str_escaped(formatter, unsafe { str::from_utf8_unchecked(&self.bytes[pos..]) })?;
 442         formatter.write_str("\"")
 443     }
 444 }
 445
 446 impl fmt::Display for Wtf8 {
 447     fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
 448         let wtf8_bytes = &self.bytes;
 449         let mut pos = 0;
 450         loop {
 451             match self.next_surrogate(pos) {
 452                 Some((surrogate_pos, _)) => {
 453                     formatter.write_str(unsafe {
 454                         str::from_utf8_unchecked(&wtf8_bytes[pos..surrogate_pos])
 455                     })?;
 456                     formatter.write_str(UTF8_REPLACEMENT_CHARACTER)?;
 457                     pos = surrogate_pos + 3;
 458                 }
 459                 None => {
 460                     let s = unsafe { str::from_utf8_unchecked(&wtf8_bytes[pos..]) };
 461                     if pos == 0 { return s.fmt(formatter) } else { return formatter.write_str(s) }
 462                 }
 463             }
 464         }
 465     }
 466 }
 467
 468 impl Wtf8 {
 469     /// Creates a WTF-8 slice from a UTF-8 `&str` slice.
 470     ///
 471     /// Since WTF-8 is a superset of UTF-8, this always succeeds.
 472     #[inline]
 473     pub fn from_str(value: &str) -> &Wtf8 {
 474         unsafe { Wtf8::from_bytes_unchecked(value.as_bytes()) }
 475     }
 476
 477     /// Creates a WTF-8 slice from a WTF-8 byte slice.
 478     ///
 479     /// Since the byte slice is not checked for valid WTF-8, this functions is
 480     /// marked unsafe.
 481     #[inline]
 482     unsafe fn from_bytes_unchecked(value: &[u8]) -> &Wtf8 {
 483         mem::transmute(value)
 484     }
 485
 486     /// Creates a mutable WTF-8 slice from a mutable WTF-8 byte slice.
 487     ///
 488     /// Since the byte slice is not checked for valid WTF-8, this functions is
 489     /// marked unsafe.
 490     #[inline]
 491     unsafe fn from_mut_bytes_unchecked(value: &mut [u8]) -> &mut Wtf8 {
 492         mem::transmute(value)
 493     }
 494
 495     /// Returns the length, in WTF-8 bytes.
 496     #[inline]
 497     pub fn len(&self) -> usize {
 498         self.bytes.len()
 499     }
 500
 501     #[inline]
 502     pub fn is_empty(&self) -> bool {
 503         self.bytes.is_empty()
 504     }
 505
 506     /// Returns the code point at `position` if it is in the ASCII range,
 507     /// or `b'\xFF' otherwise.
 508     ///
 509     /// # Panics
 510     ///
 511     /// Panics if `position` is beyond the end of the string.
 512     #[inline]
 513     pub fn ascii_byte_at(&self, position: usize) -> u8 {
 514         match self.bytes[position] {
 515             ascii_byte @ 0x00..=0x7F => ascii_byte,
 516             _ => 0xFF,
 517         }
 518     }
 519
 520     /// Returns an iterator for the string’s code points.
 521     #[inline]
 522     pub fn code_points(&self) -> Wtf8CodePoints<'_> {
 523         Wtf8CodePoints { bytes: self.bytes.iter() }
 524     }
 525
 526     /// Tries to convert the string to UTF-8 and return a `&str` slice.
 527     ///
 528     /// Returns `None` if the string contains surrogates.
 529     ///
 530     /// This does not copy the data.
 531     #[inline]
 532     pub fn as_str(&self) -> Option<&str> {
 533         // Well-formed WTF-8 is also well-formed UTF-8
 534         // if and only if it contains no surrogate.
 535         match self.next_surrogate(0) {
 536             None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 537             Some(_) => None,
 538         }
 539     }
 540
 541     /// Lossily converts the string to UTF-8.
 542     /// Returns a UTF-8 `&str` slice if the contents are well-formed in UTF-8.
 543     ///
 544     /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
 545     ///
 546     /// This only copies the data if necessary (if it contains any surrogate).
 547     pub fn to_string_lossy(&self) -> Cow<'_, str> {
 548         let surrogate_pos = match self.next_surrogate(0) {
 549             None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
 550             Some((pos, _)) => pos,
 551         };
 552         let wtf8_bytes = &self.bytes;
 553         let mut utf8_bytes = Vec::with_capacity(self.len());
 554         utf8_bytes.extend_from_slice(&wtf8_bytes[..surrogate_pos]);
 555         utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 556         let mut pos = surrogate_pos + 3;
 557         loop {
 558             match self.next_surrogate(pos) {
 559                 Some((surrogate_pos, _)) => {
 560                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..surrogate_pos]);
 561                     utf8_bytes.extend_from_slice(UTF8_REPLACEMENT_CHARACTER.as_bytes());
 562                     pos = surrogate_pos + 3;
 563                 }
 564                 None => {
 565                     utf8_bytes.extend_from_slice(&wtf8_bytes[pos..]);
 566                     return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) });
 567                 }
 568             }
 569         }
 570     }
 571
 572     /// Converts the WTF-8 string to potentially ill-formed UTF-16
 573     /// and return an iterator of 16-bit code units.
 574     ///
 575     /// This is lossless:
 576     /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
 577     /// would always return the original WTF-8 string.
 578     #[inline]
 579     pub fn encode_wide(&self) -> EncodeWide<'_> {
 580         EncodeWide { code_points: self.code_points(), extra: 0 }
 581     }
 582
 583     #[inline]
 584     fn next_surrogate(&self, mut pos: usize) -> Option<(usize, u16)> {
 585         let mut iter = self.bytes[pos..].iter();
 586         loop {
 587             let b = *iter.next()?;
 588             if b < 0x80 {
 589                 pos += 1;
 590             } else if b < 0xE0 {
 591                 iter.next();
 592                 pos += 2;
 593             } else if b == 0xED {
 594                 match (iter.next(), iter.next()) {
 595                     (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
 596                         return Some((pos, decode_surrogate(b2, b3)));
 597                     }
 598                     _ => pos += 3,
 599                 }
 600             } else if b < 0xF0 {
 601                 iter.next();
 602                 iter.next();
 603                 pos += 3;
 604             } else {
 605                 iter.next();
 606                 iter.next();
 607                 iter.next();
 608                 pos += 4;
 609             }
 610         }
 611     }
 612
 613     #[inline]
 614     fn final_lead_surrogate(&self) -> Option<u16> {
 615         match self.bytes {
 616             [.., 0xED, b2 @ 0xA0..=0xAF, b3] => Some(decode_surrogate(b2, b3)),
 617             _ => None,
 618         }
 619     }
 620
 621     #[inline]
 622     fn initial_trail_surrogate(&self) -> Option<u16> {
 623         match self.bytes {
 624             [0xED, b2 @ 0xB0..=0xBF, b3, ..] => Some(decode_surrogate(b2, b3)),
 625             _ => None,
 626         }
 627     }
 628
 629     pub fn clone_into(&self, buf: &mut Wtf8Buf) {
 630         self.bytes.clone_into(&mut buf.bytes)
 631     }
 632
 633     /// Boxes this `Wtf8`.
 634     #[inline]
 635     pub fn into_box(&self) -> Box<Wtf8> {
 636         let boxed: Box<[u8]> = self.bytes.into();
 637         unsafe { mem::transmute(boxed) }
 638     }
 639
 640     /// Creates a boxed, empty `Wtf8`.
 641     pub fn empty_box() -> Box<Wtf8> {
 642         let boxed: Box<[u8]> = Default::default();
 643         unsafe { mem::transmute(boxed) }
 644     }
 645
 646     #[inline]
 647     pub fn into_arc(&self) -> Arc<Wtf8> {
 648         let arc: Arc<[u8]> = Arc::from(&self.bytes);
 649         unsafe { Arc::from_raw(Arc::into_raw(arc) as *const Wtf8) }
 650     }
 651
 652     #[inline]
 653     pub fn into_rc(&self) -> Rc<Wtf8> {
 654         let rc: Rc<[u8]> = Rc::from(&self.bytes);
 655         unsafe { Rc::from_raw(Rc::into_raw(rc) as *const Wtf8) }
 656     }
 657
 658     #[inline]
 659     pub fn make_ascii_lowercase(&mut self) {
 660         self.bytes.make_ascii_lowercase()
 661     }
 662
 663     #[inline]
 664     pub fn make_ascii_uppercase(&mut self) {
 665         self.bytes.make_ascii_uppercase()
 666     }
 667
 668     #[inline]
 669     pub fn to_ascii_lowercase(&self) -> Wtf8Buf {
 670         Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
 671     }
 672
 673     #[inline]
 674     pub fn to_ascii_uppercase(&self) -> Wtf8Buf {
 675         Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
 676     }
 677
 678     #[inline]
 679     pub fn is_ascii(&self) -> bool {
 680         self.bytes.is_ascii()
 681     }
 682
 683     #[inline]
 684     pub fn eq_ignore_ascii_case(&self, other: &Self) -> bool {
 685         self.bytes.eq_ignore_ascii_case(&other.bytes)
 686     }
 687 }
 688
 689 /// Returns a slice of the given string for the byte range [`begin`..`end`).
 690 ///
 691 /// # Panics
 692 ///
 693 /// Panics when `begin` and `end` do not point to code point boundaries,
 694 /// or point beyond the end of the string.
 695 impl ops::Index<ops::Range<usize>> for Wtf8 {
 696     type Output = Wtf8;
 697
 698     #[inline]
 699     fn index(&self, range: ops::Range<usize>) -> &Wtf8 {
 700         // is_code_point_boundary checks that the index is in [0, .len()]
 701         if range.start <= range.end
 702             && is_code_point_boundary(self, range.start)
 703             && is_code_point_boundary(self, range.end)
 704         {
 705             unsafe { slice_unchecked(self, range.start, range.end) }
 706         } else {
 707             slice_error_fail(self, range.start, range.end)
 708         }
 709     }
 710 }
 711
 712 /// Returns a slice of the given string from byte `begin` to its end.
 713 ///
 714 /// # Panics
 715 ///
 716 /// Panics when `begin` is not at a code point boundary,
 717 /// or is beyond the end of the string.
 718 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
 719     type Output = Wtf8;
 720
 721     #[inline]
 722     fn index(&self, range: ops::RangeFrom<usize>) -> &Wtf8 {
 723         // is_code_point_boundary checks that the index is in [0, .len()]
 724         if is_code_point_boundary(self, range.start) {
 725             unsafe { slice_unchecked(self, range.start, self.len()) }
 726         } else {
 727             slice_error_fail(self, range.start, self.len())
 728         }
 729     }
 730 }
 731
 732 /// Returns a slice of the given string from its beginning to byte `end`.
 733 ///
 734 /// # Panics
 735 ///
 736 /// Panics when `end` is not at a code point boundary,
 737 /// or is beyond the end of the string.
 738 impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
 739     type Output = Wtf8;
 740
 741     #[inline]
 742     fn index(&self, range: ops::RangeTo<usize>) -> &Wtf8 {
 743         // is_code_point_boundary checks that the index is in [0, .len()]
 744         if is_code_point_boundary(self, range.end) {
 745             unsafe { slice_unchecked(self, 0, range.end) }
 746         } else {
 747             slice_error_fail(self, 0, range.end)
 748         }
 749     }
 750 }
 751
 752 impl ops::Index<ops::RangeFull> for Wtf8 {
 753     type Output = Wtf8;
 754
 755     #[inline]
 756     fn index(&self, _range: ops::RangeFull) -> &Wtf8 {
 757         self
 758     }
 759 }
 760
 761 #[inline]
 762 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
 763     // The first byte is assumed to be 0xED
 764     0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
 765 }
 766
 767 #[inline]
 768 fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
 769     let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
 770     unsafe { char::from_u32_unchecked(code_point) }
 771 }
 772
 773 /// Copied from core::str::StrPrelude::is_char_boundary
 774 #[inline]
 775 pub fn is_code_point_boundary(slice: &Wtf8, index: usize) -> bool {
 776     if index == slice.len() {
 777         return true;
 778     }
 779     match slice.bytes.get(index) {
 780         None => false,
 781         Some(&b) => b < 128 || b >= 192,
 782     }
 783 }
 784
 785 /// Copied from core::str::raw::slice_unchecked
 786 #[inline]
 787 pub unsafe fn slice_unchecked(s: &Wtf8, begin: usize, end: usize) -> &Wtf8 {
 788     // memory layout of an &[u8] and &Wtf8 are the same
 789     Wtf8::from_bytes_unchecked(slice::from_raw_parts(s.bytes.as_ptr().add(begin), end - begin))
 790 }
 791
 792 /// Copied from core::str::raw::slice_error_fail
 793 #[inline(never)]
 794 pub fn slice_error_fail(s: &Wtf8, begin: usize, end: usize) -> ! {
 795     assert!(begin <= end);
 796     panic!("index {} and/or {} in `{:?}` do not lie on character boundary", begin, end, s);
 797 }
 798
 799 /// Iterator for the code points of a WTF-8 string.
 800 ///
 801 /// Created with the method `.code_points()`.
 802 #[derive(Clone)]
 803 pub struct Wtf8CodePoints<'a> {
 804     bytes: slice::Iter<'a, u8>,
 805 }
 806
 807 impl<'a> Iterator for Wtf8CodePoints<'a> {
 808     type Item = CodePoint;
 809
 810     #[inline]
 811     fn next(&mut self) -> Option<CodePoint> {
 812         next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
 813     }
 814
 815     #[inline]
 816     fn size_hint(&self) -> (usize, Option<usize>) {
 817         let len = self.bytes.len();
 818         (len.saturating_add(3) / 4, Some(len))
 819     }
 820 }
 821
 822 /// Generates a wide character sequence for potentially ill-formed UTF-16.
 823 #[stable(feature = "rust1", since = "1.0.0")]
 824 #[derive(Clone)]
 825 pub struct EncodeWide<'a> {
 826     code_points: Wtf8CodePoints<'a>,
 827     extra: u16,
 828 }
 829
 830 // Copied from libunicode/u_str.rs
 831 #[stable(feature = "rust1", since = "1.0.0")]
 832 impl<'a> Iterator for EncodeWide<'a> {
 833     type Item = u16;
 834
 835     #[inline]
 836     fn next(&mut self) -> Option<u16> {
 837         if self.extra != 0 {
 838             let tmp = self.extra;
 839             self.extra = 0;
 840             return Some(tmp);
 841         }
 842
 843         let mut buf = [0; 2];
 844         self.code_points.next().map(|code_point| {
 845             let n = char::encode_utf16_raw(code_point.value, &mut buf).len();
 846             if n == 2 {
 847                 self.extra = buf[1];
 848             }
 849             buf[0]
 850         })
 851     }
 852
 853     #[inline]
 854     fn size_hint(&self) -> (usize, Option<usize>) {
 855         let (low, high) = self.code_points.size_hint();
 856         let ext = (self.extra != 0) as usize;
 857         // every code point gets either one u16 or two u16,
 858         // so this iterator is between 1 or 2 times as
 859         // long as the underlying iterator.
 860         (low + ext, high.and_then(|n| n.checked_mul(2)).and_then(|n| n.checked_add(ext)))
 861     }
 862 }
 863
 864 impl Hash for CodePoint {
 865     #[inline]
 866     fn hash<H: Hasher>(&self, state: &mut H) {
 867         self.value.hash(state)
 868     }
 869 }
 870
 871 impl Hash for Wtf8Buf {
 872     #[inline]
 873     fn hash<H: Hasher>(&self, state: &mut H) {
 874         state.write(&self.bytes);
 875         0xfeu8.hash(state)
 876     }
 877 }
 878
 879 impl Hash for Wtf8 {
 880     #[inline]
 881     fn hash<H: Hasher>(&self, state: &mut H) {
 882         state.write(&self.bytes);
 883         0xfeu8.hash(state)
 884     }
 885 }