1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
13 //! This library uses Rust’s type system to maintain
14 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
15 //! like the `String` and `&str` types do for UTF-8.
17 //! Since [WTF-8 must not be used
18 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
19 //! this library deliberately does not provide access to the underlying bytes
21 //! nor can it decode WTF-8 from arbitrary bytes.
22 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
26 use core::char::{encode_utf8_raw, encode_utf16_raw};
27 use core::str::{char_range_at_raw, next_code_point};
28 use core::raw::Slice as RawSlice;
34 use hash::{Hash, Writer, Hasher};
35 use iter::FromIterator;
41 use string::{String, CowString};
42 use sys_common::AsInner;
43 use unicode::str::{Utf16Item, utf16_items};
46 static UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD";
48 /// A Unicode code point: from U+0000 to U+10FFFF.
50 /// Compare with the `char` type,
51 /// which represents a Unicode scalar value:
52 /// a code point that is not a surrogate (U+D800 to U+DFFF).
53 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
54 pub struct CodePoint {
58 /// Format the code point as `U+` followed by four to six hexadecimal digits.
59 /// Example: `U+1F4A9`
60 impl fmt::Debug for CodePoint {
62 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
63 write!(formatter, "U+{:04X}", self.value)
68 /// Unsafely create a new `CodePoint` without checking the value.
70 /// Only use when `value` is known to be less than or equal to 0x10FFFF.
72 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
73 CodePoint { value: value }
76 /// Create a new `CodePoint` if the value is a valid code point.
78 /// Return `None` if `value` is above 0x10FFFF.
80 pub fn from_u32(value: u32) -> Option<CodePoint> {
82 0 ... 0x10FFFF => Some(CodePoint { value: value }),
87 /// Create a new `CodePoint` from a `char`.
89 /// Since all Unicode scalar values are code points, this always succeeds.
91 pub fn from_char(value: char) -> CodePoint {
92 CodePoint { value: value as u32 }
95 /// Return the numeric value of the code point.
97 pub fn to_u32(&self) -> u32 {
101 /// Optionally return a Unicode scalar value for the code point.
103 /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF).
105 pub fn to_char(&self) -> Option<char> {
107 0xD800 ... 0xDFFF => None,
108 _ => Some(unsafe { mem::transmute(self.value) })
112 /// Return a Unicode scalar value for the code point.
114 /// Return `'\u{FFFD}'` (the replacement character “�”)
115 /// if the code point is a surrogate (from U+D800 to U+DFFF).
117 pub fn to_char_lossy(&self) -> char {
118 self.to_char().unwrap_or('\u{FFFD}')
122 /// An owned, growable string of well-formed WTF-8 data.
124 /// Similar to `String`, but can additionally contain surrogate code points
125 /// if they’re not in a surrogate pair.
126 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
131 impl ops::Deref for Wtf8Buf {
134 fn deref(&self) -> &Wtf8 {
139 /// Format the string with double quotes,
140 /// and surrogates as `\u` followed by four hexadecimal digits.
141 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
142 impl fmt::Debug for Wtf8Buf {
144 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
145 fmt::Debug::fmt(&**self, formatter)
150 /// Create an new, empty WTF-8 string.
152 pub fn new() -> Wtf8Buf {
153 Wtf8Buf { bytes: Vec::new() }
156 /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` bytes.
158 pub fn with_capacity(n: uint) -> Wtf8Buf {
159 Wtf8Buf { bytes: Vec::with_capacity(n) }
162 /// Create a WTF-8 string from an UTF-8 `String`.
164 /// This takes ownership of the `String` and does not copy.
166 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
168 pub fn from_string(string: String) -> Wtf8Buf {
169 Wtf8Buf { bytes: string.into_bytes() }
172 /// Create a WTF-8 string from an UTF-8 `&str` slice.
174 /// This copies the content of the slice.
176 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
178 pub fn from_str(str: &str) -> Wtf8Buf {
179 Wtf8Buf { bytes: slice::SliceExt::to_vec(str.as_bytes()) }
182 /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
184 /// This is lossless: calling `.encode_wide()` on the resulting string
185 /// will always return the original code units.
186 pub fn from_wide(v: &[u16]) -> Wtf8Buf {
187 let mut string = Wtf8Buf::with_capacity(v.len());
188 for item in utf16_items(v) {
190 Utf16Item::ScalarValue(c) => string.push_char(c),
191 Utf16Item::LoneSurrogate(s) => {
192 // Surrogates are known to be in the code point range.
193 let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
194 // Skip the WTF-8 concatenation check,
195 // surrogate pairs are already decoded by utf16_items
196 string.push_code_point_unchecked(code_point)
203 /// Copied from String::push
204 /// This does **not** include the WTF-8 concatenation check.
205 fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
206 let cur_len = self.len();
207 // This may use up to 4 bytes.
211 // Attempt to not use an intermediate buffer by just pushing bytes
212 // directly onto this string.
213 let slice = RawSlice {
214 data: self.bytes.as_ptr().offset(cur_len as int),
217 let used = encode_utf8_raw(code_point.value, mem::transmute(slice))
219 self.bytes.set_len(cur_len + used);
224 pub fn as_slice(&self) -> &Wtf8 {
225 unsafe { mem::transmute(&*self.bytes) }
228 /// Reserves capacity for at least `additional` more bytes to be inserted
229 /// in the given `Wtf8Buf`.
230 /// The collection may reserve more space to avoid frequent reallocations.
234 /// Panics if the new capacity overflows `uint`.
236 pub fn reserve(&mut self, additional: uint) {
237 self.bytes.reserve(additional)
240 /// Returns the number of bytes that this string buffer can hold without reallocating.
242 pub fn capacity(&self) -> uint {
243 self.bytes.capacity()
246 /// Append an UTF-8 slice at the end of the string.
248 pub fn push_str(&mut self, other: &str) {
249 self.bytes.push_all(other.as_bytes())
252 /// Append a WTF-8 slice at the end of the string.
254 /// This replaces newly paired surrogates at the boundary
255 /// with a supplementary code point,
256 /// like concatenating ill-formed UTF-16 strings effectively would.
258 pub fn push_wtf8(&mut self, other: &Wtf8) {
259 match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
260 // Replace newly paired surrogates by a supplementary code point.
261 (Some(lead), Some(trail)) => {
262 let len_without_lead_surrogate = self.len() - 3;
263 self.bytes.truncate(len_without_lead_surrogate);
264 let other_without_trail_surrogate = &other.bytes[3..];
265 // 4 bytes for the supplementary code point
266 self.bytes.reserve(4 + other_without_trail_surrogate.len());
267 self.push_char(decode_surrogate_pair(lead, trail));
268 self.bytes.push_all(other_without_trail_surrogate);
270 _ => self.bytes.push_all(&other.bytes)
274 /// Append a Unicode scalar value at the end of the string.
276 pub fn push_char(&mut self, c: char) {
277 self.push_code_point_unchecked(CodePoint::from_char(c))
280 /// Append a code point at the end of the string.
282 /// This replaces newly paired surrogates at the boundary
283 /// with a supplementary code point,
284 /// like concatenating ill-formed UTF-16 strings effectively would.
286 pub fn push(&mut self, code_point: CodePoint) {
287 match code_point.to_u32() {
288 trail @ 0xDC00...0xDFFF => {
289 match (&*self).final_lead_surrogate() {
291 let len_without_lead_surrogate = self.len() - 3;
292 self.bytes.truncate(len_without_lead_surrogate);
293 self.push_char(decode_surrogate_pair(lead, trail as u16));
302 // No newly paired surrogates at the boundary.
303 self.push_code_point_unchecked(code_point)
306 /// Shortens a string to the specified length.
310 /// Panics if `new_len` > current length,
311 /// or if `new_len` is not a code point boundary.
313 pub fn truncate(&mut self, new_len: uint) {
314 assert!(is_code_point_boundary(self, new_len));
315 self.bytes.truncate(new_len)
318 /// Consume the WTF-8 string and try to convert it to UTF-8.
320 /// This does not copy the data.
322 /// If the contents are not well-formed UTF-8
323 /// (that is, if the string contains surrogates),
324 /// the original WTF-8 string is returned instead.
325 pub fn into_string(self) -> Result<String, Wtf8Buf> {
326 match self.next_surrogate(0) {
327 None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
328 Some(_) => Err(self),
332 /// Consume the WTF-8 string and convert it lossily to UTF-8.
334 /// This does not copy the data (but may overwrite parts of it in place).
336 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
337 pub fn into_string_lossy(mut self) -> String {
340 match self.next_surrogate(pos) {
341 Some((surrogate_pos, _)) => {
342 pos = surrogate_pos + 3;
343 slice::bytes::copy_memory(
344 &mut self.bytes[surrogate_pos .. pos],
345 UTF8_REPLACEMENT_CHARACTER
348 None => return unsafe { String::from_utf8_unchecked(self.bytes) }
354 /// Create a new WTF-8 string from an iterator of code points.
356 /// This replaces surrogate code point pairs with supplementary code points,
357 /// like concatenating ill-formed UTF-16 strings effectively would.
358 impl FromIterator<CodePoint> for Wtf8Buf {
359 fn from_iter<T: Iterator<Item=CodePoint>>(iterator: T) -> Wtf8Buf {
360 let mut string = Wtf8Buf::new();
361 string.extend(iterator);
366 /// Append code points from an iterator to the string.
368 /// This replaces surrogate code point pairs with supplementary code points,
369 /// like concatenating ill-formed UTF-16 strings effectively would.
370 impl Extend<CodePoint> for Wtf8Buf {
371 fn extend<T: Iterator<Item=CodePoint>>(&mut self, iterator: T) {
372 let (low, _high) = iterator.size_hint();
373 // Lower bound of one byte per code point (ASCII only)
374 self.bytes.reserve(low);
375 for code_point in iterator {
376 self.push(code_point);
381 /// A borrowed slice of well-formed WTF-8 data.
383 /// Similar to `&str`, but can additionally contain surrogate code points
384 /// if they’re not in a surrogate pair.
389 impl AsInner<[u8]> for Wtf8 {
390 fn as_inner(&self) -> &[u8] { &self.bytes }
393 // FIXME: https://github.com/rust-lang/rust/issues/18805
394 impl PartialEq for Wtf8 {
395 fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) }
398 // FIXME: https://github.com/rust-lang/rust/issues/18805
401 // FIXME: https://github.com/rust-lang/rust/issues/18738
402 impl PartialOrd for Wtf8 {
404 fn partial_cmp(&self, other: &Wtf8) -> Option<cmp::Ordering> {
405 self.bytes.partial_cmp(&other.bytes)
408 fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) }
410 fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) }
412 fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) }
414 fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) }
417 // FIXME: https://github.com/rust-lang/rust/issues/18738
420 fn cmp(&self, other: &Wtf8) -> cmp::Ordering { self.bytes.cmp(&other.bytes) }
423 /// Format the slice with double quotes,
424 /// and surrogates as `\u` followed by four hexadecimal digits.
425 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
426 impl fmt::Debug for Wtf8 {
427 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
428 try!(formatter.write_str("\""));
431 match self.next_surrogate(pos) {
433 Some((surrogate_pos, surrogate)) => {
434 try!(formatter.write_str(unsafe {
435 // the data in this slice is valid UTF-8, transmute to &str
436 mem::transmute(&self.bytes[pos .. surrogate_pos])
438 try!(write!(formatter, "\\u{{{:X}}}", surrogate));
439 pos = surrogate_pos + 3;
443 try!(formatter.write_str(unsafe {
444 // the data in this slice is valid UTF-8, transmute to &str
445 mem::transmute(&self.bytes[pos..])
447 formatter.write_str("\"")
452 /// Create a WTF-8 slice from a UTF-8 `&str` slice.
454 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
456 pub fn from_str(value: &str) -> &Wtf8 {
457 unsafe { mem::transmute(value.as_bytes()) }
460 /// Return the length, in WTF-8 bytes.
462 pub fn len(&self) -> uint {
466 /// Return the code point at `position` if it is in the ASCII range,
467 /// or `b'\xFF' otherwise.
471 /// Panics if `position` is beyond the end of the string.
473 pub fn ascii_byte_at(&self, position: uint) -> u8 {
474 match self.bytes[position] {
475 ascii_byte @ 0x00 ... 0x7F => ascii_byte,
480 /// Return the code point at `position`.
484 /// Panics if `position` is not at a code point boundary,
485 /// or is beyond the end of the string.
487 pub fn code_point_at(&self, position: uint) -> CodePoint {
488 let (code_point, _) = self.code_point_range_at(position);
492 /// Return the code point at `position`
493 /// and the position of the next code point.
497 /// Panics if `position` is not at a code point boundary,
498 /// or is beyond the end of the string.
500 pub fn code_point_range_at(&self, position: uint) -> (CodePoint, uint) {
501 let (c, n) = char_range_at_raw(&self.bytes, position);
502 (CodePoint { value: c }, n)
505 /// Return an iterator for the string’s code points.
507 pub fn code_points(&self) -> Wtf8CodePoints {
508 Wtf8CodePoints { bytes: self.bytes.iter() }
511 /// Try to convert the string to UTF-8 and return a `&str` slice.
513 /// Return `None` if the string contains surrogates.
515 /// This does not copy the data.
517 pub fn as_str(&self) -> Option<&str> {
518 // Well-formed WTF-8 is also well-formed UTF-8
519 // if and only if it contains no surrogate.
520 match self.next_surrogate(0) {
521 None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
526 /// Lossily convert the string to UTF-8.
527 /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8.
529 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
531 /// This only copies the data if necessary (if it contains any surrogate).
532 pub fn to_string_lossy(&self) -> CowString {
533 let surrogate_pos = match self.next_surrogate(0) {
534 None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
535 Some((pos, _)) => pos,
537 let wtf8_bytes = &self.bytes;
538 let mut utf8_bytes = Vec::with_capacity(self.len());
539 utf8_bytes.push_all(&wtf8_bytes[..surrogate_pos]);
540 utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
541 let mut pos = surrogate_pos + 3;
543 match self.next_surrogate(pos) {
544 Some((surrogate_pos, _)) => {
545 utf8_bytes.push_all(&wtf8_bytes[pos .. surrogate_pos]);
546 utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
547 pos = surrogate_pos + 3;
550 utf8_bytes.push_all(&wtf8_bytes[pos..]);
551 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
557 /// Convert the WTF-8 string to potentially ill-formed UTF-16
558 /// and return an iterator of 16-bit code units.
560 /// This is lossless:
561 /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
562 /// would always return the original WTF-8 string.
564 pub fn encode_wide(&self) -> EncodeWide {
565 EncodeWide { code_points: self.code_points(), extra: 0 }
569 fn next_surrogate(&self, mut pos: uint) -> Option<(uint, u16)> {
570 let mut iter = self.bytes[pos..].iter();
572 let b = match iter.next() {
581 } else if b == 0xED {
582 match (iter.next(), iter.next()) {
583 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
584 return Some((pos, decode_surrogate(b2, b3)))
602 fn final_lead_surrogate(&self) -> Option<u16> {
603 let len = self.len();
607 match &self.bytes[(len - 3)..] {
608 [0xED, b2 @ 0xA0...0xAF, b3] => Some(decode_surrogate(b2, b3)),
614 fn initial_trail_surrogate(&self) -> Option<u16> {
615 let len = self.len();
619 match &self.bytes[..3] {
620 [0xED, b2 @ 0xB0...0xBF, b3] => Some(decode_surrogate(b2, b3)),
627 /// Return a slice of the given string for the byte range [`begin`..`end`).
631 /// Panics when `begin` and `end` do not point to code point boundaries,
632 /// or point beyond the end of the string.
633 impl ops::Index<ops::Range<usize>> for Wtf8 {
637 fn index(&self, range: &ops::Range<usize>) -> &Wtf8 {
638 // is_code_point_boundary checks that the index is in [0, .len()]
639 if range.start <= range.end &&
640 is_code_point_boundary(self, range.start) &&
641 is_code_point_boundary(self, range.end) {
642 unsafe { slice_unchecked(self, range.start, range.end) }
644 slice_error_fail(self, range.start, range.end)
649 /// Return a slice of the given string from byte `begin` to its end.
653 /// Panics when `begin` is not at a code point boundary,
654 /// or is beyond the end of the string.
655 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
659 fn index(&self, range: &ops::RangeFrom<usize>) -> &Wtf8 {
660 // is_code_point_boundary checks that the index is in [0, .len()]
661 if is_code_point_boundary(self, range.start) {
662 unsafe { slice_unchecked(self, range.start, self.len()) }
664 slice_error_fail(self, range.start, self.len())
669 /// Return a slice of the given string from its beginning to byte `end`.
673 /// Panics when `end` is not at a code point boundary,
674 /// or is beyond the end of the string.
675 impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
679 fn index(&self, range: &ops::RangeTo<usize>) -> &Wtf8 {
680 // is_code_point_boundary checks that the index is in [0, .len()]
681 if is_code_point_boundary(self, range.end) {
682 unsafe { slice_unchecked(self, 0, range.end) }
684 slice_error_fail(self, 0, range.end)
689 impl ops::Index<ops::RangeFull> for Wtf8 {
693 fn index(&self, _range: &ops::RangeFull) -> &Wtf8 {
699 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
700 // The first byte is assumed to be 0xED
701 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
705 fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
706 let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
707 unsafe { mem::transmute(code_point) }
710 /// Copied from core::str::StrPrelude::is_char_boundary
712 pub fn is_code_point_boundary(slice: &Wtf8, index: uint) -> bool {
713 if index == slice.len() { return true; }
714 match slice.bytes.get(index) {
716 Some(&b) => b < 128u8 || b >= 192u8,
720 /// Copied from core::str::raw::slice_unchecked
722 pub unsafe fn slice_unchecked(s: &Wtf8, begin: uint, end: uint) -> &Wtf8 {
723 mem::transmute(RawSlice {
724 data: s.bytes.as_ptr().offset(begin as int),
729 /// Copied from core::str::raw::slice_error_fail
731 pub fn slice_error_fail(s: &Wtf8, begin: uint, end: uint) -> ! {
732 assert!(begin <= end);
733 panic!("index {} and/or {} in `{:?}` do not lie on character boundary",
737 /// Iterator for the code points of a WTF-8 string.
739 /// Created with the method `.code_points()`.
741 pub struct Wtf8CodePoints<'a> {
742 bytes: slice::Iter<'a, u8>
745 impl<'a> Iterator for Wtf8CodePoints<'a> {
746 type Item = CodePoint;
749 fn next(&mut self) -> Option<CodePoint> {
750 next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
754 fn size_hint(&self) -> (uint, Option<uint>) {
755 let (len, _) = self.bytes.size_hint();
756 (len.saturating_add(3) / 4, Some(len))
761 pub struct EncodeWide<'a> {
762 code_points: Wtf8CodePoints<'a>,
766 // Copied from libunicode/u_str.rs
767 impl<'a> Iterator for EncodeWide<'a> {
771 fn next(&mut self) -> Option<u16> {
773 let tmp = self.extra;
778 let mut buf = [0u16; 2];
779 self.code_points.next().map(|code_point| {
780 let n = encode_utf16_raw(code_point.value, &mut buf)
782 if n == 2 { self.extra = buf[1]; }
788 fn size_hint(&self) -> (uint, Option<uint>) {
789 let (low, high) = self.code_points.size_hint();
790 // every code point gets either one u16 or two u16,
791 // so this iterator is between 1 or 2 times as
792 // long as the underlying iterator.
793 (low, high.and_then(|n| n.checked_mul(2)))
797 impl<S: Writer + Hasher> Hash<S> for CodePoint {
799 fn hash(&self, state: &mut S) {
800 self.value.hash(state)
804 impl<S: Writer + Hasher> Hash<S> for Wtf8Buf {
806 fn hash(&self, state: &mut S) {
807 state.write(&self.bytes);
812 impl<'a, S: Writer + Hasher> Hash<S> for Wtf8 {
814 fn hash(&self, state: &mut S) {
815 state.write(&self.bytes);
820 impl AsciiExt for Wtf8 {
821 type Owned = Wtf8Buf;
823 fn is_ascii(&self) -> bool {
824 self.bytes.is_ascii()
826 fn to_ascii_uppercase(&self) -> Wtf8Buf {
827 Wtf8Buf { bytes: self.bytes.to_ascii_uppercase() }
829 fn to_ascii_lowercase(&self) -> Wtf8Buf {
830 Wtf8Buf { bytes: self.bytes.to_ascii_lowercase() }
832 fn eq_ignore_ascii_case(&self, other: &Wtf8) -> bool {
833 self.bytes.eq_ignore_ascii_case(&other.bytes)
836 fn make_ascii_uppercase(&mut self) { self.bytes.make_ascii_uppercase() }
837 fn make_ascii_lowercase(&mut self) { self.bytes.make_ascii_lowercase() }
846 use string::CowString;
849 fn code_point_from_u32() {
850 assert!(CodePoint::from_u32(0).is_some());
851 assert!(CodePoint::from_u32(0xD800).is_some());
852 assert!(CodePoint::from_u32(0x10FFFF).is_some());
853 assert!(CodePoint::from_u32(0x110000).is_none());
857 fn code_point_to_u32() {
858 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
859 assert_eq!(c(0).to_u32(), 0);
860 assert_eq!(c(0xD800).to_u32(), 0xD800);
861 assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
865 fn code_point_from_char() {
866 assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
867 assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9);
871 fn code_point_to_string() {
872 assert_eq!(format!("{:?}", CodePoint::from_char('a')), "U+0061");
873 assert_eq!(format!("{:?}", CodePoint::from_char('💩')), "U+1F4A9");
877 fn code_point_to_char() {
878 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
879 assert_eq!(c(0x61).to_char(), Some('a'));
880 assert_eq!(c(0x1F4A9).to_char(), Some('💩'));
881 assert_eq!(c(0xD800).to_char(), None);
885 fn code_point_to_char_lossy() {
886 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
887 assert_eq!(c(0x61).to_char_lossy(), 'a');
888 assert_eq!(c(0x1F4A9).to_char_lossy(), '💩');
889 assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}');
894 assert_eq!(Wtf8Buf::new().bytes, b"");
898 fn wtf8buf_from_str() {
899 assert_eq!(Wtf8Buf::from_str("").bytes, b"");
900 assert_eq!(Wtf8Buf::from_str("aé 💩").bytes,
901 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
905 fn wtf8buf_from_string() {
906 assert_eq!(Wtf8Buf::from_string(String::from_str("")).bytes, b"");
907 assert_eq!(Wtf8Buf::from_string(String::from_str("aé 💩")).bytes,
908 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
912 fn wtf8buf_from_wide() {
913 assert_eq!(Wtf8Buf::from_wide(&[]).bytes, b"");
914 assert_eq!(Wtf8Buf::from_wide(
915 &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes,
916 b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9");
920 fn wtf8buf_push_str() {
921 let mut string = Wtf8Buf::new();
922 assert_eq!(string.bytes, b"");
923 string.push_str("aé 💩");
924 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
928 fn wtf8buf_push_char() {
929 let mut string = Wtf8Buf::from_str("aé ");
930 assert_eq!(string.bytes, b"a\xC3\xA9 ");
931 string.push_char('💩');
932 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
937 let mut string = Wtf8Buf::from_str("aé ");
938 assert_eq!(string.bytes, b"a\xC3\xA9 ");
939 string.push(CodePoint::from_char('💩'));
940 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
942 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
944 let mut string = Wtf8Buf::new();
945 string.push(c(0xD83D)); // lead
946 string.push(c(0xDCA9)); // trail
947 assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic!
949 let mut string = Wtf8Buf::new();
950 string.push(c(0xD83D)); // lead
951 string.push(c(0x20)); // not surrogate
952 string.push(c(0xDCA9)); // trail
953 assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
955 let mut string = Wtf8Buf::new();
956 string.push(c(0xD800)); // lead
957 string.push(c(0xDBFF)); // lead
958 assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
960 let mut string = Wtf8Buf::new();
961 string.push(c(0xD800)); // lead
962 string.push(c(0xE000)); // not surrogate
963 assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
965 let mut string = Wtf8Buf::new();
966 string.push(c(0xD7FF)); // not surrogate
967 string.push(c(0xDC00)); // trail
968 assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
970 let mut string = Wtf8Buf::new();
971 string.push(c(0x61)); // not surrogate, < 3 bytes
972 string.push(c(0xDC00)); // trail
973 assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
975 let mut string = Wtf8Buf::new();
976 string.push(c(0xDC00)); // trail
977 assert_eq!(string.bytes, b"\xED\xB0\x80");
981 fn wtf8buf_push_wtf8() {
982 let mut string = Wtf8Buf::from_str("aé");
983 assert_eq!(string.bytes, b"a\xC3\xA9");
984 string.push_wtf8(Wtf8::from_str(" 💩"));
985 assert_eq!(string.bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
987 fn w(value: &[u8]) -> &Wtf8 { unsafe { transmute(value) } }
989 let mut string = Wtf8Buf::new();
990 string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
991 string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
992 assert_eq!(string.bytes, b"\xF0\x9F\x92\xA9"); // Magic!
994 let mut string = Wtf8Buf::new();
995 string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
996 string.push_wtf8(w(b" ")); // not surrogate
997 string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
998 assert_eq!(string.bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1000 let mut string = Wtf8Buf::new();
1001 string.push_wtf8(w(b"\xED\xA0\x80")); // lead
1002 string.push_wtf8(w(b"\xED\xAF\xBF")); // lead
1003 assert_eq!(string.bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1005 let mut string = Wtf8Buf::new();
1006 string.push_wtf8(w(b"\xED\xA0\x80")); // lead
1007 string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate
1008 assert_eq!(string.bytes, b"\xED\xA0\x80\xEE\x80\x80");
1010 let mut string = Wtf8Buf::new();
1011 string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate
1012 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1013 assert_eq!(string.bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1015 let mut string = Wtf8Buf::new();
1016 string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes
1017 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1018 assert_eq!(string.bytes, b"\x61\xED\xB0\x80");
1020 let mut string = Wtf8Buf::new();
1021 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
1022 assert_eq!(string.bytes, b"\xED\xB0\x80");
1026 fn wtf8buf_truncate() {
1027 let mut string = Wtf8Buf::from_str("aé");
1029 assert_eq!(string.bytes, b"a");
1034 fn wtf8buf_truncate_fail_code_point_boundary() {
1035 let mut string = Wtf8Buf::from_str("aé");
1041 fn wtf8buf_truncate_fail_longer() {
1042 let mut string = Wtf8Buf::from_str("aé");
1047 fn wtf8buf_into_string() {
1048 let mut string = Wtf8Buf::from_str("aé 💩");
1049 assert_eq!(string.clone().into_string(), Ok(String::from_str("aé 💩")));
1050 string.push(CodePoint::from_u32(0xD800).unwrap());
1051 assert_eq!(string.clone().into_string(), Err(string));
1055 fn wtf8buf_into_string_lossy() {
1056 let mut string = Wtf8Buf::from_str("aé 💩");
1057 assert_eq!(string.clone().into_string_lossy(), String::from_str("aé 💩"));
1058 string.push(CodePoint::from_u32(0xD800).unwrap());
1059 assert_eq!(string.clone().into_string_lossy(), String::from_str("aé 💩�"));
1063 fn wtf8buf_from_iterator() {
1064 fn f(values: &[u32]) -> Wtf8Buf {
1065 values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
1067 assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1069 assert_eq!(f(&[0xD83D, 0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
1070 assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1071 assert_eq!(f(&[0xD800, 0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1072 assert_eq!(f(&[0xD800, 0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1073 assert_eq!(f(&[0xD7FF, 0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1074 assert_eq!(f(&[0x61, 0xDC00]).bytes, b"\x61\xED\xB0\x80");
1075 assert_eq!(f(&[0xDC00]).bytes, b"\xED\xB0\x80");
1079 fn wtf8buf_extend() {
1080 fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
1081 fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() }
1082 let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
1083 string.extend(extended.iter().map(c));
1087 assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes,
1088 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1090 assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes, b"\xF0\x9F\x92\xA9"); // Magic!
1091 assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes, b"\xED\xA0\xBD \xED\xB2\xA9");
1092 assert_eq!(e(&[0xD800], &[0xDBFF]).bytes, b"\xED\xA0\x80\xED\xAF\xBF");
1093 assert_eq!(e(&[0xD800], &[0xE000]).bytes, b"\xED\xA0\x80\xEE\x80\x80");
1094 assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes, b"\xED\x9F\xBF\xED\xB0\x80");
1095 assert_eq!(e(&[0x61], &[0xDC00]).bytes, b"\x61\xED\xB0\x80");
1096 assert_eq!(e(&[], &[0xDC00]).bytes, b"\xED\xB0\x80");
1101 let mut string = Wtf8Buf::from_str("aé 💩");
1102 string.push(CodePoint::from_u32(0xD800).unwrap());
1103 assert_eq!(format!("{:?}", string), r#""aé 💩\u{D800}""#);
1107 fn wtf8buf_as_slice() {
1108 assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé"));
1113 let mut string = Wtf8Buf::from_str("aé 💩");
1114 string.push(CodePoint::from_u32(0xD800).unwrap());
1115 assert_eq!(format!("{:?}", string), r#""aé 💩\u{D800}""#);
1119 fn wtf8_from_str() {
1120 assert_eq!(&Wtf8::from_str("").bytes, b"");
1121 assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1126 assert_eq!(Wtf8::from_str("").len(), 0);
1127 assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1132 assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 ");
1137 fn wtf8_slice_not_code_point_boundary() {
1138 &Wtf8::from_str("aé 💩")[2.. 4];
1142 fn wtf8_slice_from() {
1143 assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
1148 fn wtf8_slice_from_not_code_point_boundary() {
1149 &Wtf8::from_str("aé 💩")[2..];
1153 fn wtf8_slice_to() {
1154 assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
1159 fn wtf8_slice_to_not_code_point_boundary() {
1160 &Wtf8::from_str("aé 💩")[5..];
1164 fn wtf8_ascii_byte_at() {
1165 let slice = Wtf8::from_str("aé 💩");
1166 assert_eq!(slice.ascii_byte_at(0), b'a');
1167 assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1168 assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1169 assert_eq!(slice.ascii_byte_at(3), b' ');
1170 assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1174 fn wtf8_code_point_at() {
1175 let mut string = Wtf8Buf::from_str("aé ");
1176 string.push(CodePoint::from_u32(0xD83D).unwrap());
1177 string.push_char('💩');
1178 assert_eq!(string.code_point_at(0), CodePoint::from_char('a'));
1179 assert_eq!(string.code_point_at(1), CodePoint::from_char('é'));
1180 assert_eq!(string.code_point_at(3), CodePoint::from_char(' '));
1181 assert_eq!(string.code_point_at(4), CodePoint::from_u32(0xD83D).unwrap());
1182 assert_eq!(string.code_point_at(7), CodePoint::from_char('💩'));
1186 fn wtf8_code_point_range_at() {
1187 let mut string = Wtf8Buf::from_str("aé ");
1188 string.push(CodePoint::from_u32(0xD83D).unwrap());
1189 string.push_char('💩');
1190 assert_eq!(string.code_point_range_at(0), (CodePoint::from_char('a'), 1));
1191 assert_eq!(string.code_point_range_at(1), (CodePoint::from_char('é'), 3));
1192 assert_eq!(string.code_point_range_at(3), (CodePoint::from_char(' '), 4));
1193 assert_eq!(string.code_point_range_at(4), (CodePoint::from_u32(0xD83D).unwrap(), 7));
1194 assert_eq!(string.code_point_range_at(7), (CodePoint::from_char('💩'), 11));
1198 fn wtf8_code_points() {
1199 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
1200 fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1201 string.code_points().map(|c| c.to_char()).collect::<Vec<_>>()
1203 let mut string = Wtf8Buf::from_str("é ");
1204 assert_eq!(cp(&string), vec![Some('é'), Some(' ')]);
1205 string.push(c(0xD83D));
1206 assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]);
1207 string.push(c(0xDCA9));
1208 assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]);
1213 assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1214 assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1215 let mut string = Wtf8Buf::new();
1216 string.push(CodePoint::from_u32(0xD800).unwrap());
1217 assert_eq!(string.as_str(), None);
1221 fn wtf8_to_string_lossy() {
1222 assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1223 assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
1224 let mut string = Wtf8Buf::from_str("aé 💩");
1225 string.push(CodePoint::from_u32(0xD800).unwrap());
1226 let expected: CowString = Cow::Owned(String::from_str("aé 💩�"));
1227 assert_eq!(string.to_string_lossy(), expected);
1231 fn wtf8_encode_wide() {
1232 let mut string = Wtf8Buf::from_str("aé ");
1233 string.push(CodePoint::from_u32(0xD83D).unwrap());
1234 string.push_char('💩');
1235 assert_eq!(string.encode_wide().collect::<Vec<_>>(),
1236 vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);