1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Implementation of [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
13 //! This library uses Rust’s type system to maintain
14 //! [well-formedness](https://simonsapin.github.io/wtf-8/#well-formed),
15 //! like the `String` and `&str` types do for UTF-8.
17 //! Since [WTF-8 must not be used
18 //! for interchange](https://simonsapin.github.io/wtf-8/#intended-audience),
19 //! this library deliberately does not provide access to the underlying bytes
21 //! nor can it decode WTF-8 from arbitrary bytes.
22 //! WTF-8 strings can be obtained from UTF-8, UTF-16, or code points.
26 use core::char::{encode_utf8_raw, encode_utf16_raw};
27 use core::str::{char_range_at_raw, next_code_point};
28 use core::raw::Slice as RawSlice;
33 use hash::{Hash, Writer, Hasher};
34 use iter::FromIterator;
40 use string::{String, CowString};
41 use unicode::str::{Utf16Item, utf16_items};
44 static UTF8_REPLACEMENT_CHARACTER: &'static [u8] = b"\xEF\xBF\xBD";
46 /// A Unicode code point: from U+0000 to U+10FFFF.
48 /// Compare with the `char` type,
49 /// which represents a Unicode scalar value:
50 /// a code point that is not a surrogate (U+D800 to U+DFFF).
51 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone, Copy)]
52 pub struct CodePoint {
56 /// Format the code point as `U+` followed by four to six hexadecimal digits.
57 /// Example: `U+1F4A9`
58 impl fmt::Debug for CodePoint {
60 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
61 write!(formatter, "U+{:04X}", self.value)
66 /// Unsafely create a new `CodePoint` without checking the value.
68 /// Only use when `value` is known to be less than or equal to 0x10FFFF.
70 pub unsafe fn from_u32_unchecked(value: u32) -> CodePoint {
71 CodePoint { value: value }
74 /// Create a new `CodePoint` if the value is a valid code point.
76 /// Return `None` if `value` is above 0x10FFFF.
78 pub fn from_u32(value: u32) -> Option<CodePoint> {
80 0 ... 0x10FFFF => Some(CodePoint { value: value }),
85 /// Create a new `CodePoint` from a `char`.
87 /// Since all Unicode scalar values are code points, this always succeds.
89 pub fn from_char(value: char) -> CodePoint {
90 CodePoint { value: value as u32 }
93 /// Return the numeric value of the code point.
95 pub fn to_u32(&self) -> u32 {
99 /// Optionally return a Unicode scalar value for the code point.
101 /// Return `None` if the code point is a surrogate (from U+D800 to U+DFFF).
103 pub fn to_char(&self) -> Option<char> {
105 0xD800 ... 0xDFFF => None,
106 _ => Some(unsafe { mem::transmute(self.value) })
110 /// Return a Unicode scalar value for the code point.
112 /// Return `'\u{FFFD}'` (the replacement character “�”)
113 /// if the code point is a surrogate (from U+D800 to U+DFFF).
115 pub fn to_char_lossy(&self) -> char {
116 self.to_char().unwrap_or('\u{FFFD}')
120 /// An owned, growable string of well-formed WTF-8 data.
122 /// Similar to `String`, but can additionally contain surrogate code points
123 /// if they’re not in a surrogate pair.
124 #[derive(Eq, PartialEq, Ord, PartialOrd, Clone)]
129 impl ops::Deref for Wtf8Buf {
132 fn deref(&self) -> &Wtf8 {
137 /// Format the string with double quotes,
138 /// and surrogates as `\u` followed by four hexadecimal digits.
139 /// Example: `"a\u{D800}"` for a string with code points [U+0061, U+D800]
140 impl fmt::Debug for Wtf8Buf {
142 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
143 self.as_slice().fmt(formatter)
148 /// Create an new, empty WTF-8 string.
150 pub fn new() -> Wtf8Buf {
151 Wtf8Buf { bytes: Vec::new() }
154 /// Create an new, empty WTF-8 string with pre-allocated capacity for `n` bytes.
156 pub fn with_capacity(n: uint) -> Wtf8Buf {
157 Wtf8Buf { bytes: Vec::with_capacity(n) }
160 /// Create a WTF-8 string from an UTF-8 `String`.
162 /// This takes ownership of the `String` and does not copy.
164 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
166 pub fn from_string(string: String) -> Wtf8Buf {
167 Wtf8Buf { bytes: string.into_bytes() }
170 /// Create a WTF-8 string from an UTF-8 `&str` slice.
172 /// This copies the content of the slice.
174 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
176 pub fn from_str(str: &str) -> Wtf8Buf {
177 Wtf8Buf { bytes: slice::SliceExt::to_vec(str.as_bytes()) }
180 /// Create a WTF-8 string from a potentially ill-formed UTF-16 slice of 16-bit code units.
182 /// This is lossless: calling `.encode_wide()` on the resulting string
183 /// will always return the original code units.
184 pub fn from_wide(v: &[u16]) -> Wtf8Buf {
185 let mut string = Wtf8Buf::with_capacity(v.len());
186 for item in utf16_items(v) {
188 Utf16Item::ScalarValue(c) => string.push_char(c),
189 Utf16Item::LoneSurrogate(s) => {
190 // Surrogates are known to be in the code point range.
191 let code_point = unsafe { CodePoint::from_u32_unchecked(s as u32) };
192 // Skip the WTF-8 concatenation check,
193 // surrogate pairs are already decoded by utf16_items
194 string.push_code_point_unchecked(code_point)
201 /// Copied from String::push
202 /// This does **not** include the WTF-8 concatenation check.
203 fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
204 let cur_len = self.len();
205 // This may use up to 4 bytes.
209 // Attempt to not use an intermediate buffer by just pushing bytes
210 // directly onto this string.
211 let slice = RawSlice {
212 data: self.bytes.as_ptr().offset(cur_len as int),
215 let used = encode_utf8_raw(code_point.value, mem::transmute(slice))
217 self.bytes.set_len(cur_len + used);
222 pub fn as_slice(&self) -> &Wtf8 {
223 unsafe { mem::transmute(self.bytes.as_slice()) }
226 /// Reserves capacity for at least `additional` more bytes to be inserted
227 /// in the given `Wtf8Buf`.
228 /// The collection may reserve more space to avoid frequent reallocations.
232 /// Panics if the new capacity overflows `uint`.
234 pub fn reserve(&mut self, additional: uint) {
235 self.bytes.reserve(additional)
238 /// Returns the number of bytes that this string buffer can hold without reallocating.
240 pub fn capacity(&self) -> uint {
241 self.bytes.capacity()
244 /// Append an UTF-8 slice at the end of the string.
246 pub fn push_str(&mut self, other: &str) {
247 self.bytes.push_all(other.as_bytes())
250 /// Append a WTF-8 slice at the end of the string.
252 /// This replaces newly paired surrogates at the boundary
253 /// with a supplementary code point,
254 /// like concatenating ill-formed UTF-16 strings effectively would.
256 pub fn push_wtf8(&mut self, other: &Wtf8) {
257 match ((&*self).final_lead_surrogate(), other.initial_trail_surrogate()) {
258 // Replace newly paired surrogates by a supplementary code point.
259 (Some(lead), Some(trail)) => {
260 let len_without_lead_surrogate = self.len() - 3;
261 self.bytes.truncate(len_without_lead_surrogate);
262 let other_without_trail_surrogate = &other.bytes[3..];
263 // 4 bytes for the supplementary code point
264 self.bytes.reserve(4 + other_without_trail_surrogate.len());
265 self.push_char(decode_surrogate_pair(lead, trail));
266 self.bytes.push_all(other_without_trail_surrogate);
268 _ => self.bytes.push_all(&other.bytes)
272 /// Append a Unicode scalar value at the end of the string.
274 pub fn push_char(&mut self, c: char) {
275 self.push_code_point_unchecked(CodePoint::from_char(c))
278 /// Append a code point at the end of the string.
280 /// This replaces newly paired surrogates at the boundary
281 /// with a supplementary code point,
282 /// like concatenating ill-formed UTF-16 strings effectively would.
284 pub fn push(&mut self, code_point: CodePoint) {
285 match code_point.to_u32() {
286 trail @ 0xDC00...0xDFFF => {
287 match (&*self).final_lead_surrogate() {
289 let len_without_lead_surrogate = self.len() - 3;
290 self.bytes.truncate(len_without_lead_surrogate);
291 self.push_char(decode_surrogate_pair(lead, trail as u16));
300 // No newly paired surrogates at the boundary.
301 self.push_code_point_unchecked(code_point)
304 /// Shortens a string to the specified length.
308 /// Panics if `new_len` > current length,
309 /// or if `new_len` is not a code point boundary.
311 pub fn truncate(&mut self, new_len: uint) {
312 assert!(is_code_point_boundary(self.as_slice(), new_len));
313 self.bytes.truncate(new_len)
316 /// Consume the WTF-8 string and try to convert it to UTF-8.
318 /// This does not copy the data.
320 /// If the contents are not well-formed UTF-8
321 /// (that is, if the string contains surrogates),
322 /// the original WTF-8 string is returned instead.
323 pub fn into_string(self) -> Result<String, Wtf8Buf> {
324 match self.next_surrogate(0) {
325 None => Ok(unsafe { String::from_utf8_unchecked(self.bytes) }),
326 Some(_) => Err(self),
330 /// Consume the WTF-8 string and convert it lossily to UTF-8.
332 /// This does not copy the data (but may overwrite parts of it in place).
334 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”)
335 pub fn into_string_lossy(mut self) -> String {
338 match self.next_surrogate(pos) {
339 Some((surrogate_pos, _)) => {
340 pos = surrogate_pos + 3;
341 slice::bytes::copy_memory(
342 &mut self.bytes[surrogate_pos .. pos],
343 UTF8_REPLACEMENT_CHARACTER
346 None => return unsafe { String::from_utf8_unchecked(self.bytes) }
352 /// Create a new WTF-8 string from an iterator of code points.
354 /// This replaces surrogate code point pairs with supplementary code points,
355 /// like concatenating ill-formed UTF-16 strings effectively would.
356 impl FromIterator<CodePoint> for Wtf8Buf {
357 fn from_iter<T: Iterator<Item=CodePoint>>(iterator: T) -> Wtf8Buf {
358 let mut string = Wtf8Buf::new();
359 string.extend(iterator);
364 /// Append code points from an iterator to the string.
366 /// This replaces surrogate code point pairs with supplementary code points,
367 /// like concatenating ill-formed UTF-16 strings effectively would.
368 impl Extend<CodePoint> for Wtf8Buf {
369 fn extend<T: Iterator<Item=CodePoint>>(&mut self, mut iterator: T) {
370 let (low, _high) = iterator.size_hint();
371 // Lower bound of one byte per code point (ASCII only)
372 self.bytes.reserve(low);
373 for code_point in iterator {
374 self.push(code_point);
379 /// A borrowed slice of well-formed WTF-8 data.
381 /// Similar to `&str`, but can additionally contain surrogate code points
382 /// if they’re not in a surrogate pair.
387 // FIXME: https://github.com/rust-lang/rust/issues/18805
388 impl PartialEq for Wtf8 {
389 fn eq(&self, other: &Wtf8) -> bool { self.bytes.eq(&other.bytes) }
392 // FIXME: https://github.com/rust-lang/rust/issues/18805
395 // FIXME: https://github.com/rust-lang/rust/issues/18738
396 impl PartialOrd for Wtf8 {
398 fn partial_cmp(&self, other: &Wtf8) -> Option<cmp::Ordering> {
399 self.bytes.partial_cmp(&other.bytes)
402 fn lt(&self, other: &Wtf8) -> bool { self.bytes.lt(&other.bytes) }
404 fn le(&self, other: &Wtf8) -> bool { self.bytes.le(&other.bytes) }
406 fn gt(&self, other: &Wtf8) -> bool { self.bytes.gt(&other.bytes) }
408 fn ge(&self, other: &Wtf8) -> bool { self.bytes.ge(&other.bytes) }
411 // FIXME: https://github.com/rust-lang/rust/issues/18738
414 fn cmp(&self, other: &Wtf8) -> cmp::Ordering { self.bytes.cmp(&other.bytes) }
417 /// Format the slice with double quotes,
418 /// and surrogates as `\u` followed by four hexadecimal digits.
419 /// Example: `"a\u{D800}"` for a slice with code points [U+0061, U+D800]
420 impl fmt::Debug for Wtf8 {
421 fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
422 try!(formatter.write_str("\""));
425 match self.next_surrogate(pos) {
427 Some((surrogate_pos, surrogate)) => {
428 try!(formatter.write_str(unsafe {
429 // the data in this slice is valid UTF-8, transmute to &str
430 mem::transmute(&self.bytes[pos .. surrogate_pos])
432 try!(write!(formatter, "\\u{{{:X}}}", surrogate));
433 pos = surrogate_pos + 3;
437 try!(formatter.write_str(unsafe {
438 // the data in this slice is valid UTF-8, transmute to &str
439 mem::transmute(&self.bytes[pos..])
441 formatter.write_str("\"")
446 /// Create a WTF-8 slice from a UTF-8 `&str` slice.
448 /// Since WTF-8 is a superset of UTF-8, this always succeeds.
450 pub fn from_str(value: &str) -> &Wtf8 {
451 unsafe { mem::transmute(value.as_bytes()) }
454 /// Return the length, in WTF-8 bytes.
456 pub fn len(&self) -> uint {
460 /// Return the code point at `position` if it is in the ASCII range,
461 /// or `b'\xFF' otherwise.
465 /// Panics if `position` is beyond the end of the string.
467 pub fn ascii_byte_at(&self, position: uint) -> u8 {
468 match self.bytes[position] {
469 ascii_byte @ 0x00 ... 0x7F => ascii_byte,
474 /// Return the code point at `position`.
478 /// Panics if `position` is not at a code point boundary,
479 /// or is beyond the end of the string.
481 pub fn code_point_at(&self, position: uint) -> CodePoint {
482 let (code_point, _) = self.code_point_range_at(position);
486 /// Return the code point at `position`
487 /// and the position of the next code point.
491 /// Panics if `position` is not at a code point boundary,
492 /// or is beyond the end of the string.
494 pub fn code_point_range_at(&self, position: uint) -> (CodePoint, uint) {
495 let (c, n) = char_range_at_raw(&self.bytes, position);
496 (CodePoint { value: c }, n)
499 /// Return an iterator for the string’s code points.
501 pub fn code_points(&self) -> Wtf8CodePoints {
502 Wtf8CodePoints { bytes: self.bytes.iter() }
505 /// Try to convert the string to UTF-8 and return a `&str` slice.
507 /// Return `None` if the string contains surrogates.
509 /// This does not copy the data.
511 pub fn as_str(&self) -> Option<&str> {
512 // Well-formed WTF-8 is also well-formed UTF-8
513 // if and only if it contains no surrogate.
514 match self.next_surrogate(0) {
515 None => Some(unsafe { str::from_utf8_unchecked(&self.bytes) }),
520 /// Lossily convert the string to UTF-8.
521 /// Return an UTF-8 `&str` slice if the contents are well-formed in UTF-8.
523 /// Surrogates are replaced with `"\u{FFFD}"` (the replacement character “�”).
525 /// This only copies the data if necessary (if it contains any surrogate).
526 pub fn to_string_lossy(&self) -> CowString {
527 let surrogate_pos = match self.next_surrogate(0) {
528 None => return Cow::Borrowed(unsafe { str::from_utf8_unchecked(&self.bytes) }),
529 Some((pos, _)) => pos,
531 let wtf8_bytes = &self.bytes;
532 let mut utf8_bytes = Vec::with_capacity(self.len());
533 utf8_bytes.push_all(&wtf8_bytes[..surrogate_pos]);
534 utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
535 let mut pos = surrogate_pos + 3;
537 match self.next_surrogate(pos) {
538 Some((surrogate_pos, _)) => {
539 utf8_bytes.push_all(&wtf8_bytes[pos .. surrogate_pos]);
540 utf8_bytes.push_all(UTF8_REPLACEMENT_CHARACTER);
541 pos = surrogate_pos + 3;
544 utf8_bytes.push_all(&wtf8_bytes[pos..]);
545 return Cow::Owned(unsafe { String::from_utf8_unchecked(utf8_bytes) })
551 /// Convert the WTF-8 string to potentially ill-formed UTF-16
552 /// and return an iterator of 16-bit code units.
554 /// This is lossless:
555 /// calling `Wtf8Buf::from_ill_formed_utf16` on the resulting code units
556 /// would always return the original WTF-8 string.
558 pub fn encode_wide(&self) -> EncodeWide {
559 EncodeWide { code_points: self.code_points(), extra: 0 }
563 fn next_surrogate(&self, mut pos: uint) -> Option<(uint, u16)> {
564 let mut iter = self.bytes[pos..].iter();
566 let b = match iter.next() {
575 } else if b == 0xED {
576 match (iter.next(), iter.next()) {
577 (Some(&b2), Some(&b3)) if b2 >= 0xA0 => {
578 return Some((pos, decode_surrogate(b2, b3)))
596 fn final_lead_surrogate(&self) -> Option<u16> {
597 let len = self.len();
601 match &self.bytes[(len - 3)..] {
602 [0xED, b2 @ 0xA0...0xAF, b3] => Some(decode_surrogate(b2, b3)),
608 fn initial_trail_surrogate(&self) -> Option<u16> {
609 let len = self.len();
613 match &self.bytes[..3] {
614 [0xED, b2 @ 0xB0...0xBF, b3] => Some(decode_surrogate(b2, b3)),
621 /// Return a slice of the given string for the byte range [`begin`..`end`).
625 /// Panics when `begin` and `end` do not point to code point boundaries,
626 /// or point beyond the end of the string.
627 impl ops::Index<ops::Range<usize>> for Wtf8 {
631 fn index(&self, range: &ops::Range<usize>) -> &Wtf8 {
632 // is_code_point_boundary checks that the index is in [0, .len()]
633 if range.start <= range.end &&
634 is_code_point_boundary(self, range.start) &&
635 is_code_point_boundary(self, range.end) {
636 unsafe { slice_unchecked(self, range.start, range.end) }
638 slice_error_fail(self, range.start, range.end)
643 /// Return a slice of the given string from byte `begin` to its end.
647 /// Panics when `begin` is not at a code point boundary,
648 /// or is beyond the end of the string.
649 impl ops::Index<ops::RangeFrom<usize>> for Wtf8 {
653 fn index(&self, range: &ops::RangeFrom<usize>) -> &Wtf8 {
654 // is_code_point_boundary checks that the index is in [0, .len()]
655 if is_code_point_boundary(self, range.start) {
656 unsafe { slice_unchecked(self, range.start, self.len()) }
658 slice_error_fail(self, range.start, self.len())
663 /// Return a slice of the given string from its beginning to byte `end`.
667 /// Panics when `end` is not at a code point boundary,
668 /// or is beyond the end of the string.
669 impl ops::Index<ops::RangeTo<usize>> for Wtf8 {
673 fn index(&self, range: &ops::RangeTo<usize>) -> &Wtf8 {
674 // is_code_point_boundary checks that the index is in [0, .len()]
675 if is_code_point_boundary(self, range.end) {
676 unsafe { slice_unchecked(self, 0, range.end) }
678 slice_error_fail(self, 0, range.end)
683 impl ops::Index<ops::RangeFull> for Wtf8 {
687 fn index(&self, _range: &ops::RangeFull) -> &Wtf8 {
693 fn decode_surrogate(second_byte: u8, third_byte: u8) -> u16 {
694 // The first byte is assumed to be 0xED
695 0xD800 | (second_byte as u16 & 0x3F) << 6 | third_byte as u16 & 0x3F
699 fn decode_surrogate_pair(lead: u16, trail: u16) -> char {
700 let code_point = 0x10000 + ((((lead - 0xD800) as u32) << 10) | (trail - 0xDC00) as u32);
701 unsafe { mem::transmute(code_point) }
704 /// Copied from core::str::StrPrelude::is_char_boundary
706 pub fn is_code_point_boundary(slice: &Wtf8, index: uint) -> bool {
707 if index == slice.len() { return true; }
708 match slice.bytes.get(index) {
710 Some(&b) => b < 128u8 || b >= 192u8,
714 /// Copied from core::str::raw::slice_unchecked
716 pub unsafe fn slice_unchecked(s: &Wtf8, begin: uint, end: uint) -> &Wtf8 {
717 mem::transmute(RawSlice {
718 data: s.bytes.as_ptr().offset(begin as int),
723 /// Copied from core::str::raw::slice_error_fail
725 pub fn slice_error_fail(s: &Wtf8, begin: uint, end: uint) -> ! {
726 assert!(begin <= end);
727 panic!("index {} and/or {} in `{:?}` do not lie on character boundary",
731 /// Iterator for the code points of a WTF-8 string.
733 /// Created with the method `.code_points()`.
735 pub struct Wtf8CodePoints<'a> {
736 bytes: slice::Iter<'a, u8>
739 impl<'a> Iterator for Wtf8CodePoints<'a> {
740 type Item = CodePoint;
743 fn next(&mut self) -> Option<CodePoint> {
744 next_code_point(&mut self.bytes).map(|c| CodePoint { value: c })
748 fn size_hint(&self) -> (uint, Option<uint>) {
749 let (len, _) = self.bytes.size_hint();
750 (len.saturating_add(3) / 4, Some(len))
755 pub struct EncodeWide<'a> {
756 code_points: Wtf8CodePoints<'a>,
760 // Copied from libunicode/u_str.rs
761 impl<'a> Iterator for EncodeWide<'a> {
765 fn next(&mut self) -> Option<u16> {
767 let tmp = self.extra;
772 let mut buf = [0u16; 2];
773 self.code_points.next().map(|code_point| {
774 let n = encode_utf16_raw(code_point.value, buf.as_mut_slice())
776 if n == 2 { self.extra = buf[1]; }
782 fn size_hint(&self) -> (uint, Option<uint>) {
783 let (low, high) = self.code_points.size_hint();
784 // every code point gets either one u16 or two u16,
785 // so this iterator is between 1 or 2 times as
786 // long as the underlying iterator.
787 (low, high.and_then(|n| n.checked_mul(2)))
791 impl<S: Writer + Hasher> Hash<S> for CodePoint {
793 fn hash(&self, state: &mut S) {
794 self.value.hash(state)
798 impl<S: Writer + Hasher> Hash<S> for Wtf8Buf {
800 fn hash(&self, state: &mut S) {
801 state.write(self.bytes.as_slice());
806 impl<'a, S: Writer + Hasher> Hash<S> for Wtf8 {
808 fn hash(&self, state: &mut S) {
809 state.write(&self.bytes);
820 use string::CowString;
823 fn code_point_from_u32() {
824 assert!(CodePoint::from_u32(0).is_some());
825 assert!(CodePoint::from_u32(0xD800).is_some());
826 assert!(CodePoint::from_u32(0x10FFFF).is_some());
827 assert!(CodePoint::from_u32(0x110000).is_none());
831 fn code_point_to_u32() {
832 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
833 assert_eq!(c(0).to_u32(), 0);
834 assert_eq!(c(0xD800).to_u32(), 0xD800);
835 assert_eq!(c(0x10FFFF).to_u32(), 0x10FFFF);
839 fn code_point_from_char() {
840 assert_eq!(CodePoint::from_char('a').to_u32(), 0x61);
841 assert_eq!(CodePoint::from_char('💩').to_u32(), 0x1F4A9);
845 fn code_point_to_string() {
846 assert_eq!(format!("{:?}", CodePoint::from_char('a')).as_slice(), "U+0061");
847 assert_eq!(format!("{:?}", CodePoint::from_char('💩')).as_slice(), "U+1F4A9");
851 fn code_point_to_char() {
852 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
853 assert_eq!(c(0x61).to_char(), Some('a'));
854 assert_eq!(c(0x1F4A9).to_char(), Some('💩'));
855 assert_eq!(c(0xD800).to_char(), None);
859 fn code_point_to_char_lossy() {
860 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
861 assert_eq!(c(0x61).to_char_lossy(), 'a');
862 assert_eq!(c(0x1F4A9).to_char_lossy(), '💩');
863 assert_eq!(c(0xD800).to_char_lossy(), '\u{FFFD}');
868 assert_eq!(Wtf8Buf::new().bytes.as_slice(), b"");
872 fn wtf8buf_from_str() {
873 assert_eq!(Wtf8Buf::from_str("").bytes.as_slice(), b"");
874 assert_eq!(Wtf8Buf::from_str("aé 💩").bytes.as_slice(),
875 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
879 fn wtf8buf_from_string() {
880 assert_eq!(Wtf8Buf::from_string(String::from_str("")).bytes.as_slice(), b"");
881 assert_eq!(Wtf8Buf::from_string(String::from_str("aé 💩")).bytes.as_slice(),
882 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
886 fn wtf8buf_from_wide() {
887 assert_eq!(Wtf8Buf::from_wide(&[]).bytes.as_slice(), b"");
888 assert_eq!(Wtf8Buf::from_wide(
889 &[0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]).bytes.as_slice(),
890 b"a\xC3\xA9 \xED\xA0\xBD\xF0\x9F\x92\xA9");
894 fn wtf8buf_push_str() {
895 let mut string = Wtf8Buf::new();
896 assert_eq!(string.bytes.as_slice(), b"");
897 string.push_str("aé 💩");
898 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 \xF0\x9F\x92\xA9");
902 fn wtf8buf_push_char() {
903 let mut string = Wtf8Buf::from_str("aé ");
904 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 ");
905 string.push_char('💩');
906 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 \xF0\x9F\x92\xA9");
911 let mut string = Wtf8Buf::from_str("aé ");
912 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 ");
913 string.push(CodePoint::from_char('💩'));
914 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 \xF0\x9F\x92\xA9");
916 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
918 let mut string = Wtf8Buf::new();
919 string.push(c(0xD83D)); // lead
920 string.push(c(0xDCA9)); // trail
921 assert_eq!(string.bytes.as_slice(), b"\xF0\x9F\x92\xA9"); // Magic!
923 let mut string = Wtf8Buf::new();
924 string.push(c(0xD83D)); // lead
925 string.push(c(0x20)); // not surrogate
926 string.push(c(0xDCA9)); // trail
927 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\xBD \xED\xB2\xA9");
929 let mut string = Wtf8Buf::new();
930 string.push(c(0xD800)); // lead
931 string.push(c(0xDBFF)); // lead
932 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\x80\xED\xAF\xBF");
934 let mut string = Wtf8Buf::new();
935 string.push(c(0xD800)); // lead
936 string.push(c(0xE000)); // not surrogate
937 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\x80\xEE\x80\x80");
939 let mut string = Wtf8Buf::new();
940 string.push(c(0xD7FF)); // not surrogate
941 string.push(c(0xDC00)); // trail
942 assert_eq!(string.bytes.as_slice(), b"\xED\x9F\xBF\xED\xB0\x80");
944 let mut string = Wtf8Buf::new();
945 string.push(c(0x61)); // not surrogate, < 3 bytes
946 string.push(c(0xDC00)); // trail
947 assert_eq!(string.bytes.as_slice(), b"\x61\xED\xB0\x80");
949 let mut string = Wtf8Buf::new();
950 string.push(c(0xDC00)); // trail
951 assert_eq!(string.bytes.as_slice(), b"\xED\xB0\x80");
955 fn wtf8buf_push_wtf8() {
956 let mut string = Wtf8Buf::from_str("aé");
957 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9");
958 string.push_wtf8(Wtf8::from_str(" 💩"));
959 assert_eq!(string.bytes.as_slice(), b"a\xC3\xA9 \xF0\x9F\x92\xA9");
961 fn w(value: &[u8]) -> &Wtf8 { unsafe { transmute(value) } }
963 let mut string = Wtf8Buf::new();
964 string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
965 string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
966 assert_eq!(string.bytes.as_slice(), b"\xF0\x9F\x92\xA9"); // Magic!
968 let mut string = Wtf8Buf::new();
969 string.push_wtf8(w(b"\xED\xA0\xBD")); // lead
970 string.push_wtf8(w(b" ")); // not surrogate
971 string.push_wtf8(w(b"\xED\xB2\xA9")); // trail
972 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\xBD \xED\xB2\xA9");
974 let mut string = Wtf8Buf::new();
975 string.push_wtf8(w(b"\xED\xA0\x80")); // lead
976 string.push_wtf8(w(b"\xED\xAF\xBF")); // lead
977 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\x80\xED\xAF\xBF");
979 let mut string = Wtf8Buf::new();
980 string.push_wtf8(w(b"\xED\xA0\x80")); // lead
981 string.push_wtf8(w(b"\xEE\x80\x80")); // not surrogate
982 assert_eq!(string.bytes.as_slice(), b"\xED\xA0\x80\xEE\x80\x80");
984 let mut string = Wtf8Buf::new();
985 string.push_wtf8(w(b"\xED\x9F\xBF")); // not surrogate
986 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
987 assert_eq!(string.bytes.as_slice(), b"\xED\x9F\xBF\xED\xB0\x80");
989 let mut string = Wtf8Buf::new();
990 string.push_wtf8(w(b"a")); // not surrogate, < 3 bytes
991 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
992 assert_eq!(string.bytes.as_slice(), b"\x61\xED\xB0\x80");
994 let mut string = Wtf8Buf::new();
995 string.push_wtf8(w(b"\xED\xB0\x80")); // trail
996 assert_eq!(string.bytes.as_slice(), b"\xED\xB0\x80");
1000 fn wtf8buf_truncate() {
1001 let mut string = Wtf8Buf::from_str("aé");
1003 assert_eq!(string.bytes.as_slice(), b"a");
1008 fn wtf8buf_truncate_fail_code_point_boundary() {
1009 let mut string = Wtf8Buf::from_str("aé");
1015 fn wtf8buf_truncate_fail_longer() {
1016 let mut string = Wtf8Buf::from_str("aé");
1021 fn wtf8buf_into_string() {
1022 let mut string = Wtf8Buf::from_str("aé 💩");
1023 assert_eq!(string.clone().into_string(), Ok(String::from_str("aé 💩")));
1024 string.push(CodePoint::from_u32(0xD800).unwrap());
1025 assert_eq!(string.clone().into_string(), Err(string));
1029 fn wtf8buf_into_string_lossy() {
1030 let mut string = Wtf8Buf::from_str("aé 💩");
1031 assert_eq!(string.clone().into_string_lossy(), String::from_str("aé 💩"));
1032 string.push(CodePoint::from_u32(0xD800).unwrap());
1033 assert_eq!(string.clone().into_string_lossy(), String::from_str("aé 💩�"));
1037 fn wtf8buf_from_iterator() {
1038 fn f(values: &[u32]) -> Wtf8Buf {
1039 values.iter().map(|&c| CodePoint::from_u32(c).unwrap()).collect::<Wtf8Buf>()
1041 assert_eq!(f(&[0x61, 0xE9, 0x20, 0x1F4A9]).bytes.as_slice(), b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1043 assert_eq!(f(&[0xD83D, 0xDCA9]).bytes.as_slice(), b"\xF0\x9F\x92\xA9"); // Magic!
1044 assert_eq!(f(&[0xD83D, 0x20, 0xDCA9]).bytes.as_slice(), b"\xED\xA0\xBD \xED\xB2\xA9");
1045 assert_eq!(f(&[0xD800, 0xDBFF]).bytes.as_slice(), b"\xED\xA0\x80\xED\xAF\xBF");
1046 assert_eq!(f(&[0xD800, 0xE000]).bytes.as_slice(), b"\xED\xA0\x80\xEE\x80\x80");
1047 assert_eq!(f(&[0xD7FF, 0xDC00]).bytes.as_slice(), b"\xED\x9F\xBF\xED\xB0\x80");
1048 assert_eq!(f(&[0x61, 0xDC00]).bytes.as_slice(), b"\x61\xED\xB0\x80");
1049 assert_eq!(f(&[0xDC00]).bytes.as_slice(), b"\xED\xB0\x80");
1053 fn wtf8buf_extend() {
1054 fn e(initial: &[u32], extended: &[u32]) -> Wtf8Buf {
1055 fn c(value: &u32) -> CodePoint { CodePoint::from_u32(*value).unwrap() }
1056 let mut string = initial.iter().map(c).collect::<Wtf8Buf>();
1057 string.extend(extended.iter().map(c));
1061 assert_eq!(e(&[0x61, 0xE9], &[0x20, 0x1F4A9]).bytes.as_slice(),
1062 b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1064 assert_eq!(e(&[0xD83D], &[0xDCA9]).bytes.as_slice(), b"\xF0\x9F\x92\xA9"); // Magic!
1065 assert_eq!(e(&[0xD83D, 0x20], &[0xDCA9]).bytes.as_slice(), b"\xED\xA0\xBD \xED\xB2\xA9");
1066 assert_eq!(e(&[0xD800], &[0xDBFF]).bytes.as_slice(), b"\xED\xA0\x80\xED\xAF\xBF");
1067 assert_eq!(e(&[0xD800], &[0xE000]).bytes.as_slice(), b"\xED\xA0\x80\xEE\x80\x80");
1068 assert_eq!(e(&[0xD7FF], &[0xDC00]).bytes.as_slice(), b"\xED\x9F\xBF\xED\xB0\x80");
1069 assert_eq!(e(&[0x61], &[0xDC00]).bytes.as_slice(), b"\x61\xED\xB0\x80");
1070 assert_eq!(e(&[], &[0xDC00]).bytes.as_slice(), b"\xED\xB0\x80");
1075 let mut string = Wtf8Buf::from_str("aé 💩");
1076 string.push(CodePoint::from_u32(0xD800).unwrap());
1077 assert_eq!(format!("{:?}", string).as_slice(), r#""aé 💩\u{D800}""#);
1081 fn wtf8buf_as_slice() {
1082 assert_eq!(Wtf8Buf::from_str("aé").as_slice(), Wtf8::from_str("aé"));
1087 let mut string = Wtf8Buf::from_str("aé 💩");
1088 string.push(CodePoint::from_u32(0xD800).unwrap());
1089 assert_eq!(format!("{:?}", string.as_slice()).as_slice(), r#""aé 💩\u{D800}""#);
1093 fn wtf8_from_str() {
1094 assert_eq!(&Wtf8::from_str("").bytes, b"");
1095 assert_eq!(&Wtf8::from_str("aé 💩").bytes, b"a\xC3\xA9 \xF0\x9F\x92\xA9");
1100 assert_eq!(Wtf8::from_str("").len(), 0);
1101 assert_eq!(Wtf8::from_str("aé 💩").len(), 8);
1106 assert_eq!(&Wtf8::from_str("aé 💩")[1.. 4].bytes, b"\xC3\xA9 ");
1111 fn wtf8_slice_not_code_point_boundary() {
1112 &Wtf8::from_str("aé 💩")[2.. 4];
1116 fn wtf8_slice_from() {
1117 assert_eq!(&Wtf8::from_str("aé 💩")[1..].bytes, b"\xC3\xA9 \xF0\x9F\x92\xA9");
1122 fn wtf8_slice_from_not_code_point_boundary() {
1123 &Wtf8::from_str("aé 💩")[2..];
1127 fn wtf8_slice_to() {
1128 assert_eq!(&Wtf8::from_str("aé 💩")[..4].bytes, b"a\xC3\xA9 ");
1133 fn wtf8_slice_to_not_code_point_boundary() {
1134 &Wtf8::from_str("aé 💩")[5..];
1138 fn wtf8_ascii_byte_at() {
1139 let slice = Wtf8::from_str("aé 💩");
1140 assert_eq!(slice.ascii_byte_at(0), b'a');
1141 assert_eq!(slice.ascii_byte_at(1), b'\xFF');
1142 assert_eq!(slice.ascii_byte_at(2), b'\xFF');
1143 assert_eq!(slice.ascii_byte_at(3), b' ');
1144 assert_eq!(slice.ascii_byte_at(4), b'\xFF');
1148 fn wtf8_code_point_at() {
1149 let mut string = Wtf8Buf::from_str("aé ");
1150 string.push(CodePoint::from_u32(0xD83D).unwrap());
1151 string.push_char('💩');
1152 assert_eq!(string.code_point_at(0), CodePoint::from_char('a'));
1153 assert_eq!(string.code_point_at(1), CodePoint::from_char('é'));
1154 assert_eq!(string.code_point_at(3), CodePoint::from_char(' '));
1155 assert_eq!(string.code_point_at(4), CodePoint::from_u32(0xD83D).unwrap());
1156 assert_eq!(string.code_point_at(7), CodePoint::from_char('💩'));
1160 fn wtf8_code_point_range_at() {
1161 let mut string = Wtf8Buf::from_str("aé ");
1162 string.push(CodePoint::from_u32(0xD83D).unwrap());
1163 string.push_char('💩');
1164 assert_eq!(string.code_point_range_at(0), (CodePoint::from_char('a'), 1));
1165 assert_eq!(string.code_point_range_at(1), (CodePoint::from_char('é'), 3));
1166 assert_eq!(string.code_point_range_at(3), (CodePoint::from_char(' '), 4));
1167 assert_eq!(string.code_point_range_at(4), (CodePoint::from_u32(0xD83D).unwrap(), 7));
1168 assert_eq!(string.code_point_range_at(7), (CodePoint::from_char('💩'), 11));
1172 fn wtf8_code_points() {
1173 fn c(value: u32) -> CodePoint { CodePoint::from_u32(value).unwrap() }
1174 fn cp(string: &Wtf8Buf) -> Vec<Option<char>> {
1175 string.code_points().map(|c| c.to_char()).collect::<Vec<_>>()
1177 let mut string = Wtf8Buf::from_str("é ");
1178 assert_eq!(cp(&string), vec![Some('é'), Some(' ')]);
1179 string.push(c(0xD83D));
1180 assert_eq!(cp(&string), vec![Some('é'), Some(' '), None]);
1181 string.push(c(0xDCA9));
1182 assert_eq!(cp(&string), vec![Some('é'), Some(' '), Some('💩')]);
1187 assert_eq!(Wtf8::from_str("").as_str(), Some(""));
1188 assert_eq!(Wtf8::from_str("aé 💩").as_str(), Some("aé 💩"));
1189 let mut string = Wtf8Buf::new();
1190 string.push(CodePoint::from_u32(0xD800).unwrap());
1191 assert_eq!(string.as_str(), None);
1195 fn wtf8_to_string_lossy() {
1196 assert_eq!(Wtf8::from_str("").to_string_lossy(), Cow::Borrowed(""));
1197 assert_eq!(Wtf8::from_str("aé 💩").to_string_lossy(), Cow::Borrowed("aé 💩"));
1198 let mut string = Wtf8Buf::from_str("aé 💩");
1199 string.push(CodePoint::from_u32(0xD800).unwrap());
1200 let expected: CowString = Cow::Owned(String::from_str("aé 💩�"));
1201 assert_eq!(string.to_string_lossy(), expected);
1205 fn wtf8_encode_wide() {
1206 let mut string = Wtf8Buf::from_str("aé ");
1207 string.push(CodePoint::from_u32(0xD83D).unwrap());
1208 string.push_char('💩');
1209 assert_eq!(string.encode_wide().collect::<Vec<_>>(),
1210 vec![0x61, 0xE9, 0x20, 0xD83D, 0xD83D, 0xDCA9]);