use core::mem;
use core::ops::{self, Add, Index, IndexMut};
use core::ptr;
-use core::slice;
use core::str::pattern::Pattern;
use rustc_unicode::char::{decode_utf16, REPLACEMENT_CHARACTER};
use rustc_unicode::str as unicode_str;
pub fn push(&mut self, ch: char) {
match ch.len_utf8() {
1 => self.vec.push(ch as u8),
- ch_len => {
- let cur_len = self.len();
- // This may use up to 4 bytes.
- self.vec.reserve(ch_len);
-
- unsafe {
- // Attempt to not use an intermediate buffer by just pushing bytes
- // directly onto this string.
- let slice = slice::from_raw_parts_mut(self.vec
- .as_mut_ptr()
- .offset(cur_len as isize),
- ch_len);
- let used = ch.encode_utf8(slice).unwrap_or(0);
- self.vec.set_len(cur_len + used);
- }
- }
+ _ => self.vec.extend_from_slice(ch.encode_utf8().as_slice()),
}
}
let len = self.len();
assert!(idx <= len);
assert!(self.is_char_boundary(idx));
- self.vec.reserve(4);
- let mut bits = [0; 4];
- let amt = ch.encode_utf8(&mut bits).unwrap();
+ let bits = ch.encode_utf8();
+ let bits = bits.as_slice();
+ let amt = bits.len();
+ self.vec.reserve(amt);
unsafe {
ptr::copy(self.vec.as_ptr().offset(idx as isize),
#[test]
fn test_chars_decoding() {
- let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
- let len = c.encode_utf8(&mut bytes).unwrap_or(0);
- let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
+ let bytes = c.encode_utf8();
+ let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
#[test]
fn test_chars_rev_decoding() {
- let mut bytes = [0; 4];
for c in (0..0x110000).filter_map(::std::char::from_u32) {
- let len = c.encode_utf8(&mut bytes).unwrap_or(0);
- let s = ::std::str::from_utf8(&bytes[..len]).unwrap();
+ let bytes = c.encode_utf8();
+ let s = ::std::str::from_utf8(bytes.as_slice()).unwrap();
if Some(c) != s.chars().rev().next() {
panic!("character {:x}={} does not decode correctly", c as u32, c);
}
fn len_utf8(self) -> usize;
#[stable(feature = "core", since = "1.6.0")]
fn len_utf16(self) -> usize;
- #[stable(feature = "core", since = "1.6.0")]
- fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
- #[stable(feature = "core", since = "1.6.0")]
- fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
+ #[unstable(feature = "unicode", issue = "27784")]
+ fn encode_utf8(self) -> EncodeUtf8;
+ #[unstable(feature = "unicode", issue = "27784")]
+ fn encode_utf16(self) -> EncodeUtf16;
}
#[stable(feature = "core", since = "1.6.0")]
}
#[inline]
- fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
- encode_utf8_raw(self as u32, dst)
+ fn encode_utf8(self) -> EncodeUtf8 {
+ let code = self as u32;
+ let mut buf = [0; 4];
+ let pos = if code < MAX_ONE_B {
+ buf[3] = code as u8;
+ 3
+ } else if code < MAX_TWO_B {
+ buf[2] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
+ buf[3] = (code & 0x3F) as u8 | TAG_CONT;
+ 2
+ } else if code < MAX_THREE_B {
+ buf[1] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
+ buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+ buf[3] = (code & 0x3F) as u8 | TAG_CONT;
+ 1
+ } else {
+ buf[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
+ buf[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
+ buf[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
+ buf[3] = (code & 0x3F) as u8 | TAG_CONT;
+ 0
+ };
+ EncodeUtf8 { buf: buf, pos: pos }
}
#[inline]
- fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
- encode_utf16_raw(self as u32, dst)
- }
-}
-
-/// Encodes a raw u32 value as UTF-8 into the provided byte buffer,
-/// and then returns the number of bytes written.
-///
-/// If the buffer is not large enough, nothing will be written into it
-/// and a `None` will be returned.
-#[inline]
-#[unstable(feature = "char_internals",
- reason = "this function should not be exposed publicly",
- issue = "0")]
-#[doc(hidden)]
-pub fn encode_utf8_raw(code: u32, dst: &mut [u8]) -> Option<usize> {
- // Marked #[inline] to allow llvm optimizing it away
- if code < MAX_ONE_B && !dst.is_empty() {
- dst[0] = code as u8;
- Some(1)
- } else if code < MAX_TWO_B && dst.len() >= 2 {
- dst[0] = (code >> 6 & 0x1F) as u8 | TAG_TWO_B;
- dst[1] = (code & 0x3F) as u8 | TAG_CONT;
- Some(2)
- } else if code < MAX_THREE_B && dst.len() >= 3 {
- dst[0] = (code >> 12 & 0x0F) as u8 | TAG_THREE_B;
- dst[1] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
- dst[2] = (code & 0x3F) as u8 | TAG_CONT;
- Some(3)
- } else if dst.len() >= 4 {
- dst[0] = (code >> 18 & 0x07) as u8 | TAG_FOUR_B;
- dst[1] = (code >> 12 & 0x3F) as u8 | TAG_CONT;
- dst[2] = (code >> 6 & 0x3F) as u8 | TAG_CONT;
- dst[3] = (code & 0x3F) as u8 | TAG_CONT;
- Some(4)
- } else {
- None
- }
-}
-
-/// Encodes a raw u32 value as UTF-16 into the provided `u16` buffer,
-/// and then returns the number of `u16`s written.
-///
-/// If the buffer is not large enough, nothing will be written into it
-/// and a `None` will be returned.
-#[inline]
-#[unstable(feature = "char_internals",
- reason = "this function should not be exposed publicly",
- issue = "0")]
-#[doc(hidden)]
-pub fn encode_utf16_raw(mut ch: u32, dst: &mut [u16]) -> Option<usize> {
- // Marked #[inline] to allow llvm optimizing it away
- if (ch & 0xFFFF) == ch && !dst.is_empty() {
- // The BMP falls through (assuming non-surrogate, as it should)
- dst[0] = ch as u16;
- Some(1)
- } else if dst.len() >= 2 {
- // Supplementary planes break into surrogates.
- ch -= 0x1_0000;
- dst[0] = 0xD800 | ((ch >> 10) as u16);
- dst[1] = 0xDC00 | ((ch as u16) & 0x3FF);
- Some(2)
- } else {
- None
+ fn encode_utf16(self) -> EncodeUtf16 {
+ let mut buf = [0; 2];
+ let mut code = self as u32;
+ let pos = if (code & 0xFFFF) == code {
+ // The BMP falls through (assuming non-surrogate, as it should)
+ buf[1] = code as u16;
+ 1
+ } else {
+ // Supplementary planes break into surrogates.
+ code -= 0x1_0000;
+ buf[0] = 0xD800 | ((code >> 10) as u16);
+ buf[1] = 0xDC00 | ((code as u16) & 0x3FF);
+ 0
+ };
+ EncodeUtf16 { buf: buf, pos: pos }
}
}
}
}
}
+
+/// An iterator over `u8` entries represending the UTF-8 encoding of a `char`
+/// value.
+///
+/// Constructed via the `.encode_utf8()` method on `char`.
+#[unstable(feature = "unicode", issue = "27784")]
+#[derive(Debug)]
+pub struct EncodeUtf8 {
+ buf: [u8; 4],
+ pos: usize,
+}
+
+impl EncodeUtf8 {
+ /// Returns the remaining bytes of this iterator as a slice.
+ #[unstable(feature = "unicode", issue = "27784")]
+ pub fn as_slice(&self) -> &[u8] {
+ &self.buf[self.pos..]
+ }
+}
+
+#[unstable(feature = "unicode", issue = "27784")]
+impl Iterator for EncodeUtf8 {
+ type Item = u8;
+
+ fn next(&mut self) -> Option<u8> {
+ if self.pos == self.buf.len() {
+ None
+ } else {
+ let ret = Some(self.buf[self.pos]);
+ self.pos += 1;
+ ret
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.as_slice().iter().size_hint()
+ }
+}
+
+/// An iterator over `u16` entries represending the UTF-16 encoding of a `char`
+/// value.
+///
+/// Constructed via the `.encode_utf16()` method on `char`.
+#[unstable(feature = "unicode", issue = "27784")]
+#[derive(Debug)]
+pub struct EncodeUtf16 {
+ buf: [u16; 2],
+ pos: usize,
+}
+
+impl EncodeUtf16 {
+ /// Returns the remaining bytes of this iterator as a slice.
+ #[unstable(feature = "unicode", issue = "27784")]
+ pub fn as_slice(&self) -> &[u16] {
+ &self.buf[self.pos..]
+ }
+}
+
+
+#[unstable(feature = "unicode", issue = "27784")]
+impl Iterator for EncodeUtf16 {
+ type Item = u16;
+
+ fn next(&mut self) -> Option<u16> {
+ if self.pos == self.buf.len() {
+ None
+ } else {
+ let ret = Some(self.buf[self.pos]);
+ self.pos += 1;
+ ret
+ }
+ }
+
+ fn size_hint(&self) -> (usize, Option<usize>) {
+ self.as_slice().iter().size_hint()
+ }
+}
/// This function will return an instance of `Error` on error.
#[stable(feature = "fmt_write_char", since = "1.1.0")]
fn write_char(&mut self, c: char) -> Result {
- let mut utf_8 = [0u8; 4];
- let bytes_written = c.encode_utf8(&mut utf_8).unwrap_or(0);
- self.write_str(unsafe { str::from_utf8_unchecked(&utf_8[..bytes_written]) })
+ self.write_str(unsafe {
+ str::from_utf8_unchecked(c.encode_utf8().as_slice())
+ })
}
/// Glue for usage of the `write!` macro with implementors of this trait.
// Writes the sign if it exists, and then the prefix if it was requested
let write_prefix = |f: &mut Formatter| {
if let Some(c) = sign {
- let mut b = [0; 4];
- let n = c.encode_utf8(&mut b).unwrap_or(0);
- let b = unsafe { str::from_utf8_unchecked(&b[..n]) };
- try!(f.buf.write_str(b));
+ try!(f.buf.write_str(unsafe {
+ str::from_utf8_unchecked(c.encode_utf8().as_slice())
+ }));
}
if prefixed { f.buf.write_str(prefix) }
else { Ok(()) }
rt::v1::Alignment::Center => (padding / 2, (padding + 1) / 2),
};
- let mut fill = [0; 4];
- let len = self.fill.encode_utf8(&mut fill).unwrap_or(0);
- let fill = unsafe { str::from_utf8_unchecked(&fill[..len]) };
+ let fill = self.fill.encode_utf8();
+ let fill = unsafe {
+ str::from_utf8_unchecked(fill.as_slice())
+ };
for _ in 0..pre_pad {
try!(self.buf.write_str(fill));
if f.width.is_none() && f.precision.is_none() {
f.write_char(*self)
} else {
- let mut utf8 = [0; 4];
- let amt = self.encode_utf8(&mut utf8).unwrap_or(0);
- let s: &str = unsafe { str::from_utf8_unchecked(&utf8[..amt]) };
- f.pad(s)
+ f.pad(unsafe {
+ str::from_utf8_unchecked(self.encode_utf8().as_slice())
+ })
}
}
}
#[test]
fn test_encode_utf8() {
fn check(input: char, expect: &[u8]) {
- let mut buf = [0; 4];
- let n = input.encode_utf8(&mut buf).unwrap_or(0);
- assert_eq!(&buf[..n], expect);
+ assert_eq!(input.encode_utf8().as_slice(), expect);
+ for (a, b) in input.encode_utf8().zip(expect) {
+ assert_eq!(a, *b);
+ }
}
check('x', &[0x78]);
#[test]
fn test_encode_utf16() {
fn check(input: char, expect: &[u16]) {
- let mut buf = [0; 2];
- let n = input.encode_utf16(&mut buf).unwrap_or(0);
- assert_eq!(&buf[..n], expect);
+ assert_eq!(input.encode_utf16().as_slice(), expect);
+ for (a, b) in input.encode_utf16().zip(expect) {
+ assert_eq!(a, *b);
+ }
}
check('x', &[0x0078]);
// stable reexports
#[stable(feature = "rust1", since = "1.0.0")]
-pub use core::char::{MAX, from_u32, from_u32_unchecked, from_digit, EscapeUnicode, EscapeDefault};
+pub use core::char::{MAX, from_u32, from_u32_unchecked, from_digit};
+#[stable(feature = "rust1", since = "1.0.0")]
+pub use core::char::{EscapeUnicode, EscapeDefault, EncodeUtf8, EncodeUtf16};
// unstable reexports
#[unstable(feature = "unicode", issue = "27783")]
C::len_utf16(self)
}
- /// Encodes this character as UTF-8 into the provided byte buffer, and then
- /// returns the number of bytes written.
+ /// Returns an interator over the bytes of this character as UTF-8.
///
- /// If the buffer is not large enough, nothing will be written into it and a
- /// `None` will be returned. A buffer of length four is large enough to
- /// encode any `char`.
+ /// The returned iterator also has an `as_slice()` method to view the
+ /// encoded bytes as a byte slice.
///
/// # Examples
///
- /// In both of these examples, 'ß' takes two bytes to encode.
- ///
/// ```
/// #![feature(unicode)]
///
- /// let mut b = [0; 2];
+ /// let iterator = 'ß'.encode_utf8();
+ /// assert_eq!(iterator.as_slice(), [0xc3, 0x9f]);
///
- /// let result = 'ß'.encode_utf8(&mut b);
- ///
- /// assert_eq!(result, Some(2));
- /// ```
- ///
- /// A buffer that's too small:
- ///
- /// ```
- /// #![feature(unicode)]
- ///
- /// let mut b = [0; 1];
- ///
- /// let result = 'ß'.encode_utf8(&mut b);
- ///
- /// assert_eq!(result, None);
+ /// for (i, byte) in iterator.enumerate() {
+ /// println!("byte {}: {:x}", i, byte);
+ /// }
/// ```
- #[unstable(feature = "unicode",
- reason = "pending decision about Iterator/Writer/Reader",
- issue = "27784")]
+ #[unstable(feature = "unicode", issue = "27784")]
#[inline]
- pub fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
- C::encode_utf8(self, dst)
+ pub fn encode_utf8(self) -> EncodeUtf8 {
+ C::encode_utf8(self)
}
- /// Encodes this character as UTF-16 into the provided `u16` buffer, and
- /// then returns the number of `u16`s written.
+ /// Returns an interator over the `u16` entries of this character as UTF-16.
///
- /// If the buffer is not large enough, nothing will be written into it and a
- /// `None` will be returned. A buffer of length 2 is large enough to encode
- /// any `char`.
+ /// The returned iterator also has an `as_slice()` method to view the
+ /// encoded form as a slice.
///
/// # Examples
///
- /// In both of these examples, '𝕊' takes two `u16`s to encode.
- ///
- /// ```
- /// #![feature(unicode)]
- ///
- /// let mut b = [0; 2];
- ///
- /// let result = '𝕊'.encode_utf16(&mut b);
- ///
- /// assert_eq!(result, Some(2));
- /// ```
- ///
- /// A buffer that's too small:
- ///
/// ```
/// #![feature(unicode)]
///
- /// let mut b = [0; 1];
+ /// let iterator = '𝕊'.encode_utf16();
+ /// assert_eq!(iterator.as_slice(), [0xd835, 0xdd4a]);
///
- /// let result = '𝕊'.encode_utf16(&mut b);
- ///
- /// assert_eq!(result, None);
+ /// for (i, val) in iterator.enumerate() {
+ /// println!("entry {}: {:x}", i, val);
+ /// }
/// ```
- #[unstable(feature = "unicode",
- reason = "pending decision about Iterator/Writer/Reader",
- issue = "27784")]
+ #[unstable(feature = "unicode", issue = "27784")]
#[inline]
- pub fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
- C::encode_utf16(self, dst)
+ pub fn encode_utf16(self) -> EncodeUtf16 {
+ C::encode_utf16(self)
}
/// Returns true if this `char` is an alphabetic code point, and false if not.
#![feature(core_char_ext)]
#![feature(lang_items)]
#![feature(staged_api)]
+#![feature(unicode)]
mod tables;
mod u_str;
return Some(tmp);
}
- let mut buf = [0; 2];
self.chars.next().map(|ch| {
- let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
- if n == 2 {
- self.extra = buf[1];
+ let n = CharExt::encode_utf16(ch);
+ let n = n.as_slice();
+ if n.len() == 2 {
+ self.extra = n[1];
}
- buf[0]
+ n[0]
})
}
}
fn escape_char(writer: &mut fmt::Write, v: char) -> EncodeResult {
- let mut buf = [0; 4];
- let n = v.encode_utf8(&mut buf).unwrap();
- let buf = unsafe { str::from_utf8_unchecked(&buf[..n]) };
- escape_str(writer, buf)
+ escape_str(writer, unsafe {
+ str::from_utf8_unchecked(v.encode_utf8().as_slice())
+ })
}
fn spaces(wr: &mut fmt::Write, mut n: usize) -> EncodeResult {
// unix (it's mostly used on windows), so don't worry about dead code here.
#![allow(dead_code)]
-use core::char::{encode_utf8_raw, encode_utf16_raw};
use core::str::next_code_point;
use ascii::*;
/// Copied from String::push
/// This does **not** include the WTF-8 concatenation check.
fn push_code_point_unchecked(&mut self, code_point: CodePoint) {
- let cur_len = self.len();
- // This may use up to 4 bytes.
- self.reserve(4);
-
- unsafe {
- // Attempt to not use an intermediate buffer by just pushing bytes
- // directly onto this string.
- let slice = slice::from_raw_parts_mut(
- self.bytes.as_mut_ptr().offset(cur_len as isize), 4
- );
- let used = encode_utf8_raw(code_point.value, slice).unwrap();
- self.bytes.set_len(cur_len + used);
- }
+ let bytes = unsafe {
+ char::from_u32_unchecked(code_point.value).encode_utf8()
+ };
+ self.bytes.extend_from_slice(bytes.as_slice());
}
#[inline]
return Some(tmp);
}
- let mut buf = [0; 2];
self.code_points.next().map(|code_point| {
- let n = encode_utf16_raw(code_point.value, &mut buf)
- .unwrap_or(0);
- if n == 2 { self.extra = buf[1]; }
- buf[0]
+ let n = unsafe {
+ char::from_u32_unchecked(code_point.value).encode_utf16()
+ };
+ let n = n.as_slice();
+ if n.len() == 2 {
+ self.extra = n[1];
+ }
+ n[0]
})
}