1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `Box<T>` types, meaning that it has one, and only one,
24 owner. This type cannot be implicitly copied, and is moved out of
25 when passed to other functions.
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = "I am an owned string".to_owned();
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
68 and can contain null codepoints.
70 The actual representation of strings have direct mappings to vectors:
72 * `~str` is the same as `~[u8]`
73 * `&str` is the same as `&[u8]`
82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
83 use container::Container;
86 use iter::{Iterator, FromIterator, Extendable, range};
87 use iter::{Filter, AdditiveIterator, Map};
88 use iter::{Rev, DoubleEndedIterator, ExactSize};
91 use option::{None, Option, Some};
93 use from_str::FromStr;
95 use slice::{OwnedVector, ImmutableVector, MutableVector};
103 Section: Creating a string
106 /// Consumes a vector of bytes to create a new utf-8 string.
107 /// Returns None if the vector contains invalid UTF-8.
108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
110 Some(unsafe { raw::from_utf8_owned(vv) })
116 /// Converts a vector to a string slice without performing any allocations.
118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
119 /// returned as a '&str' instead of a '&[u8]'
121 /// Returns None if the slice is not utf-8.
122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
124 Some(unsafe { raw::from_utf8(v) })
128 impl FromStr for ~str {
130 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
133 /// Convert a byte to a UTF-8 string
137 /// Fails if invalid UTF-8
138 pub fn from_byte(b: u8) -> ~str {
140 unsafe { ::cast::transmute(box [b]) }
143 /// Convert a char to a string
144 pub fn from_char(ch: char) -> ~str {
145 let mut buf = StrBuf::new();
150 /// Convert a vector of chars to a string
151 pub fn from_chars(chs: &[char]) -> ~str {
152 chs.iter().map(|c| *c).collect()
155 /// Methods for vectors of strings
156 pub trait StrVector {
157 /// Concatenate a vector of strings.
158 fn concat(&self) -> ~str;
160 /// Concatenate a vector of strings, placing a given separator between each.
161 fn connect(&self, sep: &str) -> ~str;
164 impl<'a, S: Str> StrVector for &'a [S] {
165 fn concat(&self) -> ~str {
166 if self.is_empty() { return "".to_owned(); }
168 // `len` calculation may overflow but push_str but will check boundaries
169 let len = self.iter().map(|s| s.as_slice().len()).sum();
171 let mut result = StrBuf::with_capacity(len);
173 for s in self.iter() {
174 result.push_str(s.as_slice())
180 fn connect(&self, sep: &str) -> ~str {
181 if self.is_empty() { return "".to_owned(); }
184 if sep.is_empty() { return self.concat(); }
186 // this is wrong without the guarantee that `self` is non-empty
187 // `len` calculation may overflow but push_str but will check boundaries
188 let len = sep.len() * (self.len() - 1)
189 + self.iter().map(|s| s.as_slice().len()).sum();
190 let mut result = StrBuf::with_capacity(len);
191 let mut first = true;
193 for s in self.iter() {
197 result.push_str(sep);
199 result.push_str(s.as_slice());
205 impl<'a, S: Str> StrVector for Vec<S> {
207 fn concat(&self) -> ~str {
208 self.as_slice().concat()
212 fn connect(&self, sep: &str) -> ~str {
213 self.as_slice().connect(sep)
217 /// Something that can be used to compare against a character
219 /// Determine if the splitter should split at the given character
220 fn matches(&mut self, char) -> bool;
221 /// Indicate if this is only concerned about ASCII characters,
222 /// which can allow for a faster implementation.
223 fn only_ascii(&self) -> bool;
226 impl CharEq for char {
228 fn matches(&mut self, c: char) -> bool { *self == c }
231 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
234 impl<'a> CharEq for |char|: 'a -> bool {
236 fn matches(&mut self, c: char) -> bool { (*self)(c) }
239 fn only_ascii(&self) -> bool { false }
242 impl CharEq for extern "Rust" fn(char) -> bool {
244 fn matches(&mut self, c: char) -> bool { (*self)(c) }
247 fn only_ascii(&self) -> bool { false }
250 impl<'a> CharEq for &'a [char] {
252 fn matches(&mut self, c: char) -> bool {
253 self.iter().any(|&mut m| m.matches(c))
257 fn only_ascii(&self) -> bool {
258 self.iter().all(|m| m.only_ascii())
266 /// External iterator for a string's characters.
267 /// Use with the `std::iter` module.
269 pub struct Chars<'a> {
270 /// The slice remaining to be iterated
274 impl<'a> Iterator<char> for Chars<'a> {
276 fn next(&mut self) -> Option<char> {
277 // Decode the next codepoint, then update
278 // the slice to be just the remaining part
279 if self.string.len() != 0 {
280 let CharRange {ch, next} = self.string.char_range_at(0);
282 self.string = raw::slice_unchecked(self.string, next, self.string.len());
291 fn size_hint(&self) -> (uint, Option<uint>) {
292 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
296 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
298 fn next_back(&mut self) -> Option<char> {
299 if self.string.len() != 0 {
300 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
302 self.string = raw::slice_unchecked(self.string, 0, next);
311 /// External iterator for a string's characters and their byte offsets.
312 /// Use with the `std::iter` module.
314 pub struct CharOffsets<'a> {
315 /// The original string to be iterated
320 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
322 fn next(&mut self) -> Option<(uint, char)> {
323 // Compute the byte offset by using the pointer offset between
324 // the original string slice and the iterator's remaining part
325 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
326 self.iter.next().map(|ch| (offset, ch))
330 fn size_hint(&self) -> (uint, Option<uint>) {
331 self.iter.size_hint()
335 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
337 fn next_back(&mut self) -> Option<(uint, char)> {
338 self.iter.next_back().map(|ch| {
339 let offset = self.iter.string.len() +
340 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
346 #[deprecated = "replaced by Rev<Chars<'a>>"]
347 pub type RevChars<'a> = Rev<Chars<'a>>;
349 #[deprecated = "replaced by Rev<CharOffsets<'a>>"]
350 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
352 /// External iterator for a string's bytes.
353 /// Use with the `std::iter` module.
355 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
357 #[deprecated = "replaced by Rev<Bytes<'a>>"]
358 pub type RevBytes<'a> = Rev<Bytes<'a>>;
360 /// An iterator over the substrings of a string, separated by `sep`.
362 pub struct CharSplits<'a, Sep> {
363 /// The slice remaining to be iterated
366 /// Whether an empty string at the end is allowed
367 allow_trailing_empty: bool,
372 #[deprecated = "replaced by Rev<CharSplits<'a, Sep>>"]
373 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
375 /// An iterator over the substrings of a string, separated by `sep`,
376 /// splitting at most `count` times.
378 pub struct CharSplitsN<'a, Sep> {
379 iter: CharSplits<'a, Sep>,
380 /// The number of splits remaining
385 /// An iterator over the words of a string, separated by a sequence of whitespace
387 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
389 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
390 pub type AnyLines<'a> =
391 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
393 impl<'a, Sep> CharSplits<'a, Sep> {
395 fn get_end(&mut self) -> Option<&'a str> {
396 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
397 self.finished = true;
405 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
407 fn next(&mut self) -> Option<&'a str> {
408 if self.finished { return None }
410 let mut next_split = None;
412 for (idx, byte) in self.string.bytes().enumerate() {
413 if self.sep.matches(byte as char) && byte < 128u8 {
414 next_split = Some((idx, idx + 1));
419 for (idx, ch) in self.string.char_indices() {
420 if self.sep.matches(ch) {
421 next_split = Some((idx, self.string.char_range_at(idx).next));
427 Some((a, b)) => unsafe {
428 let elt = raw::slice_unchecked(self.string, 0, a);
429 self.string = raw::slice_unchecked(self.string, b, self.string.len());
432 None => self.get_end(),
437 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
438 for CharSplits<'a, Sep> {
440 fn next_back(&mut self) -> Option<&'a str> {
441 if self.finished { return None }
443 if !self.allow_trailing_empty {
444 self.allow_trailing_empty = true;
445 match self.next_back() {
446 Some(elt) if !elt.is_empty() => return Some(elt),
447 _ => if self.finished { return None }
450 let len = self.string.len();
451 let mut next_split = None;
454 for (idx, byte) in self.string.bytes().enumerate().rev() {
455 if self.sep.matches(byte as char) && byte < 128u8 {
456 next_split = Some((idx, idx + 1));
461 for (idx, ch) in self.string.char_indices().rev() {
462 if self.sep.matches(ch) {
463 next_split = Some((idx, self.string.char_range_at(idx).next));
469 Some((a, b)) => unsafe {
470 let elt = raw::slice_unchecked(self.string, b, len);
471 self.string = raw::slice_unchecked(self.string, 0, a);
474 None => { self.finished = true; Some(self.string) }
479 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
481 fn next(&mut self) -> Option<&'a str> {
484 if self.invert { self.iter.next_back() } else { self.iter.next() }
491 /// An iterator over the start and end indices of the matches of a
492 /// substring within a larger string
494 pub struct MatchIndices<'a> {
500 /// An iterator over the substrings of a string separated by a given
503 pub struct StrSplits<'a> {
504 it: MatchIndices<'a>,
509 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
511 fn next(&mut self) -> Option<(uint, uint)> {
512 // See Issue #1932 for why this is a naive search
513 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
514 let mut match_start = 0;
517 while self.position < h_len {
518 if self.haystack[self.position] == self.needle[match_i] {
519 if match_i == 0 { match_start = self.position; }
523 if match_i == n_len {
525 return Some((match_start, self.position));
528 // failed match, backtrack
531 self.position = match_start;
540 impl<'a> Iterator<&'a str> for StrSplits<'a> {
542 fn next(&mut self) -> Option<&'a str> {
543 if self.finished { return None; }
545 match self.it.next() {
546 Some((from, to)) => {
547 let ret = Some(self.it.haystack.slice(self.last_end, from));
552 self.finished = true;
553 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
559 // Helper functions used for Unicode normalization
560 fn canonical_sort(comb: &mut [(char, u8)]) {
564 let len = comb.len();
565 for i in range(0, len) {
566 let mut swapped = false;
567 for j in range(1, len-i) {
568 let class_a = *comb[j-1].ref1();
569 let class_b = *comb[j].ref1();
570 if class_a != 0 && class_b != 0 && class_a > class_b {
575 if !swapped { break; }
580 enum NormalizationForm {
585 /// External iterator for a string's normalization's characters.
586 /// Use with the `std::iter` module.
588 pub struct Normalizations<'a> {
589 kind: NormalizationForm,
591 buffer: Vec<(char, u8)>,
595 impl<'a> Iterator<char> for Normalizations<'a> {
597 fn next(&mut self) -> Option<char> {
598 use unicode::decompose::canonical_combining_class;
600 match self.buffer.as_slice().head() {
606 Some(&(c, _)) if self.sorted => {
610 _ => self.sorted = false
613 let decomposer = match self.kind {
614 NFD => char::decompose_canonical,
615 NFKD => char::decompose_compatible
619 for ch in self.iter {
620 let buffer = &mut self.buffer;
621 let sorted = &mut self.sorted;
623 let class = canonical_combining_class(d);
624 if class == 0 && !*sorted {
625 canonical_sort(buffer.as_mut_slice());
628 buffer.push((d, class));
635 canonical_sort(self.buffer.as_mut_slice());
639 match self.buffer.shift() {
644 Some((c, _)) => Some(c),
649 fn size_hint(&self) -> (uint, Option<uint>) {
650 let (lower, _) = self.iter.size_hint();
655 /// Replace all occurrences of one string with another
659 /// * s - The string containing substrings to replace
660 /// * from - The string to replace
661 /// * to - The replacement string
665 /// The original string with all occurrences of `from` replaced with `to`
666 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
667 let mut result = StrBuf::new();
668 let mut last_end = 0;
669 for (start, end) in s.match_indices(from) {
670 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
674 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
679 Section: Comparing strings
682 // share the implementation of the lang-item vs. non-lang-item
685 fn eq_slice_(a: &str, b: &str) -> bool {
686 a.len() == b.len() && unsafe {
687 libc::memcmp(a.as_ptr() as *libc::c_void,
688 b.as_ptr() as *libc::c_void,
689 a.len() as libc::size_t) == 0
693 /// Bytewise slice equality
697 pub fn eq_slice(a: &str, b: &str) -> bool {
701 /// Bytewise slice equality
704 pub fn eq_slice(a: &str, b: &str) -> bool {
708 /// Bytewise string equality
710 #[lang="uniq_str_eq"]
712 pub fn eq(a: &~str, b: &~str) -> bool {
718 pub fn eq(a: &~str, b: &~str) -> bool {
726 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
727 /// returning `true` in that case, or, if it is invalid, `false` with
728 /// `iter` reset such that it is pointing at the first byte in the
729 /// invalid sequence.
731 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
733 // save the current thing we're pointing at.
736 // restore the iterator we had at the start of this codepoint.
737 macro_rules! err ( () => { {*iter = old; return false} });
738 macro_rules! next ( () => {
741 // we needed data, but there was none: error!
746 let first = match iter.next() {
748 // we're at the end of the iterator and a codepoint
749 // boundary at the same time, so this string is valid.
753 // ASCII characters are always valid, so only large
754 // bytes need more examination.
756 let w = utf8_char_width(first);
757 let second = next!();
758 // 2-byte encoding is for codepoints \u0080 to \u07ff
759 // first C2 80 last DF BF
760 // 3-byte encoding is for codepoints \u0800 to \uffff
761 // first E0 A0 80 last EF BF BF
762 // excluding surrogates codepoints \ud800 to \udfff
763 // ED A0 80 to ED BF BF
764 // 4-byte encoding is for codepoints \u10000 to \u10ffff
765 // first F0 90 80 80 last F4 8F BF BF
767 // Use the UTF-8 syntax from the RFC
769 // https://tools.ietf.org/html/rfc3629
771 // UTF8-2 = %xC2-DF UTF8-tail
772 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
773 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
774 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
775 // %xF4 %x80-8F 2( UTF8-tail )
777 2 => if second & 192 != TAG_CONT_U8 {err!()},
779 match (first, second, next!() & 192) {
780 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
781 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
782 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
783 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
788 match (first, second, next!() & 192, next!() & 192) {
789 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
790 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
791 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
801 /// Determines if a vector of bytes contains valid UTF-8.
802 pub fn is_utf8(v: &[u8]) -> bool {
803 run_utf8_validation_iterator(&mut v.iter())
807 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
808 let mut it = v.iter();
810 let ok = run_utf8_validation_iterator(&mut it);
814 // work out how many valid bytes we've consumed
815 // (run_utf8_validation_iterator resets the iterator to just
816 // after the last good byte), which we can do because the
817 // vector iterator size_hint is exact.
818 let (remaining, _) = it.size_hint();
819 Some(v.len() - remaining)
823 /// Determines if a vector of `u16` contains valid UTF-16
824 pub fn is_utf16(v: &[u16]) -> bool {
825 let mut it = v.iter();
826 macro_rules! next ( ($ret:expr) => {
827 match it.next() { Some(u) => *u, None => return $ret }
833 match char::from_u32(u as u32) {
836 let u2 = next!(false);
837 if u < 0xD7FF || u > 0xDBFF ||
838 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
844 /// An iterator that decodes UTF-16 encoded codepoints from a vector
847 pub struct UTF16Items<'a> {
848 iter: slice::Items<'a, u16>
850 /// The possibilities for values decoded from a `u16` stream.
851 #[deriving(Eq, TotalEq, Clone, Show)]
853 /// A valid codepoint.
855 /// An invalid surrogate without its pair.
860 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
861 /// replacement character (U+FFFD).
863 pub fn to_char_lossy(&self) -> char {
866 LoneSurrogate(_) => '\uFFFD'
871 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
872 fn next(&mut self) -> Option<UTF16Item> {
873 let u = match self.iter.next() {
878 if u < 0xD800 || 0xDFFF < u {
880 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
881 } else if u >= 0xDC00 {
882 // a trailing surrogate
883 Some(LoneSurrogate(u))
885 // preserve state for rewinding.
888 let u2 = match self.iter.next() {
891 None => return Some(LoneSurrogate(u))
893 if u2 < 0xDC00 || u2 > 0xDFFF {
894 // not a trailing surrogate so we're not a valid
895 // surrogate pair, so rewind to redecode u2 next time.
897 return Some(LoneSurrogate(u))
900 // all ok, so lets decode it.
901 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
902 Some(ScalarValue(unsafe {cast::transmute(c)}))
907 fn size_hint(&self) -> (uint, Option<uint>) {
908 let (low, high) = self.iter.size_hint();
909 // we could be entirely valid surrogates (2 elements per
910 // char), or entirely non-surrogates (1 element per char)
915 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
916 /// returning invalid surrogates as `LoneSurrogate`s.
922 /// use std::str::{ScalarValue, LoneSurrogate};
924 /// // 𝄞mus<invalid>ic<invalid>
925 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
926 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
929 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
930 /// ~[ScalarValue('𝄞'),
931 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
932 /// LoneSurrogate(0xDD1E),
933 /// ScalarValue('i'), ScalarValue('c'),
934 /// LoneSurrogate(0xD834)]);
936 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
937 UTF16Items { iter : v.iter() }
940 /// Return a slice of `v` ending at (and not including) the first NUL
949 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
950 /// // no NULs so no change
951 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
955 /// assert_eq!(str::truncate_utf16_at_nul(v),
956 /// &['a' as u16, 'b' as u16]);
958 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
959 match v.iter().position(|c| *c == 0) {
960 // don't include the 0
961 Some(i) => v.slice_to(i),
966 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
967 /// if `v` contains any invalid data.
975 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
976 /// 0x0073, 0x0069, 0x0063];
977 /// assert_eq!(str::from_utf16(v), Some("𝄞music".to_owned()));
979 /// // 𝄞mu<invalid>ic
981 /// assert_eq!(str::from_utf16(v), None);
983 pub fn from_utf16(v: &[u16]) -> Option<~str> {
984 let mut s = StrBuf::with_capacity(v.len() / 2);
985 for c in utf16_items(v) {
987 ScalarValue(c) => s.push_char(c),
988 LoneSurrogate(_) => return None
994 /// Decode a UTF-16 encoded vector `v` into a string, replacing
995 /// invalid data with the replacement character (U+FFFD).
1001 /// // 𝄞mus<invalid>ic<invalid>
1002 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1003 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1006 /// assert_eq!(str::from_utf16_lossy(v),
1007 /// "𝄞mus\uFFFDic\uFFFD".to_owned());
1009 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1010 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1013 // https://tools.ietf.org/html/rfc3629
1014 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1015 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1017 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1018 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1026 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1027 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1028 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1029 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1030 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1033 /// Given a first byte, determine how many bytes are in this UTF-8 character
1035 pub fn utf8_char_width(b: u8) -> uint {
1036 return UTF8_CHAR_WIDTH[b as uint] as uint;
1039 /// Struct that contains a `char` and the index of the first byte of
1040 /// the next `char` in a string. This can be used as a data structure
1041 /// for iterating over the UTF-8 bytes of a string.
1042 pub struct CharRange {
1045 /// Index of the first byte of the next `char`
1049 // Return the initial codepoint accumulator for the first byte.
1050 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1051 // for width 3, and 3 bits for width 4
1052 macro_rules! utf8_first_byte(
1053 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1056 // return the value of $ch updated with continuation byte $byte
1057 macro_rules! utf8_acc_cont_byte(
1058 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1061 static TAG_CONT_U8: u8 = 128u8;
1063 /// Converts a vector of bytes to a new utf-8 string.
1064 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1069 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1070 /// let output = std::str::from_utf8_lossy(input);
1071 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1073 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1074 let firstbad = match first_non_utf8_index(v) {
1075 None => return Slice(unsafe { cast::transmute(v) }),
1079 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1080 let mut i = firstbad;
1081 let total = v.len();
1082 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1083 unsafe { *xs.unsafe_ref(i) }
1085 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1093 let mut res = StrBuf::with_capacity(total);
1097 res.push_bytes(v.slice_to(i))
1101 // subseqidx is the index of the first byte of the subsequence we're looking at.
1102 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1104 let mut subseqidx = firstbad;
1108 let byte = unsafe_get(v, i);
1111 macro_rules! error(() => ({
1113 if subseqidx != i_ {
1114 res.push_bytes(v.slice(subseqidx, i_));
1117 res.push_bytes(REPLACEMENT);
1122 // subseqidx handles this
1124 let w = utf8_char_width(byte);
1128 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1135 match (byte, safe_get(v, i, total)) {
1136 (0xE0 , 0xA0 .. 0xBF) => (),
1137 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1138 (0xED , 0x80 .. 0x9F) => (),
1139 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1146 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1153 match (byte, safe_get(v, i, total)) {
1154 (0xF0 , 0x90 .. 0xBF) => (),
1155 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1156 (0xF4 , 0x80 .. 0x8F) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1168 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1181 if subseqidx < total {
1183 res.push_bytes(v.slice(subseqidx, total))
1186 Owned(res.into_owned())
1193 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1194 /// This can be useful as an optimization when an allocation is sometimes
1195 /// needed but not always.
1196 pub enum MaybeOwned<'a> {
1197 /// A borrowed string
1203 /// SendStr is a specialization of `MaybeOwned` to be sendable
1204 pub type SendStr = MaybeOwned<'static>;
1206 impl<'a> MaybeOwned<'a> {
1207 /// Returns `true` if this `MaybeOwned` wraps an owned string
1209 pub fn is_owned(&self) -> bool {
1216 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1218 pub fn is_slice(&self) -> bool {
1226 /// Trait for moving into a `MaybeOwned`
1227 pub trait IntoMaybeOwned<'a> {
1228 /// Moves self into a `MaybeOwned`
1229 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1232 impl<'a> IntoMaybeOwned<'a> for ~str {
1234 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1237 impl<'a> IntoMaybeOwned<'a> for &'a str {
1239 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1242 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1244 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1247 impl<'a> Eq for MaybeOwned<'a> {
1249 fn eq(&self, other: &MaybeOwned) -> bool {
1250 self.as_slice() == other.as_slice()
1254 impl<'a> TotalEq for MaybeOwned<'a> {}
1256 impl<'a> Ord for MaybeOwned<'a> {
1258 fn lt(&self, other: &MaybeOwned) -> bool {
1259 self.as_slice().lt(&other.as_slice())
1263 impl<'a> TotalOrd for MaybeOwned<'a> {
1265 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1266 self.as_slice().cmp(&other.as_slice())
1270 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1272 fn equiv(&self, other: &S) -> bool {
1273 self.as_slice() == other.as_slice()
1277 impl<'a> Str for MaybeOwned<'a> {
1279 fn as_slice<'b>(&'b self) -> &'b str {
1282 Owned(ref s) => s.as_slice()
1287 fn into_owned(self) -> ~str {
1289 Slice(s) => s.to_owned(),
1295 impl<'a> Container for MaybeOwned<'a> {
1297 fn len(&self) -> uint { self.as_slice().len() }
1300 impl<'a> Clone for MaybeOwned<'a> {
1302 fn clone(&self) -> MaybeOwned<'a> {
1304 Slice(s) => Slice(s),
1305 Owned(ref s) => Owned(s.to_owned())
1310 impl<'a> Default for MaybeOwned<'a> {
1312 fn default() -> MaybeOwned<'a> { Slice("") }
1315 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1317 fn hash(&self, hasher: &mut H) {
1319 Slice(s) => s.hash(hasher),
1320 Owned(ref s) => s.hash(hasher),
1325 impl<'a> fmt::Show for MaybeOwned<'a> {
1327 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329 Slice(ref s) => s.fmt(f),
1330 Owned(ref s) => s.fmt(f)
1335 /// Unsafe operations
1338 use container::Container;
1344 use slice::{MutableVector, ImmutableVector, OwnedVector, Vector};
1345 use str::{is_utf8, StrSlice};
1348 /// Create a Rust string from a *u8 buffer of the given length
1349 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1350 let mut v = Vec::with_capacity(len);
1351 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1354 assert!(is_utf8(v.as_slice()));
1355 ::cast::transmute(v.move_iter().collect::<~[u8]>())
1358 #[lang="strdup_uniq"]
1361 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1362 from_buf_len(ptr, len)
1365 /// Create a Rust string from a null-terminated C string
1366 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1371 curr = buf.offset(i);
1373 from_buf_len(buf as *u8, i as uint)
1376 /// Converts a slice of bytes to a string slice without checking
1377 /// that the string contains valid UTF-8.
1378 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1382 /// Converts an owned vector of bytes to a new owned string. This assumes
1383 /// that the utf-8-ness of the vector has already been validated
1385 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1389 /// Converts a byte to a string.
1390 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(box [u]) }
1392 /// Form a slice from a C string. Unsafe because the caller must ensure the
1393 /// C string has the static lifetime, or else the return value may be
1394 /// invalidated later.
1395 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1399 while *curr != 0u8 {
1401 curr = s.offset(len as int);
1403 let v = Slice { data: s, len: len };
1404 assert!(is_utf8(::cast::transmute(v)));
1405 ::cast::transmute(v)
1408 /// Takes a bytewise (not UTF-8) slice from a string.
1410 /// Returns the substring from [`begin`..`end`).
1414 /// If begin is greater than end.
1415 /// If end is greater than the length of the string.
1417 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1418 assert!(begin <= end);
1419 assert!(end <= s.len());
1420 slice_unchecked(s, begin, end)
1423 /// Takes a bytewise (not UTF-8) slice from a string.
1425 /// Returns the substring from [`begin`..`end`).
1427 /// Caller must check slice boundaries!
1429 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1430 cast::transmute(Slice {
1431 data: s.as_ptr().offset(begin as int),
1436 /// Access the str in its vector representation.
1437 /// The caller must preserve the valid UTF-8 property when modifying.
1439 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1443 /// Sets the length of a string
1445 /// This will explicitly set the size of the string, without actually
1446 /// modifying its buffers, so it is up to the caller to ensure that
1447 /// the string is actually the specified size.
1449 fn test_from_buf_len() {
1451 let a = box [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1453 let c = from_buf_len(b, 3u);
1454 assert_eq!(c, "AAA".to_owned());
1460 Section: Trait implementations
1464 #[allow(missing_doc)]
1466 use container::Container;
1467 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1470 use option::{Some, None};
1471 use str::{Str, StrSlice, eq_slice};
1474 impl<'a> Add<&'a str,~str> for &'a str {
1476 fn add(&self, rhs: & &'a str) -> ~str {
1477 let mut ret = StrBuf::from_owned_str(self.to_owned());
1483 impl<'a> TotalOrd for &'a str {
1485 fn cmp(&self, other: & &'a str) -> Ordering {
1486 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1487 match s_b.cmp(&o_b) {
1488 Greater => return Greater,
1489 Less => return Less,
1494 self.len().cmp(&other.len())
1498 impl TotalOrd for ~str {
1500 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1503 impl<'a> Eq for &'a str {
1505 fn eq(&self, other: & &'a str) -> bool {
1506 eq_slice((*self), (*other))
1509 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1514 fn eq(&self, other: &~str) -> bool {
1515 eq_slice((*self), (*other))
1519 impl<'a> TotalEq for &'a str {}
1521 impl TotalEq for ~str {}
1523 impl<'a> Ord for &'a str {
1525 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1530 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1533 impl<'a, S: Str> Equiv<S> for &'a str {
1535 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1538 impl<'a, S: Str> Equiv<S> for ~str {
1540 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1547 /// Any string that can be represented as a slice
1549 /// Work with `self` as a slice.
1550 fn as_slice<'a>(&'a self) -> &'a str;
1552 /// Convert `self` into a ~str, not making a copy if possible.
1553 fn into_owned(self) -> ~str;
1555 /// Convert `self` into a `StrBuf`.
1557 fn to_strbuf(&self) -> StrBuf {
1558 StrBuf::from_str(self.as_slice())
1561 /// Convert `self` into a `StrBuf`, not making a copy if possible.
1563 fn into_strbuf(self) -> StrBuf {
1564 StrBuf::from_owned_str(self.into_owned())
1568 impl<'a> Str for &'a str {
1570 fn as_slice<'a>(&'a self) -> &'a str { *self }
1573 fn into_owned(self) -> ~str { self.to_owned() }
1576 impl<'a> Str for ~str {
1578 fn as_slice<'a>(&'a self) -> &'a str {
1579 let s: &'a str = *self; s
1583 fn into_owned(self) -> ~str { self }
1586 impl<'a> Container for &'a str {
1588 fn len(&self) -> uint {
1593 impl Container for ~str {
1595 fn len(&self) -> uint { self.as_slice().len() }
1598 /// Methods for string slices
1599 pub trait StrSlice<'a> {
1600 /// Returns true if one string contains another
1604 /// - needle - The string to look for
1605 fn contains<'a>(&self, needle: &'a str) -> bool;
1607 /// Returns true if a string contains a char.
1611 /// - needle - The char to look for
1612 fn contains_char(&self, needle: char) -> bool;
1614 /// An iterator over the characters of `self`. Note, this iterates
1615 /// over unicode code-points, not unicode graphemes.
1620 /// let v: ~[char] = "abc åäö".chars().collect();
1621 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1623 fn chars(&self) -> Chars<'a>;
1625 /// Do not use this - it is deprecated.
1626 #[deprecated = "replaced by .chars().rev()"]
1627 fn chars_rev(&self) -> Rev<Chars<'a>>;
1629 /// An iterator over the bytes of `self`
1630 fn bytes(&self) -> Bytes<'a>;
1632 /// Do not use this - it is deprecated.
1633 #[deprecated = "replaced by .bytes().rev()"]
1634 fn bytes_rev(&self) -> Rev<Bytes<'a>>;
1636 /// An iterator over the characters of `self` and their byte offsets.
1637 fn char_indices(&self) -> CharOffsets<'a>;
1639 /// Do not use this - it is deprecated.
1640 #[deprecated = "replaced by .char_indices().rev()"]
1641 fn char_indices_rev(&self) -> Rev<CharOffsets<'a>>;
1643 /// An iterator over substrings of `self`, separated by characters
1644 /// matched by `sep`.
1649 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1650 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1652 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1653 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1655 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1656 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1658 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1660 /// An iterator over substrings of `self`, separated by characters
1661 /// matched by `sep`, restricted to splitting at most `count`
1667 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1668 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1670 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1671 /// assert_eq!(v, ~["abc", "def2ghi"]);
1673 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1674 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1676 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1678 /// An iterator over substrings of `self`, separated by characters
1679 /// matched by `sep`.
1681 /// Equivalent to `split`, except that the trailing substring
1682 /// is skipped if empty (terminator semantics).
1687 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1688 /// assert_eq!(v, ~["A", "B"]);
1690 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1691 /// assert_eq!(v, ~["A", "", "B", ""]);
1693 /// let v: ~[&str] = "Mary had a little lamb".split(' ').rev().collect();
1694 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1696 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).rev().collect();
1697 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1699 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').rev().collect();
1700 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1702 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1704 /// Do not use this - it is deprecated.
1705 #[deprecated = "replaced by .split(sep).rev()"]
1706 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> Rev<CharSplits<'a, Sep>>;
1708 /// An iterator over substrings of `self`, separated by characters
1709 /// matched by `sep`, starting from the end of the string.
1710 /// Restricted to splitting at most `count` times.
1715 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1716 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1718 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1719 /// assert_eq!(v, ~["ghi", "abc1def"]);
1721 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1722 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1724 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1726 /// An iterator over the start and end indices of the disjoint
1727 /// matches of `sep` within `self`.
1729 /// That is, each returned value `(start, end)` satisfies
1730 /// `self.slice(start, end) == sep`. For matches of `sep` within
1731 /// `self` that overlap, only the indicies corresponding to the
1732 /// first match are returned.
1737 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1738 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1740 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1741 /// assert_eq!(v, ~[(1,4), (4,7)]);
1743 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1744 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1746 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1748 /// An iterator over the substrings of `self` separated by `sep`.
1753 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1754 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1756 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1757 /// assert_eq!(v, ~["1", "", "2"]);
1759 fn split_str(&self, &'a str) -> StrSplits<'a>;
1761 /// An iterator over the lines of a string (subsequences separated
1762 /// by `\n`). This does not include the empty string after a
1768 /// let four_lines = "foo\nbar\n\nbaz\n";
1769 /// let v: ~[&str] = four_lines.lines().collect();
1770 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1772 fn lines(&self) -> CharSplits<'a, char>;
1774 /// An iterator over the lines of a string, separated by either
1775 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1776 /// empty trailing line.
1781 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1782 /// let v: ~[&str] = four_lines.lines_any().collect();
1783 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1785 fn lines_any(&self) -> AnyLines<'a>;
1787 /// An iterator over the words of a string (subsequences separated
1788 /// by any sequence of whitespace). Sequences of whitespace are
1789 /// collapsed, so empty "words" are not included.
1794 /// let some_words = " Mary had\ta little \n\t lamb";
1795 /// let v: ~[&str] = some_words.words().collect();
1796 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1798 fn words(&self) -> Words<'a>;
1800 /// An Iterator over the string in Unicode Normalization Form D
1801 /// (canonical decomposition).
1802 fn nfd_chars(&self) -> Normalizations<'a>;
1804 /// An Iterator over the string in Unicode Normalization Form KD
1805 /// (compatibility decomposition).
1806 fn nfkd_chars(&self) -> Normalizations<'a>;
1808 /// Returns true if the string contains only whitespace.
1810 /// Whitespace characters are determined by `char::is_whitespace`.
1815 /// assert!(" \t\n".is_whitespace());
1816 /// assert!("".is_whitespace());
1818 /// assert!( !"abc".is_whitespace());
1820 fn is_whitespace(&self) -> bool;
1822 /// Returns true if the string contains only alphanumeric code
1825 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1830 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1831 /// assert!("".is_alphanumeric());
1833 /// assert!( !" &*~".is_alphanumeric());
1835 fn is_alphanumeric(&self) -> bool;
1837 /// Returns the number of Unicode code points (`char`) that a
1840 /// This does not perform any normalization, and is `O(n)`, since
1841 /// UTF-8 is a variable width encoding of code points.
1843 /// *Warning*: The number of code points in a string does not directly
1844 /// correspond to the number of visible characters or width of the
1845 /// visible text due to composing characters, and double- and
1846 /// zero-width ones.
1848 /// See also `.len()` for the byte length.
1853 /// // composed forms of `ö` and `é`
1854 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1855 /// // decomposed forms of `ö` and `é`
1856 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1858 /// assert_eq!(c.char_len(), 15);
1859 /// assert_eq!(d.char_len(), 17);
1861 /// assert_eq!(c.len(), 21);
1862 /// assert_eq!(d.len(), 23);
1864 /// // the two strings *look* the same
1865 /// println!("{}", c);
1866 /// println!("{}", d);
1868 fn char_len(&self) -> uint;
1870 /// Returns a slice of the given string from the byte range
1871 /// [`begin`..`end`).
1873 /// This operation is `O(1)`.
1875 /// Fails when `begin` and `end` do not point to valid characters
1876 /// or point beyond the last character of the string.
1878 /// See also `slice_to` and `slice_from` for slicing prefixes and
1879 /// suffixes of strings, and `slice_chars` for slicing based on
1880 /// code point counts.
1885 /// let s = "Löwe 老虎 Léopard";
1886 /// assert_eq!(s.slice(0, 1), "L");
1888 /// assert_eq!(s.slice(1, 9), "öwe 老");
1890 /// // these will fail:
1891 /// // byte 2 lies within `ö`:
1892 /// // s.slice(2, 3);
1894 /// // byte 8 lies within `老`
1895 /// // s.slice(1, 8);
1897 /// // byte 100 is outside the string
1898 /// // s.slice(3, 100);
1900 fn slice(&self, begin: uint, end: uint) -> &'a str;
1902 /// Returns a slice of the string from `begin` to its end.
1904 /// Equivalent to `self.slice(begin, self.len())`.
1906 /// Fails when `begin` does not point to a valid character, or is
1909 /// See also `slice`, `slice_to` and `slice_chars`.
1910 fn slice_from(&self, begin: uint) -> &'a str;
1912 /// Returns a slice of the string from the beginning to byte
1915 /// Equivalent to `self.slice(0, end)`.
1917 /// Fails when `end` does not point to a valid character, or is
1920 /// See also `slice`, `slice_from` and `slice_chars`.
1921 fn slice_to(&self, end: uint) -> &'a str;
1923 /// Returns a slice of the string from the character range
1924 /// [`begin`..`end`).
1926 /// That is, start at the `begin`-th code point of the string and
1927 /// continue to the `end`-th code point. This does not detect or
1928 /// handle edge cases such as leaving a combining character as the
1929 /// first code point of the string.
1931 /// Due to the design of UTF-8, this operation is `O(end)`.
1932 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1933 /// variants that use byte indices rather than code point
1936 /// Fails if `begin` > `end` or the either `begin` or `end` are
1937 /// beyond the last character of the string.
1942 /// let s = "Löwe 老虎 Léopard";
1943 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1944 /// assert_eq!(s.slice_chars(5, 7), "老虎");
1946 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1948 /// Returns true if `needle` is a prefix of the string.
1949 fn starts_with(&self, needle: &str) -> bool;
1951 /// Returns true if `needle` is a suffix of the string.
1952 fn ends_with(&self, needle: &str) -> bool;
1954 /// Escape each char in `s` with `char::escape_default`.
1955 fn escape_default(&self) -> ~str;
1957 /// Escape each char in `s` with `char::escape_unicode`.
1958 fn escape_unicode(&self) -> ~str;
1960 /// Returns a string with leading and trailing whitespace removed.
1961 fn trim(&self) -> &'a str;
1963 /// Returns a string with leading whitespace removed.
1964 fn trim_left(&self) -> &'a str;
1966 /// Returns a string with trailing whitespace removed.
1967 fn trim_right(&self) -> &'a str;
1969 /// Returns a string with characters that match `to_trim` removed.
1973 /// * to_trim - a character matcher
1978 /// assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar")
1979 /// assert_eq!("12foo1bar12".trim_chars(&['1', '2']), "foo1bar")
1980 /// assert_eq!("123foo1bar123".trim_chars(|c: char| c.is_digit()), "foo1bar")
1982 fn trim_chars<C: CharEq>(&self, to_trim: C) -> &'a str;
1984 /// Returns a string with leading `chars_to_trim` removed.
1988 /// * to_trim - a character matcher
1993 /// assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11")
1994 /// assert_eq!("12foo1bar12".trim_left_chars(&['1', '2']), "foo1bar12")
1995 /// assert_eq!("123foo1bar123".trim_left_chars(|c: char| c.is_digit()), "foo1bar123")
1997 fn trim_left_chars<C: CharEq>(&self, to_trim: C) -> &'a str;
1999 /// Returns a string with trailing `chars_to_trim` removed.
2003 /// * to_trim - a character matcher
2008 /// assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar")
2009 /// assert_eq!("12foo1bar12".trim_right_chars(&['1', '2']), "12foo1bar")
2010 /// assert_eq!("123foo1bar123".trim_right_chars(|c: char| c.is_digit()), "123foo1bar")
2012 fn trim_right_chars<C: CharEq>(&self, to_trim: C) -> &'a str;
2014 /// Replace all occurrences of one string with another.
2018 /// * `from` - The string to replace
2019 /// * `to` - The replacement string
2023 /// The original string with all occurrences of `from` replaced with `to`.
2028 /// let s = "Do you know the muffin man,
2029 /// The muffin man, the muffin man, ...".to_owned();
2031 /// assert_eq!(s.replace("muffin man", "little lamb"),
2032 /// "Do you know the little lamb,
2033 /// The little lamb, the little lamb, ...".to_owned());
2035 /// // not found, so no change.
2036 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2038 fn replace(&self, from: &str, to: &str) -> ~str;
2040 /// Copy a slice into a new owned str.
2041 fn to_owned(&self) -> ~str;
2043 /// Converts to a vector of `u16` encoded as UTF-16.
2044 fn to_utf16(&self) -> ~[u16];
2046 /// Check that `index`-th byte lies at the start and/or end of a
2047 /// UTF-8 code point sequence.
2049 /// The start and end of the string (when `index == self.len()`)
2050 /// are considered to be boundaries.
2052 /// Fails if `index` is greater than `self.len()`.
2057 /// let s = "Löwe 老虎 Léopard";
2058 /// assert!(s.is_char_boundary(0));
2060 /// assert!(s.is_char_boundary(6));
2061 /// assert!(s.is_char_boundary(s.len()));
2063 /// // second byte of `ö`
2064 /// assert!(!s.is_char_boundary(2));
2066 /// // third byte of `老`
2067 /// assert!(!s.is_char_boundary(8));
2069 fn is_char_boundary(&self, index: uint) -> bool;
2071 /// Pluck a character out of a string and return the index of the next
2074 /// This function can be used to iterate over the unicode characters of a
2079 /// This example manually iterate through the characters of a
2080 /// string; this should normally by done by `.chars()` or
2081 /// `.char_indices`.
2084 /// use std::str::CharRange;
2086 /// let s = "中华Việt Nam";
2088 /// while i < s.len() {
2089 /// let CharRange {ch, next} = s.char_range_at(i);
2090 /// println!("{}: {}", i, ch);
2112 /// * s - The string
2113 /// * i - The byte offset of the char to extract
2117 /// A record {ch: char, next: uint} containing the char value and the byte
2118 /// index of the next unicode character.
2122 /// If `i` is greater than or equal to the length of the string.
2123 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2124 fn char_range_at(&self, start: uint) -> CharRange;
2126 /// Given a byte position and a str, return the previous char and its position.
2128 /// This function can be used to iterate over a unicode string in reverse.
2130 /// Returns 0 for next index if called on start index 0.
2131 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2133 /// Plucks the character starting at the `i`th byte of a string
2134 fn char_at(&self, i: uint) -> char;
2136 /// Plucks the character ending at the `i`th byte of a string
2137 fn char_at_reverse(&self, i: uint) -> char;
2139 /// Work with the byte buffer of a string as a byte slice.
2140 fn as_bytes(&self) -> &'a [u8];
2142 /// Returns the byte index of the first character of `self` that
2143 /// matches `search`.
2147 /// `Some` containing the byte index of the last matching character
2148 /// or `None` if there is no match
2153 /// let s = "Löwe 老虎 Léopard";
2155 /// assert_eq!(s.find('L'), Some(0));
2156 /// assert_eq!(s.find('é'), Some(14));
2158 /// // the first space
2159 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2161 /// // neither are found
2162 /// assert_eq!(s.find(&['1', '2']), None);
2164 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2166 /// Returns the byte index of the last character of `self` that
2167 /// matches `search`.
2171 /// `Some` containing the byte index of the last matching character
2172 /// or `None` if there is no match.
2177 /// let s = "Löwe 老虎 Léopard";
2179 /// assert_eq!(s.rfind('L'), Some(13));
2180 /// assert_eq!(s.rfind('é'), Some(14));
2182 /// // the second space
2183 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2185 /// // searches for an occurrence of either `1` or `2`, but neither are found
2186 /// assert_eq!(s.rfind(&['1', '2']), None);
2188 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2190 /// Returns the byte index of the first matching substring
2194 /// * `needle` - The string to search for
2198 /// `Some` containing the byte index of the first matching substring
2199 /// or `None` if there is no match.
2204 /// let s = "Löwe 老虎 Léopard";
2206 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2207 /// assert_eq!(s.find_str("muffin man"), None);
2209 fn find_str(&self, &str) -> Option<uint>;
2211 /// Given a string, make a new string with repeated copies of it.
2212 fn repeat(&self, nn: uint) -> ~str;
2214 /// Retrieves the first character from a string slice and returns
2215 /// it. This does not allocate a new string; instead, it returns a
2216 /// slice that point one character beyond the character that was
2217 /// shifted. If the string does not contain any characters,
2218 /// a tuple of None and an empty string is returned instead.
2223 /// let s = "Löwe 老虎 Léopard";
2224 /// let (c, s1) = s.slice_shift_char();
2225 /// assert_eq!(c, Some('L'));
2226 /// assert_eq!(s1, "öwe 老虎 Léopard");
2228 /// let (c, s2) = s1.slice_shift_char();
2229 /// assert_eq!(c, Some('ö'));
2230 /// assert_eq!(s2, "we 老虎 Léopard");
2232 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2234 /// Levenshtein Distance between two strings.
2235 fn lev_distance(&self, t: &str) -> uint;
2237 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2239 /// Fails if `inner` is not a direct slice contained within self.
2244 /// let string = "a\nb\nc";
2245 /// let lines: ~[&str] = string.lines().collect();
2247 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2248 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2249 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2251 fn subslice_offset(&self, inner: &str) -> uint;
2253 /// Return an unsafe pointer to the strings buffer.
2255 /// The caller must ensure that the string outlives this pointer,
2256 /// and that it is not reallocated (e.g. by pushing to the
2258 fn as_ptr(&self) -> *u8;
2261 impl<'a> StrSlice<'a> for &'a str {
2263 fn contains<'a>(&self, needle: &'a str) -> bool {
2264 self.find_str(needle).is_some()
2268 fn contains_char(&self, needle: char) -> bool {
2269 self.find(needle).is_some()
2273 fn chars(&self) -> Chars<'a> {
2274 Chars{string: *self}
2278 #[deprecated = "replaced by .chars().rev()"]
2279 fn chars_rev(&self) -> Rev<Chars<'a>> {
2284 fn bytes(&self) -> Bytes<'a> {
2285 self.as_bytes().iter().map(|&b| b)
2289 #[deprecated = "replaced by .bytes().rev()"]
2290 fn bytes_rev(&self) -> Rev<Bytes<'a>> {
2295 fn char_indices(&self) -> CharOffsets<'a> {
2296 CharOffsets{string: *self, iter: self.chars()}
2300 #[deprecated = "replaced by .char_indices().rev()"]
2301 fn char_indices_rev(&self) -> Rev<CharOffsets<'a>> {
2302 self.char_indices().rev()
2306 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2309 only_ascii: sep.only_ascii(),
2311 allow_trailing_empty: true,
2317 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2318 -> CharSplitsN<'a, Sep> {
2320 iter: self.split(sep),
2327 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2328 -> CharSplits<'a, Sep> {
2330 allow_trailing_empty: false,
2336 #[deprecated = "replaced by .split(sep).rev()"]
2337 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> Rev<CharSplits<'a, Sep>> {
2338 self.split(sep).rev()
2342 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2343 -> CharSplitsN<'a, Sep> {
2345 iter: self.split(sep),
2352 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2353 assert!(!sep.is_empty())
2362 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2364 it: self.match_indices(sep),
2371 fn lines(&self) -> CharSplits<'a, char> {
2372 self.split_terminator('\n')
2375 fn lines_any(&self) -> AnyLines<'a> {
2376 self.lines().map(|line| {
2378 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2384 fn words(&self) -> Words<'a> {
2385 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2389 fn nfd_chars(&self) -> Normalizations<'a> {
2399 fn nfkd_chars(&self) -> Normalizations<'a> {
2409 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2412 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2415 fn char_len(&self) -> uint { self.chars().len() }
2418 fn slice(&self, begin: uint, end: uint) -> &'a str {
2419 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2420 unsafe { raw::slice_bytes(*self, begin, end) }
2424 fn slice_from(&self, begin: uint) -> &'a str {
2425 self.slice(begin, self.len())
2429 fn slice_to(&self, end: uint) -> &'a str {
2430 assert!(self.is_char_boundary(end));
2431 unsafe { raw::slice_bytes(*self, 0, end) }
2434 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2435 assert!(begin <= end);
2437 let mut begin_byte = None;
2438 let mut end_byte = None;
2440 // This could be even more efficient by not decoding,
2441 // only finding the char boundaries
2442 for (idx, _) in self.char_indices() {
2443 if count == begin { begin_byte = Some(idx); }
2444 if count == end { end_byte = Some(idx); break; }
2447 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2448 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2450 match (begin_byte, end_byte) {
2451 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2452 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2453 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2458 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2459 let n = needle.len();
2460 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2464 fn ends_with(&self, needle: &str) -> bool {
2465 let (m, n) = (self.len(), needle.len());
2466 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2469 fn escape_default(&self) -> ~str {
2470 let mut out = StrBuf::with_capacity(self.len());
2471 for c in self.chars() {
2472 c.escape_default(|c| out.push_char(c));
2477 fn escape_unicode(&self) -> ~str {
2478 let mut out = StrBuf::with_capacity(self.len());
2479 for c in self.chars() {
2480 c.escape_unicode(|c| out.push_char(c));
2486 fn trim(&self) -> &'a str {
2487 self.trim_left().trim_right()
2491 fn trim_left(&self) -> &'a str {
2492 self.trim_left_chars(char::is_whitespace)
2496 fn trim_right(&self) -> &'a str {
2497 self.trim_right_chars(char::is_whitespace)
2501 fn trim_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
2502 let cur = match self.find(|c: char| !to_trim.matches(c)) {
2504 Some(i) => unsafe { raw::slice_bytes(*self, i, self.len()) }
2506 match cur.rfind(|c: char| !to_trim.matches(c)) {
2509 let right = cur.char_range_at(i).next;
2510 unsafe { raw::slice_bytes(cur, 0, right) }
2516 fn trim_left_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
2517 match self.find(|c: char| !to_trim.matches(c)) {
2519 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2524 fn trim_right_chars<C: CharEq>(&self, mut to_trim: C) -> &'a str {
2525 match self.rfind(|c: char| !to_trim.matches(c)) {
2528 let next = self.char_range_at(last).next;
2529 unsafe { raw::slice_bytes(*self, 0u, next) }
2534 fn replace(&self, from: &str, to: &str) -> ~str {
2535 let mut result = StrBuf::new();
2536 let mut last_end = 0;
2537 for (start, end) in self.match_indices(from) {
2538 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2539 result.push_str(to);
2542 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2547 fn to_owned(&self) -> ~str {
2548 let len = self.len();
2550 let mut v = Vec::with_capacity(len);
2552 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2554 ::cast::transmute(v.move_iter().collect::<~[u8]>())
2558 fn to_utf16(&self) -> ~[u16] {
2559 let mut u = Vec::new();;
2560 for ch in self.chars() {
2561 let mut buf = [0u16, ..2];
2562 let n = ch.encode_utf16(buf /* as mut slice! */);
2563 u.push_all(buf.slice_to(n));
2565 u.move_iter().collect()
2569 fn is_char_boundary(&self, index: uint) -> bool {
2570 if index == self.len() { return true; }
2571 let b = self[index];
2572 return b < 128u8 || b >= 192u8;
2576 fn char_range_at(&self, i: uint) -> CharRange {
2577 if self[i] < 128u8 {
2578 return CharRange {ch: self[i] as char, next: i + 1 };
2581 // Multibyte case is a fn to allow char_range_at to inline cleanly
2582 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2583 let mut val = s[i] as u32;
2584 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2587 val = utf8_first_byte!(val, w);
2588 val = utf8_acc_cont_byte!(val, s[i + 1]);
2589 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2590 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2592 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2595 return multibyte_char_range_at(*self, i);
2599 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2600 let mut prev = start;
2602 prev = prev.saturating_sub(1);
2603 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2605 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2606 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2607 // while there is a previous byte == 10......
2608 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2612 let mut val = s[i] as u32;
2613 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2616 val = utf8_first_byte!(val, w);
2617 val = utf8_acc_cont_byte!(val, s[i + 1]);
2618 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2619 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2621 return CharRange {ch: unsafe { transmute(val) }, next: i};
2624 return multibyte_char_range_at_reverse(*self, prev);
2628 fn char_at(&self, i: uint) -> char {
2629 self.char_range_at(i).ch
2633 fn char_at_reverse(&self, i: uint) -> char {
2634 self.char_range_at_reverse(i).ch
2638 fn as_bytes(&self) -> &'a [u8] {
2639 unsafe { cast::transmute(*self) }
2642 fn find<C: CharEq>(&self, mut search: C) -> Option<uint> {
2643 if search.only_ascii() {
2644 self.bytes().position(|b| search.matches(b as char))
2646 for (index, c) in self.char_indices() {
2647 if search.matches(c) { return Some(index); }
2653 fn rfind<C: CharEq>(&self, mut search: C) -> Option<uint> {
2654 if search.only_ascii() {
2655 self.bytes().rposition(|b| search.matches(b as char))
2657 for (index, c) in self.char_indices().rev() {
2658 if search.matches(c) { return Some(index); }
2664 fn find_str(&self, needle: &str) -> Option<uint> {
2665 if needle.is_empty() {
2668 self.match_indices(needle)
2670 .map(|(start, _end)| start)
2674 fn repeat(&self, nn: uint) -> ~str {
2675 let mut ret = StrBuf::with_capacity(nn * self.len());
2676 for _ in range(0, nn) {
2677 ret.push_str(*self);
2683 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2684 if self.is_empty() {
2685 return (None, *self);
2687 let CharRange {ch, next} = self.char_range_at(0u);
2688 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2689 return (Some(ch), next_s);
2693 fn lev_distance(&self, t: &str) -> uint {
2694 let slen = self.len();
2697 if slen == 0 { return tlen; }
2698 if tlen == 0 { return slen; }
2700 let mut dcol = Vec::from_fn(tlen + 1, |x| x);
2702 for (i, sc) in self.chars().enumerate() {
2704 let mut current = i;
2705 *dcol.get_mut(0) = current + 1;
2707 for (j, tc) in t.chars().enumerate() {
2709 let next = *dcol.get(j + 1);
2712 *dcol.get_mut(j + 1) = current;
2714 *dcol.get_mut(j + 1) = ::cmp::min(current, next);
2715 *dcol.get_mut(j + 1) = ::cmp::min(*dcol.get(j + 1),
2723 return *dcol.get(tlen);
2726 fn subslice_offset(&self, inner: &str) -> uint {
2727 let a_start = self.as_ptr() as uint;
2728 let a_end = a_start + self.len();
2729 let b_start = inner.as_ptr() as uint;
2730 let b_end = b_start + inner.len();
2732 assert!(a_start <= b_start);
2733 assert!(b_end <= a_end);
2738 fn as_ptr(&self) -> *u8 {
2743 /// Methods for owned strings
2744 pub trait OwnedStr {
2745 /// Consumes the string, returning the underlying byte buffer.
2747 /// The buffer does not have a null terminator.
2748 fn into_bytes(self) -> ~[u8];
2750 /// Pushes the given string onto this string, returning the concatenation of the two strings.
2751 fn append(self, rhs: &str) -> ~str;
2754 impl OwnedStr for ~str {
2756 fn into_bytes(self) -> ~[u8] {
2757 unsafe { cast::transmute(self) }
2761 fn append(self, rhs: &str) -> ~str {
2762 let mut new_str = StrBuf::from_owned_str(self);
2763 new_str.push_str(rhs);
2764 new_str.into_owned()
2768 impl Clone for ~str {
2770 fn clone(&self) -> ~str {
2775 impl FromIterator<char> for ~str {
2777 fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2778 let (lower, _) = iterator.size_hint();
2779 let mut buf = StrBuf::with_capacity(lower);
2780 buf.extend(iterator);
2785 // This works because every lifetime is a sub-lifetime of 'static
2786 impl<'a> Default for &'a str {
2787 fn default() -> &'a str { "" }
2790 impl Default for ~str {
2791 fn default() -> ~str { "".to_owned() }
2796 use iter::AdditiveIterator;
2797 use default::Default;
2804 assert!((eq(&"".to_owned(), &"".to_owned())));
2805 assert!((eq(&"foo".to_owned(), &"foo".to_owned())));
2806 assert!((!eq(&"foo".to_owned(), &"bar".to_owned())));
2810 fn test_eq_slice() {
2811 assert!((eq_slice("foobar".slice(0, 3), "foo")));
2812 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2813 assert!((!eq_slice("foo1", "foo2")));
2819 assert!("" <= "foo");
2820 assert!("foo" <= "foo");
2821 assert!("foo" != "bar");
2826 assert_eq!("".len(), 0u);
2827 assert_eq!("hello world".len(), 11u);
2828 assert_eq!("\x63".len(), 1u);
2829 assert_eq!("\xa2".len(), 2u);
2830 assert_eq!("\u03c0".len(), 2u);
2831 assert_eq!("\u2620".len(), 3u);
2832 assert_eq!("\U0001d11e".len(), 4u);
2834 assert_eq!("".char_len(), 0u);
2835 assert_eq!("hello world".char_len(), 11u);
2836 assert_eq!("\x63".char_len(), 1u);
2837 assert_eq!("\xa2".char_len(), 1u);
2838 assert_eq!("\u03c0".char_len(), 1u);
2839 assert_eq!("\u2620".char_len(), 1u);
2840 assert_eq!("\U0001d11e".char_len(), 1u);
2841 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2846 assert_eq!("hello".find('l'), Some(2u));
2847 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2848 assert!("hello".find('x').is_none());
2849 assert!("hello".find(|c:char| c == 'x').is_none());
2850 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2851 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2856 assert_eq!("hello".rfind('l'), Some(3u));
2857 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2858 assert!("hello".rfind('x').is_none());
2859 assert!("hello".rfind(|c:char| c == 'x').is_none());
2860 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2861 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2866 let empty = "".to_owned();
2867 let s: ~str = empty.chars().collect();
2868 assert_eq!(empty, s);
2869 let data = "ประเทศไทย中".to_owned();
2870 let s: ~str = data.chars().collect();
2871 assert_eq!(data, s);
2875 fn test_into_bytes() {
2876 let data = "asdf".to_owned();
2877 let buf = data.into_bytes();
2878 assert_eq!(bytes!("asdf"), buf.as_slice());
2882 fn test_find_str() {
2884 assert_eq!("".find_str(""), Some(0u));
2885 assert!("banana".find_str("apple pie").is_none());
2887 let data = "abcabc";
2888 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2889 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2890 assert!(data.slice(2u, 4u).find_str("ab").is_none());
2892 let mut data = "ประเทศไทย中华Việt Nam".to_owned();
2894 assert!(data.find_str("ไท华").is_none());
2895 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2896 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2898 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2899 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2900 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2901 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2902 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2904 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2905 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2906 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2907 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2908 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2912 fn test_slice_chars() {
2913 fn t(a: &str, b: &str, start: uint) {
2914 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2917 t("hello", "llo", 2);
2918 t("hello", "el", 1);
2921 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2926 fn t(v: &[~str], s: &str) {
2927 assert_eq!(v.concat(), s.to_str());
2929 t(["you".to_owned(), "know".to_owned(), "I'm".to_owned(),
2930 "no".to_owned(), "good".to_owned()], "youknowI'mnogood");
2931 let v: &[~str] = [];
2933 t(["hi".to_owned()], "hi");
2938 fn t(v: &[~str], sep: &str, s: &str) {
2939 assert_eq!(v.connect(sep), s.to_str());
2941 t(["you".to_owned(), "know".to_owned(), "I'm".to_owned(),
2942 "no".to_owned(), "good".to_owned()],
2943 " ", "you know I'm no good");
2944 let v: &[~str] = [];
2946 t(["hi".to_owned()], " ", "hi");
2950 fn test_concat_slices() {
2951 fn t(v: &[&str], s: &str) {
2952 assert_eq!(v.concat(), s.to_str());
2954 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2955 let v: &[&str] = [];
2961 fn test_connect_slices() {
2962 fn t(v: &[&str], sep: &str, s: &str) {
2963 assert_eq!(v.connect(sep), s.to_str());
2965 t(["you", "know", "I'm", "no", "good"],
2966 " ", "you know I'm no good");
2968 t(["hi"], " ", "hi");
2973 assert_eq!("x".repeat(4), "xxxx".to_owned());
2974 assert_eq!("hi".repeat(4), "hihihihi".to_owned());
2975 assert_eq!("ไท华".repeat(3), "ไท华ไท华ไท华".to_owned());
2976 assert_eq!("".repeat(4), "".to_owned());
2977 assert_eq!("hi".repeat(0), "".to_owned());
2981 fn test_unsafe_slice() {
2982 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2983 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
2984 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2985 fn a_million_letter_a() -> ~str {
2987 let mut rs = StrBuf::new();
2989 rs.push_str("aaaaaaaaaa");
2994 fn half_a_million_letter_a() -> ~str {
2996 let mut rs = StrBuf::new();
2998 rs.push_str("aaaaa");
3003 let letters = a_million_letter_a();
3004 assert!(half_a_million_letter_a() ==
3005 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3009 fn test_starts_with() {
3010 assert!(("".starts_with("")));
3011 assert!(("abc".starts_with("")));
3012 assert!(("abc".starts_with("a")));
3013 assert!((!"a".starts_with("abc")));
3014 assert!((!"".starts_with("abc")));
3015 assert!((!"ödd".starts_with("-")));
3016 assert!(("ödd".starts_with("öd")));
3020 fn test_ends_with() {
3021 assert!(("".ends_with("")));
3022 assert!(("abc".ends_with("")));
3023 assert!(("abc".ends_with("c")));
3024 assert!((!"a".ends_with("abc")));
3025 assert!((!"".ends_with("abc")));
3026 assert!((!"ddö".ends_with("-")));
3027 assert!(("ddö".ends_with("dö")));
3031 fn test_is_empty() {
3032 assert!("".is_empty());
3033 assert!(!"a".is_empty());
3039 assert_eq!("".replace(a, "b"), "".to_owned());
3040 assert_eq!("a".replace(a, "b"), "b".to_owned());
3041 assert_eq!("ab".replace(a, "b"), "bb".to_owned());
3043 assert!(" test test ".replace(test, "toast") ==
3044 " toast toast ".to_owned());
3045 assert_eq!(" test test ".replace(test, ""), " ".to_owned());
3049 fn test_replace_2a() {
3050 let data = "ประเทศไทย中华".to_owned();
3051 let repl = "دولة الكويت".to_owned();
3053 let a = "ประเ".to_owned();
3054 let a2 = "دولة الكويتทศไทย中华".to_owned();
3055 assert_eq!(data.replace(a, repl), a2);
3059 fn test_replace_2b() {
3060 let data = "ประเทศไทย中华".to_owned();
3061 let repl = "دولة الكويت".to_owned();
3063 let b = "ะเ".to_owned();
3064 let b2 = "ปรدولة الكويتทศไทย中华".to_owned();
3065 assert_eq!(data.replace(b, repl), b2);
3069 fn test_replace_2c() {
3070 let data = "ประเทศไทย中华".to_owned();
3071 let repl = "دولة الكويت".to_owned();
3073 let c = "中华".to_owned();
3074 let c2 = "ประเทศไทยدولة الكويت".to_owned();
3075 assert_eq!(data.replace(c, repl), c2);
3079 fn test_replace_2d() {
3080 let data = "ประเทศไทย中华".to_owned();
3081 let repl = "دولة الكويت".to_owned();
3083 let d = "ไท华".to_owned();
3084 assert_eq!(data.replace(d, repl), data);
3089 assert_eq!("ab", "abc".slice(0, 2));
3090 assert_eq!("bc", "abc".slice(1, 3));
3091 assert_eq!("", "abc".slice(1, 1));
3092 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3094 let data = "ประเทศไทย中华";
3095 assert_eq!("ป", data.slice(0, 3));
3096 assert_eq!("ร", data.slice(3, 6));
3097 assert_eq!("", data.slice(3, 3));
3098 assert_eq!("华", data.slice(30, 33));
3100 fn a_million_letter_X() -> ~str {
3102 let mut rs = StrBuf::new();
3104 rs.push_str("华华华华华华华华华华");
3109 fn half_a_million_letter_X() -> ~str {
3111 let mut rs = StrBuf::new();
3113 rs.push_str("华华华华华");
3118 let letters = a_million_letter_X();
3119 assert!(half_a_million_letter_X() ==
3120 letters.slice(0u, 3u * 500000u).to_owned());
3125 let ss = "中华Việt Nam";
3127 assert_eq!("华", ss.slice(3u, 6u));
3128 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3130 assert_eq!("ab", "abc".slice(0u, 2u));
3131 assert_eq!("bc", "abc".slice(1u, 3u));
3132 assert_eq!("", "abc".slice(1u, 1u));
3134 assert_eq!("中", ss.slice(0u, 3u));
3135 assert_eq!("华V", ss.slice(3u, 7u));
3136 assert_eq!("", ss.slice(3u, 3u));
3151 fn test_slice_fail() {
3152 "中华Việt Nam".slice(0u, 2u);
3156 fn test_slice_from() {
3157 assert_eq!("abcd".slice_from(0), "abcd");
3158 assert_eq!("abcd".slice_from(2), "cd");
3159 assert_eq!("abcd".slice_from(4), "");
3162 fn test_slice_to() {
3163 assert_eq!("abcd".slice_to(0), "");
3164 assert_eq!("abcd".slice_to(2), "ab");
3165 assert_eq!("abcd".slice_to(4), "abcd");
3169 fn test_trim_left_chars() {
3170 let v: &[char] = &[];
3171 assert_eq!(" *** foo *** ".trim_left_chars(v), " *** foo *** ");
3172 assert_eq!(" *** foo *** ".trim_left_chars(&['*', ' ']), "foo *** ");
3173 assert_eq!(" *** *** ".trim_left_chars(&['*', ' ']), "");
3174 assert_eq!("foo *** ".trim_left_chars(&['*', ' ']), "foo *** ");
3176 assert_eq!("11foo1bar11".trim_left_chars('1'), "foo1bar11");
3177 assert_eq!("12foo1bar12".trim_left_chars(&['1', '2']), "foo1bar12");
3178 assert_eq!("123foo1bar123".trim_left_chars(|c: char| c.is_digit()), "foo1bar123");
3182 fn test_trim_right_chars() {
3183 let v: &[char] = &[];
3184 assert_eq!(" *** foo *** ".trim_right_chars(v), " *** foo *** ");
3185 assert_eq!(" *** foo *** ".trim_right_chars(&['*', ' ']), " *** foo");
3186 assert_eq!(" *** *** ".trim_right_chars(&['*', ' ']), "");
3187 assert_eq!(" *** foo".trim_right_chars(&['*', ' ']), " *** foo");
3189 assert_eq!("11foo1bar11".trim_right_chars('1'), "11foo1bar");
3190 assert_eq!("12foo1bar12".trim_right_chars(&['1', '2']), "12foo1bar");
3191 assert_eq!("123foo1bar123".trim_right_chars(|c: char| c.is_digit()), "123foo1bar");
3195 fn test_trim_chars() {
3196 let v: &[char] = &[];
3197 assert_eq!(" *** foo *** ".trim_chars(v), " *** foo *** ");
3198 assert_eq!(" *** foo *** ".trim_chars(&['*', ' ']), "foo");
3199 assert_eq!(" *** *** ".trim_chars(&['*', ' ']), "");
3200 assert_eq!("foo".trim_chars(&['*', ' ']), "foo");
3202 assert_eq!("11foo1bar11".trim_chars('1'), "foo1bar");
3203 assert_eq!("12foo1bar12".trim_chars(&['1', '2']), "foo1bar");
3204 assert_eq!("123foo1bar123".trim_chars(|c: char| c.is_digit()), "foo1bar");
3208 fn test_trim_left() {
3209 assert_eq!("".trim_left(), "");
3210 assert_eq!("a".trim_left(), "a");
3211 assert_eq!(" ".trim_left(), "");
3212 assert_eq!(" blah".trim_left(), "blah");
3213 assert_eq!(" \u3000 wut".trim_left(), "wut");
3214 assert_eq!("hey ".trim_left(), "hey ");
3218 fn test_trim_right() {
3219 assert_eq!("".trim_right(), "");
3220 assert_eq!("a".trim_right(), "a");
3221 assert_eq!(" ".trim_right(), "");
3222 assert_eq!("blah ".trim_right(), "blah");
3223 assert_eq!("wut \u3000 ".trim_right(), "wut");
3224 assert_eq!(" hey".trim_right(), " hey");
3229 assert_eq!("".trim(), "");
3230 assert_eq!("a".trim(), "a");
3231 assert_eq!(" ".trim(), "");
3232 assert_eq!(" blah ".trim(), "blah");
3233 assert_eq!("\nwut \u3000 ".trim(), "wut");
3234 assert_eq!(" hey dude ".trim(), "hey dude");
3238 fn test_is_whitespace() {
3239 assert!("".is_whitespace());
3240 assert!(" ".is_whitespace());
3241 assert!("\u2009".is_whitespace()); // Thin space
3242 assert!(" \n\t ".is_whitespace());
3243 assert!(!" _ ".is_whitespace());
3247 fn test_slice_shift_char() {
3248 let data = "ประเทศไทย中";
3249 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3253 fn test_slice_shift_char_2() {
3255 assert_eq!(empty.slice_shift_char(), (None, ""));
3260 // deny overlong encodings
3261 assert!(!is_utf8([0xc0, 0x80]));
3262 assert!(!is_utf8([0xc0, 0xae]));
3263 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3264 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3265 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3266 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3267 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3270 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3271 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3273 assert!(is_utf8([0xC2, 0x80]));
3274 assert!(is_utf8([0xDF, 0xBF]));
3275 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3276 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3277 assert!(is_utf8([0xEE, 0x80, 0x80]));
3278 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3279 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3280 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3284 fn test_is_utf16() {
3285 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3293 // surrogate pairs (randomly generated with Python 3's
3294 // .encode('utf-16be'))
3295 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3296 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3297 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3299 // mixtures (also random)
3300 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3301 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3302 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3305 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3308 // surrogate + regular unit
3310 // surrogate + lead surrogate
3312 // unterminated surrogate
3314 // trail surrogate without a lead
3317 // random byte sequences that Python 3's .decode('utf-16be')
3319 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3320 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3321 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3322 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3323 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3324 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3325 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3326 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3327 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3328 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3329 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3330 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3331 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3332 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3333 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3334 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3335 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3336 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3337 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3338 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3339 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3343 fn test_raw_from_c_str() {
3345 let a = box [65, 65, 65, 65, 65, 65, 65, 0];
3347 let c = raw::from_c_str(b);
3348 assert_eq!(c, "AAAAAAA".to_owned());
3353 fn test_as_bytes() {
3356 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3357 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3360 assert_eq!("".as_bytes(), &[]);
3361 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3362 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3367 fn test_as_bytes_fail() {
3368 // Don't double free. (I'm not sure if this exercises the
3369 // original problem code path anymore.)
3370 let s = "".to_owned();
3371 let _bytes = s.as_bytes();
3377 let buf = "hello".as_ptr();
3379 assert_eq!(*buf.offset(0), 'h' as u8);
3380 assert_eq!(*buf.offset(1), 'e' as u8);
3381 assert_eq!(*buf.offset(2), 'l' as u8);
3382 assert_eq!(*buf.offset(3), 'l' as u8);
3383 assert_eq!(*buf.offset(4), 'o' as u8);
3388 fn test_subslice_offset() {
3389 let a = "kernelsprite";
3390 let b = a.slice(7, a.len());
3391 let c = a.slice(0, a.len() - 6);
3392 assert_eq!(a.subslice_offset(b), 7);
3393 assert_eq!(a.subslice_offset(c), 0);
3395 let string = "a\nb\nc";
3396 let lines: ~[&str] = string.lines().collect();
3397 assert_eq!(string.subslice_offset(lines[0]), 0);
3398 assert_eq!(string.subslice_offset(lines[1]), 2);
3399 assert_eq!(string.subslice_offset(lines[2]), 4);
3404 fn test_subslice_offset_2() {
3405 let a = "alchemiter";
3406 let b = "cruxtruder";
3407 a.subslice_offset(b);
3411 fn vec_str_conversions() {
3412 let s1: ~str = "All mimsy were the borogoves".to_owned();
3414 let v: ~[u8] = s1.as_bytes().to_owned();
3415 let s2: ~str = from_utf8(v).unwrap().to_owned();
3416 let mut i: uint = 0u;
3417 let n1: uint = s1.len();
3418 let n2: uint = v.len();
3431 fn test_contains() {
3432 assert!("abcde".contains("bcd"));
3433 assert!("abcde".contains("abcd"));
3434 assert!("abcde".contains("bcde"));
3435 assert!("abcde".contains(""));
3436 assert!("".contains(""));
3437 assert!(!"abcde".contains("def"));
3438 assert!(!"".contains("a"));
3440 let data = "ประเทศไทย中华Việt Nam".to_owned();
3441 assert!(data.contains("ประเ"));
3442 assert!(data.contains("ะเ"));
3443 assert!(data.contains("中华"));
3444 assert!(!data.contains("ไท华"));
3448 fn test_contains_char() {
3449 assert!("abc".contains_char('b'));
3450 assert!("a".contains_char('a'));
3451 assert!(!"abc".contains_char('d'));
3452 assert!(!"".contains_char('a'));
3458 [("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n".to_owned(),
3459 box [0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3460 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3461 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3462 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3464 ("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n".to_owned(),
3465 box [0xd801_u16, 0xdc12_u16, 0xd801_u16,
3466 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3467 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3468 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3469 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3472 ("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n".to_owned(),
3473 box [0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3474 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3475 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3476 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3477 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3478 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3479 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3481 ("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n".to_owned(),
3482 box [0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3483 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3484 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3485 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3486 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3487 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3488 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3489 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3490 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3491 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3493 // Issue #12318, even-numbered non-BMP planes
3494 ("\U00020000".to_owned(),
3495 box [0xD840, 0xDC00])];
3497 for p in pairs.iter() {
3498 let (s, u) = (*p).clone();
3499 assert!(is_utf16(u));
3500 assert_eq!(s.to_utf16(), u);
3502 assert_eq!(from_utf16(u).unwrap(), s);
3503 assert_eq!(from_utf16_lossy(u), s);
3505 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3506 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3511 fn test_utf16_invalid() {
3512 // completely positive cases tested above.
3514 assert_eq!(from_utf16([0xD800]), None);
3516 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3519 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3522 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3526 fn test_utf16_lossy() {
3527 // completely positive cases tested above.
3529 assert_eq!(from_utf16_lossy([0xD800]), "\uFFFD".to_owned());
3531 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), "\uFFFD\uFFFD".to_owned());
3534 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), "a\uFFFD".to_owned());
3537 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), "\uFFFD𐒋\uFFFD".to_owned());
3541 fn test_truncate_utf16_at_nul() {
3543 assert_eq!(truncate_utf16_at_nul(v), &[]);
3546 assert_eq!(truncate_utf16_at_nul(v), &[]);
3549 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3552 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3555 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3560 let s = "ศไทย中华Việt Nam".to_owned();
3561 let v = box ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3563 for ch in v.iter() {
3564 assert!(s.char_at(pos) == *ch);
3565 pos += from_char(*ch).len();
3570 fn test_char_at_reverse() {
3571 let s = "ศไทย中华Việt Nam".to_owned();
3572 let v = box ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3573 let mut pos = s.len();
3574 for ch in v.iter().rev() {
3575 assert!(s.char_at_reverse(pos) == *ch);
3576 pos -= from_char(*ch).len();
3581 fn test_escape_unicode() {
3582 assert_eq!("abc".escape_unicode(), "\\x61\\x62\\x63".to_owned());
3583 assert_eq!("a c".escape_unicode(), "\\x61\\x20\\x63".to_owned());
3584 assert_eq!("\r\n\t".escape_unicode(), "\\x0d\\x0a\\x09".to_owned());
3585 assert_eq!("'\"\\".escape_unicode(), "\\x27\\x22\\x5c".to_owned());
3586 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), "\\x00\\x01\\xfe\\xff".to_owned());
3587 assert_eq!("\u0100\uffff".escape_unicode(), "\\u0100\\uffff".to_owned());
3588 assert_eq!("\U00010000\U0010ffff".escape_unicode(), "\\U00010000\\U0010ffff".to_owned());
3589 assert_eq!("ab\ufb00".escape_unicode(), "\\x61\\x62\\ufb00".to_owned());
3590 assert_eq!("\U0001d4ea\r".escape_unicode(), "\\U0001d4ea\\x0d".to_owned());
3594 fn test_escape_default() {
3595 assert_eq!("abc".escape_default(), "abc".to_owned());
3596 assert_eq!("a c".escape_default(), "a c".to_owned());
3597 assert_eq!("\r\n\t".escape_default(), "\\r\\n\\t".to_owned());
3598 assert_eq!("'\"\\".escape_default(), "\\'\\\"\\\\".to_owned());
3599 assert_eq!("\u0100\uffff".escape_default(), "\\u0100\\uffff".to_owned());
3600 assert_eq!("\U00010000\U0010ffff".escape_default(), "\\U00010000\\U0010ffff".to_owned());
3601 assert_eq!("ab\ufb00".escape_default(), "ab\\ufb00".to_owned());
3602 assert_eq!("\U0001d4ea\r".escape_default(), "\\U0001d4ea\\r".to_owned());
3606 fn test_total_ord() {
3607 "1234".cmp(&("123")) == Greater;
3608 "123".cmp(&("1234")) == Less;
3609 "1234".cmp(&("1234")) == Equal;
3610 "12345555".cmp(&("123456")) == Less;
3611 "22".cmp(&("1234")) == Greater;
3615 fn test_char_range_at() {
3616 let data = "b¢€𤭢𤭢€¢b".to_owned();
3617 assert_eq!('b', data.char_range_at(0).ch);
3618 assert_eq!('¢', data.char_range_at(1).ch);
3619 assert_eq!('€', data.char_range_at(3).ch);
3620 assert_eq!('𤭢', data.char_range_at(6).ch);
3621 assert_eq!('𤭢', data.char_range_at(10).ch);
3622 assert_eq!('€', data.char_range_at(14).ch);
3623 assert_eq!('¢', data.char_range_at(17).ch);
3624 assert_eq!('b', data.char_range_at(19).ch);
3628 fn test_char_range_at_reverse_underflow() {
3629 assert_eq!("abc".char_range_at_reverse(0).next, 0);
3634 #![allow(unnecessary_allocation)]
3636 ($s1:expr, $s2:expr, $e:expr) => { {
3640 assert_eq!(s1 + s2, e.to_owned());
3641 assert_eq!(s1.to_owned() + s2, e.to_owned());
3645 t!("foo", "bar", "foobar");
3646 t!("foo", "bar".to_owned(), "foobar");
3647 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
3648 t!("ศไทย中", "华Việt Nam".to_owned(), "ศไทย中华Việt Nam");
3652 fn test_iterator() {
3654 let s = "ศไทย中华Việt Nam".to_owned();
3655 let v = box ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3658 let mut it = s.chars();
3661 assert_eq!(c, v[pos]);
3664 assert_eq!(pos, v.len());
3668 fn test_rev_iterator() {
3670 let s = "ศไทย中华Việt Nam".to_owned();
3671 let v = box ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3674 let mut it = s.chars().rev();
3677 assert_eq!(c, v[pos]);
3680 assert_eq!(pos, v.len());
3684 fn test_iterator_clone() {
3685 let s = "ศไทย中华Việt Nam";
3686 let mut it = s.chars();
3688 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3692 fn test_bytesator() {
3693 let s = "ศไทย中华Việt Nam".to_owned();
3695 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3696 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3701 for b in s.bytes() {
3702 assert_eq!(b, v[pos]);
3708 fn test_bytes_revator() {
3709 let s = "ศไทย中华Việt Nam".to_owned();
3711 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3712 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3715 let mut pos = v.len();
3717 for b in s.bytes().rev() {
3719 assert_eq!(b, v[pos]);
3724 fn test_char_indicesator() {
3726 let s = "ศไทย中华Việt Nam";
3727 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3728 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3731 let mut it = s.char_indices();
3734 assert_eq!(c, (p[pos], v[pos]));
3737 assert_eq!(pos, v.len());
3738 assert_eq!(pos, p.len());
3742 fn test_char_indices_revator() {
3744 let s = "ศไทย中华Việt Nam";
3745 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3746 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3749 let mut it = s.char_indices().rev();
3752 assert_eq!(c, (p[pos], v[pos]));
3755 assert_eq!(pos, v.len());
3756 assert_eq!(pos, p.len());
3760 fn test_split_char_iterator() {
3761 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3763 let split: ~[&str] = data.split(' ').collect();
3764 assert_eq!( split, box ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3766 let mut rsplit: ~[&str] = data.split(' ').rev().collect();
3768 assert_eq!(rsplit, box ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3770 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3771 assert_eq!( split, box ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3773 let mut rsplit: ~[&str] = data.split(|c: char| c == ' ').rev().collect();
3775 assert_eq!(rsplit, box ["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3778 let split: ~[&str] = data.split('ä').collect();
3779 assert_eq!( split, box ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3781 let mut rsplit: ~[&str] = data.split('ä').rev().collect();
3783 assert_eq!(rsplit, box ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3785 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3786 assert_eq!( split, box ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3788 let mut rsplit: ~[&str] = data.split(|c: char| c == 'ä').rev().collect();
3790 assert_eq!(rsplit, box ["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3794 fn test_splitn_char_iterator() {
3795 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3797 let split: ~[&str] = data.splitn(' ', 3).collect();
3798 assert_eq!(split, box ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3800 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3801 assert_eq!(split, box ["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3804 let split: ~[&str] = data.splitn('ä', 3).collect();
3805 assert_eq!(split, box ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3807 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3808 assert_eq!(split, box ["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3812 fn test_rsplitn_char_iterator() {
3813 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3815 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3817 assert_eq!(split, box ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3819 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3821 assert_eq!(split, box ["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3824 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3826 assert_eq!(split, box ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3828 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3830 assert_eq!(split, box ["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3834 fn test_split_char_iterator_no_trailing() {
3835 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3837 let split: ~[&str] = data.split('\n').collect();
3838 assert_eq!(split, box ["", "Märy häd ä little lämb", "Little lämb", ""]);
3840 let split: ~[&str] = data.split_terminator('\n').collect();
3841 assert_eq!(split, box ["", "Märy häd ä little lämb", "Little lämb"]);
3845 fn test_rev_split_char_iterator_no_trailing() {
3846 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3848 let mut split: ~[&str] = data.split('\n').rev().collect();
3850 assert_eq!(split, box ["", "Märy häd ä little lämb", "Little lämb", ""]);
3852 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3854 assert_eq!(split, box ["", "Märy häd ä little lämb", "Little lämb"]);
3859 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
3860 let words: ~[&str] = data.words().collect();
3861 assert_eq!(words, box ["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3865 fn test_nfd_chars() {
3866 assert_eq!("abc".nfd_chars().collect::<~str>(), "abc".to_owned());
3867 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), "d\u0307\u01c4".to_owned());
3868 assert_eq!("\u2026".nfd_chars().collect::<~str>(), "\u2026".to_owned());
3869 assert_eq!("\u2126".nfd_chars().collect::<~str>(), "\u03a9".to_owned());
3870 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3871 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3872 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), "a\u0301".to_owned());
3873 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), "\u0301a".to_owned());
3874 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), "\u1111\u1171\u11b6".to_owned());
3875 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), "\u1100\u1162".to_owned());
3879 fn test_nfkd_chars() {
3880 assert_eq!("abc".nfkd_chars().collect::<~str>(), "abc".to_owned());
3881 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), "d\u0307DZ\u030c".to_owned());
3882 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), "...".to_owned());
3883 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), "\u03a9".to_owned());
3884 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3885 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3886 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), "a\u0301".to_owned());
3887 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), "\u0301a".to_owned());
3888 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), "\u1111\u1171\u11b6".to_owned());
3889 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), "\u1100\u1162".to_owned());
3894 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3895 let lines: ~[&str] = data.lines().collect();
3896 assert_eq!(lines, box ["", "Märy häd ä little lämb", "", "Little lämb"]);
3898 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3899 let lines: ~[&str] = data.lines().collect();
3900 assert_eq!(lines, box ["", "Märy häd ä little lämb", "", "Little lämb"]);
3904 fn test_split_strator() {
3905 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3906 let v: ~[&str] = s.split_str(sep).collect();
3909 t("--1233345--", "12345", box ["--1233345--"]);
3910 t("abc::hello::there", "::", box ["abc", "hello", "there"]);
3911 t("::hello::there", "::", box ["", "hello", "there"]);
3912 t("hello::there::", "::", box ["hello", "there", ""]);
3913 t("::hello::there::", "::", box ["", "hello", "there", ""]);
3914 t("ประเทศไทย中华Việt Nam", "中华", box ["ประเทศไทย", "Việt Nam"]);
3915 t("zzXXXzzYYYzz", "zz", box ["", "XXX", "YYY", ""]);
3916 t("zzXXXzYYYz", "XXX", box ["zz", "zYYYz"]);
3917 t(".XXX.YYY.", ".", box ["", "XXX", "YYY", ""]);
3918 t("", ".", box [""]);
3919 t("zz", "zz", box ["",""]);
3920 t("ok", "z", box ["ok"]);
3921 t("zzz", "zz", box ["","z"]);
3922 t("zzzzz", "zz", box ["","","z"]);
3926 fn test_str_default() {
3927 use default::Default;
3928 fn t<S: Default + Str>() {
3929 let s: S = Default::default();
3930 assert_eq!(s.as_slice(), "");
3938 fn test_str_container() {
3939 fn sum_len<S: Container>(v: &[S]) -> uint {
3940 v.iter().map(|x| x.len()).sum()
3943 let s = "01234".to_owned();
3944 assert_eq!(5, sum_len(["012", "", "34"]));
3945 assert_eq!(5, sum_len(["01".to_owned(), "2".to_owned(), "34".to_owned(), "".to_owned()]));
3946 assert_eq!(5, sum_len([s.as_slice()]));
3950 fn test_str_from_utf8() {
3951 let xs = bytes!("hello");
3952 assert_eq!(from_utf8(xs), Some("hello"));
3954 let xs = bytes!("ศไทย中华Việt Nam");
3955 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3957 let xs = bytes!("hello", 0xff);
3958 assert_eq!(from_utf8(xs), None);
3962 fn test_str_from_utf8_owned() {
3963 let xs = bytes!("hello").to_owned();
3964 assert_eq!(from_utf8_owned(xs), Some("hello".to_owned()));
3966 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3967 assert_eq!(from_utf8_owned(xs), Some("ศไทย中华Việt Nam".to_owned()));
3969 let xs = bytes!("hello", 0xff).to_owned();
3970 assert_eq!(from_utf8_owned(xs), None);
3974 fn test_str_from_utf8_lossy() {
3975 let xs = bytes!("hello");
3976 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
3978 let xs = bytes!("ศไทย中华Việt Nam");
3979 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
3981 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3982 assert_eq!(from_utf8_lossy(xs), Owned("Hello\uFFFD There\uFFFD Goodbye".to_owned()));
3984 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
3985 assert_eq!(from_utf8_lossy(xs), Owned("Hello\uFFFD\uFFFD There\uFFFD Goodbye".to_owned()));
3987 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
3988 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFD\uFFFDbar".to_owned()));
3990 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
3991 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFDbar\uFFFDbaz".to_owned()));
3993 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
3994 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz".to_owned()));
3996 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
3997 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFD\uFFFD\uFFFD\uFFFD\
3998 foo\U00010000bar".to_owned()));
4001 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4002 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFD\uFFFD\uFFFDfoo\
4003 \uFFFD\uFFFD\uFFFDbar".to_owned()));
4007 fn test_from_str() {
4008 let owned: Option<~str> = from_str("string");
4009 assert_eq!(owned, Some("string".to_owned()));
4013 fn test_maybe_owned_traits() {
4014 let s = Slice("abcde");
4015 assert_eq!(s.len(), 5);
4016 assert_eq!(s.as_slice(), "abcde");
4017 assert_eq!(s.to_str(), "abcde".to_owned());
4018 assert_eq!(format!("{}", s), "abcde".to_owned());
4019 assert!(s.lt(&Owned("bcdef".to_owned())));
4020 assert_eq!(Slice(""), Default::default());
4022 let o = Owned("abcde".to_owned());
4023 assert_eq!(o.len(), 5);
4024 assert_eq!(o.as_slice(), "abcde");
4025 assert_eq!(o.to_str(), "abcde".to_owned());
4026 assert_eq!(format!("{}", o), "abcde".to_owned());
4027 assert!(o.lt(&Slice("bcdef")));
4028 assert_eq!(Owned("".to_owned()), Default::default());
4030 assert!(s.cmp(&o) == Equal);
4031 assert!(s.equiv(&o));
4033 assert!(o.cmp(&s) == Equal);
4034 assert!(o.equiv(&s));
4038 fn test_maybe_owned_methods() {
4039 let s = Slice("abcde");
4040 assert!(s.is_slice());
4041 assert!(!s.is_owned());
4043 let o = Owned("abcde".to_owned());
4044 assert!(!o.is_slice());
4045 assert!(o.is_owned());
4049 fn test_maybe_owned_clone() {
4050 assert_eq!(Owned("abcde".to_owned()), Slice("abcde").clone());
4051 assert_eq!(Owned("abcde".to_owned()), Owned("abcde".to_owned()).clone());
4052 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4053 assert_eq!(Slice("abcde"), Owned("abcde".to_owned()).clone());
4057 fn test_maybe_owned_into_owned() {
4058 assert_eq!(Slice("abcde").into_owned(), "abcde".to_owned());
4059 assert_eq!(Owned("abcde".to_owned()).into_owned(), "abcde".to_owned());
4063 fn test_into_maybe_owned() {
4064 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4065 assert_eq!(("abcde".to_owned()).into_maybe_owned(), Slice("abcde"));
4066 assert_eq!("abcde".into_maybe_owned(), Owned("abcde".to_owned()));
4067 assert_eq!(("abcde".to_owned()).into_maybe_owned(), Owned("abcde".to_owned()));
4074 use self::test::Bencher;
4079 fn char_iterator(b: &mut Bencher) {
4080 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4081 let len = s.char_len();
4083 b.iter(|| assert_eq!(s.chars().len(), len));
4087 fn char_iterator_ascii(b: &mut Bencher) {
4088 let s = "Mary had a little lamb, Little lamb
4089 Mary had a little lamb, Little lamb
4090 Mary had a little lamb, Little lamb
4091 Mary had a little lamb, Little lamb
4092 Mary had a little lamb, Little lamb
4093 Mary had a little lamb, Little lamb";
4094 let len = s.char_len();
4096 b.iter(|| assert_eq!(s.chars().len(), len));
4100 fn char_iterator_rev(b: &mut Bencher) {
4101 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4102 let len = s.char_len();
4104 b.iter(|| assert_eq!(s.chars().rev().len(), len));
4108 fn char_indicesator(b: &mut Bencher) {
4109 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4110 let len = s.char_len();
4112 b.iter(|| assert_eq!(s.char_indices().len(), len));
4116 fn char_indicesator_rev(b: &mut Bencher) {
4117 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4118 let len = s.char_len();
4120 b.iter(|| assert_eq!(s.char_indices().rev().len(), len));
4124 fn split_unicode_ascii(b: &mut Bencher) {
4125 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4127 b.iter(|| assert_eq!(s.split('V').len(), 3));
4131 fn split_unicode_not_ascii(b: &mut Bencher) {
4132 struct NotAscii(char);
4133 impl CharEq for NotAscii {
4134 fn matches(&mut self, c: char) -> bool {
4135 let NotAscii(cc) = *self;
4138 fn only_ascii(&self) -> bool { false }
4140 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4142 b.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4147 fn split_ascii(b: &mut Bencher) {
4148 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4149 let len = s.split(' ').len();
4151 b.iter(|| assert_eq!(s.split(' ').len(), len));
4155 fn split_not_ascii(b: &mut Bencher) {
4156 struct NotAscii(char);
4157 impl CharEq for NotAscii {
4159 fn matches(&mut self, c: char) -> bool {
4160 let NotAscii(cc) = *self;
4163 fn only_ascii(&self) -> bool { false }
4165 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4166 let len = s.split(' ').len();
4168 b.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4172 fn split_extern_fn(b: &mut Bencher) {
4173 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4174 let len = s.split(' ').len();
4175 fn pred(c: char) -> bool { c == ' ' }
4177 b.iter(|| assert_eq!(s.split(pred).len(), len));
4181 fn split_closure(b: &mut Bencher) {
4182 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4183 let len = s.split(' ').len();
4185 b.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4189 fn split_slice(b: &mut Bencher) {
4190 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4191 let len = s.split(' ').len();
4193 b.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4197 fn is_utf8_100_ascii(b: &mut Bencher) {
4199 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4200 Lorem ipsum dolor sit amet, consectetur. ");
4202 assert_eq!(100, s.len());
4209 fn is_utf8_100_multibyte(b: &mut Bencher) {
4210 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4211 assert_eq!(100, s.len());
4218 fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
4219 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4220 Lorem ipsum dolor sit amet, consectetur. ");
4222 assert_eq!(100, s.len());
4224 let _ = from_utf8_lossy(s);
4229 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
4230 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4231 assert_eq!(100, s.len());
4233 let _ = from_utf8_lossy(s);
4238 fn from_utf8_lossy_invalid(b: &mut Bencher) {
4239 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4241 let _ = from_utf8_lossy(s);
4246 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
4247 let s = Vec::from_elem(100, 0xF5u8);
4249 let _ = from_utf8_lossy(s.as_slice());
4254 fn bench_connect(b: &mut Bencher) {
4255 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4257 let v = [s, s, s, s, s, s, s, s, s, s];
4259 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);