1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
68 and can contain null codepoints.
70 The actual representation of strings have direct mappings to vectors:
72 * `~str` is the same as `~[u8]`
73 * `&str` is the same as `&[u8]`
82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
83 use container::Container;
86 use iter::{Iterator, FromIterator, Extendable, range};
87 use iter::{Filter, AdditiveIterator, Map};
88 use iter::{Rev, DoubleEndedIterator, ExactSize};
91 use option::{None, Option, Some};
93 use from_str::FromStr;
95 use slice::{OwnedVector, ImmutableVector, MutableVector};
103 Section: Creating a string
106 /// Consumes a vector of bytes to create a new utf-8 string.
107 /// Returns None if the vector contains invalid UTF-8.
108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
110 Some(unsafe { raw::from_utf8_owned(vv) })
116 /// Converts a vector to a string slice without performing any allocations.
118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
119 /// returned as a '&str' instead of a '&[u8]'
121 /// Returns None if the slice is not utf-8.
122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
124 Some(unsafe { raw::from_utf8(v) })
128 impl FromStr for ~str {
130 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
133 /// Convert a byte to a UTF-8 string
137 /// Fails if invalid UTF-8
138 pub fn from_byte(b: u8) -> ~str {
140 unsafe { ::cast::transmute(~[b]) }
143 /// Convert a char to a string
144 pub fn from_char(ch: char) -> ~str {
145 let mut buf = StrBuf::new();
150 /// Convert a vector of chars to a string
151 pub fn from_chars(chs: &[char]) -> ~str {
152 chs.iter().map(|c| *c).collect()
155 /// Methods for vectors of strings
156 pub trait StrVector {
157 /// Concatenate a vector of strings.
158 fn concat(&self) -> ~str;
160 /// Concatenate a vector of strings, placing a given separator between each.
161 fn connect(&self, sep: &str) -> ~str;
164 impl<'a, S: Str> StrVector for &'a [S] {
165 fn concat(&self) -> ~str {
166 if self.is_empty() { return ~""; }
168 // `len` calculation may overflow but push_str but will check boundaries
169 let len = self.iter().map(|s| s.as_slice().len()).sum();
171 let mut result = StrBuf::with_capacity(len);
173 for s in self.iter() {
174 result.push_str(s.as_slice())
180 fn connect(&self, sep: &str) -> ~str {
181 if self.is_empty() { return ~""; }
184 if sep.is_empty() { return self.concat(); }
186 // this is wrong without the guarantee that `self` is non-empty
187 // `len` calculation may overflow but push_str but will check boundaries
188 let len = sep.len() * (self.len() - 1)
189 + self.iter().map(|s| s.as_slice().len()).sum();
190 let mut result = StrBuf::with_capacity(len);
191 let mut first = true;
193 for s in self.iter() {
197 result.push_str(sep);
199 result.push_str(s.as_slice());
205 impl<'a, S: Str> StrVector for Vec<S> {
207 fn concat(&self) -> ~str {
208 self.as_slice().concat()
212 fn connect(&self, sep: &str) -> ~str {
213 self.as_slice().connect(sep)
217 /// Something that can be used to compare against a character
219 /// Determine if the splitter should split at the given character
220 fn matches(&self, char) -> bool;
221 /// Indicate if this is only concerned about ASCII characters,
222 /// which can allow for a faster implementation.
223 fn only_ascii(&self) -> bool;
226 impl CharEq for char {
228 fn matches(&self, c: char) -> bool { *self == c }
230 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
233 impl<'a> CharEq for |char|: 'a -> bool {
235 fn matches(&self, c: char) -> bool { (*self)(c) }
237 fn only_ascii(&self) -> bool { false }
240 impl CharEq for extern "Rust" fn(char) -> bool {
242 fn matches(&self, c: char) -> bool { (*self)(c) }
244 fn only_ascii(&self) -> bool { false }
247 impl<'a, C: CharEq> CharEq for &'a [C] {
249 fn matches(&self, c: char) -> bool {
250 self.iter().any(|m| m.matches(c))
253 fn only_ascii(&self) -> bool {
254 self.iter().all(|m| m.only_ascii())
262 /// External iterator for a string's characters.
263 /// Use with the `std::iter` module.
265 pub struct Chars<'a> {
266 /// The slice remaining to be iterated
270 impl<'a> Iterator<char> for Chars<'a> {
272 fn next(&mut self) -> Option<char> {
273 // Decode the next codepoint, then update
274 // the slice to be just the remaining part
275 if self.string.len() != 0 {
276 let CharRange {ch, next} = self.string.char_range_at(0);
278 self.string = raw::slice_unchecked(self.string, next, self.string.len());
287 fn size_hint(&self) -> (uint, Option<uint>) {
288 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
292 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
294 fn next_back(&mut self) -> Option<char> {
295 if self.string.len() != 0 {
296 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
298 self.string = raw::slice_unchecked(self.string, 0, next);
307 /// External iterator for a string's characters and their byte offsets.
308 /// Use with the `std::iter` module.
310 pub struct CharOffsets<'a> {
311 /// The original string to be iterated
316 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
318 fn next(&mut self) -> Option<(uint, char)> {
319 // Compute the byte offset by using the pointer offset between
320 // the original string slice and the iterator's remaining part
321 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
322 self.iter.next().map(|ch| (offset, ch))
326 fn size_hint(&self) -> (uint, Option<uint>) {
327 self.iter.size_hint()
331 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
333 fn next_back(&mut self) -> Option<(uint, char)> {
334 self.iter.next_back().map(|ch| {
335 let offset = self.iter.string.len() +
336 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
342 /// External iterator for a string's characters in reverse order.
343 /// Use with the `std::iter` module.
344 pub type RevChars<'a> = Rev<Chars<'a>>;
346 /// External iterator for a string's characters and their byte offsets in reverse order.
347 /// Use with the `std::iter` module.
348 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
350 /// External iterator for a string's bytes.
351 /// Use with the `std::iter` module.
353 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
355 /// External iterator for a string's bytes in reverse order.
356 /// Use with the `std::iter` module.
357 pub type RevBytes<'a> = Rev<Bytes<'a>>;
359 /// An iterator over the substrings of a string, separated by `sep`.
361 pub struct CharSplits<'a, Sep> {
362 /// The slice remaining to be iterated
365 /// Whether an empty string at the end is allowed
366 allow_trailing_empty: bool,
371 /// An iterator over the substrings of a string, separated by `sep`,
372 /// starting from the back of the string.
373 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
375 /// An iterator over the substrings of a string, separated by `sep`,
376 /// splitting at most `count` times.
378 pub struct CharSplitsN<'a, Sep> {
379 iter: CharSplits<'a, Sep>,
380 /// The number of splits remaining
385 /// An iterator over the words of a string, separated by a sequence of whitespace
387 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
389 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
390 pub type AnyLines<'a> =
391 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
393 impl<'a, Sep> CharSplits<'a, Sep> {
395 fn get_end(&mut self) -> Option<&'a str> {
396 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
397 self.finished = true;
405 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
407 fn next(&mut self) -> Option<&'a str> {
408 if self.finished { return None }
410 let mut next_split = None;
412 for (idx, byte) in self.string.bytes().enumerate() {
413 if self.sep.matches(byte as char) && byte < 128u8 {
414 next_split = Some((idx, idx + 1));
419 for (idx, ch) in self.string.char_indices() {
420 if self.sep.matches(ch) {
421 next_split = Some((idx, self.string.char_range_at(idx).next));
427 Some((a, b)) => unsafe {
428 let elt = raw::slice_unchecked(self.string, 0, a);
429 self.string = raw::slice_unchecked(self.string, b, self.string.len());
432 None => self.get_end(),
437 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
438 for CharSplits<'a, Sep> {
440 fn next_back(&mut self) -> Option<&'a str> {
441 if self.finished { return None }
443 if !self.allow_trailing_empty {
444 self.allow_trailing_empty = true;
445 match self.next_back() {
446 Some(elt) if !elt.is_empty() => return Some(elt),
447 _ => if self.finished { return None }
450 let len = self.string.len();
451 let mut next_split = None;
454 for (idx, byte) in self.string.bytes().enumerate().rev() {
455 if self.sep.matches(byte as char) && byte < 128u8 {
456 next_split = Some((idx, idx + 1));
461 for (idx, ch) in self.string.char_indices_rev() {
462 if self.sep.matches(ch) {
463 next_split = Some((idx, self.string.char_range_at(idx).next));
469 Some((a, b)) => unsafe {
470 let elt = raw::slice_unchecked(self.string, b, len);
471 self.string = raw::slice_unchecked(self.string, 0, a);
474 None => { self.finished = true; Some(self.string) }
479 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
481 fn next(&mut self) -> Option<&'a str> {
484 if self.invert { self.iter.next_back() } else { self.iter.next() }
491 /// An iterator over the start and end indices of the matches of a
492 /// substring within a larger string
494 pub struct MatchIndices<'a> {
500 /// An iterator over the substrings of a string separated by a given
503 pub struct StrSplits<'a> {
504 it: MatchIndices<'a>,
509 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
511 fn next(&mut self) -> Option<(uint, uint)> {
512 // See Issue #1932 for why this is a naive search
513 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
514 let mut match_start = 0;
517 while self.position < h_len {
518 if self.haystack[self.position] == self.needle[match_i] {
519 if match_i == 0 { match_start = self.position; }
523 if match_i == n_len {
525 return Some((match_start, self.position));
528 // failed match, backtrack
531 self.position = match_start;
540 impl<'a> Iterator<&'a str> for StrSplits<'a> {
542 fn next(&mut self) -> Option<&'a str> {
543 if self.finished { return None; }
545 match self.it.next() {
546 Some((from, to)) => {
547 let ret = Some(self.it.haystack.slice(self.last_end, from));
552 self.finished = true;
553 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
559 // Helper functions used for Unicode normalization
560 fn canonical_sort(comb: &mut [(char, u8)]) {
564 let len = comb.len();
565 for i in range(0, len) {
566 let mut swapped = false;
567 for j in range(1, len-i) {
568 let class_a = *comb[j-1].ref1();
569 let class_b = *comb[j].ref1();
570 if class_a != 0 && class_b != 0 && class_a > class_b {
575 if !swapped { break; }
580 enum NormalizationForm {
585 /// External iterator for a string's normalization's characters.
586 /// Use with the `std::iter` module.
588 pub struct Normalizations<'a> {
589 kind: NormalizationForm,
591 buffer: Vec<(char, u8)>,
595 impl<'a> Iterator<char> for Normalizations<'a> {
597 fn next(&mut self) -> Option<char> {
598 use unicode::decompose::canonical_combining_class;
600 match self.buffer.as_slice().head() {
606 Some(&(c, _)) if self.sorted => {
610 _ => self.sorted = false
613 let decomposer = match self.kind {
614 NFD => char::decompose_canonical,
615 NFKD => char::decompose_compatible
619 for ch in self.iter {
620 let buffer = &mut self.buffer;
621 let sorted = &mut self.sorted;
623 let class = canonical_combining_class(d);
624 if class == 0 && !*sorted {
625 canonical_sort(buffer.as_mut_slice());
628 buffer.push((d, class));
635 canonical_sort(self.buffer.as_mut_slice());
639 match self.buffer.shift() {
644 Some((c, _)) => Some(c),
649 fn size_hint(&self) -> (uint, Option<uint>) {
650 let (lower, _) = self.iter.size_hint();
655 /// Replace all occurrences of one string with another
659 /// * s - The string containing substrings to replace
660 /// * from - The string to replace
661 /// * to - The replacement string
665 /// The original string with all occurances of `from` replaced with `to`
666 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
667 let mut result = StrBuf::new();
668 let mut last_end = 0;
669 for (start, end) in s.match_indices(from) {
670 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
674 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
679 Section: Comparing strings
682 // share the implementation of the lang-item vs. non-lang-item
685 fn eq_slice_(a: &str, b: &str) -> bool {
686 a.len() == b.len() && unsafe {
687 libc::memcmp(a.as_ptr() as *libc::c_void,
688 b.as_ptr() as *libc::c_void,
689 a.len() as libc::size_t) == 0
693 /// Bytewise slice equality
697 pub fn eq_slice(a: &str, b: &str) -> bool {
701 /// Bytewise slice equality
704 pub fn eq_slice(a: &str, b: &str) -> bool {
708 /// Bytewise string equality
710 #[lang="uniq_str_eq"]
712 pub fn eq(a: &~str, b: &~str) -> bool {
718 pub fn eq(a: &~str, b: &~str) -> bool {
726 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
727 /// returning `true` in that case, or, if it is invalid, `false` with
728 /// `iter` reset such that it is pointing at the first byte in the
729 /// invalid sequence.
731 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
733 // save the current thing we're pointing at.
736 // restore the iterator we had at the start of this codepoint.
737 macro_rules! err ( () => { {*iter = old; return false} });
738 macro_rules! next ( () => {
741 // we needed data, but there was none: error!
746 let first = match iter.next() {
748 // we're at the end of the iterator and a codepoint
749 // boundary at the same time, so this string is valid.
753 // ASCII characters are always valid, so only large
754 // bytes need more examination.
756 let w = utf8_char_width(first);
757 let second = next!();
758 // 2-byte encoding is for codepoints \u0080 to \u07ff
759 // first C2 80 last DF BF
760 // 3-byte encoding is for codepoints \u0800 to \uffff
761 // first E0 A0 80 last EF BF BF
762 // excluding surrogates codepoints \ud800 to \udfff
763 // ED A0 80 to ED BF BF
764 // 4-byte encoding is for codepoints \u10000 to \u10ffff
765 // first F0 90 80 80 last F4 8F BF BF
767 // Use the UTF-8 syntax from the RFC
769 // https://tools.ietf.org/html/rfc3629
771 // UTF8-2 = %xC2-DF UTF8-tail
772 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
773 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
774 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
775 // %xF4 %x80-8F 2( UTF8-tail )
777 2 => if second & 192 != TAG_CONT_U8 {err!()},
779 match (first, second, next!() & 192) {
780 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
781 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
782 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
783 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
788 match (first, second, next!() & 192, next!() & 192) {
789 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
790 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
791 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
801 /// Determines if a vector of bytes contains valid UTF-8.
802 pub fn is_utf8(v: &[u8]) -> bool {
803 run_utf8_validation_iterator(&mut v.iter())
807 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
808 let mut it = v.iter();
810 let ok = run_utf8_validation_iterator(&mut it);
814 // work out how many valid bytes we've consumed
815 // (run_utf8_validation_iterator resets the iterator to just
816 // after the last good byte), which we can do because the
817 // vector iterator size_hint is exact.
818 let (remaining, _) = it.size_hint();
819 Some(v.len() - remaining)
823 /// Determines if a vector of `u16` contains valid UTF-16
824 pub fn is_utf16(v: &[u16]) -> bool {
825 let mut it = v.iter();
826 macro_rules! next ( ($ret:expr) => {
827 match it.next() { Some(u) => *u, None => return $ret }
833 match char::from_u32(u as u32) {
836 let u2 = next!(false);
837 if u < 0xD7FF || u > 0xDBFF ||
838 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
844 /// An iterator that decodes UTF-16 encoded codepoints from a vector
847 pub struct UTF16Items<'a> {
848 iter: slice::Items<'a, u16>
850 /// The possibilities for values decoded from a `u16` stream.
851 #[deriving(Eq, TotalEq, Clone, Show)]
853 /// A valid codepoint.
855 /// An invalid surrogate without its pair.
860 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
861 /// replacement character (U+FFFD).
863 pub fn to_char_lossy(&self) -> char {
866 LoneSurrogate(_) => '\uFFFD'
871 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
872 fn next(&mut self) -> Option<UTF16Item> {
873 let u = match self.iter.next() {
878 if u < 0xD800 || 0xDFFF < u {
880 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
881 } else if u >= 0xDC00 {
882 // a trailing surrogate
883 Some(LoneSurrogate(u))
885 // preserve state for rewinding.
888 let u2 = match self.iter.next() {
891 None => return Some(LoneSurrogate(u))
893 if u2 < 0xDC00 || u2 > 0xDFFF {
894 // not a trailing surrogate so we're not a valid
895 // surrogate pair, so rewind to redecode u2 next time.
897 return Some(LoneSurrogate(u))
900 // all ok, so lets decode it.
901 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
902 Some(ScalarValue(unsafe {cast::transmute(c)}))
907 fn size_hint(&self) -> (uint, Option<uint>) {
908 let (low, high) = self.iter.size_hint();
909 // we could be entirely valid surrogates (2 elements per
910 // char), or entirely non-surrogates (1 element per char)
915 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
916 /// returning invalid surrogates as `LoneSurrogate`s.
922 /// use std::str::{ScalarValue, LoneSurrogate};
924 /// // 𝄞mus<invalid>ic<invalid>
925 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
926 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
929 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
930 /// ~[ScalarValue('𝄞'),
931 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
932 /// LoneSurrogate(0xDD1E),
933 /// ScalarValue('i'), ScalarValue('c'),
934 /// LoneSurrogate(0xD834)]);
936 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
937 UTF16Items { iter : v.iter() }
940 /// Return a slice of `v` ending at (and not including) the first NUL
949 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
950 /// // no NULs so no change
951 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
955 /// assert_eq!(str::truncate_utf16_at_nul(v),
956 /// &['a' as u16, 'b' as u16]);
958 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
959 match v.iter().position(|c| *c == 0) {
960 // don't include the 0
961 Some(i) => v.slice_to(i),
966 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
967 /// if `v` contains any invalid data.
975 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
976 /// 0x0073, 0x0069, 0x0063];
977 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
979 /// // 𝄞mu<invalid>ic
981 /// assert_eq!(str::from_utf16(v), None);
983 pub fn from_utf16(v: &[u16]) -> Option<~str> {
984 let mut s = StrBuf::with_capacity(v.len() / 2);
985 for c in utf16_items(v) {
987 ScalarValue(c) => s.push_char(c),
988 LoneSurrogate(_) => return None
994 /// Decode a UTF-16 encoded vector `v` into a string, replacing
995 /// invalid data with the replacement character (U+FFFD).
1001 /// // 𝄞mus<invalid>ic<invalid>
1002 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1003 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1006 /// assert_eq!(str::from_utf16_lossy(v),
1007 /// ~"𝄞mus\uFFFDic\uFFFD");
1009 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1010 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1013 // https://tools.ietf.org/html/rfc3629
1014 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1015 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1017 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1018 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1026 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1027 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1028 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1029 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1030 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1033 /// Given a first byte, determine how many bytes are in this UTF-8 character
1035 pub fn utf8_char_width(b: u8) -> uint {
1036 return UTF8_CHAR_WIDTH[b as uint] as uint;
1039 /// Struct that contains a `char` and the index of the first byte of
1040 /// the next `char` in a string. This can be used as a data structure
1041 /// for iterating over the UTF-8 bytes of a string.
1042 pub struct CharRange {
1045 /// Index of the first byte of the next `char`
1049 // Return the initial codepoint accumulator for the first byte.
1050 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1051 // for width 3, and 3 bits for width 4
1052 macro_rules! utf8_first_byte(
1053 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1056 // return the value of $ch updated with continuation byte $byte
1057 macro_rules! utf8_acc_cont_byte(
1058 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1061 static TAG_CONT_U8: u8 = 128u8;
1063 /// Converts a vector of bytes to a new utf-8 string.
1064 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1069 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1070 /// let output = std::str::from_utf8_lossy(input);
1071 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1073 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1074 let firstbad = match first_non_utf8_index(v) {
1075 None => return Slice(unsafe { cast::transmute(v) }),
1079 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1080 let mut i = firstbad;
1081 let total = v.len();
1082 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1083 unsafe { *xs.unsafe_ref(i) }
1085 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1093 let mut res = StrBuf::with_capacity(total);
1097 res.push_bytes(v.slice_to(i))
1101 // subseqidx is the index of the first byte of the subsequence we're looking at.
1102 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1104 let mut subseqidx = firstbad;
1108 let byte = unsafe_get(v, i);
1111 macro_rules! error(() => ({
1113 if subseqidx != i_ {
1114 res.push_bytes(v.slice(subseqidx, i_));
1117 res.push_bytes(REPLACEMENT);
1122 // subseqidx handles this
1124 let w = utf8_char_width(byte);
1128 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1135 match (byte, safe_get(v, i, total)) {
1136 (0xE0 , 0xA0 .. 0xBF) => (),
1137 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1138 (0xED , 0x80 .. 0x9F) => (),
1139 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1146 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1153 match (byte, safe_get(v, i, total)) {
1154 (0xF0 , 0x90 .. 0xBF) => (),
1155 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1156 (0xF4 , 0x80 .. 0x8F) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1168 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1181 if subseqidx < total {
1183 res.push_bytes(v.slice(subseqidx, total))
1186 Owned(res.into_owned())
1193 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1194 /// This can be useful as an optimization when an allocation is sometimes
1195 /// needed but not always.
1196 pub enum MaybeOwned<'a> {
1197 /// A borrowed string
1203 /// SendStr is a specialization of `MaybeOwned` to be sendable
1204 pub type SendStr = MaybeOwned<'static>;
1206 impl<'a> MaybeOwned<'a> {
1207 /// Returns `true` if this `MaybeOwned` wraps an owned string
1209 pub fn is_owned(&self) -> bool {
1216 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1218 pub fn is_slice(&self) -> bool {
1226 /// Trait for moving into a `MaybeOwned`
1227 pub trait IntoMaybeOwned<'a> {
1228 /// Moves self into a `MaybeOwned`
1229 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1232 impl<'a> IntoMaybeOwned<'a> for ~str {
1234 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1237 impl<'a> IntoMaybeOwned<'a> for &'a str {
1239 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1242 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1244 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1247 impl<'a> Eq for MaybeOwned<'a> {
1249 fn eq(&self, other: &MaybeOwned) -> bool {
1250 self.as_slice() == other.as_slice()
1254 impl<'a> TotalEq for MaybeOwned<'a> {}
1256 impl<'a> Ord for MaybeOwned<'a> {
1258 fn lt(&self, other: &MaybeOwned) -> bool {
1259 self.as_slice().lt(&other.as_slice())
1263 impl<'a> TotalOrd for MaybeOwned<'a> {
1265 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1266 self.as_slice().cmp(&other.as_slice())
1270 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1272 fn equiv(&self, other: &S) -> bool {
1273 self.as_slice() == other.as_slice()
1277 impl<'a> Str for MaybeOwned<'a> {
1279 fn as_slice<'b>(&'b self) -> &'b str {
1282 Owned(ref s) => s.as_slice()
1287 fn into_owned(self) -> ~str {
1289 Slice(s) => s.to_owned(),
1295 impl<'a> Container for MaybeOwned<'a> {
1297 fn len(&self) -> uint { self.as_slice().len() }
1300 impl<'a> Clone for MaybeOwned<'a> {
1302 fn clone(&self) -> MaybeOwned<'a> {
1304 Slice(s) => Slice(s),
1305 Owned(ref s) => Owned(s.to_owned())
1310 impl<'a> Default for MaybeOwned<'a> {
1312 fn default() -> MaybeOwned<'a> { Slice("") }
1315 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1317 fn hash(&self, hasher: &mut H) {
1319 Slice(s) => s.hash(hasher),
1320 Owned(ref s) => s.hash(hasher),
1325 impl<'a> fmt::Show for MaybeOwned<'a> {
1327 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329 Slice(ref s) => s.fmt(f),
1330 Owned(ref s) => s.fmt(f)
1335 /// Unsafe operations
1338 use container::Container;
1344 use slice::{MutableVector, ImmutableVector, OwnedVector, Vector};
1345 use str::{is_utf8, StrSlice};
1348 /// Create a Rust string from a *u8 buffer of the given length
1349 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1350 let mut v = Vec::with_capacity(len);
1351 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1354 assert!(is_utf8(v.as_slice()));
1355 ::cast::transmute(v.move_iter().collect::<~[u8]>())
1358 #[lang="strdup_uniq"]
1361 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1362 from_buf_len(ptr, len)
1365 /// Create a Rust string from a null-terminated C string
1366 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1371 curr = buf.offset(i);
1373 from_buf_len(buf as *u8, i as uint)
1376 /// Converts a slice of bytes to a string slice without checking
1377 /// that the string contains valid UTF-8.
1378 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1382 /// Converts an owned vector of bytes to a new owned string. This assumes
1383 /// that the utf-8-ness of the vector has already been validated
1385 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1389 /// Converts a byte to a string.
1390 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1392 /// Form a slice from a C string. Unsafe because the caller must ensure the
1393 /// C string has the static lifetime, or else the return value may be
1394 /// invalidated later.
1395 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1399 while *curr != 0u8 {
1401 curr = s.offset(len as int);
1403 let v = Slice { data: s, len: len };
1404 assert!(is_utf8(::cast::transmute(v)));
1405 ::cast::transmute(v)
1408 /// Takes a bytewise (not UTF-8) slice from a string.
1410 /// Returns the substring from [`begin`..`end`).
1414 /// If begin is greater than end.
1415 /// If end is greater than the length of the string.
1417 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1418 assert!(begin <= end);
1419 assert!(end <= s.len());
1420 slice_unchecked(s, begin, end)
1423 /// Takes a bytewise (not UTF-8) slice from a string.
1425 /// Returns the substring from [`begin`..`end`).
1427 /// Caller must check slice boundaries!
1429 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1430 cast::transmute(Slice {
1431 data: s.as_ptr().offset(begin as int),
1436 /// Access the str in its vector representation.
1437 /// The caller must preserve the valid UTF-8 property when modifying.
1439 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1443 /// Sets the length of a string
1445 /// This will explicitly set the size of the string, without actually
1446 /// modifing its buffers, so it is up to the caller to ensure that
1447 /// the string is actually the specified size.
1449 fn test_from_buf_len() {
1451 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1453 let c = from_buf_len(b, 3u);
1454 assert_eq!(c, ~"AAA");
1460 Section: Trait implementations
1464 #[allow(missing_doc)]
1466 use container::Container;
1467 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1470 use option::{Some, None};
1471 use str::{Str, StrSlice, eq_slice};
1474 impl<'a> Add<&'a str,~str> for &'a str {
1476 fn add(&self, rhs: & &'a str) -> ~str {
1477 let mut ret = StrBuf::from_owned_str(self.to_owned());
1483 impl<'a> TotalOrd for &'a str {
1485 fn cmp(&self, other: & &'a str) -> Ordering {
1486 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1487 match s_b.cmp(&o_b) {
1488 Greater => return Greater,
1489 Less => return Less,
1494 self.len().cmp(&other.len())
1498 impl TotalOrd for ~str {
1500 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1503 impl<'a> Eq for &'a str {
1505 fn eq(&self, other: & &'a str) -> bool {
1506 eq_slice((*self), (*other))
1509 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1514 fn eq(&self, other: &~str) -> bool {
1515 eq_slice((*self), (*other))
1519 impl<'a> TotalEq for &'a str {}
1521 impl TotalEq for ~str {}
1523 impl<'a> Ord for &'a str {
1525 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1530 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1533 impl<'a, S: Str> Equiv<S> for &'a str {
1535 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1538 impl<'a, S: Str> Equiv<S> for ~str {
1540 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1547 /// Any string that can be represented as a slice
1549 /// Work with `self` as a slice.
1550 fn as_slice<'a>(&'a self) -> &'a str;
1552 /// Convert `self` into a ~str, not making a copy if possible.
1553 fn into_owned(self) -> ~str;
1555 /// Convert `self` into a `StrBuf`.
1557 fn to_strbuf(&self) -> StrBuf {
1558 StrBuf::from_str(self.as_slice())
1561 /// Convert `self` into a `StrBuf`, not making a copy if possible.
1563 fn into_strbuf(self) -> StrBuf {
1564 StrBuf::from_owned_str(self.into_owned())
1568 impl<'a> Str for &'a str {
1570 fn as_slice<'a>(&'a self) -> &'a str { *self }
1573 fn into_owned(self) -> ~str { self.to_owned() }
1576 impl<'a> Str for ~str {
1578 fn as_slice<'a>(&'a self) -> &'a str {
1579 let s: &'a str = *self; s
1583 fn into_owned(self) -> ~str { self }
1586 impl<'a> Container for &'a str {
1588 fn len(&self) -> uint {
1593 impl Container for ~str {
1595 fn len(&self) -> uint { self.as_slice().len() }
1598 /// Methods for string slices
1599 pub trait StrSlice<'a> {
1600 /// Returns true if one string contains another
1604 /// - needle - The string to look for
1605 fn contains<'a>(&self, needle: &'a str) -> bool;
1607 /// Returns true if a string contains a char.
1611 /// - needle - The char to look for
1612 fn contains_char(&self, needle: char) -> bool;
1614 /// An iterator over the characters of `self`. Note, this iterates
1615 /// over unicode code-points, not unicode graphemes.
1620 /// let v: ~[char] = "abc åäö".chars().collect();
1621 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1623 fn chars(&self) -> Chars<'a>;
1625 /// An iterator over the characters of `self`, in reverse order.
1626 fn chars_rev(&self) -> RevChars<'a>;
1628 /// An iterator over the bytes of `self`
1629 fn bytes(&self) -> Bytes<'a>;
1631 /// An iterator over the bytes of `self`, in reverse order
1632 fn bytes_rev(&self) -> RevBytes<'a>;
1634 /// An iterator over the characters of `self` and their byte offsets.
1635 fn char_indices(&self) -> CharOffsets<'a>;
1637 /// An iterator over the characters of `self` and their byte offsets,
1638 /// in reverse order.
1639 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1641 /// An iterator over substrings of `self`, separated by characters
1642 /// matched by `sep`.
1647 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1648 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1650 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1651 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1653 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1654 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1656 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1658 /// An iterator over substrings of `self`, separated by characters
1659 /// matched by `sep`, restricted to splitting at most `count`
1665 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1666 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1668 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1669 /// assert_eq!(v, ~["abc", "def2ghi"]);
1671 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1672 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1674 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1676 /// An iterator over substrings of `self`, separated by characters
1677 /// matched by `sep`.
1679 /// Equivalent to `split`, except that the trailing substring
1680 /// is skipped if empty (terminator semantics).
1685 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1686 /// assert_eq!(v, ~["A", "B"]);
1688 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1689 /// assert_eq!(v, ~["A", "", "B", ""]);
1691 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1693 /// An iterator over substrings of `self`, separated by characters
1694 /// matched by `sep`, in reverse order.
1699 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1700 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1702 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1703 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1705 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1706 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1708 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1710 /// An iterator over substrings of `self`, separated by characters
1711 /// matched by `sep`, starting from the end of the string.
1712 /// Restricted to splitting at most `count` times.
1717 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1718 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1720 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1721 /// assert_eq!(v, ~["ghi", "abc1def"]);
1723 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1724 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1726 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1728 /// An iterator over the start and end indices of the disjoint
1729 /// matches of `sep` within `self`.
1731 /// That is, each returned value `(start, end)` satisfies
1732 /// `self.slice(start, end) == sep`. For matches of `sep` within
1733 /// `self` that overlap, only the indicies corresponding to the
1734 /// first match are returned.
1739 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1740 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1742 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1743 /// assert_eq!(v, ~[(1,4), (4,7)]);
1745 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1746 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1748 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1750 /// An iterator over the substrings of `self` separated by `sep`.
1755 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1756 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1758 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1759 /// assert_eq!(v, ~["1", "", "2"]);
1761 fn split_str(&self, &'a str) -> StrSplits<'a>;
1763 /// An iterator over the lines of a string (subsequences separated
1764 /// by `\n`). This does not include the empty string after a
1770 /// let four_lines = "foo\nbar\n\nbaz\n";
1771 /// let v: ~[&str] = four_lines.lines().collect();
1772 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1774 fn lines(&self) -> CharSplits<'a, char>;
1776 /// An iterator over the lines of a string, separated by either
1777 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1778 /// empty trailing line.
1783 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1784 /// let v: ~[&str] = four_lines.lines_any().collect();
1785 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1787 fn lines_any(&self) -> AnyLines<'a>;
1789 /// An iterator over the words of a string (subsequences separated
1790 /// by any sequence of whitespace). Sequences of whitespace are
1791 /// collapsed, so empty "words" are not included.
1796 /// let some_words = " Mary had\ta little \n\t lamb";
1797 /// let v: ~[&str] = some_words.words().collect();
1798 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1800 fn words(&self) -> Words<'a>;
1802 /// An Iterator over the string in Unicode Normalization Form D
1803 /// (canonical decomposition).
1804 fn nfd_chars(&self) -> Normalizations<'a>;
1806 /// An Iterator over the string in Unicode Normalization Form KD
1807 /// (compatibility decomposition).
1808 fn nfkd_chars(&self) -> Normalizations<'a>;
1810 /// Returns true if the string contains only whitespace.
1812 /// Whitespace characters are determined by `char::is_whitespace`.
1817 /// assert!(" \t\n".is_whitespace());
1818 /// assert!("".is_whitespace());
1820 /// assert!( !"abc".is_whitespace());
1822 fn is_whitespace(&self) -> bool;
1824 /// Returns true if the string contains only alphanumeric code
1827 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1832 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1833 /// assert!("".is_alphanumeric());
1835 /// assert!( !" &*~".is_alphanumeric());
1837 fn is_alphanumeric(&self) -> bool;
1839 /// Returns the number of Unicode code points (`char`) that a
1842 /// This does not perform any normalization, and is `O(n)`, since
1843 /// UTF-8 is a variable width encoding of code points.
1845 /// *Warning*: The number of code points in a string does not directly
1846 /// correspond to the number of visible characters or width of the
1847 /// visible text due to composing characters, and double- and
1848 /// zero-width ones.
1850 /// See also `.len()` for the byte length.
1855 /// // composed forms of `ö` and `é`
1856 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1857 /// // decomposed forms of `ö` and `é`
1858 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1860 /// assert_eq!(c.char_len(), 15);
1861 /// assert_eq!(d.char_len(), 17);
1863 /// assert_eq!(c.len(), 21);
1864 /// assert_eq!(d.len(), 23);
1866 /// // the two strings *look* the same
1867 /// println!("{}", c);
1868 /// println!("{}", d);
1870 fn char_len(&self) -> uint;
1872 /// Returns a slice of the given string from the byte range
1873 /// [`begin`..`end`).
1875 /// This operation is `O(1)`.
1877 /// Fails when `begin` and `end` do not point to valid characters
1878 /// or point beyond the last character of the string.
1880 /// See also `slice_to` and `slice_from` for slicing prefixes and
1881 /// suffixes of strings, and `slice_chars` for slicing based on
1882 /// code point counts.
1887 /// let s = "Löwe 老虎 Léopard";
1888 /// assert_eq!(s.slice(0, 1), "L");
1890 /// assert_eq!(s.slice(1, 9), "öwe 老");
1892 /// // these will fail:
1893 /// // byte 2 lies within `ö`:
1894 /// // s.slice(2, 3);
1896 /// // byte 8 lies within `老`
1897 /// // s.slice(1, 8);
1899 /// // byte 100 is outside the string
1900 /// // s.slice(3, 100);
1902 fn slice(&self, begin: uint, end: uint) -> &'a str;
1904 /// Returns a slice of the string from `begin` to its end.
1906 /// Equivalent to `self.slice(begin, self.len())`.
1908 /// Fails when `begin` does not point to a valid character, or is
1911 /// See also `slice`, `slice_to` and `slice_chars`.
1912 fn slice_from(&self, begin: uint) -> &'a str;
1914 /// Returns a slice of the string from the beginning to byte
1917 /// Equivalent to `self.slice(0, end)`.
1919 /// Fails when `end` does not point to a valid character, or is
1922 /// See also `slice`, `slice_from` and `slice_chars`.
1923 fn slice_to(&self, end: uint) -> &'a str;
1925 /// Returns a slice of the string from the character range
1926 /// [`begin`..`end`).
1928 /// That is, start at the `begin`-th code point of the string and
1929 /// continue to the `end`-th code point. This does not detect or
1930 /// handle edge cases such as leaving a combining character as the
1931 /// first code point of the string.
1933 /// Due to the design of UTF-8, this operation is `O(end)`.
1934 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1935 /// variants that use byte indices rather than code point
1938 /// Fails if `begin` > `end` or the either `begin` or `end` are
1939 /// beyond the last character of the string.
1944 /// let s = "Löwe 老虎 Léopard";
1945 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1946 /// assert_eq!(s.slice_chars(5, 7), "老虎");
1948 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1950 /// Returns true if `needle` is a prefix of the string.
1951 fn starts_with(&self, needle: &str) -> bool;
1953 /// Returns true if `needle` is a suffix of the string.
1954 fn ends_with(&self, needle: &str) -> bool;
1956 /// Escape each char in `s` with `char::escape_default`.
1957 fn escape_default(&self) -> ~str;
1959 /// Escape each char in `s` with `char::escape_unicode`.
1960 fn escape_unicode(&self) -> ~str;
1962 /// Returns a string with leading and trailing whitespace removed.
1963 fn trim(&self) -> &'a str;
1965 /// Returns a string with leading whitespace removed.
1966 fn trim_left(&self) -> &'a str;
1968 /// Returns a string with trailing whitespace removed.
1969 fn trim_right(&self) -> &'a str;
1971 /// Returns a string with characters that match `to_trim` removed.
1975 /// * to_trim - a character matcher
1980 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1981 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1982 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1984 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
1986 /// Returns a string with leading `chars_to_trim` removed.
1990 /// * to_trim - a character matcher
1995 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
1996 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
1997 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
1999 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2001 /// Returns a string with trailing `chars_to_trim` removed.
2005 /// * to_trim - a character matcher
2010 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2011 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2012 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2014 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2016 /// Replace all occurrences of one string with another.
2020 /// * `from` - The string to replace
2021 /// * `to` - The replacement string
2025 /// The original string with all occurances of `from` replaced with `to`.
2030 /// let s = ~"Do you know the muffin man,
2031 /// The muffin man, the muffin man, ...";
2033 /// assert_eq!(s.replace("muffin man", "little lamb"),
2034 /// ~"Do you know the little lamb,
2035 /// The little lamb, the little lamb, ...");
2037 /// // not found, so no change.
2038 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2040 fn replace(&self, from: &str, to: &str) -> ~str;
2042 /// Copy a slice into a new owned str.
2043 fn to_owned(&self) -> ~str;
2045 /// Converts to a vector of `u16` encoded as UTF-16.
2046 fn to_utf16(&self) -> ~[u16];
2048 /// Check that `index`-th byte lies at the start and/or end of a
2049 /// UTF-8 code point sequence.
2051 /// The start and end of the string (when `index == self.len()`)
2052 /// are considered to be boundaries.
2054 /// Fails if `index` is greater than `self.len()`.
2059 /// let s = "Löwe 老虎 Léopard";
2060 /// assert!(s.is_char_boundary(0));
2062 /// assert!(s.is_char_boundary(6));
2063 /// assert!(s.is_char_boundary(s.len()));
2065 /// // second byte of `ö`
2066 /// assert!(!s.is_char_boundary(2));
2068 /// // third byte of `老`
2069 /// assert!(!s.is_char_boundary(8));
2071 fn is_char_boundary(&self, index: uint) -> bool;
2073 /// Pluck a character out of a string and return the index of the next
2076 /// This function can be used to iterate over the unicode characters of a
2081 /// This example manually iterate through the characters of a
2082 /// string; this should normally by done by `.chars()` or
2083 /// `.char_indices`.
2086 /// use std::str::CharRange;
2088 /// let s = "中华Việt Nam";
2090 /// while i < s.len() {
2091 /// let CharRange {ch, next} = s.char_range_at(i);
2092 /// println!("{}: {}", i, ch);
2114 /// * s - The string
2115 /// * i - The byte offset of the char to extract
2119 /// A record {ch: char, next: uint} containing the char value and the byte
2120 /// index of the next unicode character.
2124 /// If `i` is greater than or equal to the length of the string.
2125 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2126 fn char_range_at(&self, start: uint) -> CharRange;
2128 /// Given a byte position and a str, return the previous char and its position.
2130 /// This function can be used to iterate over a unicode string in reverse.
2132 /// Returns 0 for next index if called on start index 0.
2133 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2135 /// Plucks the character starting at the `i`th byte of a string
2136 fn char_at(&self, i: uint) -> char;
2138 /// Plucks the character ending at the `i`th byte of a string
2139 fn char_at_reverse(&self, i: uint) -> char;
2141 /// Work with the byte buffer of a string as a byte slice.
2142 fn as_bytes(&self) -> &'a [u8];
2144 /// Returns the byte index of the first character of `self` that
2145 /// matches `search`.
2149 /// `Some` containing the byte index of the last matching character
2150 /// or `None` if there is no match
2155 /// let s = "Löwe 老虎 Léopard";
2157 /// assert_eq!(s.find('L'), Some(0));
2158 /// assert_eq!(s.find('é'), Some(14));
2160 /// // the first space
2161 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2163 /// // neither are found
2164 /// assert_eq!(s.find(&['1', '2']), None);
2166 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2168 /// Returns the byte index of the last character of `self` that
2169 /// matches `search`.
2173 /// `Some` containing the byte index of the last matching character
2174 /// or `None` if there is no match.
2179 /// let s = "Löwe 老虎 Léopard";
2181 /// assert_eq!(s.rfind('L'), Some(13));
2182 /// assert_eq!(s.rfind('é'), Some(14));
2184 /// // the second space
2185 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2187 /// // searches for an occurrence of either `1` or `2`, but neither are found
2188 /// assert_eq!(s.rfind(&['1', '2']), None);
2190 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2192 /// Returns the byte index of the first matching substring
2196 /// * `needle` - The string to search for
2200 /// `Some` containing the byte index of the first matching substring
2201 /// or `None` if there is no match.
2206 /// let s = "Löwe 老虎 Léopard";
2208 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2209 /// assert_eq!(s.find_str("muffin man"), None);
2211 fn find_str(&self, &str) -> Option<uint>;
2213 /// Given a string, make a new string with repeated copies of it.
2214 fn repeat(&self, nn: uint) -> ~str;
2216 /// Retrieves the first character from a string slice and returns
2217 /// it. This does not allocate a new string; instead, it returns a
2218 /// slice that point one character beyond the character that was
2219 /// shifted. If the string does not contain any characters,
2220 /// a tuple of None and an empty string is returned instead.
2225 /// let s = "Löwe 老虎 Léopard";
2226 /// let (c, s1) = s.slice_shift_char();
2227 /// assert_eq!(c, Some('L'));
2228 /// assert_eq!(s1, "öwe 老虎 Léopard");
2230 /// let (c, s2) = s1.slice_shift_char();
2231 /// assert_eq!(c, Some('ö'));
2232 /// assert_eq!(s2, "we 老虎 Léopard");
2234 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2236 /// Levenshtein Distance between two strings.
2237 fn lev_distance(&self, t: &str) -> uint;
2239 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2241 /// Fails if `inner` is not a direct slice contained within self.
2246 /// let string = "a\nb\nc";
2247 /// let lines: ~[&str] = string.lines().collect();
2249 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2250 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2251 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2253 fn subslice_offset(&self, inner: &str) -> uint;
2255 /// Return an unsafe pointer to the strings buffer.
2257 /// The caller must ensure that the string outlives this pointer,
2258 /// and that it is not reallocated (e.g. by pushing to the
2260 fn as_ptr(&self) -> *u8;
2263 impl<'a> StrSlice<'a> for &'a str {
2265 fn contains<'a>(&self, needle: &'a str) -> bool {
2266 self.find_str(needle).is_some()
2270 fn contains_char(&self, needle: char) -> bool {
2271 self.find(needle).is_some()
2275 fn chars(&self) -> Chars<'a> {
2276 Chars{string: *self}
2280 fn chars_rev(&self) -> RevChars<'a> {
2285 fn bytes(&self) -> Bytes<'a> {
2286 self.as_bytes().iter().map(|&b| b)
2290 fn bytes_rev(&self) -> RevBytes<'a> {
2295 fn char_indices(&self) -> CharOffsets<'a> {
2296 CharOffsets{string: *self, iter: self.chars()}
2300 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2301 self.char_indices().rev()
2305 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2308 only_ascii: sep.only_ascii(),
2310 allow_trailing_empty: true,
2316 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2317 -> CharSplitsN<'a, Sep> {
2319 iter: self.split(sep),
2326 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2327 -> CharSplits<'a, Sep> {
2329 allow_trailing_empty: false,
2335 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2336 self.split(sep).rev()
2340 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2341 -> CharSplitsN<'a, Sep> {
2343 iter: self.split(sep),
2350 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2351 assert!(!sep.is_empty())
2360 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2362 it: self.match_indices(sep),
2369 fn lines(&self) -> CharSplits<'a, char> {
2370 self.split_terminator('\n')
2373 fn lines_any(&self) -> AnyLines<'a> {
2374 self.lines().map(|line| {
2376 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2382 fn words(&self) -> Words<'a> {
2383 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2387 fn nfd_chars(&self) -> Normalizations<'a> {
2397 fn nfkd_chars(&self) -> Normalizations<'a> {
2407 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2410 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2413 fn char_len(&self) -> uint { self.chars().len() }
2416 fn slice(&self, begin: uint, end: uint) -> &'a str {
2417 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2418 unsafe { raw::slice_bytes(*self, begin, end) }
2422 fn slice_from(&self, begin: uint) -> &'a str {
2423 self.slice(begin, self.len())
2427 fn slice_to(&self, end: uint) -> &'a str {
2428 assert!(self.is_char_boundary(end));
2429 unsafe { raw::slice_bytes(*self, 0, end) }
2432 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2433 assert!(begin <= end);
2435 let mut begin_byte = None;
2436 let mut end_byte = None;
2438 // This could be even more efficient by not decoding,
2439 // only finding the char boundaries
2440 for (idx, _) in self.char_indices() {
2441 if count == begin { begin_byte = Some(idx); }
2442 if count == end { end_byte = Some(idx); break; }
2445 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2446 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2448 match (begin_byte, end_byte) {
2449 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2450 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2451 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2456 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2457 let n = needle.len();
2458 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2462 fn ends_with(&self, needle: &str) -> bool {
2463 let (m, n) = (self.len(), needle.len());
2464 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2467 fn escape_default(&self) -> ~str {
2468 let mut out = StrBuf::with_capacity(self.len());
2469 for c in self.chars() {
2470 c.escape_default(|c| out.push_char(c));
2475 fn escape_unicode(&self) -> ~str {
2476 let mut out = StrBuf::with_capacity(self.len());
2477 for c in self.chars() {
2478 c.escape_unicode(|c| out.push_char(c));
2484 fn trim(&self) -> &'a str {
2485 self.trim_left().trim_right()
2489 fn trim_left(&self) -> &'a str {
2490 self.trim_left_chars(&char::is_whitespace)
2494 fn trim_right(&self) -> &'a str {
2495 self.trim_right_chars(&char::is_whitespace)
2499 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2500 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2504 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2505 match self.find(|c: char| !to_trim.matches(c)) {
2507 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2512 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2513 match self.rfind(|c: char| !to_trim.matches(c)) {
2516 let next = self.char_range_at(last).next;
2517 unsafe { raw::slice_bytes(*self, 0u, next) }
2522 fn replace(&self, from: &str, to: &str) -> ~str {
2523 let mut result = StrBuf::new();
2524 let mut last_end = 0;
2525 for (start, end) in self.match_indices(from) {
2526 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2527 result.push_str(to);
2530 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2535 fn to_owned(&self) -> ~str {
2536 let len = self.len();
2538 let mut v = Vec::with_capacity(len);
2540 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2542 ::cast::transmute(v.move_iter().collect::<~[u8]>())
2546 fn to_utf16(&self) -> ~[u16] {
2547 let mut u = Vec::new();;
2548 for ch in self.chars() {
2549 let mut buf = [0u16, ..2];
2550 let n = ch.encode_utf16(buf /* as mut slice! */);
2551 u.push_all(buf.slice_to(n));
2553 u.move_iter().collect()
2557 fn is_char_boundary(&self, index: uint) -> bool {
2558 if index == self.len() { return true; }
2559 let b = self[index];
2560 return b < 128u8 || b >= 192u8;
2564 fn char_range_at(&self, i: uint) -> CharRange {
2565 if self[i] < 128u8 {
2566 return CharRange {ch: self[i] as char, next: i + 1 };
2569 // Multibyte case is a fn to allow char_range_at to inline cleanly
2570 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2571 let mut val = s[i] as u32;
2572 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2575 val = utf8_first_byte!(val, w);
2576 val = utf8_acc_cont_byte!(val, s[i + 1]);
2577 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2578 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2580 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2583 return multibyte_char_range_at(*self, i);
2587 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2588 let mut prev = start;
2590 prev = prev.saturating_sub(1);
2591 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2593 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2594 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2595 // while there is a previous byte == 10......
2596 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2600 let mut val = s[i] as u32;
2601 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2604 val = utf8_first_byte!(val, w);
2605 val = utf8_acc_cont_byte!(val, s[i + 1]);
2606 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2607 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2609 return CharRange {ch: unsafe { transmute(val) }, next: i};
2612 return multibyte_char_range_at_reverse(*self, prev);
2616 fn char_at(&self, i: uint) -> char {
2617 self.char_range_at(i).ch
2621 fn char_at_reverse(&self, i: uint) -> char {
2622 self.char_range_at_reverse(i).ch
2626 fn as_bytes(&self) -> &'a [u8] {
2627 unsafe { cast::transmute(*self) }
2630 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2631 if search.only_ascii() {
2632 self.bytes().position(|b| search.matches(b as char))
2634 for (index, c) in self.char_indices() {
2635 if search.matches(c) { return Some(index); }
2641 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2642 if search.only_ascii() {
2643 self.bytes().rposition(|b| search.matches(b as char))
2645 for (index, c) in self.char_indices_rev() {
2646 if search.matches(c) { return Some(index); }
2652 fn find_str(&self, needle: &str) -> Option<uint> {
2653 if needle.is_empty() {
2656 self.match_indices(needle)
2658 .map(|(start, _end)| start)
2662 fn repeat(&self, nn: uint) -> ~str {
2663 let mut ret = StrBuf::with_capacity(nn * self.len());
2664 for _ in range(0, nn) {
2665 ret.push_str(*self);
2671 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2672 if self.is_empty() {
2673 return (None, *self);
2675 let CharRange {ch, next} = self.char_range_at(0u);
2676 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2677 return (Some(ch), next_s);
2681 fn lev_distance(&self, t: &str) -> uint {
2682 let slen = self.len();
2685 if slen == 0 { return tlen; }
2686 if tlen == 0 { return slen; }
2688 let mut dcol = Vec::from_fn(tlen + 1, |x| x);
2690 for (i, sc) in self.chars().enumerate() {
2692 let mut current = i;
2693 *dcol.get_mut(0) = current + 1;
2695 for (j, tc) in t.chars().enumerate() {
2697 let next = *dcol.get(j + 1);
2700 *dcol.get_mut(j + 1) = current;
2702 *dcol.get_mut(j + 1) = ::cmp::min(current, next);
2703 *dcol.get_mut(j + 1) = ::cmp::min(*dcol.get(j + 1),
2711 return *dcol.get(tlen);
2714 fn subslice_offset(&self, inner: &str) -> uint {
2715 let a_start = self.as_ptr() as uint;
2716 let a_end = a_start + self.len();
2717 let b_start = inner.as_ptr() as uint;
2718 let b_end = b_start + inner.len();
2720 assert!(a_start <= b_start);
2721 assert!(b_end <= a_end);
2726 fn as_ptr(&self) -> *u8 {
2731 /// Methods for owned strings
2732 pub trait OwnedStr {
2733 /// Consumes the string, returning the underlying byte buffer.
2735 /// The buffer does not have a null terminator.
2736 fn into_bytes(self) -> ~[u8];
2738 /// Pushes the given string onto this string, returning the concatenation of the two strings.
2739 fn append(self, rhs: &str) -> ~str;
2742 impl OwnedStr for ~str {
2744 fn into_bytes(self) -> ~[u8] {
2745 unsafe { cast::transmute(self) }
2749 fn append(self, rhs: &str) -> ~str {
2750 let mut new_str = StrBuf::from_owned_str(self);
2751 new_str.push_str(rhs);
2752 new_str.into_owned()
2756 impl Clone for ~str {
2758 fn clone(&self) -> ~str {
2763 impl FromIterator<char> for ~str {
2765 fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2766 let (lower, _) = iterator.size_hint();
2767 let mut buf = StrBuf::with_capacity(lower);
2768 buf.extend(iterator);
2773 // This works because every lifetime is a sub-lifetime of 'static
2774 impl<'a> Default for &'a str {
2775 fn default() -> &'a str { "" }
2778 impl Default for ~str {
2779 fn default() -> ~str { ~"" }
2784 use iter::AdditiveIterator;
2785 use default::Default;
2792 assert!((eq(&~"", &~"")));
2793 assert!((eq(&~"foo", &~"foo")));
2794 assert!((!eq(&~"foo", &~"bar")));
2798 fn test_eq_slice() {
2799 assert!((eq_slice("foobar".slice(0, 3), "foo")));
2800 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2801 assert!((!eq_slice("foo1", "foo2")));
2807 assert!("" <= "foo");
2808 assert!("foo" <= "foo");
2809 assert!("foo" != "bar");
2814 assert_eq!("".len(), 0u);
2815 assert_eq!("hello world".len(), 11u);
2816 assert_eq!("\x63".len(), 1u);
2817 assert_eq!("\xa2".len(), 2u);
2818 assert_eq!("\u03c0".len(), 2u);
2819 assert_eq!("\u2620".len(), 3u);
2820 assert_eq!("\U0001d11e".len(), 4u);
2822 assert_eq!("".char_len(), 0u);
2823 assert_eq!("hello world".char_len(), 11u);
2824 assert_eq!("\x63".char_len(), 1u);
2825 assert_eq!("\xa2".char_len(), 1u);
2826 assert_eq!("\u03c0".char_len(), 1u);
2827 assert_eq!("\u2620".char_len(), 1u);
2828 assert_eq!("\U0001d11e".char_len(), 1u);
2829 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2834 assert_eq!("hello".find('l'), Some(2u));
2835 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2836 assert!("hello".find('x').is_none());
2837 assert!("hello".find(|c:char| c == 'x').is_none());
2838 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2839 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2844 assert_eq!("hello".rfind('l'), Some(3u));
2845 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2846 assert!("hello".rfind('x').is_none());
2847 assert!("hello".rfind(|c:char| c == 'x').is_none());
2848 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2849 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2855 let s: ~str = empty.chars().collect();
2856 assert_eq!(empty, s);
2857 let data = ~"ประเทศไทย中";
2858 let s: ~str = data.chars().collect();
2859 assert_eq!(data, s);
2863 fn test_into_bytes() {
2865 let buf = data.into_bytes();
2866 assert_eq!(bytes!("asdf"), buf.as_slice());
2870 fn test_find_str() {
2872 assert_eq!("".find_str(""), Some(0u));
2873 assert!("banana".find_str("apple pie").is_none());
2875 let data = "abcabc";
2876 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2877 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2878 assert!(data.slice(2u, 4u).find_str("ab").is_none());
2880 let mut data = ~"ประเทศไทย中华Việt Nam";
2882 assert!(data.find_str("ไท华").is_none());
2883 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2884 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2886 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2887 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2888 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2889 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2890 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2892 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2893 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2894 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2895 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2896 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2900 fn test_slice_chars() {
2901 fn t(a: &str, b: &str, start: uint) {
2902 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2905 t("hello", "llo", 2);
2906 t("hello", "el", 1);
2909 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2914 fn t(v: &[~str], s: &str) {
2915 assert_eq!(v.concat(), s.to_str());
2917 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2918 let v: &[~str] = [];
2925 fn t(v: &[~str], sep: &str, s: &str) {
2926 assert_eq!(v.connect(sep), s.to_str());
2928 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2929 " ", "you know I'm no good");
2930 let v: &[~str] = [];
2932 t([~"hi"], " ", "hi");
2936 fn test_concat_slices() {
2937 fn t(v: &[&str], s: &str) {
2938 assert_eq!(v.concat(), s.to_str());
2940 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2941 let v: &[&str] = [];
2947 fn test_connect_slices() {
2948 fn t(v: &[&str], sep: &str, s: &str) {
2949 assert_eq!(v.connect(sep), s.to_str());
2951 t(["you", "know", "I'm", "no", "good"],
2952 " ", "you know I'm no good");
2954 t(["hi"], " ", "hi");
2959 assert_eq!("x".repeat(4), ~"xxxx");
2960 assert_eq!("hi".repeat(4), ~"hihihihi");
2961 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
2962 assert_eq!("".repeat(4), ~"");
2963 assert_eq!("hi".repeat(0), ~"");
2967 fn test_unsafe_slice() {
2968 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2969 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
2970 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2971 fn a_million_letter_a() -> ~str {
2973 let mut rs = StrBuf::new();
2975 rs.push_str("aaaaaaaaaa");
2980 fn half_a_million_letter_a() -> ~str {
2982 let mut rs = StrBuf::new();
2984 rs.push_str("aaaaa");
2989 let letters = a_million_letter_a();
2990 assert!(half_a_million_letter_a() ==
2991 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
2995 fn test_starts_with() {
2996 assert!(("".starts_with("")));
2997 assert!(("abc".starts_with("")));
2998 assert!(("abc".starts_with("a")));
2999 assert!((!"a".starts_with("abc")));
3000 assert!((!"".starts_with("abc")));
3001 assert!((!"ödd".starts_with("-")));
3002 assert!(("ödd".starts_with("öd")));
3006 fn test_ends_with() {
3007 assert!(("".ends_with("")));
3008 assert!(("abc".ends_with("")));
3009 assert!(("abc".ends_with("c")));
3010 assert!((!"a".ends_with("abc")));
3011 assert!((!"".ends_with("abc")));
3012 assert!((!"ddö".ends_with("-")));
3013 assert!(("ddö".ends_with("dö")));
3017 fn test_is_empty() {
3018 assert!("".is_empty());
3019 assert!(!"a".is_empty());
3025 assert_eq!("".replace(a, "b"), ~"");
3026 assert_eq!("a".replace(a, "b"), ~"b");
3027 assert_eq!("ab".replace(a, "b"), ~"bb");
3029 assert!(" test test ".replace(test, "toast") ==
3031 assert_eq!(" test test ".replace(test, ""), ~" ");
3035 fn test_replace_2a() {
3036 let data = ~"ประเทศไทย中华";
3037 let repl = ~"دولة الكويت";
3040 let a2 = ~"دولة الكويتทศไทย中华";
3041 assert_eq!(data.replace(a, repl), a2);
3045 fn test_replace_2b() {
3046 let data = ~"ประเทศไทย中华";
3047 let repl = ~"دولة الكويت";
3050 let b2 = ~"ปรدولة الكويتทศไทย中华";
3051 assert_eq!(data.replace(b, repl), b2);
3055 fn test_replace_2c() {
3056 let data = ~"ประเทศไทย中华";
3057 let repl = ~"دولة الكويت";
3060 let c2 = ~"ประเทศไทยدولة الكويت";
3061 assert_eq!(data.replace(c, repl), c2);
3065 fn test_replace_2d() {
3066 let data = ~"ประเทศไทย中华";
3067 let repl = ~"دولة الكويت";
3070 assert_eq!(data.replace(d, repl), data);
3075 assert_eq!("ab", "abc".slice(0, 2));
3076 assert_eq!("bc", "abc".slice(1, 3));
3077 assert_eq!("", "abc".slice(1, 1));
3078 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3080 let data = "ประเทศไทย中华";
3081 assert_eq!("ป", data.slice(0, 3));
3082 assert_eq!("ร", data.slice(3, 6));
3083 assert_eq!("", data.slice(3, 3));
3084 assert_eq!("华", data.slice(30, 33));
3086 fn a_million_letter_X() -> ~str {
3088 let mut rs = StrBuf::new();
3090 rs.push_str("华华华华华华华华华华");
3095 fn half_a_million_letter_X() -> ~str {
3097 let mut rs = StrBuf::new();
3099 rs.push_str("华华华华华");
3104 let letters = a_million_letter_X();
3105 assert!(half_a_million_letter_X() ==
3106 letters.slice(0u, 3u * 500000u).to_owned());
3111 let ss = "中华Việt Nam";
3113 assert_eq!("华", ss.slice(3u, 6u));
3114 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3116 assert_eq!("ab", "abc".slice(0u, 2u));
3117 assert_eq!("bc", "abc".slice(1u, 3u));
3118 assert_eq!("", "abc".slice(1u, 1u));
3120 assert_eq!("中", ss.slice(0u, 3u));
3121 assert_eq!("华V", ss.slice(3u, 7u));
3122 assert_eq!("", ss.slice(3u, 3u));
3137 fn test_slice_fail() {
3138 "中华Việt Nam".slice(0u, 2u);
3142 fn test_slice_from() {
3143 assert_eq!("abcd".slice_from(0), "abcd");
3144 assert_eq!("abcd".slice_from(2), "cd");
3145 assert_eq!("abcd".slice_from(4), "");
3148 fn test_slice_to() {
3149 assert_eq!("abcd".slice_to(0), "");
3150 assert_eq!("abcd".slice_to(2), "ab");
3151 assert_eq!("abcd".slice_to(4), "abcd");
3155 fn test_trim_left_chars() {
3156 let v: &[char] = &[];
3157 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3158 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3159 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3160 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3162 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3163 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3164 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3168 fn test_trim_right_chars() {
3169 let v: &[char] = &[];
3170 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3171 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3172 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3173 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3175 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3176 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3177 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3181 fn test_trim_chars() {
3182 let v: &[char] = &[];
3183 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3184 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3185 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3186 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3188 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3189 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3190 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3194 fn test_trim_left() {
3195 assert_eq!("".trim_left(), "");
3196 assert_eq!("a".trim_left(), "a");
3197 assert_eq!(" ".trim_left(), "");
3198 assert_eq!(" blah".trim_left(), "blah");
3199 assert_eq!(" \u3000 wut".trim_left(), "wut");
3200 assert_eq!("hey ".trim_left(), "hey ");
3204 fn test_trim_right() {
3205 assert_eq!("".trim_right(), "");
3206 assert_eq!("a".trim_right(), "a");
3207 assert_eq!(" ".trim_right(), "");
3208 assert_eq!("blah ".trim_right(), "blah");
3209 assert_eq!("wut \u3000 ".trim_right(), "wut");
3210 assert_eq!(" hey".trim_right(), " hey");
3215 assert_eq!("".trim(), "");
3216 assert_eq!("a".trim(), "a");
3217 assert_eq!(" ".trim(), "");
3218 assert_eq!(" blah ".trim(), "blah");
3219 assert_eq!("\nwut \u3000 ".trim(), "wut");
3220 assert_eq!(" hey dude ".trim(), "hey dude");
3224 fn test_is_whitespace() {
3225 assert!("".is_whitespace());
3226 assert!(" ".is_whitespace());
3227 assert!("\u2009".is_whitespace()); // Thin space
3228 assert!(" \n\t ".is_whitespace());
3229 assert!(!" _ ".is_whitespace());
3233 fn test_slice_shift_char() {
3234 let data = "ประเทศไทย中";
3235 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3239 fn test_slice_shift_char_2() {
3241 assert_eq!(empty.slice_shift_char(), (None, ""));
3246 // deny overlong encodings
3247 assert!(!is_utf8([0xc0, 0x80]));
3248 assert!(!is_utf8([0xc0, 0xae]));
3249 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3250 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3251 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3252 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3253 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3256 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3257 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3259 assert!(is_utf8([0xC2, 0x80]));
3260 assert!(is_utf8([0xDF, 0xBF]));
3261 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3262 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3263 assert!(is_utf8([0xEE, 0x80, 0x80]));
3264 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3265 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3266 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3270 fn test_is_utf16() {
3271 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3279 // surrogate pairs (randomly generated with Python 3's
3280 // .encode('utf-16be'))
3281 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3282 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3283 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3285 // mixtures (also random)
3286 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3287 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3288 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3291 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3294 // surrogate + regular unit
3296 // surrogate + lead surrogate
3298 // unterminated surrogate
3300 // trail surrogate without a lead
3303 // random byte sequences that Python 3's .decode('utf-16be')
3305 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3306 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3307 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3308 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3309 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3310 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3311 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3312 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3313 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3314 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3315 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3316 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3317 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3318 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3319 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3320 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3321 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3322 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3323 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3324 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3325 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3329 fn test_raw_from_c_str() {
3331 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3333 let c = raw::from_c_str(b);
3334 assert_eq!(c, ~"AAAAAAA");
3339 fn test_as_bytes() {
3342 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3343 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3346 assert_eq!("".as_bytes(), &[]);
3347 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3348 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3353 fn test_as_bytes_fail() {
3354 // Don't double free. (I'm not sure if this exercises the
3355 // original problem code path anymore.)
3357 let _bytes = s.as_bytes();
3363 let buf = "hello".as_ptr();
3365 assert_eq!(*buf.offset(0), 'h' as u8);
3366 assert_eq!(*buf.offset(1), 'e' as u8);
3367 assert_eq!(*buf.offset(2), 'l' as u8);
3368 assert_eq!(*buf.offset(3), 'l' as u8);
3369 assert_eq!(*buf.offset(4), 'o' as u8);
3374 fn test_subslice_offset() {
3375 let a = "kernelsprite";
3376 let b = a.slice(7, a.len());
3377 let c = a.slice(0, a.len() - 6);
3378 assert_eq!(a.subslice_offset(b), 7);
3379 assert_eq!(a.subslice_offset(c), 0);
3381 let string = "a\nb\nc";
3382 let lines: ~[&str] = string.lines().collect();
3383 assert_eq!(string.subslice_offset(lines[0]), 0);
3384 assert_eq!(string.subslice_offset(lines[1]), 2);
3385 assert_eq!(string.subslice_offset(lines[2]), 4);
3390 fn test_subslice_offset_2() {
3391 let a = "alchemiter";
3392 let b = "cruxtruder";
3393 a.subslice_offset(b);
3397 fn vec_str_conversions() {
3398 let s1: ~str = ~"All mimsy were the borogoves";
3400 let v: ~[u8] = s1.as_bytes().to_owned();
3401 let s2: ~str = from_utf8(v).unwrap().to_owned();
3402 let mut i: uint = 0u;
3403 let n1: uint = s1.len();
3404 let n2: uint = v.len();
3417 fn test_contains() {
3418 assert!("abcde".contains("bcd"));
3419 assert!("abcde".contains("abcd"));
3420 assert!("abcde".contains("bcde"));
3421 assert!("abcde".contains(""));
3422 assert!("".contains(""));
3423 assert!(!"abcde".contains("def"));
3424 assert!(!"".contains("a"));
3426 let data = ~"ประเทศไทย中华Việt Nam";
3427 assert!(data.contains("ประเ"));
3428 assert!(data.contains("ะเ"));
3429 assert!(data.contains("中华"));
3430 assert!(!data.contains("ไท华"));
3434 fn test_contains_char() {
3435 assert!("abc".contains_char('b'));
3436 assert!("a".contains_char('a'));
3437 assert!(!"abc".contains_char('d'));
3438 assert!(!"".contains_char('a'));
3445 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3446 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3447 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3448 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3451 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3452 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3453 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3454 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3455 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3458 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3459 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3460 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3461 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3462 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3463 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3464 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3465 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3467 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3468 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3469 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3470 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3471 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3472 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3473 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3474 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3475 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3476 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3477 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3479 // Issue #12318, even-numbered non-BMP planes
3481 ~[0xD840, 0xDC00])];
3483 for p in pairs.iter() {
3484 let (s, u) = (*p).clone();
3485 assert!(is_utf16(u));
3486 assert_eq!(s.to_utf16(), u);
3488 assert_eq!(from_utf16(u).unwrap(), s);
3489 assert_eq!(from_utf16_lossy(u), s);
3491 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3492 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3497 fn test_utf16_invalid() {
3498 // completely positive cases tested above.
3500 assert_eq!(from_utf16([0xD800]), None);
3502 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3505 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3508 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3512 fn test_utf16_lossy() {
3513 // completely positive cases tested above.
3515 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3517 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3520 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3523 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3527 fn test_truncate_utf16_at_nul() {
3529 assert_eq!(truncate_utf16_at_nul(v), &[]);
3532 assert_eq!(truncate_utf16_at_nul(v), &[]);
3535 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3538 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3541 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3546 let s = ~"ศไทย中华Việt Nam";
3547 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3549 for ch in v.iter() {
3550 assert!(s.char_at(pos) == *ch);
3551 pos += from_char(*ch).len();
3556 fn test_char_at_reverse() {
3557 let s = ~"ศไทย中华Việt Nam";
3558 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3559 let mut pos = s.len();
3560 for ch in v.rev_iter() {
3561 assert!(s.char_at_reverse(pos) == *ch);
3562 pos -= from_char(*ch).len();
3567 fn test_escape_unicode() {
3568 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3569 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3570 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3571 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3572 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3573 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3574 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3575 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3576 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3580 fn test_escape_default() {
3581 assert_eq!("abc".escape_default(), ~"abc");
3582 assert_eq!("a c".escape_default(), ~"a c");
3583 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3584 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3585 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3586 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3587 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3588 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3592 fn test_total_ord() {
3593 "1234".cmp(& &"123") == Greater;
3594 "123".cmp(& &"1234") == Less;
3595 "1234".cmp(& &"1234") == Equal;
3596 "12345555".cmp(& &"123456") == Less;
3597 "22".cmp(& &"1234") == Greater;
3601 fn test_char_range_at() {
3602 let data = ~"b¢€𤭢𤭢€¢b";
3603 assert_eq!('b', data.char_range_at(0).ch);
3604 assert_eq!('¢', data.char_range_at(1).ch);
3605 assert_eq!('€', data.char_range_at(3).ch);
3606 assert_eq!('𤭢', data.char_range_at(6).ch);
3607 assert_eq!('𤭢', data.char_range_at(10).ch);
3608 assert_eq!('€', data.char_range_at(14).ch);
3609 assert_eq!('¢', data.char_range_at(17).ch);
3610 assert_eq!('b', data.char_range_at(19).ch);
3614 fn test_char_range_at_reverse_underflow() {
3615 assert_eq!("abc".char_range_at_reverse(0).next, 0);
3620 #![allow(unnecessary_allocation)]
3622 ($s1:expr, $s2:expr, $e:expr) => { {
3626 assert_eq!(s1 + s2, e.to_owned());
3627 assert_eq!(s1.to_owned() + s2, e.to_owned());
3631 t!("foo", "bar", "foobar");
3632 t!("foo", ~"bar", "foobar");
3633 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
3634 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
3638 fn test_iterator() {
3640 let s = ~"ศไทย中华Việt Nam";
3641 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3644 let mut it = s.chars();
3647 assert_eq!(c, v[pos]);
3650 assert_eq!(pos, v.len());
3654 fn test_rev_iterator() {
3656 let s = ~"ศไทย中华Việt Nam";
3657 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3660 let mut it = s.chars_rev();
3663 assert_eq!(c, v[pos]);
3666 assert_eq!(pos, v.len());
3670 fn test_iterator_clone() {
3671 let s = "ศไทย中华Việt Nam";
3672 let mut it = s.chars();
3674 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3678 fn test_bytesator() {
3679 let s = ~"ศไทย中华Việt Nam";
3681 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3682 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3687 for b in s.bytes() {
3688 assert_eq!(b, v[pos]);
3694 fn test_bytes_revator() {
3695 let s = ~"ศไทย中华Việt Nam";
3697 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3698 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3701 let mut pos = v.len();
3703 for b in s.bytes_rev() {
3705 assert_eq!(b, v[pos]);
3710 fn test_char_indicesator() {
3712 let s = "ศไทย中华Việt Nam";
3713 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3714 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3717 let mut it = s.char_indices();
3720 assert_eq!(c, (p[pos], v[pos]));
3723 assert_eq!(pos, v.len());
3724 assert_eq!(pos, p.len());
3728 fn test_char_indices_revator() {
3730 let s = "ศไทย中华Việt Nam";
3731 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3732 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3735 let mut it = s.char_indices_rev();
3738 assert_eq!(c, (p[pos], v[pos]));
3741 assert_eq!(pos, v.len());
3742 assert_eq!(pos, p.len());
3746 fn test_split_char_iterator() {
3747 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3749 let split: ~[&str] = data.split(' ').collect();
3750 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3752 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
3754 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3756 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3757 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3759 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
3761 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3764 let split: ~[&str] = data.split('ä').collect();
3765 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3767 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
3769 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3771 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3772 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3774 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
3776 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3780 fn test_splitn_char_iterator() {
3781 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3783 let split: ~[&str] = data.splitn(' ', 3).collect();
3784 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3786 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3787 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3790 let split: ~[&str] = data.splitn('ä', 3).collect();
3791 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3793 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3794 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3798 fn test_rsplitn_char_iterator() {
3799 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3801 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3803 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3805 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3807 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3810 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3812 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3814 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3816 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3820 fn test_split_char_iterator_no_trailing() {
3821 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3823 let split: ~[&str] = data.split('\n').collect();
3824 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3826 let split: ~[&str] = data.split_terminator('\n').collect();
3827 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3831 fn test_rev_split_char_iterator_no_trailing() {
3832 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3834 let mut split: ~[&str] = data.split('\n').rev().collect();
3836 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3838 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3840 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3845 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
3846 let words: ~[&str] = data.words().collect();
3847 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3851 fn test_nfd_chars() {
3852 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
3853 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
3854 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
3855 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
3856 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3857 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3858 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
3859 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
3860 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3861 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
3865 fn test_nfkd_chars() {
3866 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
3867 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
3868 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
3869 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
3870 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3871 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3872 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
3873 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
3874 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3875 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
3880 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3881 let lines: ~[&str] = data.lines().collect();
3882 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3884 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3885 let lines: ~[&str] = data.lines().collect();
3886 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3890 fn test_split_strator() {
3891 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3892 let v: ~[&str] = s.split_str(sep).collect();
3895 t("--1233345--", "12345", ~["--1233345--"]);
3896 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3897 t("::hello::there", "::", ~["", "hello", "there"]);
3898 t("hello::there::", "::", ~["hello", "there", ""]);
3899 t("::hello::there::", "::", ~["", "hello", "there", ""]);
3900 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3901 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3902 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3903 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3905 t("zz", "zz", ~["",""]);
3906 t("ok", "z", ~["ok"]);
3907 t("zzz", "zz", ~["","z"]);
3908 t("zzzzz", "zz", ~["","","z"]);
3912 fn test_str_default() {
3913 use default::Default;
3914 fn t<S: Default + Str>() {
3915 let s: S = Default::default();
3916 assert_eq!(s.as_slice(), "");
3924 fn test_str_container() {
3925 fn sum_len<S: Container>(v: &[S]) -> uint {
3926 v.iter().map(|x| x.len()).sum()
3930 assert_eq!(5, sum_len(["012", "", "34"]));
3931 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3932 assert_eq!(5, sum_len([s.as_slice()]));
3936 fn test_str_from_utf8() {
3937 let xs = bytes!("hello");
3938 assert_eq!(from_utf8(xs), Some("hello"));
3940 let xs = bytes!("ศไทย中华Việt Nam");
3941 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3943 let xs = bytes!("hello", 0xff);
3944 assert_eq!(from_utf8(xs), None);
3948 fn test_str_from_utf8_owned() {
3949 let xs = bytes!("hello").to_owned();
3950 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
3952 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3953 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
3955 let xs = bytes!("hello", 0xff).to_owned();
3956 assert_eq!(from_utf8_owned(xs), None);
3960 fn test_str_from_utf8_lossy() {
3961 let xs = bytes!("hello");
3962 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
3964 let xs = bytes!("ศไทย中华Việt Nam");
3965 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
3967 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3968 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
3970 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
3971 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
3973 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
3974 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
3976 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
3977 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
3979 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
3980 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
3982 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
3983 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
3986 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
3987 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
3991 fn test_from_str() {
3992 let owned: Option<~str> = from_str(&"string");
3993 assert_eq!(owned, Some(~"string"));
3997 fn test_maybe_owned_traits() {
3998 let s = Slice("abcde");
3999 assert_eq!(s.len(), 5);
4000 assert_eq!(s.as_slice(), "abcde");
4001 assert_eq!(s.to_str(), ~"abcde");
4002 assert_eq!(format!("{}", s), ~"abcde");
4003 assert!(s.lt(&Owned(~"bcdef")));
4004 assert_eq!(Slice(""), Default::default());
4006 let o = Owned(~"abcde");
4007 assert_eq!(o.len(), 5);
4008 assert_eq!(o.as_slice(), "abcde");
4009 assert_eq!(o.to_str(), ~"abcde");
4010 assert_eq!(format!("{}", o), ~"abcde");
4011 assert!(o.lt(&Slice("bcdef")));
4012 assert_eq!(Owned(~""), Default::default());
4014 assert!(s.cmp(&o) == Equal);
4015 assert!(s.equiv(&o));
4017 assert!(o.cmp(&s) == Equal);
4018 assert!(o.equiv(&s));
4022 fn test_maybe_owned_methods() {
4023 let s = Slice("abcde");
4024 assert!(s.is_slice());
4025 assert!(!s.is_owned());
4027 let o = Owned(~"abcde");
4028 assert!(!o.is_slice());
4029 assert!(o.is_owned());
4033 fn test_maybe_owned_clone() {
4034 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4035 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4036 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4037 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4041 fn test_maybe_owned_into_owned() {
4042 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4043 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4047 fn test_into_maybe_owned() {
4048 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4049 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4050 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4051 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4058 use self::test::Bencher;
4063 fn char_iterator(b: &mut Bencher) {
4064 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4065 let len = s.char_len();
4067 b.iter(|| assert_eq!(s.chars().len(), len));
4071 fn char_iterator_ascii(b: &mut Bencher) {
4072 let s = "Mary had a little lamb, Little lamb
4073 Mary had a little lamb, Little lamb
4074 Mary had a little lamb, Little lamb
4075 Mary had a little lamb, Little lamb
4076 Mary had a little lamb, Little lamb
4077 Mary had a little lamb, Little lamb";
4078 let len = s.char_len();
4080 b.iter(|| assert_eq!(s.chars().len(), len));
4084 fn char_iterator_rev(b: &mut Bencher) {
4085 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4086 let len = s.char_len();
4088 b.iter(|| assert_eq!(s.chars_rev().len(), len));
4092 fn char_indicesator(b: &mut Bencher) {
4093 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4094 let len = s.char_len();
4096 b.iter(|| assert_eq!(s.char_indices().len(), len));
4100 fn char_indicesator_rev(b: &mut Bencher) {
4101 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4102 let len = s.char_len();
4104 b.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4108 fn split_unicode_ascii(b: &mut Bencher) {
4109 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4111 b.iter(|| assert_eq!(s.split('V').len(), 3));
4115 fn split_unicode_not_ascii(b: &mut Bencher) {
4116 struct NotAscii(char);
4117 impl CharEq for NotAscii {
4118 fn matches(&self, c: char) -> bool {
4119 let NotAscii(cc) = *self;
4122 fn only_ascii(&self) -> bool { false }
4124 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4126 b.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4131 fn split_ascii(b: &mut Bencher) {
4132 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4133 let len = s.split(' ').len();
4135 b.iter(|| assert_eq!(s.split(' ').len(), len));
4139 fn split_not_ascii(b: &mut Bencher) {
4140 struct NotAscii(char);
4141 impl CharEq for NotAscii {
4143 fn matches(&self, c: char) -> bool {
4144 let NotAscii(cc) = *self;
4147 fn only_ascii(&self) -> bool { false }
4149 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4150 let len = s.split(' ').len();
4152 b.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4156 fn split_extern_fn(b: &mut Bencher) {
4157 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4158 let len = s.split(' ').len();
4159 fn pred(c: char) -> bool { c == ' ' }
4161 b.iter(|| assert_eq!(s.split(pred).len(), len));
4165 fn split_closure(b: &mut Bencher) {
4166 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4167 let len = s.split(' ').len();
4169 b.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4173 fn split_slice(b: &mut Bencher) {
4174 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4175 let len = s.split(' ').len();
4177 b.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4181 fn is_utf8_100_ascii(b: &mut Bencher) {
4183 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4184 Lorem ipsum dolor sit amet, consectetur. ");
4186 assert_eq!(100, s.len());
4193 fn is_utf8_100_multibyte(b: &mut Bencher) {
4194 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4195 assert_eq!(100, s.len());
4202 fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
4203 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4204 Lorem ipsum dolor sit amet, consectetur. ");
4206 assert_eq!(100, s.len());
4208 let _ = from_utf8_lossy(s);
4213 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
4214 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4215 assert_eq!(100, s.len());
4217 let _ = from_utf8_lossy(s);
4222 fn from_utf8_lossy_invalid(b: &mut Bencher) {
4223 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4225 let _ = from_utf8_lossy(s);
4230 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
4231 let s = Vec::from_elem(100, 0xF5u8);
4233 let _ = from_utf8_lossy(s.as_slice());
4238 fn bench_connect(b: &mut Bencher) {
4239 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4241 let v = [s, s, s, s, s, s, s, s, s, s];
4243 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);