1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
68 and can contain null codepoints.
70 The actual representation of strings have direct mappings to vectors:
72 * `~str` is the same as `~[u8]`
73 * `&str` is the same as `&[u8]`
82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
83 use container::{Container, Mutable};
86 use iter::{Iterator, FromIterator, Extendable, range};
87 use iter::{Filter, AdditiveIterator, Map};
88 use iter::{Rev, DoubleEndedIterator, ExactSize};
91 use option::{None, Option, Some};
93 use from_str::FromStr;
95 use slice::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
103 Section: Creating a string
106 /// Consumes a vector of bytes to create a new utf-8 string.
107 /// Returns None if the vector contains invalid UTF-8.
108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
110 Some(unsafe { raw::from_utf8_owned(vv) })
116 /// Converts a vector to a string slice without performing any allocations.
118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
119 /// returned as a '&str' instead of a '&[u8]'
121 /// Returns None if the slice is not utf-8.
122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
124 Some(unsafe { raw::from_utf8(v) })
128 impl FromStr for ~str {
130 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
133 /// Convert a byte to a UTF-8 string
137 /// Fails if invalid UTF-8
138 pub fn from_byte(b: u8) -> ~str {
140 unsafe { ::cast::transmute(~[b]) }
143 /// Convert a char to a string
144 pub fn from_char(ch: char) -> ~str {
145 let mut buf = StrBuf::new();
150 /// Convert a vector of chars to a string
151 pub fn from_chars(chs: &[char]) -> ~str {
152 chs.iter().map(|c| *c).collect()
155 /// Methods for vectors of strings
156 pub trait StrVector {
157 /// Concatenate a vector of strings.
158 fn concat(&self) -> ~str;
160 /// Concatenate a vector of strings, placing a given separator between each.
161 fn connect(&self, sep: &str) -> ~str;
164 impl<'a, S: Str> StrVector for &'a [S] {
165 fn concat(&self) -> ~str {
166 if self.is_empty() { return ~""; }
168 // `len` calculation may overflow but push_str but will check boundaries
169 let len = self.iter().map(|s| s.as_slice().len()).sum();
171 let mut result = StrBuf::with_capacity(len);
173 for s in self.iter() {
174 result.push_str(s.as_slice())
180 fn connect(&self, sep: &str) -> ~str {
181 if self.is_empty() { return ~""; }
184 if sep.is_empty() { return self.concat(); }
186 // this is wrong without the guarantee that `self` is non-empty
187 // `len` calculation may overflow but push_str but will check boundaries
188 let len = sep.len() * (self.len() - 1)
189 + self.iter().map(|s| s.as_slice().len()).sum();
190 let mut result = StrBuf::with_capacity(len);
191 let mut first = true;
193 for s in self.iter() {
197 result.push_str(sep);
199 result.push_str(s.as_slice());
205 impl<'a, S: Str> StrVector for Vec<S> {
207 fn concat(&self) -> ~str {
208 self.as_slice().concat()
212 fn connect(&self, sep: &str) -> ~str {
213 self.as_slice().connect(sep)
217 /// Something that can be used to compare against a character
219 /// Determine if the splitter should split at the given character
220 fn matches(&self, char) -> bool;
221 /// Indicate if this is only concerned about ASCII characters,
222 /// which can allow for a faster implementation.
223 fn only_ascii(&self) -> bool;
226 impl CharEq for char {
228 fn matches(&self, c: char) -> bool { *self == c }
230 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
233 impl<'a> CharEq for |char|: 'a -> bool {
235 fn matches(&self, c: char) -> bool { (*self)(c) }
237 fn only_ascii(&self) -> bool { false }
240 impl CharEq for extern "Rust" fn(char) -> bool {
242 fn matches(&self, c: char) -> bool { (*self)(c) }
244 fn only_ascii(&self) -> bool { false }
247 impl<'a, C: CharEq> CharEq for &'a [C] {
249 fn matches(&self, c: char) -> bool {
250 self.iter().any(|m| m.matches(c))
253 fn only_ascii(&self) -> bool {
254 self.iter().all(|m| m.only_ascii())
262 /// External iterator for a string's characters.
263 /// Use with the `std::iter` module.
265 pub struct Chars<'a> {
266 /// The slice remaining to be iterated
270 impl<'a> Iterator<char> for Chars<'a> {
272 fn next(&mut self) -> Option<char> {
273 // Decode the next codepoint, then update
274 // the slice to be just the remaining part
275 if self.string.len() != 0 {
276 let CharRange {ch, next} = self.string.char_range_at(0);
278 self.string = raw::slice_unchecked(self.string, next, self.string.len());
287 fn size_hint(&self) -> (uint, Option<uint>) {
288 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
292 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
294 fn next_back(&mut self) -> Option<char> {
295 if self.string.len() != 0 {
296 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
298 self.string = raw::slice_unchecked(self.string, 0, next);
307 /// External iterator for a string's characters and their byte offsets.
308 /// Use with the `std::iter` module.
310 pub struct CharOffsets<'a> {
311 /// The original string to be iterated
316 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
318 fn next(&mut self) -> Option<(uint, char)> {
319 // Compute the byte offset by using the pointer offset between
320 // the original string slice and the iterator's remaining part
321 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
322 self.iter.next().map(|ch| (offset, ch))
326 fn size_hint(&self) -> (uint, Option<uint>) {
327 self.iter.size_hint()
331 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
333 fn next_back(&mut self) -> Option<(uint, char)> {
334 self.iter.next_back().map(|ch| {
335 let offset = self.iter.string.len() +
336 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
342 /// External iterator for a string's characters in reverse order.
343 /// Use with the `std::iter` module.
344 pub type RevChars<'a> = Rev<Chars<'a>>;
346 /// External iterator for a string's characters and their byte offsets in reverse order.
347 /// Use with the `std::iter` module.
348 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
350 /// External iterator for a string's bytes.
351 /// Use with the `std::iter` module.
353 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
355 /// External iterator for a string's bytes in reverse order.
356 /// Use with the `std::iter` module.
357 pub type RevBytes<'a> = Rev<Bytes<'a>>;
359 /// An iterator over the substrings of a string, separated by `sep`.
361 pub struct CharSplits<'a, Sep> {
362 /// The slice remaining to be iterated
365 /// Whether an empty string at the end is allowed
366 allow_trailing_empty: bool,
371 /// An iterator over the substrings of a string, separated by `sep`,
372 /// starting from the back of the string.
373 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
375 /// An iterator over the substrings of a string, separated by `sep`,
376 /// splitting at most `count` times.
378 pub struct CharSplitsN<'a, Sep> {
379 iter: CharSplits<'a, Sep>,
380 /// The number of splits remaining
385 /// An iterator over the words of a string, separated by a sequence of whitespace
387 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
389 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
390 pub type AnyLines<'a> =
391 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
393 impl<'a, Sep> CharSplits<'a, Sep> {
395 fn get_end(&mut self) -> Option<&'a str> {
396 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
397 self.finished = true;
405 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
407 fn next(&mut self) -> Option<&'a str> {
408 if self.finished { return None }
410 let mut next_split = None;
412 for (idx, byte) in self.string.bytes().enumerate() {
413 if self.sep.matches(byte as char) && byte < 128u8 {
414 next_split = Some((idx, idx + 1));
419 for (idx, ch) in self.string.char_indices() {
420 if self.sep.matches(ch) {
421 next_split = Some((idx, self.string.char_range_at(idx).next));
427 Some((a, b)) => unsafe {
428 let elt = raw::slice_unchecked(self.string, 0, a);
429 self.string = raw::slice_unchecked(self.string, b, self.string.len());
432 None => self.get_end(),
437 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
438 for CharSplits<'a, Sep> {
440 fn next_back(&mut self) -> Option<&'a str> {
441 if self.finished { return None }
443 if !self.allow_trailing_empty {
444 self.allow_trailing_empty = true;
445 match self.next_back() {
446 Some(elt) if !elt.is_empty() => return Some(elt),
447 _ => if self.finished { return None }
450 let len = self.string.len();
451 let mut next_split = None;
454 for (idx, byte) in self.string.bytes().enumerate().rev() {
455 if self.sep.matches(byte as char) && byte < 128u8 {
456 next_split = Some((idx, idx + 1));
461 for (idx, ch) in self.string.char_indices_rev() {
462 if self.sep.matches(ch) {
463 next_split = Some((idx, self.string.char_range_at(idx).next));
469 Some((a, b)) => unsafe {
470 let elt = raw::slice_unchecked(self.string, b, len);
471 self.string = raw::slice_unchecked(self.string, 0, a);
474 None => { self.finished = true; Some(self.string) }
479 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
481 fn next(&mut self) -> Option<&'a str> {
484 if self.invert { self.iter.next_back() } else { self.iter.next() }
491 /// An iterator over the start and end indices of the matches of a
492 /// substring within a larger string
494 pub struct MatchIndices<'a> {
500 /// An iterator over the substrings of a string separated by a given
503 pub struct StrSplits<'a> {
504 it: MatchIndices<'a>,
509 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
511 fn next(&mut self) -> Option<(uint, uint)> {
512 // See Issue #1932 for why this is a naive search
513 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
514 let mut match_start = 0;
517 while self.position < h_len {
518 if self.haystack[self.position] == self.needle[match_i] {
519 if match_i == 0 { match_start = self.position; }
523 if match_i == n_len {
525 return Some((match_start, self.position));
528 // failed match, backtrack
531 self.position = match_start;
540 impl<'a> Iterator<&'a str> for StrSplits<'a> {
542 fn next(&mut self) -> Option<&'a str> {
543 if self.finished { return None; }
545 match self.it.next() {
546 Some((from, to)) => {
547 let ret = Some(self.it.haystack.slice(self.last_end, from));
552 self.finished = true;
553 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
559 // Helper functions used for Unicode normalization
560 fn canonical_sort(comb: &mut [(char, u8)]) {
564 let len = comb.len();
565 for i in range(0, len) {
566 let mut swapped = false;
567 for j in range(1, len-i) {
568 let class_a = *comb[j-1].ref1();
569 let class_b = *comb[j].ref1();
570 if class_a != 0 && class_b != 0 && class_a > class_b {
575 if !swapped { break; }
580 enum NormalizationForm {
585 /// External iterator for a string's normalization's characters.
586 /// Use with the `std::iter` module.
588 pub struct Normalizations<'a> {
589 kind: NormalizationForm,
591 buffer: ~[(char, u8)],
595 impl<'a> Iterator<char> for Normalizations<'a> {
597 fn next(&mut self) -> Option<char> {
598 use unicode::decompose::canonical_combining_class;
600 match self.buffer.head() {
606 Some(&(c, _)) if self.sorted => {
610 _ => self.sorted = false
613 let decomposer = match self.kind {
614 NFD => char::decompose_canonical,
615 NFKD => char::decompose_compatible
619 for ch in self.iter {
620 let buffer = &mut self.buffer;
621 let sorted = &mut self.sorted;
623 let class = canonical_combining_class(d);
624 if class == 0 && !*sorted {
625 canonical_sort(*buffer);
628 buffer.push((d, class));
635 canonical_sort(self.buffer);
639 match self.buffer.shift() {
644 Some((c, _)) => Some(c),
649 fn size_hint(&self) -> (uint, Option<uint>) {
650 let (lower, _) = self.iter.size_hint();
655 /// Replace all occurrences of one string with another
659 /// * s - The string containing substrings to replace
660 /// * from - The string to replace
661 /// * to - The replacement string
665 /// The original string with all occurances of `from` replaced with `to`
666 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
667 let mut result = StrBuf::new();
668 let mut last_end = 0;
669 for (start, end) in s.match_indices(from) {
670 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
674 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
679 Section: Comparing strings
682 // share the implementation of the lang-item vs. non-lang-item
685 fn eq_slice_(a: &str, b: &str) -> bool {
686 a.len() == b.len() && unsafe {
687 libc::memcmp(a.as_ptr() as *libc::c_void,
688 b.as_ptr() as *libc::c_void,
689 a.len() as libc::size_t) == 0
693 /// Bytewise slice equality
697 pub fn eq_slice(a: &str, b: &str) -> bool {
701 /// Bytewise slice equality
704 pub fn eq_slice(a: &str, b: &str) -> bool {
708 /// Bytewise string equality
710 #[lang="uniq_str_eq"]
712 pub fn eq(a: &~str, b: &~str) -> bool {
718 pub fn eq(a: &~str, b: &~str) -> bool {
726 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
727 /// returning `true` in that case, or, if it is invalid, `false` with
728 /// `iter` reset such that it is pointing at the first byte in the
729 /// invalid sequence.
731 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
733 // save the current thing we're pointing at.
736 // restore the iterator we had at the start of this codepoint.
737 macro_rules! err ( () => { {*iter = old; return false} });
738 macro_rules! next ( () => {
741 // we needed data, but there was none: error!
746 let first = match iter.next() {
748 // we're at the end of the iterator and a codepoint
749 // boundary at the same time, so this string is valid.
753 // ASCII characters are always valid, so only large
754 // bytes need more examination.
756 let w = utf8_char_width(first);
757 let second = next!();
758 // 2-byte encoding is for codepoints \u0080 to \u07ff
759 // first C2 80 last DF BF
760 // 3-byte encoding is for codepoints \u0800 to \uffff
761 // first E0 A0 80 last EF BF BF
762 // excluding surrogates codepoints \ud800 to \udfff
763 // ED A0 80 to ED BF BF
764 // 4-byte encoding is for codepoints \u10000 to \u10ffff
765 // first F0 90 80 80 last F4 8F BF BF
767 // Use the UTF-8 syntax from the RFC
769 // https://tools.ietf.org/html/rfc3629
771 // UTF8-2 = %xC2-DF UTF8-tail
772 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
773 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
774 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
775 // %xF4 %x80-8F 2( UTF8-tail )
777 2 => if second & 192 != TAG_CONT_U8 {err!()},
779 match (first, second, next!() & 192) {
780 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
781 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
782 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
783 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
788 match (first, second, next!() & 192, next!() & 192) {
789 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
790 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
791 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
801 /// Determines if a vector of bytes contains valid UTF-8.
802 pub fn is_utf8(v: &[u8]) -> bool {
803 run_utf8_validation_iterator(&mut v.iter())
807 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
808 let mut it = v.iter();
810 let ok = run_utf8_validation_iterator(&mut it);
814 // work out how many valid bytes we've consumed
815 // (run_utf8_validation_iterator resets the iterator to just
816 // after the last good byte), which we can do because the
817 // vector iterator size_hint is exact.
818 let (remaining, _) = it.size_hint();
819 Some(v.len() - remaining)
823 /// Determines if a vector of `u16` contains valid UTF-16
824 pub fn is_utf16(v: &[u16]) -> bool {
825 let mut it = v.iter();
826 macro_rules! next ( ($ret:expr) => {
827 match it.next() { Some(u) => *u, None => return $ret }
833 match char::from_u32(u as u32) {
836 let u2 = next!(false);
837 if u < 0xD7FF || u > 0xDBFF ||
838 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
844 /// An iterator that decodes UTF-16 encoded codepoints from a vector
847 pub struct UTF16Items<'a> {
848 iter: slice::Items<'a, u16>
850 /// The possibilities for values decoded from a `u16` stream.
851 #[deriving(Eq, TotalEq, Clone, Show)]
853 /// A valid codepoint.
855 /// An invalid surrogate without its pair.
860 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
861 /// replacement character (U+FFFD).
863 pub fn to_char_lossy(&self) -> char {
866 LoneSurrogate(_) => '\uFFFD'
871 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
872 fn next(&mut self) -> Option<UTF16Item> {
873 let u = match self.iter.next() {
878 if u < 0xD800 || 0xDFFF < u {
880 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
881 } else if u >= 0xDC00 {
882 // a trailing surrogate
883 Some(LoneSurrogate(u))
885 // preserve state for rewinding.
888 let u2 = match self.iter.next() {
891 None => return Some(LoneSurrogate(u))
893 if u2 < 0xDC00 || u2 > 0xDFFF {
894 // not a trailing surrogate so we're not a valid
895 // surrogate pair, so rewind to redecode u2 next time.
897 return Some(LoneSurrogate(u))
900 // all ok, so lets decode it.
901 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
902 Some(ScalarValue(unsafe {cast::transmute(c)}))
907 fn size_hint(&self) -> (uint, Option<uint>) {
908 let (low, high) = self.iter.size_hint();
909 // we could be entirely valid surrogates (2 elements per
910 // char), or entirely non-surrogates (1 element per char)
915 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
916 /// returning invalid surrogates as `LoneSurrogate`s.
922 /// use std::str::{ScalarValue, LoneSurrogate};
924 /// // 𝄞mus<invalid>ic<invalid>
925 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
926 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
929 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
930 /// ~[ScalarValue('𝄞'),
931 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
932 /// LoneSurrogate(0xDD1E),
933 /// ScalarValue('i'), ScalarValue('c'),
934 /// LoneSurrogate(0xD834)]);
936 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
937 UTF16Items { iter : v.iter() }
940 /// Return a slice of `v` ending at (and not including) the first NUL
949 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
950 /// // no NULs so no change
951 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
955 /// assert_eq!(str::truncate_utf16_at_nul(v),
956 /// &['a' as u16, 'b' as u16]);
958 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
959 match v.iter().position(|c| *c == 0) {
960 // don't include the 0
961 Some(i) => v.slice_to(i),
966 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
967 /// if `v` contains any invalid data.
975 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
976 /// 0x0073, 0x0069, 0x0063];
977 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
979 /// // 𝄞mu<invalid>ic
981 /// assert_eq!(str::from_utf16(v), None);
983 pub fn from_utf16(v: &[u16]) -> Option<~str> {
984 let mut s = StrBuf::with_capacity(v.len() / 2);
985 for c in utf16_items(v) {
987 ScalarValue(c) => s.push_char(c),
988 LoneSurrogate(_) => return None
994 /// Decode a UTF-16 encoded vector `v` into a string, replacing
995 /// invalid data with the replacement character (U+FFFD).
1001 /// // 𝄞mus<invalid>ic<invalid>
1002 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1003 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1006 /// assert_eq!(str::from_utf16_lossy(v),
1007 /// ~"𝄞mus\uFFFDic\uFFFD");
1009 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1010 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1013 // https://tools.ietf.org/html/rfc3629
1014 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1015 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1017 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1018 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1026 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1027 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1028 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1029 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1030 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1033 /// Given a first byte, determine how many bytes are in this UTF-8 character
1035 pub fn utf8_char_width(b: u8) -> uint {
1036 return UTF8_CHAR_WIDTH[b as uint] as uint;
1039 /// Struct that contains a `char` and the index of the first byte of
1040 /// the next `char` in a string. This can be used as a data structure
1041 /// for iterating over the UTF-8 bytes of a string.
1042 pub struct CharRange {
1045 /// Index of the first byte of the next `char`
1049 // Return the initial codepoint accumulator for the first byte.
1050 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1051 // for width 3, and 3 bits for width 4
1052 macro_rules! utf8_first_byte(
1053 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1056 // return the value of $ch updated with continuation byte $byte
1057 macro_rules! utf8_acc_cont_byte(
1058 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1061 static TAG_CONT_U8: u8 = 128u8;
1063 /// Converts a vector of bytes to a new utf-8 string.
1064 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1069 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1070 /// let output = std::str::from_utf8_lossy(input);
1071 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1073 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1074 let firstbad = match first_non_utf8_index(v) {
1075 None => return Slice(unsafe { cast::transmute(v) }),
1079 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1080 let mut i = firstbad;
1081 let total = v.len();
1082 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1083 unsafe { *xs.unsafe_ref(i) }
1085 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1093 let mut res = StrBuf::with_capacity(total);
1097 res.push_bytes(v.slice_to(i))
1101 // subseqidx is the index of the first byte of the subsequence we're looking at.
1102 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1104 let mut subseqidx = firstbad;
1108 let byte = unsafe_get(v, i);
1111 macro_rules! error(() => ({
1113 if subseqidx != i_ {
1114 res.push_bytes(v.slice(subseqidx, i_));
1117 res.push_bytes(REPLACEMENT);
1122 // subseqidx handles this
1124 let w = utf8_char_width(byte);
1128 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1135 match (byte, safe_get(v, i, total)) {
1136 (0xE0 , 0xA0 .. 0xBF) => (),
1137 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1138 (0xED , 0x80 .. 0x9F) => (),
1139 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1146 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1153 match (byte, safe_get(v, i, total)) {
1154 (0xF0 , 0x90 .. 0xBF) => (),
1155 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1156 (0xF4 , 0x80 .. 0x8F) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1168 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1181 if subseqidx < total {
1183 res.push_bytes(v.slice(subseqidx, total))
1186 Owned(res.into_owned())
1193 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1194 /// This can be useful as an optimization when an allocation is sometimes
1195 /// needed but not always.
1196 pub enum MaybeOwned<'a> {
1197 /// A borrowed string
1203 /// SendStr is a specialization of `MaybeOwned` to be sendable
1204 pub type SendStr = MaybeOwned<'static>;
1206 impl<'a> MaybeOwned<'a> {
1207 /// Returns `true` if this `MaybeOwned` wraps an owned string
1209 pub fn is_owned(&self) -> bool {
1216 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1218 pub fn is_slice(&self) -> bool {
1226 /// Trait for moving into a `MaybeOwned`
1227 pub trait IntoMaybeOwned<'a> {
1228 /// Moves self into a `MaybeOwned`
1229 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1232 impl<'a> IntoMaybeOwned<'a> for ~str {
1234 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1237 impl<'a> IntoMaybeOwned<'a> for &'a str {
1239 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1242 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1244 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1247 impl<'a> Eq for MaybeOwned<'a> {
1249 fn eq(&self, other: &MaybeOwned) -> bool {
1250 self.as_slice() == other.as_slice()
1254 impl<'a> TotalEq for MaybeOwned<'a> {}
1256 impl<'a> Ord for MaybeOwned<'a> {
1258 fn lt(&self, other: &MaybeOwned) -> bool {
1259 self.as_slice().lt(&other.as_slice())
1263 impl<'a> TotalOrd for MaybeOwned<'a> {
1265 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1266 self.as_slice().cmp(&other.as_slice())
1270 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1272 fn equiv(&self, other: &S) -> bool {
1273 self.as_slice() == other.as_slice()
1277 impl<'a> Str for MaybeOwned<'a> {
1279 fn as_slice<'b>(&'b self) -> &'b str {
1282 Owned(ref s) => s.as_slice()
1287 fn into_owned(self) -> ~str {
1289 Slice(s) => s.to_owned(),
1295 impl<'a> Container for MaybeOwned<'a> {
1297 fn len(&self) -> uint { self.as_slice().len() }
1300 impl<'a> Clone for MaybeOwned<'a> {
1302 fn clone(&self) -> MaybeOwned<'a> {
1304 Slice(s) => Slice(s),
1305 Owned(ref s) => Owned(s.to_owned())
1310 impl<'a> Default for MaybeOwned<'a> {
1312 fn default() -> MaybeOwned<'a> { Slice("") }
1315 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1317 fn hash(&self, hasher: &mut H) {
1319 Slice(s) => s.hash(hasher),
1320 Owned(ref s) => s.hash(hasher),
1325 impl<'a> fmt::Show for MaybeOwned<'a> {
1327 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329 Slice(ref s) => s.fmt(f),
1330 Owned(ref s) => s.fmt(f)
1335 /// Unsafe operations
1338 use container::Container;
1342 use str::{is_utf8, OwnedStr, StrSlice};
1344 use slice::{MutableVector, ImmutableVector, OwnedVector};
1347 /// Create a Rust string from a *u8 buffer of the given length
1348 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1349 let mut v: ~[u8] = slice::with_capacity(len);
1350 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1353 assert!(is_utf8(v));
1354 ::cast::transmute(v)
1357 #[lang="strdup_uniq"]
1360 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1361 from_buf_len(ptr, len)
1364 /// Create a Rust string from a null-terminated C string
1365 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1370 curr = buf.offset(i);
1372 from_buf_len(buf as *u8, i as uint)
1375 /// Converts a slice of bytes to a string slice without checking
1376 /// that the string contains valid UTF-8.
1377 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1381 /// Converts an owned vector of bytes to a new owned string. This assumes
1382 /// that the utf-8-ness of the vector has already been validated
1384 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1388 /// Converts a byte to a string.
1389 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1391 /// Form a slice from a C string. Unsafe because the caller must ensure the
1392 /// C string has the static lifetime, or else the return value may be
1393 /// invalidated later.
1394 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1398 while *curr != 0u8 {
1400 curr = s.offset(len as int);
1402 let v = Slice { data: s, len: len };
1403 assert!(is_utf8(::cast::transmute(v)));
1404 ::cast::transmute(v)
1407 /// Takes a bytewise (not UTF-8) slice from a string.
1409 /// Returns the substring from [`begin`..`end`).
1413 /// If begin is greater than end.
1414 /// If end is greater than the length of the string.
1416 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1417 assert!(begin <= end);
1418 assert!(end <= s.len());
1419 slice_unchecked(s, begin, end)
1422 /// Takes a bytewise (not UTF-8) slice from a string.
1424 /// Returns the substring from [`begin`..`end`).
1426 /// Caller must check slice boundaries!
1428 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1429 cast::transmute(Slice {
1430 data: s.as_ptr().offset(begin as int),
1435 /// Access the str in its vector representation.
1436 /// The caller must preserve the valid UTF-8 property when modifying.
1438 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1442 /// Sets the length of a string
1444 /// This will explicitly set the size of the string, without actually
1445 /// modifing its buffers, so it is up to the caller to ensure that
1446 /// the string is actually the specified size.
1448 fn test_from_buf_len() {
1450 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1452 let c = from_buf_len(b, 3u);
1453 assert_eq!(c, ~"AAA");
1459 Section: Trait implementations
1463 #[allow(missing_doc)]
1465 use container::Container;
1466 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1469 use option::{Some, None};
1470 use str::{Str, StrSlice, eq_slice};
1473 impl<'a> Add<&'a str,~str> for &'a str {
1475 fn add(&self, rhs: & &'a str) -> ~str {
1476 let mut ret = StrBuf::from_owned_str(self.to_owned());
1482 impl<'a> TotalOrd for &'a str {
1484 fn cmp(&self, other: & &'a str) -> Ordering {
1485 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1486 match s_b.cmp(&o_b) {
1487 Greater => return Greater,
1488 Less => return Less,
1493 self.len().cmp(&other.len())
1497 impl TotalOrd for ~str {
1499 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1502 impl<'a> Eq for &'a str {
1504 fn eq(&self, other: & &'a str) -> bool {
1505 eq_slice((*self), (*other))
1508 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1513 fn eq(&self, other: &~str) -> bool {
1514 eq_slice((*self), (*other))
1518 impl<'a> TotalEq for &'a str {}
1520 impl TotalEq for ~str {}
1522 impl<'a> Ord for &'a str {
1524 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1529 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1532 impl<'a, S: Str> Equiv<S> for &'a str {
1534 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1537 impl<'a, S: Str> Equiv<S> for ~str {
1539 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1546 /// Any string that can be represented as a slice
1548 /// Work with `self` as a slice.
1549 fn as_slice<'a>(&'a self) -> &'a str;
1551 /// Convert `self` into a ~str, not making a copy if possible.
1552 fn into_owned(self) -> ~str;
1554 /// Convert `self` into a `StrBuf`.
1556 fn to_strbuf(&self) -> StrBuf {
1557 StrBuf::from_str(self.as_slice())
1560 /// Convert `self` into a `StrBuf`, not making a copy if possible.
1562 fn into_strbuf(self) -> StrBuf {
1563 StrBuf::from_owned_str(self.into_owned())
1567 impl<'a> Str for &'a str {
1569 fn as_slice<'a>(&'a self) -> &'a str { *self }
1572 fn into_owned(self) -> ~str { self.to_owned() }
1575 impl<'a> Str for ~str {
1577 fn as_slice<'a>(&'a self) -> &'a str {
1578 let s: &'a str = *self; s
1582 fn into_owned(self) -> ~str { self }
1585 impl<'a> Container for &'a str {
1587 fn len(&self) -> uint {
1592 impl Container for ~str {
1594 fn len(&self) -> uint { self.as_slice().len() }
1597 impl Mutable for ~str {
1598 /// Remove all content, make the string empty
1600 fn clear(&mut self) {
1607 /// Methods for string slices
1608 pub trait StrSlice<'a> {
1609 /// Returns true if one string contains another
1613 /// - needle - The string to look for
1614 fn contains<'a>(&self, needle: &'a str) -> bool;
1616 /// Returns true if a string contains a char.
1620 /// - needle - The char to look for
1621 fn contains_char(&self, needle: char) -> bool;
1623 /// An iterator over the characters of `self`. Note, this iterates
1624 /// over unicode code-points, not unicode graphemes.
1629 /// let v: ~[char] = "abc åäö".chars().collect();
1630 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1632 fn chars(&self) -> Chars<'a>;
1634 /// An iterator over the characters of `self`, in reverse order.
1635 fn chars_rev(&self) -> RevChars<'a>;
1637 /// An iterator over the bytes of `self`
1638 fn bytes(&self) -> Bytes<'a>;
1640 /// An iterator over the bytes of `self`, in reverse order
1641 fn bytes_rev(&self) -> RevBytes<'a>;
1643 /// An iterator over the characters of `self` and their byte offsets.
1644 fn char_indices(&self) -> CharOffsets<'a>;
1646 /// An iterator over the characters of `self` and their byte offsets,
1647 /// in reverse order.
1648 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1650 /// An iterator over substrings of `self`, separated by characters
1651 /// matched by `sep`.
1656 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1657 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1659 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1660 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1662 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1663 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1665 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1667 /// An iterator over substrings of `self`, separated by characters
1668 /// matched by `sep`, restricted to splitting at most `count`
1674 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1675 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1677 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1678 /// assert_eq!(v, ~["abc", "def2ghi"]);
1680 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1681 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1683 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1685 /// An iterator over substrings of `self`, separated by characters
1686 /// matched by `sep`.
1688 /// Equivalent to `split`, except that the trailing substring
1689 /// is skipped if empty (terminator semantics).
1694 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1695 /// assert_eq!(v, ~["A", "B"]);
1697 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1698 /// assert_eq!(v, ~["A", "", "B", ""]);
1700 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1702 /// An iterator over substrings of `self`, separated by characters
1703 /// matched by `sep`, in reverse order.
1708 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1709 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1711 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1712 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1714 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1715 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1717 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1719 /// An iterator over substrings of `self`, separated by characters
1720 /// matched by `sep`, starting from the end of the string.
1721 /// Restricted to splitting at most `count` times.
1726 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1727 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1729 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1730 /// assert_eq!(v, ~["ghi", "abc1def"]);
1732 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1733 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1735 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1737 /// An iterator over the start and end indices of the disjoint
1738 /// matches of `sep` within `self`.
1740 /// That is, each returned value `(start, end)` satisfies
1741 /// `self.slice(start, end) == sep`. For matches of `sep` within
1742 /// `self` that overlap, only the indicies corresponding to the
1743 /// first match are returned.
1748 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1749 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1751 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1752 /// assert_eq!(v, ~[(1,4), (4,7)]);
1754 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1755 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1757 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1759 /// An iterator over the substrings of `self` separated by `sep`.
1764 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1765 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1767 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1768 /// assert_eq!(v, ~["1", "", "2"]);
1770 fn split_str(&self, &'a str) -> StrSplits<'a>;
1772 /// An iterator over the lines of a string (subsequences separated
1773 /// by `\n`). This does not include the empty string after a
1779 /// let four_lines = "foo\nbar\n\nbaz\n";
1780 /// let v: ~[&str] = four_lines.lines().collect();
1781 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1783 fn lines(&self) -> CharSplits<'a, char>;
1785 /// An iterator over the lines of a string, separated by either
1786 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1787 /// empty trailing line.
1792 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1793 /// let v: ~[&str] = four_lines.lines_any().collect();
1794 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1796 fn lines_any(&self) -> AnyLines<'a>;
1798 /// An iterator over the words of a string (subsequences separated
1799 /// by any sequence of whitespace). Sequences of whitespace are
1800 /// collapsed, so empty "words" are not included.
1805 /// let some_words = " Mary had\ta little \n\t lamb";
1806 /// let v: ~[&str] = some_words.words().collect();
1807 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1809 fn words(&self) -> Words<'a>;
1811 /// An Iterator over the string in Unicode Normalization Form D
1812 /// (canonical decomposition).
1813 fn nfd_chars(&self) -> Normalizations<'a>;
1815 /// An Iterator over the string in Unicode Normalization Form KD
1816 /// (compatibility decomposition).
1817 fn nfkd_chars(&self) -> Normalizations<'a>;
1819 /// Returns true if the string contains only whitespace.
1821 /// Whitespace characters are determined by `char::is_whitespace`.
1826 /// assert!(" \t\n".is_whitespace());
1827 /// assert!("".is_whitespace());
1829 /// assert!( !"abc".is_whitespace());
1831 fn is_whitespace(&self) -> bool;
1833 /// Returns true if the string contains only alphanumeric code
1836 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1841 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1842 /// assert!("".is_alphanumeric());
1844 /// assert!( !" &*~".is_alphanumeric());
1846 fn is_alphanumeric(&self) -> bool;
1848 /// Returns the number of Unicode code points (`char`) that a
1851 /// This does not perform any normalization, and is `O(n)`, since
1852 /// UTF-8 is a variable width encoding of code points.
1854 /// *Warning*: The number of code points in a string does not directly
1855 /// correspond to the number of visible characters or width of the
1856 /// visible text due to composing characters, and double- and
1857 /// zero-width ones.
1859 /// See also `.len()` for the byte length.
1864 /// // composed forms of `ö` and `é`
1865 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1866 /// // decomposed forms of `ö` and `é`
1867 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1869 /// assert_eq!(c.char_len(), 15);
1870 /// assert_eq!(d.char_len(), 17);
1872 /// assert_eq!(c.len(), 21);
1873 /// assert_eq!(d.len(), 23);
1875 /// // the two strings *look* the same
1876 /// println!("{}", c);
1877 /// println!("{}", d);
1879 fn char_len(&self) -> uint;
1881 /// Returns a slice of the given string from the byte range
1882 /// [`begin`..`end`).
1884 /// This operation is `O(1)`.
1886 /// Fails when `begin` and `end` do not point to valid characters
1887 /// or point beyond the last character of the string.
1889 /// See also `slice_to` and `slice_from` for slicing prefixes and
1890 /// suffixes of strings, and `slice_chars` for slicing based on
1891 /// code point counts.
1896 /// let s = "Löwe 老虎 Léopard";
1897 /// assert_eq!(s.slice(0, 1), "L");
1899 /// assert_eq!(s.slice(1, 9), "öwe 老");
1901 /// // these will fail:
1902 /// // byte 2 lies within `ö`:
1903 /// // s.slice(2, 3);
1905 /// // byte 8 lies within `老`
1906 /// // s.slice(1, 8);
1908 /// // byte 100 is outside the string
1909 /// // s.slice(3, 100);
1911 fn slice(&self, begin: uint, end: uint) -> &'a str;
1913 /// Returns a slice of the string from `begin` to its end.
1915 /// Equivalent to `self.slice(begin, self.len())`.
1917 /// Fails when `begin` does not point to a valid character, or is
1920 /// See also `slice`, `slice_to` and `slice_chars`.
1921 fn slice_from(&self, begin: uint) -> &'a str;
1923 /// Returns a slice of the string from the beginning to byte
1926 /// Equivalent to `self.slice(0, end)`.
1928 /// Fails when `end` does not point to a valid character, or is
1931 /// See also `slice`, `slice_from` and `slice_chars`.
1932 fn slice_to(&self, end: uint) -> &'a str;
1934 /// Returns a slice of the string from the character range
1935 /// [`begin`..`end`).
1937 /// That is, start at the `begin`-th code point of the string and
1938 /// continue to the `end`-th code point. This does not detect or
1939 /// handle edge cases such as leaving a combining character as the
1940 /// first code point of the string.
1942 /// Due to the design of UTF-8, this operation is `O(end)`.
1943 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1944 /// variants that use byte indices rather than code point
1947 /// Fails if `begin` > `end` or the either `begin` or `end` are
1948 /// beyond the last character of the string.
1953 /// let s = "Löwe 老虎 Léopard";
1954 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1955 /// assert_eq!(s.slice_chars(5, 7), "老虎");
1957 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1959 /// Returns true if `needle` is a prefix of the string.
1960 fn starts_with(&self, needle: &str) -> bool;
1962 /// Returns true if `needle` is a suffix of the string.
1963 fn ends_with(&self, needle: &str) -> bool;
1965 /// Escape each char in `s` with `char::escape_default`.
1966 fn escape_default(&self) -> ~str;
1968 /// Escape each char in `s` with `char::escape_unicode`.
1969 fn escape_unicode(&self) -> ~str;
1971 /// Returns a string with leading and trailing whitespace removed.
1972 fn trim(&self) -> &'a str;
1974 /// Returns a string with leading whitespace removed.
1975 fn trim_left(&self) -> &'a str;
1977 /// Returns a string with trailing whitespace removed.
1978 fn trim_right(&self) -> &'a str;
1980 /// Returns a string with characters that match `to_trim` removed.
1984 /// * to_trim - a character matcher
1989 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1990 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1991 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1993 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
1995 /// Returns a string with leading `chars_to_trim` removed.
1999 /// * to_trim - a character matcher
2004 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2005 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2006 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2008 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2010 /// Returns a string with trailing `chars_to_trim` removed.
2014 /// * to_trim - a character matcher
2019 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2020 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2021 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2023 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2025 /// Replace all occurrences of one string with another.
2029 /// * `from` - The string to replace
2030 /// * `to` - The replacement string
2034 /// The original string with all occurances of `from` replaced with `to`.
2039 /// let s = ~"Do you know the muffin man,
2040 /// The muffin man, the muffin man, ...";
2042 /// assert_eq!(s.replace("muffin man", "little lamb"),
2043 /// ~"Do you know the little lamb,
2044 /// The little lamb, the little lamb, ...");
2046 /// // not found, so no change.
2047 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2049 fn replace(&self, from: &str, to: &str) -> ~str;
2051 /// Copy a slice into a new owned str.
2052 fn to_owned(&self) -> ~str;
2054 /// Converts to a vector of `u16` encoded as UTF-16.
2055 fn to_utf16(&self) -> ~[u16];
2057 /// Check that `index`-th byte lies at the start and/or end of a
2058 /// UTF-8 code point sequence.
2060 /// The start and end of the string (when `index == self.len()`)
2061 /// are considered to be boundaries.
2063 /// Fails if `index` is greater than `self.len()`.
2068 /// let s = "Löwe 老虎 Léopard";
2069 /// assert!(s.is_char_boundary(0));
2071 /// assert!(s.is_char_boundary(6));
2072 /// assert!(s.is_char_boundary(s.len()));
2074 /// // second byte of `ö`
2075 /// assert!(!s.is_char_boundary(2));
2077 /// // third byte of `老`
2078 /// assert!(!s.is_char_boundary(8));
2080 fn is_char_boundary(&self, index: uint) -> bool;
2082 /// Pluck a character out of a string and return the index of the next
2085 /// This function can be used to iterate over the unicode characters of a
2090 /// This example manually iterate through the characters of a
2091 /// string; this should normally by done by `.chars()` or
2092 /// `.char_indices`.
2095 /// use std::str::CharRange;
2097 /// let s = "中华Việt Nam";
2099 /// while i < s.len() {
2100 /// let CharRange {ch, next} = s.char_range_at(i);
2101 /// println!("{}: {}", i, ch);
2123 /// * s - The string
2124 /// * i - The byte offset of the char to extract
2128 /// A record {ch: char, next: uint} containing the char value and the byte
2129 /// index of the next unicode character.
2133 /// If `i` is greater than or equal to the length of the string.
2134 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2135 fn char_range_at(&self, start: uint) -> CharRange;
2137 /// Given a byte position and a str, return the previous char and its position.
2139 /// This function can be used to iterate over a unicode string in reverse.
2141 /// Returns 0 for next index if called on start index 0.
2142 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2144 /// Plucks the character starting at the `i`th byte of a string
2145 fn char_at(&self, i: uint) -> char;
2147 /// Plucks the character ending at the `i`th byte of a string
2148 fn char_at_reverse(&self, i: uint) -> char;
2150 /// Work with the byte buffer of a string as a byte slice.
2151 fn as_bytes(&self) -> &'a [u8];
2153 /// Returns the byte index of the first character of `self` that
2154 /// matches `search`.
2158 /// `Some` containing the byte index of the last matching character
2159 /// or `None` if there is no match
2164 /// let s = "Löwe 老虎 Léopard";
2166 /// assert_eq!(s.find('L'), Some(0));
2167 /// assert_eq!(s.find('é'), Some(14));
2169 /// // the first space
2170 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2172 /// // neither are found
2173 /// assert_eq!(s.find(&['1', '2']), None);
2175 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2177 /// Returns the byte index of the last character of `self` that
2178 /// matches `search`.
2182 /// `Some` containing the byte index of the last matching character
2183 /// or `None` if there is no match.
2188 /// let s = "Löwe 老虎 Léopard";
2190 /// assert_eq!(s.rfind('L'), Some(13));
2191 /// assert_eq!(s.rfind('é'), Some(14));
2193 /// // the second space
2194 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2196 /// // searches for an occurrence of either `1` or `2`, but neither are found
2197 /// assert_eq!(s.rfind(&['1', '2']), None);
2199 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2201 /// Returns the byte index of the first matching substring
2205 /// * `needle` - The string to search for
2209 /// `Some` containing the byte index of the first matching substring
2210 /// or `None` if there is no match.
2215 /// let s = "Löwe 老虎 Léopard";
2217 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2218 /// assert_eq!(s.find_str("muffin man"), None);
2220 fn find_str(&self, &str) -> Option<uint>;
2222 /// Given a string, make a new string with repeated copies of it.
2223 fn repeat(&self, nn: uint) -> ~str;
2225 /// Retrieves the first character from a string slice and returns
2226 /// it. This does not allocate a new string; instead, it returns a
2227 /// slice that point one character beyond the character that was
2228 /// shifted. If the string does not contain any characters,
2229 /// a tuple of None and an empty string is returned instead.
2234 /// let s = "Löwe 老虎 Léopard";
2235 /// let (c, s1) = s.slice_shift_char();
2236 /// assert_eq!(c, Some('L'));
2237 /// assert_eq!(s1, "öwe 老虎 Léopard");
2239 /// let (c, s2) = s1.slice_shift_char();
2240 /// assert_eq!(c, Some('ö'));
2241 /// assert_eq!(s2, "we 老虎 Léopard");
2243 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2245 /// Levenshtein Distance between two strings.
2246 fn lev_distance(&self, t: &str) -> uint;
2248 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2250 /// Fails if `inner` is not a direct slice contained within self.
2255 /// let string = "a\nb\nc";
2256 /// let lines: ~[&str] = string.lines().collect();
2258 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2259 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2260 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2262 fn subslice_offset(&self, inner: &str) -> uint;
2264 /// Return an unsafe pointer to the strings buffer.
2266 /// The caller must ensure that the string outlives this pointer,
2267 /// and that it is not reallocated (e.g. by pushing to the
2269 fn as_ptr(&self) -> *u8;
2272 impl<'a> StrSlice<'a> for &'a str {
2274 fn contains<'a>(&self, needle: &'a str) -> bool {
2275 self.find_str(needle).is_some()
2279 fn contains_char(&self, needle: char) -> bool {
2280 self.find(needle).is_some()
2284 fn chars(&self) -> Chars<'a> {
2285 Chars{string: *self}
2289 fn chars_rev(&self) -> RevChars<'a> {
2294 fn bytes(&self) -> Bytes<'a> {
2295 self.as_bytes().iter().map(|&b| b)
2299 fn bytes_rev(&self) -> RevBytes<'a> {
2304 fn char_indices(&self) -> CharOffsets<'a> {
2305 CharOffsets{string: *self, iter: self.chars()}
2309 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2310 self.char_indices().rev()
2314 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2317 only_ascii: sep.only_ascii(),
2319 allow_trailing_empty: true,
2325 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2326 -> CharSplitsN<'a, Sep> {
2328 iter: self.split(sep),
2335 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2336 -> CharSplits<'a, Sep> {
2338 allow_trailing_empty: false,
2344 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2345 self.split(sep).rev()
2349 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2350 -> CharSplitsN<'a, Sep> {
2352 iter: self.split(sep),
2359 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2360 assert!(!sep.is_empty())
2369 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2371 it: self.match_indices(sep),
2378 fn lines(&self) -> CharSplits<'a, char> {
2379 self.split_terminator('\n')
2382 fn lines_any(&self) -> AnyLines<'a> {
2383 self.lines().map(|line| {
2385 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2391 fn words(&self) -> Words<'a> {
2392 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2396 fn nfd_chars(&self) -> Normalizations<'a> {
2406 fn nfkd_chars(&self) -> Normalizations<'a> {
2416 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2419 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2422 fn char_len(&self) -> uint { self.chars().len() }
2425 fn slice(&self, begin: uint, end: uint) -> &'a str {
2426 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2427 unsafe { raw::slice_bytes(*self, begin, end) }
2431 fn slice_from(&self, begin: uint) -> &'a str {
2432 self.slice(begin, self.len())
2436 fn slice_to(&self, end: uint) -> &'a str {
2437 assert!(self.is_char_boundary(end));
2438 unsafe { raw::slice_bytes(*self, 0, end) }
2441 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2442 assert!(begin <= end);
2444 let mut begin_byte = None;
2445 let mut end_byte = None;
2447 // This could be even more efficient by not decoding,
2448 // only finding the char boundaries
2449 for (idx, _) in self.char_indices() {
2450 if count == begin { begin_byte = Some(idx); }
2451 if count == end { end_byte = Some(idx); break; }
2454 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2455 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2457 match (begin_byte, end_byte) {
2458 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2459 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2460 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2465 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2466 let n = needle.len();
2467 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2471 fn ends_with(&self, needle: &str) -> bool {
2472 let (m, n) = (self.len(), needle.len());
2473 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2476 fn escape_default(&self) -> ~str {
2477 let mut out = StrBuf::with_capacity(self.len());
2478 for c in self.chars() {
2479 c.escape_default(|c| out.push_char(c));
2484 fn escape_unicode(&self) -> ~str {
2485 let mut out = StrBuf::with_capacity(self.len());
2486 for c in self.chars() {
2487 c.escape_unicode(|c| out.push_char(c));
2493 fn trim(&self) -> &'a str {
2494 self.trim_left().trim_right()
2498 fn trim_left(&self) -> &'a str {
2499 self.trim_left_chars(&char::is_whitespace)
2503 fn trim_right(&self) -> &'a str {
2504 self.trim_right_chars(&char::is_whitespace)
2508 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2509 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2513 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2514 match self.find(|c: char| !to_trim.matches(c)) {
2516 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2521 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2522 match self.rfind(|c: char| !to_trim.matches(c)) {
2525 let next = self.char_range_at(last).next;
2526 unsafe { raw::slice_bytes(*self, 0u, next) }
2531 fn replace(&self, from: &str, to: &str) -> ~str {
2532 let mut result = StrBuf::new();
2533 let mut last_end = 0;
2534 for (start, end) in self.match_indices(from) {
2535 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2536 result.push_str(to);
2539 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2544 fn to_owned(&self) -> ~str {
2545 let len = self.len();
2547 let mut v = slice::with_capacity(len);
2549 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2551 ::cast::transmute(v)
2555 fn to_utf16(&self) -> ~[u16] {
2557 for ch in self.chars() {
2558 let mut buf = [0u16, ..2];
2559 let n = ch.encode_utf16(buf /* as mut slice! */);
2560 u.push_all(buf.slice_to(n));
2566 fn is_char_boundary(&self, index: uint) -> bool {
2567 if index == self.len() { return true; }
2568 let b = self[index];
2569 return b < 128u8 || b >= 192u8;
2573 fn char_range_at(&self, i: uint) -> CharRange {
2574 if self[i] < 128u8 {
2575 return CharRange {ch: self[i] as char, next: i + 1 };
2578 // Multibyte case is a fn to allow char_range_at to inline cleanly
2579 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2580 let mut val = s[i] as u32;
2581 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2584 val = utf8_first_byte!(val, w);
2585 val = utf8_acc_cont_byte!(val, s[i + 1]);
2586 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2587 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2589 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2592 return multibyte_char_range_at(*self, i);
2596 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2597 let mut prev = start;
2599 prev = prev.saturating_sub(1);
2600 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2602 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2603 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2604 // while there is a previous byte == 10......
2605 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2609 let mut val = s[i] as u32;
2610 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2613 val = utf8_first_byte!(val, w);
2614 val = utf8_acc_cont_byte!(val, s[i + 1]);
2615 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2616 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2618 return CharRange {ch: unsafe { transmute(val) }, next: i};
2621 return multibyte_char_range_at_reverse(*self, prev);
2625 fn char_at(&self, i: uint) -> char {
2626 self.char_range_at(i).ch
2630 fn char_at_reverse(&self, i: uint) -> char {
2631 self.char_range_at_reverse(i).ch
2635 fn as_bytes(&self) -> &'a [u8] {
2636 unsafe { cast::transmute(*self) }
2639 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2640 if search.only_ascii() {
2641 self.bytes().position(|b| search.matches(b as char))
2643 for (index, c) in self.char_indices() {
2644 if search.matches(c) { return Some(index); }
2650 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2651 if search.only_ascii() {
2652 self.bytes().rposition(|b| search.matches(b as char))
2654 for (index, c) in self.char_indices_rev() {
2655 if search.matches(c) { return Some(index); }
2661 fn find_str(&self, needle: &str) -> Option<uint> {
2662 if needle.is_empty() {
2665 self.match_indices(needle)
2667 .map(|(start, _end)| start)
2671 fn repeat(&self, nn: uint) -> ~str {
2672 let mut ret = StrBuf::with_capacity(nn * self.len());
2673 for _ in range(0, nn) {
2674 ret.push_str(*self);
2680 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2681 if self.is_empty() {
2682 return (None, *self);
2684 let CharRange {ch, next} = self.char_range_at(0u);
2685 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2686 return (Some(ch), next_s);
2690 fn lev_distance(&self, t: &str) -> uint {
2691 let slen = self.len();
2694 if slen == 0 { return tlen; }
2695 if tlen == 0 { return slen; }
2697 let mut dcol = slice::from_fn(tlen + 1, |x| x);
2699 for (i, sc) in self.chars().enumerate() {
2701 let mut current = i;
2702 dcol[0] = current + 1;
2704 for (j, tc) in t.chars().enumerate() {
2706 let next = dcol[j + 1];
2709 dcol[j + 1] = current;
2711 dcol[j + 1] = ::cmp::min(current, next);
2712 dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2722 fn subslice_offset(&self, inner: &str) -> uint {
2723 let a_start = self.as_ptr() as uint;
2724 let a_end = a_start + self.len();
2725 let b_start = inner.as_ptr() as uint;
2726 let b_end = b_start + inner.len();
2728 assert!(a_start <= b_start);
2729 assert!(b_end <= a_end);
2734 fn as_ptr(&self) -> *u8 {
2739 /// Methods for owned strings
2740 pub trait OwnedStr {
2741 /// Shorten a string to the specified length (which must be <= the current length)
2742 fn truncate(&mut self, len: uint);
2744 /// Consumes the string, returning the underlying byte buffer.
2746 /// The buffer does not have a null terminator.
2747 fn into_bytes(self) -> ~[u8];
2749 /// Sets the length of a string
2751 /// This will explicitly set the size of the string, without actually
2752 /// modifying its buffers, so it is up to the caller to ensure that
2753 /// the string is actually the specified size.
2754 unsafe fn set_len(&mut self, new_len: uint);
2756 /// Pushes the given string onto this string, returning the concatenation of the two strings.
2757 fn append(self, rhs: &str) -> ~str;
2760 impl OwnedStr for ~str {
2762 fn truncate(&mut self, len: uint) {
2763 assert!(len <= self.len());
2764 assert!(self.is_char_boundary(len));
2765 unsafe { self.set_len(len); }
2769 fn into_bytes(self) -> ~[u8] {
2770 unsafe { cast::transmute(self) }
2774 unsafe fn set_len(&mut self, new_len: uint) {
2775 raw::as_owned_vec(self).set_len(new_len)
2779 fn append(self, rhs: &str) -> ~str {
2780 let mut new_str = StrBuf::from_owned_str(self);
2781 new_str.push_str(rhs);
2782 new_str.into_owned()
2786 impl Clone for ~str {
2788 fn clone(&self) -> ~str {
2793 impl FromIterator<char> for ~str {
2795 fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2796 let (lower, _) = iterator.size_hint();
2797 let mut buf = StrBuf::with_capacity(lower);
2798 buf.extend(iterator);
2803 // This works because every lifetime is a sub-lifetime of 'static
2804 impl<'a> Default for &'a str {
2805 fn default() -> &'a str { "" }
2808 impl Default for ~str {
2809 fn default() -> ~str { ~"" }
2814 use iter::AdditiveIterator;
2815 use default::Default;
2822 assert!((eq(&~"", &~"")));
2823 assert!((eq(&~"foo", &~"foo")));
2824 assert!((!eq(&~"foo", &~"bar")));
2828 fn test_eq_slice() {
2829 assert!((eq_slice("foobar".slice(0, 3), "foo")));
2830 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2831 assert!((!eq_slice("foo1", "foo2")));
2837 assert!("" <= "foo");
2838 assert!("foo" <= "foo");
2839 assert!("foo" != "bar");
2844 assert_eq!("".len(), 0u);
2845 assert_eq!("hello world".len(), 11u);
2846 assert_eq!("\x63".len(), 1u);
2847 assert_eq!("\xa2".len(), 2u);
2848 assert_eq!("\u03c0".len(), 2u);
2849 assert_eq!("\u2620".len(), 3u);
2850 assert_eq!("\U0001d11e".len(), 4u);
2852 assert_eq!("".char_len(), 0u);
2853 assert_eq!("hello world".char_len(), 11u);
2854 assert_eq!("\x63".char_len(), 1u);
2855 assert_eq!("\xa2".char_len(), 1u);
2856 assert_eq!("\u03c0".char_len(), 1u);
2857 assert_eq!("\u2620".char_len(), 1u);
2858 assert_eq!("\U0001d11e".char_len(), 1u);
2859 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2864 assert_eq!("hello".find('l'), Some(2u));
2865 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2866 assert!("hello".find('x').is_none());
2867 assert!("hello".find(|c:char| c == 'x').is_none());
2868 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2869 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2874 assert_eq!("hello".rfind('l'), Some(3u));
2875 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2876 assert!("hello".rfind('x').is_none());
2877 assert!("hello".rfind(|c:char| c == 'x').is_none());
2878 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2879 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2885 let s: ~str = empty.chars().collect();
2886 assert_eq!(empty, s);
2887 let data = ~"ประเทศไทย中";
2888 let s: ~str = data.chars().collect();
2889 assert_eq!(data, s);
2893 fn test_into_bytes() {
2895 let buf = data.into_bytes();
2896 assert_eq!(bytes!("asdf"), buf.as_slice());
2900 fn test_find_str() {
2902 assert_eq!("".find_str(""), Some(0u));
2903 assert!("banana".find_str("apple pie").is_none());
2905 let data = "abcabc";
2906 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2907 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2908 assert!(data.slice(2u, 4u).find_str("ab").is_none());
2910 let mut data = ~"ประเทศไทย中华Việt Nam";
2912 assert!(data.find_str("ไท华").is_none());
2913 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2914 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2916 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2917 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2918 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2919 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2920 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2922 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2923 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2924 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2925 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2926 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2930 fn test_slice_chars() {
2931 fn t(a: &str, b: &str, start: uint) {
2932 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2935 t("hello", "llo", 2);
2936 t("hello", "el", 1);
2939 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2944 fn t(v: &[~str], s: &str) {
2945 assert_eq!(v.concat(), s.to_str());
2947 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2948 let v: &[~str] = [];
2955 fn t(v: &[~str], sep: &str, s: &str) {
2956 assert_eq!(v.connect(sep), s.to_str());
2958 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2959 " ", "you know I'm no good");
2960 let v: &[~str] = [];
2962 t([~"hi"], " ", "hi");
2966 fn test_concat_slices() {
2967 fn t(v: &[&str], s: &str) {
2968 assert_eq!(v.concat(), s.to_str());
2970 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2971 let v: &[&str] = [];
2977 fn test_connect_slices() {
2978 fn t(v: &[&str], sep: &str, s: &str) {
2979 assert_eq!(v.connect(sep), s.to_str());
2981 t(["you", "know", "I'm", "no", "good"],
2982 " ", "you know I'm no good");
2984 t(["hi"], " ", "hi");
2989 assert_eq!("x".repeat(4), ~"xxxx");
2990 assert_eq!("hi".repeat(4), ~"hihihihi");
2991 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
2992 assert_eq!("".repeat(4), ~"");
2993 assert_eq!("hi".repeat(0), ~"");
2997 fn test_unsafe_slice() {
2998 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2999 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3000 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3001 fn a_million_letter_a() -> ~str {
3003 let mut rs = StrBuf::new();
3005 rs.push_str("aaaaaaaaaa");
3010 fn half_a_million_letter_a() -> ~str {
3012 let mut rs = StrBuf::new();
3014 rs.push_str("aaaaa");
3019 let letters = a_million_letter_a();
3020 assert!(half_a_million_letter_a() ==
3021 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3025 fn test_starts_with() {
3026 assert!(("".starts_with("")));
3027 assert!(("abc".starts_with("")));
3028 assert!(("abc".starts_with("a")));
3029 assert!((!"a".starts_with("abc")));
3030 assert!((!"".starts_with("abc")));
3031 assert!((!"ödd".starts_with("-")));
3032 assert!(("ödd".starts_with("öd")));
3036 fn test_ends_with() {
3037 assert!(("".ends_with("")));
3038 assert!(("abc".ends_with("")));
3039 assert!(("abc".ends_with("c")));
3040 assert!((!"a".ends_with("abc")));
3041 assert!((!"".ends_with("abc")));
3042 assert!((!"ddö".ends_with("-")));
3043 assert!(("ddö".ends_with("dö")));
3047 fn test_is_empty() {
3048 assert!("".is_empty());
3049 assert!(!"a".is_empty());
3055 assert_eq!("".replace(a, "b"), ~"");
3056 assert_eq!("a".replace(a, "b"), ~"b");
3057 assert_eq!("ab".replace(a, "b"), ~"bb");
3059 assert!(" test test ".replace(test, "toast") ==
3061 assert_eq!(" test test ".replace(test, ""), ~" ");
3065 fn test_replace_2a() {
3066 let data = ~"ประเทศไทย中华";
3067 let repl = ~"دولة الكويت";
3070 let a2 = ~"دولة الكويتทศไทย中华";
3071 assert_eq!(data.replace(a, repl), a2);
3075 fn test_replace_2b() {
3076 let data = ~"ประเทศไทย中华";
3077 let repl = ~"دولة الكويت";
3080 let b2 = ~"ปรدولة الكويتทศไทย中华";
3081 assert_eq!(data.replace(b, repl), b2);
3085 fn test_replace_2c() {
3086 let data = ~"ประเทศไทย中华";
3087 let repl = ~"دولة الكويت";
3090 let c2 = ~"ประเทศไทยدولة الكويت";
3091 assert_eq!(data.replace(c, repl), c2);
3095 fn test_replace_2d() {
3096 let data = ~"ประเทศไทย中华";
3097 let repl = ~"دولة الكويت";
3100 assert_eq!(data.replace(d, repl), data);
3105 assert_eq!("ab", "abc".slice(0, 2));
3106 assert_eq!("bc", "abc".slice(1, 3));
3107 assert_eq!("", "abc".slice(1, 1));
3108 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3110 let data = "ประเทศไทย中华";
3111 assert_eq!("ป", data.slice(0, 3));
3112 assert_eq!("ร", data.slice(3, 6));
3113 assert_eq!("", data.slice(3, 3));
3114 assert_eq!("华", data.slice(30, 33));
3116 fn a_million_letter_X() -> ~str {
3118 let mut rs = StrBuf::new();
3120 rs.push_str("华华华华华华华华华华");
3125 fn half_a_million_letter_X() -> ~str {
3127 let mut rs = StrBuf::new();
3129 rs.push_str("华华华华华");
3134 let letters = a_million_letter_X();
3135 assert!(half_a_million_letter_X() ==
3136 letters.slice(0u, 3u * 500000u).to_owned());
3141 let ss = "中华Việt Nam";
3143 assert_eq!("华", ss.slice(3u, 6u));
3144 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3146 assert_eq!("ab", "abc".slice(0u, 2u));
3147 assert_eq!("bc", "abc".slice(1u, 3u));
3148 assert_eq!("", "abc".slice(1u, 1u));
3150 assert_eq!("中", ss.slice(0u, 3u));
3151 assert_eq!("华V", ss.slice(3u, 7u));
3152 assert_eq!("", ss.slice(3u, 3u));
3167 fn test_slice_fail() {
3168 "中华Việt Nam".slice(0u, 2u);
3172 fn test_slice_from() {
3173 assert_eq!("abcd".slice_from(0), "abcd");
3174 assert_eq!("abcd".slice_from(2), "cd");
3175 assert_eq!("abcd".slice_from(4), "");
3178 fn test_slice_to() {
3179 assert_eq!("abcd".slice_to(0), "");
3180 assert_eq!("abcd".slice_to(2), "ab");
3181 assert_eq!("abcd".slice_to(4), "abcd");
3185 fn test_trim_left_chars() {
3186 let v: &[char] = &[];
3187 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3188 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3189 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3190 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3192 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3193 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3194 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3198 fn test_trim_right_chars() {
3199 let v: &[char] = &[];
3200 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3201 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3202 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3203 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3205 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3206 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3207 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3211 fn test_trim_chars() {
3212 let v: &[char] = &[];
3213 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3214 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3215 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3216 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3218 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3219 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3220 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3224 fn test_trim_left() {
3225 assert_eq!("".trim_left(), "");
3226 assert_eq!("a".trim_left(), "a");
3227 assert_eq!(" ".trim_left(), "");
3228 assert_eq!(" blah".trim_left(), "blah");
3229 assert_eq!(" \u3000 wut".trim_left(), "wut");
3230 assert_eq!("hey ".trim_left(), "hey ");
3234 fn test_trim_right() {
3235 assert_eq!("".trim_right(), "");
3236 assert_eq!("a".trim_right(), "a");
3237 assert_eq!(" ".trim_right(), "");
3238 assert_eq!("blah ".trim_right(), "blah");
3239 assert_eq!("wut \u3000 ".trim_right(), "wut");
3240 assert_eq!(" hey".trim_right(), " hey");
3245 assert_eq!("".trim(), "");
3246 assert_eq!("a".trim(), "a");
3247 assert_eq!(" ".trim(), "");
3248 assert_eq!(" blah ".trim(), "blah");
3249 assert_eq!("\nwut \u3000 ".trim(), "wut");
3250 assert_eq!(" hey dude ".trim(), "hey dude");
3254 fn test_is_whitespace() {
3255 assert!("".is_whitespace());
3256 assert!(" ".is_whitespace());
3257 assert!("\u2009".is_whitespace()); // Thin space
3258 assert!(" \n\t ".is_whitespace());
3259 assert!(!" _ ".is_whitespace());
3263 fn test_slice_shift_char() {
3264 let data = "ประเทศไทย中";
3265 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3269 fn test_slice_shift_char_2() {
3271 assert_eq!(empty.slice_shift_char(), (None, ""));
3276 // deny overlong encodings
3277 assert!(!is_utf8([0xc0, 0x80]));
3278 assert!(!is_utf8([0xc0, 0xae]));
3279 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3280 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3281 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3282 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3283 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3286 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3287 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3289 assert!(is_utf8([0xC2, 0x80]));
3290 assert!(is_utf8([0xDF, 0xBF]));
3291 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3292 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3293 assert!(is_utf8([0xEE, 0x80, 0x80]));
3294 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3295 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3296 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3300 fn test_is_utf16() {
3301 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3309 // surrogate pairs (randomly generated with Python 3's
3310 // .encode('utf-16be'))
3311 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3312 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3313 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3315 // mixtures (also random)
3316 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3317 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3318 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3321 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3324 // surrogate + regular unit
3326 // surrogate + lead surrogate
3328 // unterminated surrogate
3330 // trail surrogate without a lead
3333 // random byte sequences that Python 3's .decode('utf-16be')
3335 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3336 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3337 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3338 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3339 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3340 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3341 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3342 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3343 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3344 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3345 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3346 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3347 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3348 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3349 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3350 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3351 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3352 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3353 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3354 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3355 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3359 fn test_raw_from_c_str() {
3361 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3363 let c = raw::from_c_str(b);
3364 assert_eq!(c, ~"AAAAAAA");
3369 fn test_as_bytes() {
3372 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3373 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3376 assert_eq!("".as_bytes(), &[]);
3377 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3378 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3383 fn test_as_bytes_fail() {
3384 // Don't double free. (I'm not sure if this exercises the
3385 // original problem code path anymore.)
3387 let _bytes = s.as_bytes();
3393 let buf = "hello".as_ptr();
3395 assert_eq!(*buf.offset(0), 'h' as u8);
3396 assert_eq!(*buf.offset(1), 'e' as u8);
3397 assert_eq!(*buf.offset(2), 'l' as u8);
3398 assert_eq!(*buf.offset(3), 'l' as u8);
3399 assert_eq!(*buf.offset(4), 'o' as u8);
3404 fn test_subslice_offset() {
3405 let a = "kernelsprite";
3406 let b = a.slice(7, a.len());
3407 let c = a.slice(0, a.len() - 6);
3408 assert_eq!(a.subslice_offset(b), 7);
3409 assert_eq!(a.subslice_offset(c), 0);
3411 let string = "a\nb\nc";
3412 let mut lines = ~[];
3413 for line in string.lines() { lines.push(line) }
3414 assert_eq!(string.subslice_offset(lines[0]), 0);
3415 assert_eq!(string.subslice_offset(lines[1]), 2);
3416 assert_eq!(string.subslice_offset(lines[2]), 4);
3421 fn test_subslice_offset_2() {
3422 let a = "alchemiter";
3423 let b = "cruxtruder";
3424 a.subslice_offset(b);
3428 fn vec_str_conversions() {
3429 let s1: ~str = ~"All mimsy were the borogoves";
3431 let v: ~[u8] = s1.as_bytes().to_owned();
3432 let s2: ~str = from_utf8(v).unwrap().to_owned();
3433 let mut i: uint = 0u;
3434 let n1: uint = s1.len();
3435 let n2: uint = v.len();
3448 fn test_contains() {
3449 assert!("abcde".contains("bcd"));
3450 assert!("abcde".contains("abcd"));
3451 assert!("abcde".contains("bcde"));
3452 assert!("abcde".contains(""));
3453 assert!("".contains(""));
3454 assert!(!"abcde".contains("def"));
3455 assert!(!"".contains("a"));
3457 let data = ~"ประเทศไทย中华Việt Nam";
3458 assert!(data.contains("ประเ"));
3459 assert!(data.contains("ะเ"));
3460 assert!(data.contains("中华"));
3461 assert!(!data.contains("ไท华"));
3465 fn test_contains_char() {
3466 assert!("abc".contains_char('b'));
3467 assert!("a".contains_char('a'));
3468 assert!(!"abc".contains_char('d'));
3469 assert!(!"".contains_char('a'));
3476 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3477 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3478 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3479 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3482 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3483 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3484 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3485 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3486 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3489 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3490 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3491 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3492 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3493 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3494 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3495 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3496 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3498 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3499 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3500 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3501 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3502 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3503 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3504 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3505 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3506 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3507 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3508 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3510 // Issue #12318, even-numbered non-BMP planes
3512 ~[0xD840, 0xDC00])];
3514 for p in pairs.iter() {
3515 let (s, u) = (*p).clone();
3516 assert!(is_utf16(u));
3517 assert_eq!(s.to_utf16(), u);
3519 assert_eq!(from_utf16(u).unwrap(), s);
3520 assert_eq!(from_utf16_lossy(u), s);
3522 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3523 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3528 fn test_utf16_invalid() {
3529 // completely positive cases tested above.
3531 assert_eq!(from_utf16([0xD800]), None);
3533 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3536 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3539 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3543 fn test_utf16_lossy() {
3544 // completely positive cases tested above.
3546 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3548 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3551 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3554 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3558 fn test_truncate_utf16_at_nul() {
3560 assert_eq!(truncate_utf16_at_nul(v), &[]);
3563 assert_eq!(truncate_utf16_at_nul(v), &[]);
3566 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3569 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3572 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3577 let s = ~"ศไทย中华Việt Nam";
3578 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3580 for ch in v.iter() {
3581 assert!(s.char_at(pos) == *ch);
3582 pos += from_char(*ch).len();
3587 fn test_char_at_reverse() {
3588 let s = ~"ศไทย中华Việt Nam";
3589 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3590 let mut pos = s.len();
3591 for ch in v.rev_iter() {
3592 assert!(s.char_at_reverse(pos) == *ch);
3593 pos -= from_char(*ch).len();
3598 fn test_escape_unicode() {
3599 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3600 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3601 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3602 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3603 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3604 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3605 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3606 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3607 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3611 fn test_escape_default() {
3612 assert_eq!("abc".escape_default(), ~"abc");
3613 assert_eq!("a c".escape_default(), ~"a c");
3614 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3615 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3616 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3617 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3618 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3619 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3623 fn test_total_ord() {
3624 "1234".cmp(& &"123") == Greater;
3625 "123".cmp(& &"1234") == Less;
3626 "1234".cmp(& &"1234") == Equal;
3627 "12345555".cmp(& &"123456") == Less;
3628 "22".cmp(& &"1234") == Greater;
3632 fn test_char_range_at() {
3633 let data = ~"b¢€𤭢𤭢€¢b";
3634 assert_eq!('b', data.char_range_at(0).ch);
3635 assert_eq!('¢', data.char_range_at(1).ch);
3636 assert_eq!('€', data.char_range_at(3).ch);
3637 assert_eq!('𤭢', data.char_range_at(6).ch);
3638 assert_eq!('𤭢', data.char_range_at(10).ch);
3639 assert_eq!('€', data.char_range_at(14).ch);
3640 assert_eq!('¢', data.char_range_at(17).ch);
3641 assert_eq!('b', data.char_range_at(19).ch);
3645 fn test_char_range_at_reverse_underflow() {
3646 assert_eq!("abc".char_range_at_reverse(0).next, 0);
3651 #![allow(unnecessary_allocation)]
3653 ($s1:expr, $s2:expr, $e:expr) => { {
3657 assert_eq!(s1 + s2, e.to_owned());
3658 assert_eq!(s1.to_owned() + s2, e.to_owned());
3662 t!("foo", "bar", "foobar");
3663 t!("foo", ~"bar", "foobar");
3664 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
3665 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
3669 fn test_iterator() {
3671 let s = ~"ศไทย中华Việt Nam";
3672 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3675 let mut it = s.chars();
3678 assert_eq!(c, v[pos]);
3681 assert_eq!(pos, v.len());
3685 fn test_rev_iterator() {
3687 let s = ~"ศไทย中华Việt Nam";
3688 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3691 let mut it = s.chars_rev();
3694 assert_eq!(c, v[pos]);
3697 assert_eq!(pos, v.len());
3701 fn test_iterator_clone() {
3702 let s = "ศไทย中华Việt Nam";
3703 let mut it = s.chars();
3705 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3709 fn test_bytesator() {
3710 let s = ~"ศไทย中华Việt Nam";
3712 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3713 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3718 for b in s.bytes() {
3719 assert_eq!(b, v[pos]);
3725 fn test_bytes_revator() {
3726 let s = ~"ศไทย中华Việt Nam";
3728 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3729 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3732 let mut pos = v.len();
3734 for b in s.bytes_rev() {
3736 assert_eq!(b, v[pos]);
3741 fn test_char_indicesator() {
3743 let s = "ศไทย中华Việt Nam";
3744 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3745 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3748 let mut it = s.char_indices();
3751 assert_eq!(c, (p[pos], v[pos]));
3754 assert_eq!(pos, v.len());
3755 assert_eq!(pos, p.len());
3759 fn test_char_indices_revator() {
3761 let s = "ศไทย中华Việt Nam";
3762 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3763 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3766 let mut it = s.char_indices_rev();
3769 assert_eq!(c, (p[pos], v[pos]));
3772 assert_eq!(pos, v.len());
3773 assert_eq!(pos, p.len());
3777 fn test_split_char_iterator() {
3778 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3780 let split: ~[&str] = data.split(' ').collect();
3781 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3783 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
3785 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3787 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3788 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3790 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
3792 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3795 let split: ~[&str] = data.split('ä').collect();
3796 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3798 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
3800 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3802 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3803 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3805 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
3807 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3811 fn test_splitn_char_iterator() {
3812 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3814 let split: ~[&str] = data.splitn(' ', 3).collect();
3815 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3817 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3818 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3821 let split: ~[&str] = data.splitn('ä', 3).collect();
3822 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3824 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3825 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3829 fn test_rsplitn_char_iterator() {
3830 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3832 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3834 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3836 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3838 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3841 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3843 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3845 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3847 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3851 fn test_split_char_iterator_no_trailing() {
3852 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3854 let split: ~[&str] = data.split('\n').collect();
3855 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3857 let split: ~[&str] = data.split_terminator('\n').collect();
3858 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3862 fn test_rev_split_char_iterator_no_trailing() {
3863 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3865 let mut split: ~[&str] = data.split('\n').rev().collect();
3867 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3869 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3871 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3876 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
3877 let words: ~[&str] = data.words().collect();
3878 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3882 fn test_nfd_chars() {
3883 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
3884 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
3885 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
3886 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
3887 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3888 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3889 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
3890 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
3891 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3892 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
3896 fn test_nfkd_chars() {
3897 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
3898 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
3899 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
3900 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
3901 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3902 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3903 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
3904 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
3905 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3906 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
3911 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3912 let lines: ~[&str] = data.lines().collect();
3913 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3915 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3916 let lines: ~[&str] = data.lines().collect();
3917 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3921 fn test_split_strator() {
3922 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3923 let v: ~[&str] = s.split_str(sep).collect();
3926 t("--1233345--", "12345", ~["--1233345--"]);
3927 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3928 t("::hello::there", "::", ~["", "hello", "there"]);
3929 t("hello::there::", "::", ~["hello", "there", ""]);
3930 t("::hello::there::", "::", ~["", "hello", "there", ""]);
3931 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3932 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3933 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3934 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3936 t("zz", "zz", ~["",""]);
3937 t("ok", "z", ~["ok"]);
3938 t("zzz", "zz", ~["","z"]);
3939 t("zzzzz", "zz", ~["","","z"]);
3943 fn test_str_default() {
3944 use default::Default;
3945 fn t<S: Default + Str>() {
3946 let s: S = Default::default();
3947 assert_eq!(s.as_slice(), "");
3955 fn test_str_container() {
3956 fn sum_len<S: Container>(v: &[S]) -> uint {
3957 v.iter().map(|x| x.len()).sum()
3961 assert_eq!(5, sum_len(["012", "", "34"]));
3962 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3963 assert_eq!(5, sum_len([s.as_slice()]));
3967 fn test_str_from_utf8() {
3968 let xs = bytes!("hello");
3969 assert_eq!(from_utf8(xs), Some("hello"));
3971 let xs = bytes!("ศไทย中华Việt Nam");
3972 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3974 let xs = bytes!("hello", 0xff);
3975 assert_eq!(from_utf8(xs), None);
3979 fn test_str_from_utf8_owned() {
3980 let xs = bytes!("hello").to_owned();
3981 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
3983 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3984 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
3986 let xs = bytes!("hello", 0xff).to_owned();
3987 assert_eq!(from_utf8_owned(xs), None);
3991 fn test_str_from_utf8_lossy() {
3992 let xs = bytes!("hello");
3993 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
3995 let xs = bytes!("ศไทย中华Việt Nam");
3996 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
3998 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3999 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4001 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4002 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4004 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4005 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4007 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4008 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4010 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4011 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4013 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4014 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4017 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4018 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4022 fn test_from_str() {
4023 let owned: Option<~str> = from_str(&"string");
4024 assert_eq!(owned, Some(~"string"));
4028 fn test_maybe_owned_traits() {
4029 let s = Slice("abcde");
4030 assert_eq!(s.len(), 5);
4031 assert_eq!(s.as_slice(), "abcde");
4032 assert_eq!(s.to_str(), ~"abcde");
4033 assert_eq!(format!("{}", s), ~"abcde");
4034 assert!(s.lt(&Owned(~"bcdef")));
4035 assert_eq!(Slice(""), Default::default());
4037 let o = Owned(~"abcde");
4038 assert_eq!(o.len(), 5);
4039 assert_eq!(o.as_slice(), "abcde");
4040 assert_eq!(o.to_str(), ~"abcde");
4041 assert_eq!(format!("{}", o), ~"abcde");
4042 assert!(o.lt(&Slice("bcdef")));
4043 assert_eq!(Owned(~""), Default::default());
4045 assert!(s.cmp(&o) == Equal);
4046 assert!(s.equiv(&o));
4048 assert!(o.cmp(&s) == Equal);
4049 assert!(o.equiv(&s));
4053 fn test_maybe_owned_methods() {
4054 let s = Slice("abcde");
4055 assert!(s.is_slice());
4056 assert!(!s.is_owned());
4058 let o = Owned(~"abcde");
4059 assert!(!o.is_slice());
4060 assert!(o.is_owned());
4064 fn test_maybe_owned_clone() {
4065 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4066 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4067 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4068 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4072 fn test_maybe_owned_into_owned() {
4073 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4074 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4078 fn test_into_maybe_owned() {
4079 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4080 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4081 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4082 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4089 use self::test::Bencher;
4094 fn char_iterator(b: &mut Bencher) {
4095 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4096 let len = s.char_len();
4098 b.iter(|| assert_eq!(s.chars().len(), len));
4102 fn char_iterator_ascii(b: &mut Bencher) {
4103 let s = "Mary had a little lamb, Little lamb
4104 Mary had a little lamb, Little lamb
4105 Mary had a little lamb, Little lamb
4106 Mary had a little lamb, Little lamb
4107 Mary had a little lamb, Little lamb
4108 Mary had a little lamb, Little lamb";
4109 let len = s.char_len();
4111 b.iter(|| assert_eq!(s.chars().len(), len));
4115 fn char_iterator_rev(b: &mut Bencher) {
4116 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4117 let len = s.char_len();
4119 b.iter(|| assert_eq!(s.chars_rev().len(), len));
4123 fn char_indicesator(b: &mut Bencher) {
4124 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4125 let len = s.char_len();
4127 b.iter(|| assert_eq!(s.char_indices().len(), len));
4131 fn char_indicesator_rev(b: &mut Bencher) {
4132 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4133 let len = s.char_len();
4135 b.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4139 fn split_unicode_ascii(b: &mut Bencher) {
4140 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4142 b.iter(|| assert_eq!(s.split('V').len(), 3));
4146 fn split_unicode_not_ascii(b: &mut Bencher) {
4147 struct NotAscii(char);
4148 impl CharEq for NotAscii {
4149 fn matches(&self, c: char) -> bool {
4150 let NotAscii(cc) = *self;
4153 fn only_ascii(&self) -> bool { false }
4155 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4157 b.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4162 fn split_ascii(b: &mut Bencher) {
4163 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4164 let len = s.split(' ').len();
4166 b.iter(|| assert_eq!(s.split(' ').len(), len));
4170 fn split_not_ascii(b: &mut Bencher) {
4171 struct NotAscii(char);
4172 impl CharEq for NotAscii {
4174 fn matches(&self, c: char) -> bool {
4175 let NotAscii(cc) = *self;
4178 fn only_ascii(&self) -> bool { false }
4180 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4181 let len = s.split(' ').len();
4183 b.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4187 fn split_extern_fn(b: &mut Bencher) {
4188 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4189 let len = s.split(' ').len();
4190 fn pred(c: char) -> bool { c == ' ' }
4192 b.iter(|| assert_eq!(s.split(pred).len(), len));
4196 fn split_closure(b: &mut Bencher) {
4197 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4198 let len = s.split(' ').len();
4200 b.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4204 fn split_slice(b: &mut Bencher) {
4205 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4206 let len = s.split(' ').len();
4208 b.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4212 fn is_utf8_100_ascii(b: &mut Bencher) {
4214 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4215 Lorem ipsum dolor sit amet, consectetur. ");
4217 assert_eq!(100, s.len());
4224 fn is_utf8_100_multibyte(b: &mut Bencher) {
4225 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4226 assert_eq!(100, s.len());
4233 fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
4234 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4235 Lorem ipsum dolor sit amet, consectetur. ");
4237 assert_eq!(100, s.len());
4239 let _ = from_utf8_lossy(s);
4244 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
4245 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4246 assert_eq!(100, s.len());
4248 let _ = from_utf8_lossy(s);
4253 fn from_utf8_lossy_invalid(b: &mut Bencher) {
4254 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4256 let _ = from_utf8_lossy(s);
4261 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
4262 let s = ::slice::from_elem(100, 0xF5u8);
4264 let _ = from_utf8_lossy(s);
4269 fn bench_connect(b: &mut Bencher) {
4270 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4272 let v = [s, s, s, s, s, s, s, s, s, s];
4274 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);