1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
64 let mut buf = ~"testing";
67 assert_eq!(buf, ~"testing 123");
72 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
73 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
74 encoded UTF-8 sequences. Additionally, strings are not null-terminated
75 and can contain null codepoints.
77 The actual representation of strings have direct mappings to vectors:
79 * `~str` is the same as `~[u8]`
80 * `&str` is the same as `&[u8]`
88 use clone::{Clone, DeepClone};
89 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
90 use container::{Container, Mutable};
92 use hash::{Hash, sip};
93 use iter::{Iterator, FromIterator, Extendable, range};
94 use iter::{Filter, AdditiveIterator, Map};
95 use iter::{Rev, DoubleEndedIterator, ExactSize};
98 use option::{None, Option, Some};
102 use from_str::FromStr;
104 use vec::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
106 use default::Default;
110 Section: Creating a string
113 /// Consumes a vector of bytes to create a new utf-8 string.
114 /// Returns None if the vector contains invalid UTF-8.
115 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
117 Some(unsafe { raw::from_utf8_owned(vv) })
123 /// Converts a vector to a string slice without performing any allocations.
125 /// Once the slice has been validated as utf-8, it is transmuted in-place and
126 /// returned as a '&str' instead of a '&[u8]'
128 /// Returns None if the slice is not utf-8.
129 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
131 Some(unsafe { raw::from_utf8(v) })
135 impl ToStr for ~str {
137 fn to_str(&self) -> ~str { self.to_owned() }
140 impl FromStr for ~str {
142 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
145 impl<'a> ToStr for &'a str {
147 fn to_str(&self) -> ~str { self.to_owned() }
150 /// Convert a byte to a UTF-8 string
154 /// Fails if invalid UTF-8
155 pub fn from_byte(b: u8) -> ~str {
157 unsafe { ::cast::transmute(~[b]) }
160 /// Convert a char to a string
161 pub fn from_char(ch: char) -> ~str {
167 /// Convert a vector of chars to a string
168 pub fn from_chars(chs: &[char]) -> ~str {
169 chs.iter().map(|c| *c).collect()
173 pub fn push_str(lhs: &mut ~str, rhs: &str) {
177 /// Methods for vectors of strings
178 pub trait StrVector {
179 /// Concatenate a vector of strings.
180 fn concat(&self) -> ~str;
182 /// Concatenate a vector of strings, placing a given separator between each.
183 fn connect(&self, sep: &str) -> ~str;
186 impl<'a, S: Str> StrVector for &'a [S] {
187 fn concat(&self) -> ~str {
188 if self.is_empty() { return ~""; }
190 // `len` calculation may overflow but push_str but will check boundaries
191 let len = self.iter().map(|s| s.as_slice().len()).sum();
193 let mut result = with_capacity(len);
195 for s in self.iter() {
196 result.push_str(s.as_slice())
201 fn connect(&self, sep: &str) -> ~str {
202 if self.is_empty() { return ~""; }
205 if sep.is_empty() { return self.concat(); }
207 // this is wrong without the guarantee that `self` is non-empty
208 // `len` calculation may overflow but push_str but will check boundaries
209 let len = sep.len() * (self.len() - 1)
210 + self.iter().map(|s| s.as_slice().len()).sum();
211 let mut result = with_capacity(len);
212 let mut first = true;
214 for s in self.iter() {
218 result.push_str(sep);
220 result.push_str(s.as_slice());
226 impl<'a, S: Str> StrVector for Vec<S> {
228 fn concat(&self) -> ~str {
229 self.as_slice().concat()
233 fn connect(&self, sep: &str) -> ~str {
234 self.as_slice().connect(sep)
238 /// Something that can be used to compare against a character
240 /// Determine if the splitter should split at the given character
241 fn matches(&self, char) -> bool;
242 /// Indicate if this is only concerned about ASCII characters,
243 /// which can allow for a faster implementation.
244 fn only_ascii(&self) -> bool;
247 impl CharEq for char {
249 fn matches(&self, c: char) -> bool { *self == c }
251 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
254 impl<'a> CharEq for 'a |char| -> bool {
256 fn matches(&self, c: char) -> bool { (*self)(c) }
258 fn only_ascii(&self) -> bool { false }
261 impl CharEq for extern "Rust" fn(char) -> bool {
263 fn matches(&self, c: char) -> bool { (*self)(c) }
265 fn only_ascii(&self) -> bool { false }
268 impl<'a, C: CharEq> CharEq for &'a [C] {
270 fn matches(&self, c: char) -> bool {
271 self.iter().any(|m| m.matches(c))
274 fn only_ascii(&self) -> bool {
275 self.iter().all(|m| m.only_ascii())
283 /// External iterator for a string's characters.
284 /// Use with the `std::iter` module.
286 pub struct Chars<'a> {
287 /// The slice remaining to be iterated
288 priv string: &'a str,
291 impl<'a> Iterator<char> for Chars<'a> {
293 fn next(&mut self) -> Option<char> {
294 // Decode the next codepoint, then update
295 // the slice to be just the remaining part
296 if self.string.len() != 0 {
297 let CharRange {ch, next} = self.string.char_range_at(0);
299 self.string = raw::slice_unchecked(self.string, next, self.string.len());
308 fn size_hint(&self) -> (uint, Option<uint>) {
309 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
313 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
315 fn next_back(&mut self) -> Option<char> {
316 if self.string.len() != 0 {
317 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
319 self.string = raw::slice_unchecked(self.string, 0, next);
328 /// External iterator for a string's characters and their byte offsets.
329 /// Use with the `std::iter` module.
331 pub struct CharOffsets<'a> {
332 /// The original string to be iterated
333 priv string: &'a str,
334 priv iter: Chars<'a>,
337 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
339 fn next(&mut self) -> Option<(uint, char)> {
340 // Compute the byte offset by using the pointer offset between
341 // the original string slice and the iterator's remaining part
342 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
343 self.iter.next().map(|ch| (offset, ch))
347 fn size_hint(&self) -> (uint, Option<uint>) {
348 self.iter.size_hint()
352 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
354 fn next_back(&mut self) -> Option<(uint, char)> {
355 self.iter.next_back().map(|ch| {
356 let offset = self.iter.string.len() +
357 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
363 /// External iterator for a string's characters in reverse order.
364 /// Use with the `std::iter` module.
365 pub type RevChars<'a> = Rev<Chars<'a>>;
367 /// External iterator for a string's characters and their byte offsets in reverse order.
368 /// Use with the `std::iter` module.
369 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
371 /// External iterator for a string's bytes.
372 /// Use with the `std::iter` module.
374 Map<'a, &'a u8, u8, vec::Items<'a, u8>>;
376 /// External iterator for a string's bytes in reverse order.
377 /// Use with the `std::iter` module.
378 pub type RevBytes<'a> = Rev<Bytes<'a>>;
380 /// An iterator over the substrings of a string, separated by `sep`.
382 pub struct CharSplits<'a, Sep> {
383 /// The slice remaining to be iterated
384 priv string: &'a str,
386 /// Whether an empty string at the end is allowed
387 priv allow_trailing_empty: bool,
388 priv only_ascii: bool,
392 /// An iterator over the substrings of a string, separated by `sep`,
393 /// starting from the back of the string.
394 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
396 /// An iterator over the substrings of a string, separated by `sep`,
397 /// splitting at most `count` times.
399 pub struct CharSplitsN<'a, Sep> {
400 priv iter: CharSplits<'a, Sep>,
401 /// The number of splits remaining
406 /// An iterator over the words of a string, separated by a sequence of whitespace
408 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
410 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
411 pub type AnyLines<'a> =
412 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
414 impl<'a, Sep> CharSplits<'a, Sep> {
416 fn get_end(&mut self) -> Option<&'a str> {
417 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
418 self.finished = true;
426 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
428 fn next(&mut self) -> Option<&'a str> {
429 if self.finished { return None }
431 let mut next_split = None;
433 for (idx, byte) in self.string.bytes().enumerate() {
434 if self.sep.matches(byte as char) && byte < 128u8 {
435 next_split = Some((idx, idx + 1));
440 for (idx, ch) in self.string.char_indices() {
441 if self.sep.matches(ch) {
442 next_split = Some((idx, self.string.char_range_at(idx).next));
448 Some((a, b)) => unsafe {
449 let elt = raw::slice_unchecked(self.string, 0, a);
450 self.string = raw::slice_unchecked(self.string, b, self.string.len());
453 None => self.get_end(),
458 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
459 for CharSplits<'a, Sep> {
461 fn next_back(&mut self) -> Option<&'a str> {
462 if self.finished { return None }
464 if !self.allow_trailing_empty {
465 self.allow_trailing_empty = true;
466 match self.next_back() {
467 Some(elt) if !elt.is_empty() => return Some(elt),
468 _ => if self.finished { return None }
471 let len = self.string.len();
472 let mut next_split = None;
475 for (idx, byte) in self.string.bytes().enumerate().rev() {
476 if self.sep.matches(byte as char) && byte < 128u8 {
477 next_split = Some((idx, idx + 1));
482 for (idx, ch) in self.string.char_indices_rev() {
483 if self.sep.matches(ch) {
484 next_split = Some((idx, self.string.char_range_at(idx).next));
490 Some((a, b)) => unsafe {
491 let elt = raw::slice_unchecked(self.string, b, len);
492 self.string = raw::slice_unchecked(self.string, 0, a);
495 None => { self.finished = true; Some(self.string) }
500 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
502 fn next(&mut self) -> Option<&'a str> {
505 if self.invert { self.iter.next_back() } else { self.iter.next() }
512 /// An iterator over the start and end indices of the matches of a
513 /// substring within a larger string
515 pub struct MatchIndices<'a> {
516 priv haystack: &'a str,
517 priv needle: &'a str,
521 /// An iterator over the substrings of a string separated by a given
524 pub struct StrSplits<'a> {
525 priv it: MatchIndices<'a>,
530 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
532 fn next(&mut self) -> Option<(uint, uint)> {
533 // See Issue #1932 for why this is a naive search
534 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
535 let mut match_start = 0;
538 while self.position < h_len {
539 if self.haystack[self.position] == self.needle[match_i] {
540 if match_i == 0 { match_start = self.position; }
544 if match_i == n_len {
546 return Some((match_start, self.position));
549 // failed match, backtrack
552 self.position = match_start;
561 impl<'a> Iterator<&'a str> for StrSplits<'a> {
563 fn next(&mut self) -> Option<&'a str> {
564 if self.finished { return None; }
566 match self.it.next() {
567 Some((from, to)) => {
568 let ret = Some(self.it.haystack.slice(self.last_end, from));
573 self.finished = true;
574 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
580 // Helper functions used for Unicode normalization
581 fn canonical_sort(comb: &mut [(char, u8)]) {
585 let len = comb.len();
586 for i in range(0, len) {
587 let mut swapped = false;
588 for j in range(1, len-i) {
589 let classA = *comb[j-1].ref1();
590 let classB = *comb[j].ref1();
591 if classA != 0 && classB != 0 && classA > classB {
596 if !swapped { break; }
601 enum NormalizationForm {
606 /// External iterator for a string's normalization's characters.
607 /// Use with the `std::iter` module.
609 pub struct Normalizations<'a> {
610 priv kind: NormalizationForm,
611 priv iter: Chars<'a>,
612 priv buffer: ~[(char, u8)],
616 impl<'a> Iterator<char> for Normalizations<'a> {
618 fn next(&mut self) -> Option<char> {
619 use unicode::decompose::canonical_combining_class;
621 match self.buffer.head() {
627 Some(&(c, _)) if self.sorted => {
631 _ => self.sorted = false
634 let decomposer = match self.kind {
635 NFD => char::decompose_canonical,
636 NFKD => char::decompose_compatible
640 for ch in self.iter {
641 let buffer = &mut self.buffer;
642 let sorted = &mut self.sorted;
644 let class = canonical_combining_class(d);
645 if class == 0 && !*sorted {
646 canonical_sort(*buffer);
649 buffer.push((d, class));
656 canonical_sort(self.buffer);
660 match self.buffer.shift() {
665 Some((c, _)) => Some(c),
670 fn size_hint(&self) -> (uint, Option<uint>) {
671 let (lower, _) = self.iter.size_hint();
676 /// Replace all occurrences of one string with another
680 /// * s - The string containing substrings to replace
681 /// * from - The string to replace
682 /// * to - The replacement string
686 /// The original string with all occurances of `from` replaced with `to`
687 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
688 let mut result = ~"";
689 let mut last_end = 0;
690 for (start, end) in s.match_indices(from) {
691 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
695 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
700 Section: Comparing strings
703 // share the implementation of the lang-item vs. non-lang-item
706 fn eq_slice_(a: &str, b: &str) -> bool {
707 a.len() == b.len() && unsafe {
708 libc::memcmp(a.as_ptr() as *libc::c_void,
709 b.as_ptr() as *libc::c_void,
710 a.len() as libc::size_t) == 0
714 /// Bytewise slice equality
718 pub fn eq_slice(a: &str, b: &str) -> bool {
722 /// Bytewise slice equality
725 pub fn eq_slice(a: &str, b: &str) -> bool {
729 /// Bytewise string equality
731 #[lang="uniq_str_eq"]
733 pub fn eq(a: &~str, b: &~str) -> bool {
739 pub fn eq(a: &~str, b: &~str) -> bool {
747 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
748 /// returning `true` in that case, or, if it is invalid, `false` with
749 /// `iter` reset such that it is pointing at the first byte in the
750 /// invalid sequence.
752 fn run_utf8_validation_iterator(iter: &mut vec::Items<u8>) -> bool {
754 // save the current thing we're pointing at.
757 // restore the iterator we had at the start of this codepoint.
758 macro_rules! err ( () => { {*iter = old; return false} });
759 macro_rules! next ( () => {
762 // we needed data, but there was none: error!
767 let first = match iter.next() {
769 // we're at the end of the iterator and a codepoint
770 // boundary at the same time, so this string is valid.
774 // ASCII characters are always valid, so only large
775 // bytes need more examination.
777 let w = utf8_char_width(first);
778 let second = next!();
779 // 2-byte encoding is for codepoints \u0080 to \u07ff
780 // first C2 80 last DF BF
781 // 3-byte encoding is for codepoints \u0800 to \uffff
782 // first E0 A0 80 last EF BF BF
783 // excluding surrogates codepoints \ud800 to \udfff
784 // ED A0 80 to ED BF BF
785 // 4-byte encoding is for codepoints \u10000 to \u10ffff
786 // first F0 90 80 80 last F4 8F BF BF
788 // Use the UTF-8 syntax from the RFC
790 // https://tools.ietf.org/html/rfc3629
792 // UTF8-2 = %xC2-DF UTF8-tail
793 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
794 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
795 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
796 // %xF4 %x80-8F 2( UTF8-tail )
798 2 => if second & 192 != TAG_CONT_U8 {err!()},
800 match (first, second, next!() & 192) {
801 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
802 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
803 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
804 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
809 match (first, second, next!() & 192, next!() & 192) {
810 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
811 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
812 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
822 /// Determines if a vector of bytes contains valid UTF-8.
823 pub fn is_utf8(v: &[u8]) -> bool {
824 run_utf8_validation_iterator(&mut v.iter())
828 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
829 let mut it = v.iter();
831 let ok = run_utf8_validation_iterator(&mut it);
835 // work out how many valid bytes we've consumed
836 // (run_utf8_validation_iterator resets the iterator to just
837 // after the last good byte), which we can do because the
838 // vector iterator size_hint is exact.
839 let (remaining, _) = it.size_hint();
840 Some(v.len() - remaining)
844 /// Determines if a vector of `u16` contains valid UTF-16
845 pub fn is_utf16(v: &[u16]) -> bool {
846 let mut it = v.iter();
847 macro_rules! next ( ($ret:expr) => {
848 match it.next() { Some(u) => *u, None => return $ret }
854 match char::from_u32(u as u32) {
857 let u2 = next!(false);
858 if u < 0xD7FF || u > 0xDBFF ||
859 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
865 /// An iterator that decodes UTF-16 encoded codepoints from a vector
868 pub struct UTF16Items<'a> {
869 priv iter: vec::Items<'a, u16>
871 /// The possibilities for values decoded from a `u16` stream.
872 #[deriving(Eq, TotalEq, Clone)]
874 /// A valid codepoint.
876 /// An invalid surrogate without its pair.
881 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
882 /// replacement character (U+FFFD).
884 pub fn to_char_lossy(&self) -> char {
887 LoneSurrogate(_) => '\uFFFD'
892 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
893 fn next(&mut self) -> Option<UTF16Item> {
894 let u = match self.iter.next() {
899 if u < 0xD800 || 0xDFFF < u {
901 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
902 } else if u >= 0xDC00 {
903 // a trailing surrogate
904 Some(LoneSurrogate(u))
906 // preserve state for rewinding.
909 let u2 = match self.iter.next() {
912 None => return Some(LoneSurrogate(u))
914 if u2 < 0xDC00 || u2 > 0xDFFF {
915 // not a trailing surrogate so we're not a valid
916 // surrogate pair, so rewind to redecode u2 next time.
918 return Some(LoneSurrogate(u))
921 // all ok, so lets decode it.
922 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
923 Some(ScalarValue(unsafe {cast::transmute(c)}))
928 fn size_hint(&self) -> (uint, Option<uint>) {
929 let (low, high) = self.iter.size_hint();
930 // we could be entirely valid surrogates (2 elements per
931 // char), or entirely non-surrogates (1 element per char)
936 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
937 /// returning invalid surrogates as `LoneSurrogate`s.
943 /// use std::str::{ScalarValue, LoneSurrogate};
945 /// // 𝄞mus<invalid>ic<invalid>
946 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
947 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
950 /// assert_eq!(str::utf16_items(v).to_owned_vec(),
951 /// ~[ScalarValue('𝄞'),
952 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
953 /// LoneSurrogate(0xDD1E),
954 /// ScalarValue('i'), ScalarValue('c'),
955 /// LoneSurrogate(0xD834)]);
957 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
958 UTF16Items { iter : v.iter() }
961 /// Return a slice of `v` ending at (and not including) the first NUL
970 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
971 /// // no NULs so no change
972 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
976 /// assert_eq!(str::truncate_utf16_at_nul(v),
977 /// &['a' as u16, 'b' as u16]);
979 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
980 match v.iter().position(|c| *c == 0) {
981 // don't include the 0
982 Some(i) => v.slice_to(i),
987 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
988 /// if `v` contains any invalid data.
996 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
997 /// 0x0073, 0x0069, 0x0063];
998 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
1000 /// // 𝄞mu<invalid>ic
1002 /// assert_eq!(str::from_utf16(v), None);
1004 pub fn from_utf16(v: &[u16]) -> Option<~str> {
1005 let mut s = with_capacity(v.len() / 2);
1006 for c in utf16_items(v) {
1008 ScalarValue(c) => s.push_char(c),
1009 LoneSurrogate(_) => return None
1015 /// Decode a UTF-16 encoded vector `v` into a string, replacing
1016 /// invalid data with the replacement character (U+FFFD).
1022 /// // 𝄞mus<invalid>ic<invalid>
1023 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1024 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1027 /// assert_eq!(str::from_utf16_lossy(v),
1028 /// ~"𝄞mus\uFFFDic\uFFFD");
1030 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1031 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1034 /// Allocates a new string with the specified capacity. The string returned is
1035 /// the empty string, but has capacity for much more.
1037 pub fn with_capacity(capacity: uint) -> ~str {
1039 cast::transmute(vec::with_capacity::<~[u8]>(capacity))
1043 // https://tools.ietf.org/html/rfc3629
1044 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1045 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1046 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1047 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1048 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1049 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1050 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1051 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1052 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1053 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1054 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1055 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1056 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1057 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1058 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1059 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1060 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1063 /// Given a first byte, determine how many bytes are in this UTF-8 character
1065 pub fn utf8_char_width(b: u8) -> uint {
1066 return UTF8_CHAR_WIDTH[b] as uint;
1069 /// Struct that contains a `char` and the index of the first byte of
1070 /// the next `char` in a string. This can be used as a data structure
1071 /// for iterating over the UTF-8 bytes of a string.
1072 pub struct CharRange {
1075 /// Index of the first byte of the next `char`
1079 // Return the initial codepoint accumulator for the first byte.
1080 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1081 // for width 3, and 3 bits for width 4
1082 macro_rules! utf8_first_byte(
1083 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1086 // return the value of $ch updated with continuation byte $byte
1087 macro_rules! utf8_acc_cont_byte(
1088 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1091 static TAG_CONT_U8: u8 = 128u8;
1093 /// Converts a vector of bytes to a new utf-8 string.
1094 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1099 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1100 /// let output = std::str::from_utf8_lossy(input);
1101 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1103 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1104 let firstbad = match first_non_utf8_index(v) {
1105 None => return Slice(unsafe { cast::transmute(v) }),
1109 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1110 let mut i = firstbad;
1111 let total = v.len();
1112 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1113 unsafe { *xs.unsafe_ref(i) }
1115 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1122 let mut res = with_capacity(total);
1125 unsafe { raw::push_bytes(&mut res, v.slice_to(i)) };
1128 // subseqidx is the index of the first byte of the subsequence we're looking at.
1129 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1131 let mut subseqidx = firstbad;
1135 let byte = unsafe_get(v, i);
1138 macro_rules! error(() => ({
1140 if subseqidx != i_ {
1141 raw::push_bytes(&mut res, v.slice(subseqidx, i_));
1144 raw::push_bytes(&mut res, REPLACEMENT);
1149 // subseqidx handles this
1151 let w = utf8_char_width(byte);
1155 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1162 match (byte, safe_get(v, i, total)) {
1163 (0xE0 , 0xA0 .. 0xBF) => (),
1164 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1165 (0xED , 0x80 .. 0x9F) => (),
1166 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1173 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1180 match (byte, safe_get(v, i, total)) {
1181 (0xF0 , 0x90 .. 0xBF) => (),
1182 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1183 (0xF4 , 0x80 .. 0x8F) => (),
1190 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1195 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1208 if subseqidx < total {
1209 unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) };
1218 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1219 /// This can be useful as an optimization when an allocation is sometimes
1220 /// needed but not always.
1221 pub enum MaybeOwned<'a> {
1222 /// A borrowed string
1228 /// SendStr is a specialization of `MaybeOwned` to be sendable
1229 pub type SendStr = MaybeOwned<'static>;
1231 impl<'a> MaybeOwned<'a> {
1232 /// Returns `true` if this `MaybeOwned` wraps an owned string
1234 pub fn is_owned(&self) -> bool {
1241 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1243 pub fn is_slice(&self) -> bool {
1251 /// Trait for moving into a `MaybeOwned`
1252 pub trait IntoMaybeOwned<'a> {
1253 /// Moves self into a `MaybeOwned`
1254 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1257 impl<'a> IntoMaybeOwned<'a> for ~str {
1259 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1262 impl<'a> IntoMaybeOwned<'a> for &'a str {
1264 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1267 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1269 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1272 impl<'a> ToStr for MaybeOwned<'a> {
1274 fn to_str(&self) -> ~str { self.as_slice().to_owned() }
1277 impl<'a> Eq for MaybeOwned<'a> {
1279 fn eq(&self, other: &MaybeOwned) -> bool {
1280 self.as_slice().equals(&other.as_slice())
1284 impl<'a> TotalEq for MaybeOwned<'a> {
1286 fn equals(&self, other: &MaybeOwned) -> bool {
1287 self.as_slice().equals(&other.as_slice())
1291 impl<'a> Ord for MaybeOwned<'a> {
1293 fn lt(&self, other: &MaybeOwned) -> bool {
1294 self.as_slice().lt(&other.as_slice())
1298 impl<'a> TotalOrd for MaybeOwned<'a> {
1300 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1301 self.as_slice().cmp(&other.as_slice())
1305 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1307 fn equiv(&self, other: &S) -> bool {
1308 self.as_slice().equals(&other.as_slice())
1312 impl<'a> Str for MaybeOwned<'a> {
1314 fn as_slice<'b>(&'b self) -> &'b str {
1317 Owned(ref s) => s.as_slice()
1322 fn into_owned(self) -> ~str {
1324 Slice(s) => s.to_owned(),
1330 impl<'a> Container for MaybeOwned<'a> {
1332 fn len(&self) -> uint { self.as_slice().len() }
1335 impl<'a> Clone for MaybeOwned<'a> {
1337 fn clone(&self) -> MaybeOwned<'a> {
1339 Slice(s) => Slice(s),
1340 Owned(ref s) => Owned(s.to_owned())
1345 impl<'a> DeepClone for MaybeOwned<'a> {
1347 fn deep_clone(&self) -> MaybeOwned<'a> {
1349 Slice(s) => Slice(s),
1350 Owned(ref s) => Owned(s.to_owned())
1355 impl<'a> Default for MaybeOwned<'a> {
1357 fn default() -> MaybeOwned<'a> { Slice("") }
1360 impl<'a> Hash for MaybeOwned<'a> {
1362 fn hash(&self, s: &mut sip::SipState) {
1363 self.as_slice().hash(s)
1367 impl<'a> fmt::Show for MaybeOwned<'a> {
1369 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1371 Slice(ref s) => s.fmt(f),
1372 Owned(ref s) => s.fmt(f)
1377 /// Unsafe operations
1380 use container::Container;
1384 use str::{is_utf8, OwnedStr, StrSlice};
1386 use vec::{MutableVector, ImmutableVector, OwnedVector};
1389 /// Create a Rust string from a *u8 buffer of the given length
1390 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1391 let mut v: ~[u8] = vec::with_capacity(len);
1392 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1395 assert!(is_utf8(v));
1396 ::cast::transmute(v)
1399 #[lang="strdup_uniq"]
1401 #[allow(missing_doc)]
1403 pub unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1404 from_buf_len(ptr, len)
1407 /// Create a Rust string from a null-terminated C string
1408 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1413 curr = buf.offset(i);
1415 from_buf_len(buf as *u8, i as uint)
1418 /// Converts a slice of bytes to a string slice without checking
1419 /// that the string contains valid UTF-8.
1420 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1424 /// Converts an owned vector of bytes to a new owned string. This assumes
1425 /// that the utf-8-ness of the vector has already been validated
1427 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1431 /// Converts a byte to a string.
1432 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1434 /// Form a slice from a C string. Unsafe because the caller must ensure the
1435 /// C string has the static lifetime, or else the return value may be
1436 /// invalidated later.
1437 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1441 while *curr != 0u8 {
1443 curr = s.offset(len as int);
1445 let v = Slice { data: s, len: len };
1446 assert!(is_utf8(::cast::transmute(v)));
1447 ::cast::transmute(v)
1450 /// Takes a bytewise (not UTF-8) slice from a string.
1452 /// Returns the substring from [`begin`..`end`).
1456 /// If begin is greater than end.
1457 /// If end is greater than the length of the string.
1459 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1460 assert!(begin <= end);
1461 assert!(end <= s.len());
1462 slice_unchecked(s, begin, end)
1465 /// Takes a bytewise (not UTF-8) slice from a string.
1467 /// Returns the substring from [`begin`..`end`).
1469 /// Caller must check slice boundaries!
1471 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1472 cast::transmute(Slice {
1473 data: s.as_ptr().offset(begin as int),
1478 /// Appends a byte to a string.
1479 /// The caller must preserve the valid UTF-8 property.
1481 pub unsafe fn push_byte(s: &mut ~str, b: u8) {
1482 as_owned_vec(s).push(b)
1485 /// Appends a vector of bytes to a string.
1486 /// The caller must preserve the valid UTF-8 property.
1488 pub unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
1489 vec::bytes::push_bytes(as_owned_vec(s), bytes);
1492 /// Removes the last byte from a string and returns it.
1493 /// The caller must preserve the valid UTF-8 property.
1494 pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
1496 assert!((len > 0u));
1497 let b = s[len - 1u];
1502 /// Removes the first byte from a string and returns it.
1503 /// The caller must preserve the valid UTF-8 property.
1504 pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
1506 assert!((len > 0u));
1508 *s = s.slice(1, len).to_owned();
1512 /// Access the str in its vector representation.
1513 /// The caller must preserve the valid UTF-8 property when modifying.
1515 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1519 /// Sets the length of a string
1521 /// This will explicitly set the size of the string, without actually
1522 /// modifing its buffers, so it is up to the caller to ensure that
1523 /// the string is actually the specified size.
1525 fn test_from_buf_len() {
1527 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1529 let c = from_buf_len(b, 3u);
1530 assert_eq!(c, ~"AAA");
1536 Section: Trait implementations
1540 #[allow(missing_doc)]
1542 use container::Container;
1543 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1546 use option::{Some, None};
1547 use str::{Str, StrSlice, OwnedStr, eq_slice};
1549 impl<'a> Add<&'a str,~str> for &'a str {
1551 fn add(&self, rhs: & &'a str) -> ~str {
1552 let mut ret = self.to_owned();
1558 impl<'a> TotalOrd for &'a str {
1560 fn cmp(&self, other: & &'a str) -> Ordering {
1561 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1562 match s_b.cmp(&o_b) {
1563 Greater => return Greater,
1564 Less => return Less,
1569 self.len().cmp(&other.len())
1573 impl TotalOrd for ~str {
1575 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1578 impl<'a> Eq for &'a str {
1580 fn eq(&self, other: & &'a str) -> bool {
1581 eq_slice((*self), (*other))
1584 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1589 fn eq(&self, other: &~str) -> bool {
1590 eq_slice((*self), (*other))
1594 impl<'a> TotalEq for &'a str {
1596 fn equals(&self, other: & &'a str) -> bool {
1597 eq_slice((*self), (*other))
1601 impl TotalEq for ~str {
1603 fn equals(&self, other: &~str) -> bool {
1604 eq_slice((*self), (*other))
1608 impl<'a> Ord for &'a str {
1610 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1615 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1618 impl<'a, S: Str> Equiv<S> for &'a str {
1620 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1623 impl<'a, S: Str> Equiv<S> for ~str {
1625 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1632 /// Any string that can be represented as a slice
1634 /// Work with `self` as a slice.
1635 fn as_slice<'a>(&'a self) -> &'a str;
1637 /// Convert `self` into a ~str, not making a copy if possible
1638 fn into_owned(self) -> ~str;
1641 impl<'a> Str for &'a str {
1643 fn as_slice<'a>(&'a self) -> &'a str { *self }
1646 fn into_owned(self) -> ~str { self.to_owned() }
1649 impl<'a> Str for ~str {
1651 fn as_slice<'a>(&'a self) -> &'a str {
1652 let s: &'a str = *self; s
1656 fn into_owned(self) -> ~str { self }
1659 impl<'a> Container for &'a str {
1661 fn len(&self) -> uint {
1666 impl Container for ~str {
1668 fn len(&self) -> uint { self.as_slice().len() }
1671 impl Mutable for ~str {
1672 /// Remove all content, make the string empty
1674 fn clear(&mut self) {
1681 /// Methods for string slices
1682 pub trait StrSlice<'a> {
1683 /// Returns true if one string contains another
1687 /// - needle - The string to look for
1688 fn contains<'a>(&self, needle: &'a str) -> bool;
1690 /// Returns true if a string contains a char.
1694 /// - needle - The char to look for
1695 fn contains_char(&self, needle: char) -> bool;
1697 /// An iterator over the characters of `self`. Note, this iterates
1698 /// over unicode code-points, not unicode graphemes.
1703 /// let v: ~[char] = "abc åäö".chars().collect();
1704 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1706 fn chars(&self) -> Chars<'a>;
1708 /// An iterator over the characters of `self`, in reverse order.
1709 fn chars_rev(&self) -> RevChars<'a>;
1711 /// An iterator over the bytes of `self`
1712 fn bytes(&self) -> Bytes<'a>;
1714 /// An iterator over the bytes of `self`, in reverse order
1715 fn bytes_rev(&self) -> RevBytes<'a>;
1717 /// An iterator over the characters of `self` and their byte offsets.
1718 fn char_indices(&self) -> CharOffsets<'a>;
1720 /// An iterator over the characters of `self` and their byte offsets,
1721 /// in reverse order.
1722 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1724 /// An iterator over substrings of `self`, separated by characters
1725 /// matched by `sep`.
1730 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1731 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1733 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1734 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1736 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1737 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1739 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1741 /// An iterator over substrings of `self`, separated by characters
1742 /// matched by `sep`, restricted to splitting at most `count`
1748 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1749 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1751 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1752 /// assert_eq!(v, ~["abc", "def2ghi"]);
1754 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1755 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1757 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1759 /// An iterator over substrings of `self`, separated by characters
1760 /// matched by `sep`.
1762 /// Equivalent to `split`, except that the trailing substring
1763 /// is skipped if empty (terminator semantics).
1768 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1769 /// assert_eq!(v, ~["A", "B"]);
1771 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1772 /// assert_eq!(v, ~["A", "", "B", ""]);
1774 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1776 /// An iterator over substrings of `self`, separated by characters
1777 /// matched by `sep`, in reverse order.
1782 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1783 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1785 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1786 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1788 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1789 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1791 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1793 /// An iterator over substrings of `self`, separated by characters
1794 /// matched by `sep`, starting from the end of the string.
1795 /// Restricted to splitting at most `count` times.
1800 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1801 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1803 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1804 /// assert_eq!(v, ~["ghi", "abc1def"]);
1806 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1807 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1809 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1811 /// An iterator over the start and end indices of the disjoint
1812 /// matches of `sep` within `self`.
1814 /// That is, each returned value `(start, end)` satisfies
1815 /// `self.slice(start, end) == sep`. For matches of `sep` within
1816 /// `self` that overlap, only the indicies corresponding to the
1817 /// first match are returned.
1822 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1823 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1825 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1826 /// assert_eq!(v, ~[(1,4), (4,7)]);
1828 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1829 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1831 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1833 /// An iterator over the substrings of `self` separated by `sep`.
1838 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1839 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1841 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1842 /// assert_eq!(v, ~["1", "", "2"]);
1844 fn split_str(&self, &'a str) -> StrSplits<'a>;
1846 /// An iterator over the lines of a string (subsequences separated
1847 /// by `\n`). This does not include the empty string after a
1853 /// let four_lines = "foo\nbar\n\nbaz\n";
1854 /// let v: ~[&str] = four_lines.lines().collect();
1855 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1857 fn lines(&self) -> CharSplits<'a, char>;
1859 /// An iterator over the lines of a string, separated by either
1860 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1861 /// empty trailing line.
1866 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1867 /// let v: ~[&str] = four_lines.lines_any().collect();
1868 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1870 fn lines_any(&self) -> AnyLines<'a>;
1872 /// An iterator over the words of a string (subsequences separated
1873 /// by any sequence of whitespace). Sequences of whitespace are
1874 /// collapsed, so empty "words" are not included.
1879 /// let some_words = " Mary had\ta little \n\t lamb";
1880 /// let v: ~[&str] = some_words.words().collect();
1881 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1883 fn words(&self) -> Words<'a>;
1885 /// An Iterator over the string in Unicode Normalization Form D
1886 /// (canonical decomposition).
1887 fn nfd_chars(&self) -> Normalizations<'a>;
1889 /// An Iterator over the string in Unicode Normalization Form KD
1890 /// (compatibility decomposition).
1891 fn nfkd_chars(&self) -> Normalizations<'a>;
1893 /// Returns true if the string contains only whitespace.
1895 /// Whitespace characters are determined by `char::is_whitespace`.
1900 /// assert!(" \t\n".is_whitespace());
1901 /// assert!("".is_whitespace());
1903 /// assert!( !"abc".is_whitespace());
1905 fn is_whitespace(&self) -> bool;
1907 /// Returns true if the string contains only alphanumeric code
1910 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1915 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1916 /// assert!("".is_alphanumeric());
1918 /// assert!( !" &*~".is_alphanumeric());
1920 fn is_alphanumeric(&self) -> bool;
1922 /// Returns the number of Unicode code points (`char`) that a
1925 /// This does not perform any normalization, and is `O(n)`, since
1926 /// UTF-8 is a variable width encoding of code points.
1928 /// *Warning*: The number of code points in a string does not directly
1929 /// correspond to the number of visible characters or width of the
1930 /// visible text due to composing characters, and double- and
1931 /// zero-width ones.
1933 /// See also `.len()` for the byte length.
1938 /// // composed forms of `ö` and `é`
1939 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1940 /// // decomposed forms of `ö` and `é`
1941 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1943 /// assert_eq!(c.char_len(), 15);
1944 /// assert_eq!(d.char_len(), 17);
1946 /// assert_eq!(c.len(), 21);
1947 /// assert_eq!(d.len(), 23);
1949 /// // the two strings *look* the same
1950 /// println!("{}", c);
1951 /// println!("{}", d);
1953 fn char_len(&self) -> uint;
1955 /// Returns a slice of the given string from the byte range
1956 /// [`begin`..`end`).
1958 /// This operation is `O(1)`.
1960 /// Fails when `begin` and `end` do not point to valid characters
1961 /// or point beyond the last character of the string.
1963 /// See also `slice_to` and `slice_from` for slicing prefixes and
1964 /// suffixes of strings, and `slice_chars` for slicing based on
1965 /// code point counts.
1970 /// let s = "Löwe 老虎 Léopard";
1971 /// assert_eq!(s.slice(0, 1), "L");
1973 /// assert_eq!(s.slice(1, 9), "öwe 老");
1975 /// // these will fail:
1976 /// // byte 2 lies within `ö`:
1977 /// // s.slice(2, 3);
1979 /// // byte 8 lies within `老`
1980 /// // s.slice(1, 8);
1982 /// // byte 100 is outside the string
1983 /// // s.slice(3, 100);
1985 fn slice(&self, begin: uint, end: uint) -> &'a str;
1987 /// Returns a slice of the string from `begin` to its end.
1989 /// Equivalent to `self.slice(begin, self.len())`.
1991 /// Fails when `begin` does not point to a valid character, or is
1994 /// See also `slice`, `slice_to` and `slice_chars`.
1995 fn slice_from(&self, begin: uint) -> &'a str;
1997 /// Returns a slice of the string from the beginning to byte
2000 /// Equivalent to `self.slice(0, end)`.
2002 /// Fails when `end` does not point to a valid character, or is
2005 /// See also `slice`, `slice_from` and `slice_chars`.
2006 fn slice_to(&self, end: uint) -> &'a str;
2008 /// Returns a slice of the string from the character range
2009 /// [`begin`..`end`).
2011 /// That is, start at the `begin`-th code point of the string and
2012 /// continue to the `end`-th code point. This does not detect or
2013 /// handle edge cases such as leaving a combining character as the
2014 /// first code point of the string.
2016 /// Due to the design of UTF-8, this operation is `O(end -
2017 /// begin)`. See `slice`, `slice_to` and `slice_from` for `O(1)`
2018 /// variants that use byte indices rather than code point
2021 /// Fails if `begin` > `end` or the either `begin` or `end` are
2022 /// beyond the last character of the string.
2027 /// let s = "Löwe 老虎 Léopard";
2028 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
2029 /// assert_eq!(s.slice_chars(5, 7), "老虎");
2031 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
2033 /// Returns true if `needle` is a prefix of the string.
2034 fn starts_with(&self, needle: &str) -> bool;
2036 /// Returns true if `needle` is a suffix of the string.
2037 fn ends_with(&self, needle: &str) -> bool;
2039 /// Escape each char in `s` with `char::escape_default`.
2040 fn escape_default(&self) -> ~str;
2042 /// Escape each char in `s` with `char::escape_unicode`.
2043 fn escape_unicode(&self) -> ~str;
2045 /// Returns a string with leading and trailing whitespace removed.
2046 fn trim(&self) -> &'a str;
2048 /// Returns a string with leading whitespace removed.
2049 fn trim_left(&self) -> &'a str;
2051 /// Returns a string with trailing whitespace removed.
2052 fn trim_right(&self) -> &'a str;
2054 /// Returns a string with characters that match `to_trim` removed.
2058 /// * to_trim - a character matcher
2063 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
2064 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
2065 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
2067 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2069 /// Returns a string with leading `chars_to_trim` removed.
2073 /// * to_trim - a character matcher
2078 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2079 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2080 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2082 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2084 /// Returns a string with trailing `chars_to_trim` removed.
2088 /// * to_trim - a character matcher
2093 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2094 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2095 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2097 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2099 /// Replace all occurrences of one string with another.
2103 /// * `from` - The string to replace
2104 /// * `to` - The replacement string
2108 /// The original string with all occurances of `from` replaced with `to`.
2113 /// let s = ~"Do you know the muffin man,
2114 /// The muffin man, the muffin man, ...";
2116 /// assert_eq!(s.replace("muffin man", "little lamb"),
2117 /// ~"Do you know the little lamb,
2118 /// The little lamb, the little lamb, ...");
2120 /// // not found, so no change.
2121 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2123 fn replace(&self, from: &str, to: &str) -> ~str;
2125 /// Copy a slice into a new owned str.
2126 fn to_owned(&self) -> ~str;
2128 /// Converts to a vector of `u16` encoded as UTF-16.
2129 fn to_utf16(&self) -> ~[u16];
2131 /// Check that `index`-th byte lies at the start and/or end of a
2132 /// UTF-8 code point sequence.
2134 /// The start and end of the string (when `index == self.len()`)
2135 /// are considered to be boundaries.
2137 /// Fails if `index` is greater than `self.len()`.
2142 /// let s = "Löwe 老虎 Léopard";
2143 /// assert!(s.is_char_boundary(0));
2145 /// assert!(s.is_char_boundary(6));
2146 /// assert!(s.is_char_boundary(s.len()));
2148 /// // second byte of `ö`
2149 /// assert!(!s.is_char_boundary(2));
2151 /// // third byte of `老`
2152 /// assert!(!s.is_char_boundary(8));
2154 fn is_char_boundary(&self, index: uint) -> bool;
2156 /// Pluck a character out of a string and return the index of the next
2159 /// This function can be used to iterate over the unicode characters of a
2164 /// This example manually iterate through the characters of a
2165 /// string; this should normally by done by `.chars()` or
2166 /// `.char_indices`.
2169 /// use std::str::CharRange;
2171 /// let s = "中华Việt Nam";
2173 /// while i < s.len() {
2174 /// let CharRange {ch, next} = s.char_range_at(i);
2175 /// println!("{}: {}", i, ch);
2197 /// * s - The string
2198 /// * i - The byte offset of the char to extract
2202 /// A record {ch: char, next: uint} containing the char value and the byte
2203 /// index of the next unicode character.
2207 /// If `i` is greater than or equal to the length of the string.
2208 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2209 fn char_range_at(&self, start: uint) -> CharRange;
2211 /// Given a byte position and a str, return the previous char and its position.
2213 /// This function can be used to iterate over a unicode string in reverse.
2215 /// Returns 0 for next index if called on start index 0.
2216 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2218 /// Plucks the character starting at the `i`th byte of a string
2219 fn char_at(&self, i: uint) -> char;
2221 /// Plucks the character ending at the `i`th byte of a string
2222 fn char_at_reverse(&self, i: uint) -> char;
2224 /// Work with the byte buffer of a string as a byte slice.
2225 fn as_bytes(&self) -> &'a [u8];
2227 /// Returns the byte index of the first character of `self` that
2228 /// matches `search`.
2232 /// `Some` containing the byte index of the last matching character
2233 /// or `None` if there is no match
2238 /// let s = "Löwe 老虎 Léopard";
2240 /// assert_eq!(s.find('L'), Some(0));
2241 /// assert_eq!(s.find('é'), Some(14));
2243 /// // the first space
2244 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2246 /// // neither are found
2247 /// assert_eq!(s.find(&['1', '2']), None);
2249 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2251 /// Returns the byte index of the last character of `self` that
2252 /// matches `search`.
2256 /// `Some` containing the byte index of the last matching character
2257 /// or `None` if there is no match.
2262 /// let s = "Löwe 老虎 Léopard";
2264 /// assert_eq!(s.rfind('L'), Some(13));
2265 /// assert_eq!(s.rfind('é'), Some(14));
2267 /// // the second space
2268 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2270 /// // searches for an occurrence of either `1` or `2`, but neither are found
2271 /// assert_eq!(s.rfind(&['1', '2']), None);
2273 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2275 /// Returns the byte index of the first matching substring
2279 /// * `needle` - The string to search for
2283 /// `Some` containing the byte index of the first matching substring
2284 /// or `None` if there is no match.
2289 /// let s = "Löwe 老虎 Léopard";
2291 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2292 /// assert_eq!(s.find_str("muffin man"), None);
2294 fn find_str(&self, &str) -> Option<uint>;
2296 /// Given a string, make a new string with repeated copies of it.
2297 fn repeat(&self, nn: uint) -> ~str;
2299 /// Retrieves the first character from a string slice and returns
2300 /// it. This does not allocate a new string; instead, it returns a
2301 /// slice that point one character beyond the character that was
2306 /// If the string does not contain any characters.
2311 /// let s = "Löwe 老虎 Léopard";
2312 /// let (c, s1) = s.slice_shift_char();
2313 /// assert_eq!(c, 'L');
2314 /// assert_eq!(s1, "öwe 老虎 Léopard");
2316 /// let (c, s2) = s1.slice_shift_char();
2317 /// assert_eq!(c, 'ö');
2318 /// assert_eq!(s2, "we 老虎 Léopard");
2320 fn slice_shift_char(&self) -> (char, &'a str);
2322 /// Levenshtein Distance between two strings.
2323 fn lev_distance(&self, t: &str) -> uint;
2325 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2327 /// Fails if `inner` is not a direct slice contained within self.
2332 /// let string = "a\nb\nc";
2333 /// let lines: ~[&str] = string.lines().collect();
2335 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2336 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2337 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2339 fn subslice_offset(&self, inner: &str) -> uint;
2341 /// Return an unsafe pointer to the strings buffer.
2343 /// The caller must ensure that the string outlives this pointer,
2344 /// and that it is not reallocated (e.g. by pushing to the
2346 fn as_ptr(&self) -> *u8;
2349 impl<'a> StrSlice<'a> for &'a str {
2351 fn contains<'a>(&self, needle: &'a str) -> bool {
2352 self.find_str(needle).is_some()
2356 fn contains_char(&self, needle: char) -> bool {
2357 self.find(needle).is_some()
2361 fn chars(&self) -> Chars<'a> {
2362 Chars{string: *self}
2366 fn chars_rev(&self) -> RevChars<'a> {
2371 fn bytes(&self) -> Bytes<'a> {
2372 self.as_bytes().iter().map(|&b| b)
2376 fn bytes_rev(&self) -> RevBytes<'a> {
2381 fn char_indices(&self) -> CharOffsets<'a> {
2382 CharOffsets{string: *self, iter: self.chars()}
2386 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2387 self.char_indices().rev()
2391 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2394 only_ascii: sep.only_ascii(),
2396 allow_trailing_empty: true,
2402 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2403 -> CharSplitsN<'a, Sep> {
2405 iter: self.split(sep),
2412 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2413 -> CharSplits<'a, Sep> {
2415 allow_trailing_empty: false,
2421 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2422 self.split(sep).rev()
2426 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2427 -> CharSplitsN<'a, Sep> {
2429 iter: self.split(sep),
2436 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2437 assert!(!sep.is_empty())
2446 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2448 it: self.match_indices(sep),
2455 fn lines(&self) -> CharSplits<'a, char> {
2456 self.split_terminator('\n')
2459 fn lines_any(&self) -> AnyLines<'a> {
2460 self.lines().map(|line| {
2462 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2468 fn words(&self) -> Words<'a> {
2469 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2473 fn nfd_chars(&self) -> Normalizations<'a> {
2483 fn nfkd_chars(&self) -> Normalizations<'a> {
2493 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2496 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2499 fn char_len(&self) -> uint { self.chars().len() }
2502 fn slice(&self, begin: uint, end: uint) -> &'a str {
2503 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2504 unsafe { raw::slice_bytes(*self, begin, end) }
2508 fn slice_from(&self, begin: uint) -> &'a str {
2509 self.slice(begin, self.len())
2513 fn slice_to(&self, end: uint) -> &'a str {
2514 assert!(self.is_char_boundary(end));
2515 unsafe { raw::slice_bytes(*self, 0, end) }
2518 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2519 assert!(begin <= end);
2521 let mut begin_byte = None;
2522 let mut end_byte = None;
2524 // This could be even more efficient by not decoding,
2525 // only finding the char boundaries
2526 for (idx, _) in self.char_indices() {
2527 if count == begin { begin_byte = Some(idx); }
2528 if count == end { end_byte = Some(idx); break; }
2531 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2532 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2534 match (begin_byte, end_byte) {
2535 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2536 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2537 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2542 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2543 let n = needle.len();
2544 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2548 fn ends_with(&self, needle: &str) -> bool {
2549 let (m, n) = (self.len(), needle.len());
2550 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2553 fn escape_default(&self) -> ~str {
2554 let mut out = with_capacity(self.len());
2555 for c in self.chars() {
2556 c.escape_default(|c| out.push_char(c));
2561 fn escape_unicode(&self) -> ~str {
2562 let mut out = with_capacity(self.len());
2563 for c in self.chars() {
2564 c.escape_unicode(|c| out.push_char(c));
2570 fn trim(&self) -> &'a str {
2571 self.trim_left().trim_right()
2575 fn trim_left(&self) -> &'a str {
2576 self.trim_left_chars(&char::is_whitespace)
2580 fn trim_right(&self) -> &'a str {
2581 self.trim_right_chars(&char::is_whitespace)
2585 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2586 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2590 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2591 match self.find(|c: char| !to_trim.matches(c)) {
2593 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2598 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2599 match self.rfind(|c: char| !to_trim.matches(c)) {
2602 let next = self.char_range_at(last).next;
2603 unsafe { raw::slice_bytes(*self, 0u, next) }
2608 fn replace(&self, from: &str, to: &str) -> ~str {
2609 let mut result = ~"";
2610 let mut last_end = 0;
2611 for (start, end) in self.match_indices(from) {
2612 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2613 result.push_str(to);
2616 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2621 fn to_owned(&self) -> ~str {
2622 let len = self.len();
2624 let mut v = vec::with_capacity(len);
2626 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2628 ::cast::transmute(v)
2632 fn to_utf16(&self) -> ~[u16] {
2634 for ch in self.chars() {
2635 // Arithmetic with u32 literals is easier on the eyes than chars.
2636 let mut ch = ch as u32;
2638 if (ch & 0xFFFF_u32) == ch {
2639 // The BMP falls through (assuming non-surrogate, as it
2641 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2644 // Supplementary planes break into surrogates.
2645 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2647 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2648 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2649 u.push_all([w1, w2])
2656 fn is_char_boundary(&self, index: uint) -> bool {
2657 if index == self.len() { return true; }
2658 let b = self[index];
2659 return b < 128u8 || b >= 192u8;
2663 fn char_range_at(&self, i: uint) -> CharRange {
2664 if self[i] < 128u8 {
2665 return CharRange {ch: self[i] as char, next: i + 1 };
2668 // Multibyte case is a fn to allow char_range_at to inline cleanly
2669 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2670 let mut val = s[i] as u32;
2671 let w = UTF8_CHAR_WIDTH[val] as uint;
2674 val = utf8_first_byte!(val, w);
2675 val = utf8_acc_cont_byte!(val, s[i + 1]);
2676 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2677 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2679 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2682 return multibyte_char_range_at(*self, i);
2686 fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch }
2689 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2690 let mut prev = start;
2692 prev = prev.saturating_sub(1);
2693 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2695 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2696 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2697 // while there is a previous byte == 10......
2698 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2702 let mut val = s[i] as u32;
2703 let w = UTF8_CHAR_WIDTH[val] as uint;
2706 val = utf8_first_byte!(val, w);
2707 val = utf8_acc_cont_byte!(val, s[i + 1]);
2708 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2709 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2711 return CharRange {ch: unsafe { transmute(val) }, next: i};
2714 return multibyte_char_range_at_reverse(*self, prev);
2718 fn char_at(&self, i: uint) -> char {
2719 self.char_range_at(i).ch
2723 fn char_at_reverse(&self, i: uint) -> char {
2724 self.char_range_at_reverse(i).ch
2728 fn as_bytes(&self) -> &'a [u8] {
2729 unsafe { cast::transmute(*self) }
2732 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2733 if search.only_ascii() {
2734 self.bytes().position(|b| search.matches(b as char))
2736 for (index, c) in self.char_indices() {
2737 if search.matches(c) { return Some(index); }
2743 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2744 if search.only_ascii() {
2745 self.bytes().rposition(|b| search.matches(b as char))
2747 for (index, c) in self.char_indices_rev() {
2748 if search.matches(c) { return Some(index); }
2754 fn find_str(&self, needle: &str) -> Option<uint> {
2755 if needle.is_empty() {
2758 self.match_indices(needle)
2760 .map(|(start, _end)| start)
2764 fn repeat(&self, nn: uint) -> ~str {
2765 let mut ret = with_capacity(nn * self.len());
2766 for _ in range(0, nn) {
2767 ret.push_str(*self);
2773 fn slice_shift_char(&self) -> (char, &'a str) {
2774 let CharRange {ch, next} = self.char_range_at(0u);
2775 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2776 return (ch, next_s);
2779 fn lev_distance(&self, t: &str) -> uint {
2780 let slen = self.len();
2783 if slen == 0 { return tlen; }
2784 if tlen == 0 { return slen; }
2786 let mut dcol = vec::from_fn(tlen + 1, |x| x);
2788 for (i, sc) in self.chars().enumerate() {
2790 let mut current = i;
2791 dcol[0] = current + 1;
2793 for (j, tc) in t.chars().enumerate() {
2795 let next = dcol[j + 1];
2798 dcol[j + 1] = current;
2800 dcol[j + 1] = ::cmp::min(current, next);
2801 dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2811 fn subslice_offset(&self, inner: &str) -> uint {
2812 let a_start = self.as_ptr() as uint;
2813 let a_end = a_start + self.len();
2814 let b_start = inner.as_ptr() as uint;
2815 let b_end = b_start + inner.len();
2817 assert!(a_start <= b_start);
2818 assert!(b_end <= a_end);
2823 fn as_ptr(&self) -> *u8 {
2828 /// Methods for owned strings
2829 pub trait OwnedStr {
2830 /// Appends a string slice to the back of a string, without overallocating.
2831 fn push_str_no_overallocate(&mut self, rhs: &str);
2833 /// Appends a string slice to the back of a string
2834 fn push_str(&mut self, rhs: &str);
2836 /// Appends a character to the back of a string
2837 fn push_char(&mut self, c: char);
2839 /// Remove the final character from a string and return it
2843 /// If the string does not contain any characters
2844 fn pop_char(&mut self) -> char;
2846 /// Remove the first character from a string and return it
2850 /// If the string does not contain any characters
2851 fn shift_char(&mut self) -> char;
2853 /// Prepend a char to a string
2854 fn unshift_char(&mut self, ch: char);
2856 /// Insert a new sub-string at the given position in a string, in O(n + m) time
2857 /// (with n and m the lengths of the string and the substring.)
2858 /// This fails if `position` is not at a character boundary.
2859 fn insert(&mut self, position: uint, substring: &str);
2861 /// Insert a char at the given position in a string, in O(n + m) time
2862 /// (with n and m the lengths of the string and the substring.)
2863 /// This fails if `position` is not at a character boundary.
2864 fn insert_char(&mut self, position: uint, ch: char);
2866 /// Concatenate two strings together.
2867 fn append(self, rhs: &str) -> ~str;
2869 /// Reserves capacity for exactly `n` bytes in the given string.
2871 /// Assuming single-byte characters, the resulting string will be large
2872 /// enough to hold a string of length `n`.
2874 /// If the capacity for `s` is already equal to or greater than the requested
2875 /// capacity, then no action is taken.
2880 /// * n - The number of bytes to reserve space for
2881 fn reserve_exact(&mut self, n: uint);
2883 /// Reserves capacity for at least `n` bytes in the given string.
2885 /// Assuming single-byte characters, the resulting string will be large
2886 /// enough to hold a string of length `n`.
2888 /// This function will over-allocate in order to amortize the allocation costs
2889 /// in scenarios where the caller may need to repeatedly reserve additional
2892 /// If the capacity for `s` is already equal to or greater than the requested
2893 /// capacity, then no action is taken.
2898 /// * n - The number of bytes to reserve space for
2899 fn reserve(&mut self, n: uint);
2901 /// Returns the number of single-byte characters the string can hold without
2903 fn capacity(&self) -> uint;
2905 /// Shorten a string to the specified length (which must be <= the current length)
2906 fn truncate(&mut self, len: uint);
2908 /// Consumes the string, returning the underlying byte buffer.
2910 /// The buffer does not have a null terminator.
2911 fn into_bytes(self) -> ~[u8];
2913 /// Sets the length of a string
2915 /// This will explicitly set the size of the string, without actually
2916 /// modifying its buffers, so it is up to the caller to ensure that
2917 /// the string is actually the specified size.
2918 unsafe fn set_len(&mut self, new_len: uint);
2921 impl OwnedStr for ~str {
2923 fn push_str_no_overallocate(&mut self, rhs: &str) {
2924 let new_cap = self.len() + rhs.len();
2925 self.reserve_exact(new_cap);
2930 fn push_str(&mut self, rhs: &str) {
2932 raw::push_bytes(self, rhs.as_bytes());
2937 fn push_char(&mut self, c: char) {
2938 let cur_len = self.len();
2939 // may use up to 4 bytes.
2941 let v = raw::as_owned_vec(self);
2942 v.reserve_additional(4);
2944 // Attempt to not use an intermediate buffer by just pushing bytes
2945 // directly onto this string.
2946 let write_ptr = v.as_mut_ptr().offset(cur_len as int);
2947 let used = vec::raw::mut_buf_as_slice(write_ptr, 4, |slc| c.encode_utf8(slc));
2949 v.set_len(cur_len + used);
2954 fn pop_char(&mut self) -> char {
2955 let end = self.len();
2957 let CharRange {ch, next} = self.char_range_at_reverse(end);
2958 unsafe { self.set_len(next); }
2963 fn shift_char(&mut self) -> char {
2964 let CharRange {ch, next} = self.char_range_at(0u);
2965 *self = self.slice(next, self.len()).to_owned();
2970 fn unshift_char(&mut self, ch: char) {
2971 // This could be more efficient.
2972 let mut new_str = ~"";
2973 new_str.push_char(ch);
2974 new_str.push_str(*self);
2979 fn insert(&mut self, position: uint, substring: &str) {
2980 // This could be more efficient.
2981 let mut new_str = self.slice_to(position).to_owned();
2982 new_str.push_str(substring);
2983 new_str.push_str(self.slice_from(position));
2988 fn insert_char(&mut self, position: uint, ch: char) {
2989 // This could be more efficient.
2990 let mut new_str = self.slice_to(position).to_owned();
2991 new_str.push_char(ch);
2992 new_str.push_str(self.slice_from(position));
2997 fn append(self, rhs: &str) -> ~str {
2998 let mut new_str = self;
2999 new_str.push_str_no_overallocate(rhs);
3004 fn reserve_exact(&mut self, n: uint) {
3006 raw::as_owned_vec(self).reserve_exact(n)
3011 fn reserve(&mut self, n: uint) {
3013 raw::as_owned_vec(self).reserve(n)
3018 fn capacity(&self) -> uint {
3020 let buf: &~[u8] = cast::transmute(self);
3026 fn truncate(&mut self, len: uint) {
3027 assert!(len <= self.len());
3028 assert!(self.is_char_boundary(len));
3029 unsafe { self.set_len(len); }
3033 fn into_bytes(self) -> ~[u8] {
3034 unsafe { cast::transmute(self) }
3038 unsafe fn set_len(&mut self, new_len: uint) {
3039 raw::as_owned_vec(self).set_len(new_len)
3043 impl Clone for ~str {
3045 fn clone(&self) -> ~str {
3050 impl DeepClone for ~str {
3052 fn deep_clone(&self) -> ~str {
3057 impl FromIterator<char> for ~str {
3059 fn from_iterator<T: Iterator<char>>(iterator: &mut T) -> ~str {
3060 let (lower, _) = iterator.size_hint();
3061 let mut buf = with_capacity(lower);
3062 buf.extend(iterator);
3067 impl Extendable<char> for ~str {
3069 fn extend<T: Iterator<char>>(&mut self, iterator: &mut T) {
3070 let (lower, _) = iterator.size_hint();
3071 let reserve = lower + self.len();
3072 self.reserve(reserve);
3073 for ch in *iterator {
3079 // This works because every lifetime is a sub-lifetime of 'static
3080 impl<'a> Default for &'a str {
3081 fn default() -> &'a str { "" }
3084 impl Default for ~str {
3085 fn default() -> ~str { ~"" }
3090 use iter::AdditiveIterator;
3096 assert!((eq(&~"", &~"")));
3097 assert!((eq(&~"foo", &~"foo")));
3098 assert!((!eq(&~"foo", &~"bar")));
3102 fn test_eq_slice() {
3103 assert!((eq_slice("foobar".slice(0, 3), "foo")));
3104 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
3105 assert!((!eq_slice("foo1", "foo2")));
3111 assert!("" <= "foo");
3112 assert!("foo" <= "foo");
3113 assert!("foo" != "bar");
3118 assert_eq!("".len(), 0u);
3119 assert_eq!("hello world".len(), 11u);
3120 assert_eq!("\x63".len(), 1u);
3121 assert_eq!("\xa2".len(), 2u);
3122 assert_eq!("\u03c0".len(), 2u);
3123 assert_eq!("\u2620".len(), 3u);
3124 assert_eq!("\U0001d11e".len(), 4u);
3126 assert_eq!("".char_len(), 0u);
3127 assert_eq!("hello world".char_len(), 11u);
3128 assert_eq!("\x63".char_len(), 1u);
3129 assert_eq!("\xa2".char_len(), 1u);
3130 assert_eq!("\u03c0".char_len(), 1u);
3131 assert_eq!("\u2620".char_len(), 1u);
3132 assert_eq!("\U0001d11e".char_len(), 1u);
3133 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
3138 assert_eq!("hello".find('l'), Some(2u));
3139 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
3140 assert!("hello".find('x').is_none());
3141 assert!("hello".find(|c:char| c == 'x').is_none());
3142 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
3143 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
3148 assert_eq!("hello".rfind('l'), Some(3u));
3149 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
3150 assert!("hello".rfind('x').is_none());
3151 assert!("hello".rfind(|c:char| c == 'x').is_none());
3152 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
3153 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
3157 fn test_push_str() {
3160 assert_eq!(s.slice_from(0), "");
3162 assert_eq!(s.slice_from(0), "abc");
3163 s.push_str("ประเทศไทย中华Việt Nam");
3164 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3171 assert_eq!(s.slice_from(0), "");
3172 s = s.append("abc");
3173 assert_eq!(s.slice_from(0), "abc");
3174 s = s.append("ประเทศไทย中华Việt Nam");
3175 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3179 fn test_pop_char() {
3180 let mut data = ~"ประเทศไทย中华";
3181 let cc = data.pop_char();
3182 assert_eq!(~"ประเทศไทย中", data);
3183 assert_eq!('华', cc);
3187 fn test_pop_char_2() {
3188 let mut data2 = ~"华";
3189 let cc2 = data2.pop_char();
3190 assert_eq!(~"", data2);
3191 assert_eq!('华', cc2);
3196 fn test_pop_char_fail() {
3198 let _cc3 = data.pop_char();
3202 fn test_push_char() {
3203 let mut data = ~"ประเทศไทย中";
3204 data.push_char('华');
3205 data.push_char('b'); // 1 byte
3206 data.push_char('¢'); // 2 byte
3207 data.push_char('€'); // 3 byte
3208 data.push_char('𤭢'); // 4 byte
3209 assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
3213 fn test_shift_char() {
3214 let mut data = ~"ประเทศไทย中";
3215 let cc = data.shift_char();
3216 assert_eq!(~"ระเทศไทย中", data);
3217 assert_eq!('ป', cc);
3221 fn test_unshift_char() {
3222 let mut data = ~"ประเทศไทย中";
3223 data.unshift_char('华');
3224 assert_eq!(~"华ประเทศไทย中", data);
3228 fn test_insert_char() {
3229 let mut data = ~"ประเทศไทย中";
3230 data.insert_char(15, '华');
3231 assert_eq!(~"ประเท华ศไทย中", data);
3236 let mut data = ~"ประเทศไทย中";
3237 data.insert(15, "华中");
3238 assert_eq!(~"ประเท华中ศไทย中", data);
3244 let s: ~str = empty.chars().collect();
3245 assert_eq!(empty, s);
3246 let data = ~"ประเทศไทย中";
3247 let s: ~str = data.chars().collect();
3248 assert_eq!(data, s);
3253 let data = ~"ประเทศไทย中";
3254 let mut cpy = data.clone();
3256 let mut it = other.chars();
3257 cpy.extend(&mut it);
3258 assert_eq!(cpy, data + other);
3263 let mut empty = ~"";
3265 assert_eq!("", empty.as_slice());
3266 let mut data = ~"ประเทศไทย中";
3268 assert_eq!("", data.as_slice());
3269 data.push_char('华');
3270 assert_eq!("华", data.as_slice());
3274 fn test_into_bytes() {
3276 let buf = data.into_bytes();
3277 assert_eq!(bytes!("asdf"), buf.as_slice());
3281 fn test_find_str() {
3283 assert_eq!("".find_str(""), Some(0u));
3284 assert!("banana".find_str("apple pie").is_none());
3286 let data = "abcabc";
3287 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
3288 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
3289 assert!(data.slice(2u, 4u).find_str("ab").is_none());
3291 let mut data = ~"ประเทศไทย中华Việt Nam";
3293 assert!(data.find_str("ไท华").is_none());
3294 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
3295 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
3297 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
3298 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
3299 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
3300 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
3301 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
3303 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
3304 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
3305 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
3306 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
3307 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
3311 fn test_slice_chars() {
3312 fn t(a: &str, b: &str, start: uint) {
3313 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
3316 t("hello", "llo", 2);
3317 t("hello", "el", 1);
3320 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
3325 fn t(v: &[~str], s: &str) {
3326 assert_eq!(v.concat(), s.to_str());
3328 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
3329 let v: &[~str] = [];
3336 fn t(v: &[~str], sep: &str, s: &str) {
3337 assert_eq!(v.connect(sep), s.to_str());
3339 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
3340 " ", "you know I'm no good");
3341 let v: &[~str] = [];
3343 t([~"hi"], " ", "hi");
3347 fn test_concat_slices() {
3348 fn t(v: &[&str], s: &str) {
3349 assert_eq!(v.concat(), s.to_str());
3351 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
3352 let v: &[&str] = [];
3358 fn test_connect_slices() {
3359 fn t(v: &[&str], sep: &str, s: &str) {
3360 assert_eq!(v.connect(sep), s.to_str());
3362 t(["you", "know", "I'm", "no", "good"],
3363 " ", "you know I'm no good");
3365 t(["hi"], " ", "hi");
3370 assert_eq!("x".repeat(4), ~"xxxx");
3371 assert_eq!("hi".repeat(4), ~"hihihihi");
3372 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
3373 assert_eq!("".repeat(4), ~"");
3374 assert_eq!("hi".repeat(0), ~"");
3378 fn test_unsafe_slice() {
3379 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
3380 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3381 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3382 fn a_million_letter_a() -> ~str {
3385 while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
3388 fn half_a_million_letter_a() -> ~str {
3391 while i < 100000 { rs.push_str("aaaaa"); i += 1; }
3394 let letters = a_million_letter_a();
3395 assert!(half_a_million_letter_a() ==
3396 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3400 fn test_starts_with() {
3401 assert!(("".starts_with("")));
3402 assert!(("abc".starts_with("")));
3403 assert!(("abc".starts_with("a")));
3404 assert!((!"a".starts_with("abc")));
3405 assert!((!"".starts_with("abc")));
3406 assert!((!"ödd".starts_with("-")));
3407 assert!(("ödd".starts_with("öd")));
3411 fn test_ends_with() {
3412 assert!(("".ends_with("")));
3413 assert!(("abc".ends_with("")));
3414 assert!(("abc".ends_with("c")));
3415 assert!((!"a".ends_with("abc")));
3416 assert!((!"".ends_with("abc")));
3417 assert!((!"ddö".ends_with("-")));
3418 assert!(("ddö".ends_with("dö")));
3422 fn test_is_empty() {
3423 assert!("".is_empty());
3424 assert!(!"a".is_empty());
3430 assert_eq!("".replace(a, "b"), ~"");
3431 assert_eq!("a".replace(a, "b"), ~"b");
3432 assert_eq!("ab".replace(a, "b"), ~"bb");
3434 assert!(" test test ".replace(test, "toast") ==
3436 assert_eq!(" test test ".replace(test, ""), ~" ");
3440 fn test_replace_2a() {
3441 let data = ~"ประเทศไทย中华";
3442 let repl = ~"دولة الكويت";
3445 let A = ~"دولة الكويتทศไทย中华";
3446 assert_eq!(data.replace(a, repl), A);
3450 fn test_replace_2b() {
3451 let data = ~"ประเทศไทย中华";
3452 let repl = ~"دولة الكويت";
3455 let B = ~"ปรدولة الكويتทศไทย中华";
3456 assert_eq!(data.replace(b, repl), B);
3460 fn test_replace_2c() {
3461 let data = ~"ประเทศไทย中华";
3462 let repl = ~"دولة الكويت";
3465 let C = ~"ประเทศไทยدولة الكويت";
3466 assert_eq!(data.replace(c, repl), C);
3470 fn test_replace_2d() {
3471 let data = ~"ประเทศไทย中华";
3472 let repl = ~"دولة الكويت";
3475 assert_eq!(data.replace(d, repl), data);
3480 assert_eq!("ab", "abc".slice(0, 2));
3481 assert_eq!("bc", "abc".slice(1, 3));
3482 assert_eq!("", "abc".slice(1, 1));
3483 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3485 let data = "ประเทศไทย中华";
3486 assert_eq!("ป", data.slice(0, 3));
3487 assert_eq!("ร", data.slice(3, 6));
3488 assert_eq!("", data.slice(3, 3));
3489 assert_eq!("华", data.slice(30, 33));
3491 fn a_million_letter_X() -> ~str {
3495 push_str(&mut rs, "华华华华华华华华华华");
3500 fn half_a_million_letter_X() -> ~str {
3503 while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
3506 let letters = a_million_letter_X();
3507 assert!(half_a_million_letter_X() ==
3508 letters.slice(0u, 3u * 500000u).to_owned());
3513 let ss = "中华Việt Nam";
3515 assert_eq!("华", ss.slice(3u, 6u));
3516 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3518 assert_eq!("ab", "abc".slice(0u, 2u));
3519 assert_eq!("bc", "abc".slice(1u, 3u));
3520 assert_eq!("", "abc".slice(1u, 1u));
3522 assert_eq!("中", ss.slice(0u, 3u));
3523 assert_eq!("华V", ss.slice(3u, 7u));
3524 assert_eq!("", ss.slice(3u, 3u));
3539 fn test_slice_fail() {
3540 "中华Việt Nam".slice(0u, 2u);
3544 fn test_slice_from() {
3545 assert_eq!("abcd".slice_from(0), "abcd");
3546 assert_eq!("abcd".slice_from(2), "cd");
3547 assert_eq!("abcd".slice_from(4), "");
3550 fn test_slice_to() {
3551 assert_eq!("abcd".slice_to(0), "");
3552 assert_eq!("abcd".slice_to(2), "ab");
3553 assert_eq!("abcd".slice_to(4), "abcd");
3557 fn test_trim_left_chars() {
3558 let v: &[char] = &[];
3559 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3560 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3561 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3562 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3564 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3565 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3566 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3570 fn test_trim_right_chars() {
3571 let v: &[char] = &[];
3572 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3573 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3574 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3575 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3577 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3578 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3579 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3583 fn test_trim_chars() {
3584 let v: &[char] = &[];
3585 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3586 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3587 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3588 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3590 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3591 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3592 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3596 fn test_trim_left() {
3597 assert_eq!("".trim_left(), "");
3598 assert_eq!("a".trim_left(), "a");
3599 assert_eq!(" ".trim_left(), "");
3600 assert_eq!(" blah".trim_left(), "blah");
3601 assert_eq!(" \u3000 wut".trim_left(), "wut");
3602 assert_eq!("hey ".trim_left(), "hey ");
3606 fn test_trim_right() {
3607 assert_eq!("".trim_right(), "");
3608 assert_eq!("a".trim_right(), "a");
3609 assert_eq!(" ".trim_right(), "");
3610 assert_eq!("blah ".trim_right(), "blah");
3611 assert_eq!("wut \u3000 ".trim_right(), "wut");
3612 assert_eq!(" hey".trim_right(), " hey");
3617 assert_eq!("".trim(), "");
3618 assert_eq!("a".trim(), "a");
3619 assert_eq!(" ".trim(), "");
3620 assert_eq!(" blah ".trim(), "blah");
3621 assert_eq!("\nwut \u3000 ".trim(), "wut");
3622 assert_eq!(" hey dude ".trim(), "hey dude");
3626 fn test_is_whitespace() {
3627 assert!("".is_whitespace());
3628 assert!(" ".is_whitespace());
3629 assert!("\u2009".is_whitespace()); // Thin space
3630 assert!(" \n\t ".is_whitespace());
3631 assert!(!" _ ".is_whitespace());
3635 fn test_push_byte() {
3637 unsafe{raw::push_byte(&mut s, 'D' as u8)};
3638 assert_eq!(s, ~"ABCD");
3642 fn test_shift_byte() {
3644 let b = unsafe{raw::shift_byte(&mut s)};
3645 assert_eq!(s, ~"BC");
3646 assert_eq!(b, 65u8);
3650 fn test_pop_byte() {
3652 let b = unsafe{raw::pop_byte(&mut s)};
3653 assert_eq!(s, ~"AB");
3654 assert_eq!(b, 67u8);
3659 // deny overlong encodings
3660 assert!(!is_utf8([0xc0, 0x80]));
3661 assert!(!is_utf8([0xc0, 0xae]));
3662 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3663 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3664 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3665 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3666 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3669 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3670 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3672 assert!(is_utf8([0xC2, 0x80]));
3673 assert!(is_utf8([0xDF, 0xBF]));
3674 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3675 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3676 assert!(is_utf8([0xEE, 0x80, 0x80]));
3677 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3678 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3679 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3683 fn test_is_utf16() {
3684 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3692 // surrogate pairs (randomly generated with Python 3's
3693 // .encode('utf-16be'))
3694 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3695 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3696 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3698 // mixtures (also random)
3699 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3700 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3701 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3704 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3707 // surrogate + regular unit
3709 // surrogate + lead surrogate
3711 // unterminated surrogate
3713 // trail surrogate without a lead
3716 // random byte sequences that Python 3's .decode('utf-16be')
3718 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3719 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3720 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3721 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3722 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3723 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3724 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3725 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3726 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3727 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3728 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3729 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3730 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3731 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3732 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3733 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3734 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3735 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3736 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3737 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3738 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3742 fn test_raw_from_c_str() {
3744 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3746 let c = raw::from_c_str(b);
3747 assert_eq!(c, ~"AAAAAAA");
3752 fn test_as_bytes() {
3755 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3756 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3759 assert_eq!("".as_bytes(), &[]);
3760 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3761 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v);
3766 fn test_as_bytes_fail() {
3767 // Don't double free. (I'm not sure if this exercises the
3768 // original problem code path anymore.)
3770 let _bytes = s.as_bytes();
3776 let buf = "hello".as_ptr();
3778 assert_eq!(*buf.offset(0), 'h' as u8);
3779 assert_eq!(*buf.offset(1), 'e' as u8);
3780 assert_eq!(*buf.offset(2), 'l' as u8);
3781 assert_eq!(*buf.offset(3), 'l' as u8);
3782 assert_eq!(*buf.offset(4), 'o' as u8);
3787 fn test_subslice_offset() {
3788 let a = "kernelsprite";
3789 let b = a.slice(7, a.len());
3790 let c = a.slice(0, a.len() - 6);
3791 assert_eq!(a.subslice_offset(b), 7);
3792 assert_eq!(a.subslice_offset(c), 0);
3794 let string = "a\nb\nc";
3795 let mut lines = ~[];
3796 for line in string.lines() { lines.push(line) }
3797 assert_eq!(string.subslice_offset(lines[0]), 0);
3798 assert_eq!(string.subslice_offset(lines[1]), 2);
3799 assert_eq!(string.subslice_offset(lines[2]), 4);
3804 fn test_subslice_offset_2() {
3805 let a = "alchemiter";
3806 let b = "cruxtruder";
3807 a.subslice_offset(b);
3811 fn vec_str_conversions() {
3812 let s1: ~str = ~"All mimsy were the borogoves";
3814 let v: ~[u8] = s1.as_bytes().to_owned();
3815 let s2: ~str = from_utf8(v).unwrap().to_owned();
3816 let mut i: uint = 0u;
3817 let n1: uint = s1.len();
3818 let n2: uint = v.len();
3831 fn test_contains() {
3832 assert!("abcde".contains("bcd"));
3833 assert!("abcde".contains("abcd"));
3834 assert!("abcde".contains("bcde"));
3835 assert!("abcde".contains(""));
3836 assert!("".contains(""));
3837 assert!(!"abcde".contains("def"));
3838 assert!(!"".contains("a"));
3840 let data = ~"ประเทศไทย中华Việt Nam";
3841 assert!(data.contains("ประเ"));
3842 assert!(data.contains("ะเ"));
3843 assert!(data.contains("中华"));
3844 assert!(!data.contains("ไท华"));
3848 fn test_contains_char() {
3849 assert!("abc".contains_char('b'));
3850 assert!("a".contains_char('a'));
3851 assert!(!"abc".contains_char('d'));
3852 assert!(!"".contains_char('a'));
3859 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3860 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3861 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3862 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3865 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3866 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3867 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3868 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3869 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3872 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3873 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3874 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3875 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3876 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3877 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3878 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3879 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3881 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3882 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3883 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3884 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3885 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3886 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3887 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3888 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3889 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3890 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3891 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3893 // Issue #12318, even-numbered non-BMP planes
3895 ~[0xD840, 0xDC00])];
3897 for p in pairs.iter() {
3898 let (s, u) = (*p).clone();
3899 assert!(is_utf16(u));
3900 assert_eq!(s.to_utf16(), u);
3902 assert_eq!(from_utf16(u).unwrap(), s);
3903 assert_eq!(from_utf16_lossy(u), s);
3905 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3906 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3911 fn test_utf16_invalid() {
3912 // completely positive cases tested above.
3914 assert_eq!(from_utf16([0xD800]), None);
3916 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3919 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3922 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3926 fn test_utf16_lossy() {
3927 // completely positive cases tested above.
3929 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3931 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3934 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3937 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3941 fn test_truncate_utf16_at_nul() {
3943 assert_eq!(truncate_utf16_at_nul(v), &[]);
3946 assert_eq!(truncate_utf16_at_nul(v), &[]);
3949 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3952 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3955 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3960 let s = ~"ศไทย中华Việt Nam";
3961 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3963 for ch in v.iter() {
3964 assert!(s.char_at(pos) == *ch);
3965 pos += from_char(*ch).len();
3970 fn test_char_at_reverse() {
3971 let s = ~"ศไทย中华Việt Nam";
3972 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3973 let mut pos = s.len();
3974 for ch in v.rev_iter() {
3975 assert!(s.char_at_reverse(pos) == *ch);
3976 pos -= from_char(*ch).len();
3981 fn test_escape_unicode() {
3982 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3983 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3984 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3985 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3986 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3987 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3988 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3989 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3990 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3994 fn test_escape_default() {
3995 assert_eq!("abc".escape_default(), ~"abc");
3996 assert_eq!("a c".escape_default(), ~"a c");
3997 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3998 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3999 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
4000 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
4001 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
4002 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
4006 fn test_total_ord() {
4007 "1234".cmp(& &"123") == Greater;
4008 "123".cmp(& &"1234") == Less;
4009 "1234".cmp(& &"1234") == Equal;
4010 "12345555".cmp(& &"123456") == Less;
4011 "22".cmp(& &"1234") == Greater;
4015 fn test_char_range_at() {
4016 let data = ~"b¢€𤭢𤭢€¢b";
4017 assert_eq!('b', data.char_range_at(0).ch);
4018 assert_eq!('¢', data.char_range_at(1).ch);
4019 assert_eq!('€', data.char_range_at(3).ch);
4020 assert_eq!('𤭢', data.char_range_at(6).ch);
4021 assert_eq!('𤭢', data.char_range_at(10).ch);
4022 assert_eq!('€', data.char_range_at(14).ch);
4023 assert_eq!('¢', data.char_range_at(17).ch);
4024 assert_eq!('b', data.char_range_at(19).ch);
4028 fn test_char_range_at_reverse_underflow() {
4029 assert_eq!("abc".char_range_at_reverse(0).next, 0);
4034 #[allow(unnecessary_allocation)];
4036 ($s1:expr, $s2:expr, $e:expr) => { {
4040 assert_eq!(s1 + s2, e.to_owned());
4041 assert_eq!(s1.to_owned() + s2, e.to_owned());
4045 t!("foo", "bar", "foobar");
4046 t!("foo", ~"bar", "foobar");
4047 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
4048 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
4052 fn test_iterator() {
4054 let s = ~"ศไทย中华Việt Nam";
4055 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4058 let mut it = s.chars();
4061 assert_eq!(c, v[pos]);
4064 assert_eq!(pos, v.len());
4068 fn test_rev_iterator() {
4070 let s = ~"ศไทย中华Việt Nam";
4071 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4074 let mut it = s.chars_rev();
4077 assert_eq!(c, v[pos]);
4080 assert_eq!(pos, v.len());
4084 fn test_iterator_clone() {
4085 let s = "ศไทย中华Việt Nam";
4086 let mut it = s.chars();
4088 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
4092 fn test_bytesator() {
4093 let s = ~"ศไทย中华Việt Nam";
4095 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4096 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4101 for b in s.bytes() {
4102 assert_eq!(b, v[pos]);
4108 fn test_bytes_revator() {
4109 let s = ~"ศไทย中华Việt Nam";
4111 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4112 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4115 let mut pos = v.len();
4117 for b in s.bytes_rev() {
4119 assert_eq!(b, v[pos]);
4124 fn test_char_indicesator() {
4126 let s = "ศไทย中华Việt Nam";
4127 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
4128 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4131 let mut it = s.char_indices();
4134 assert_eq!(c, (p[pos], v[pos]));
4137 assert_eq!(pos, v.len());
4138 assert_eq!(pos, p.len());
4142 fn test_char_indices_revator() {
4144 let s = "ศไทย中华Việt Nam";
4145 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
4146 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4149 let mut it = s.char_indices_rev();
4152 assert_eq!(c, (p[pos], v[pos]));
4155 assert_eq!(pos, v.len());
4156 assert_eq!(pos, p.len());
4160 fn test_split_char_iterator() {
4161 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4163 let split: ~[&str] = data.split(' ').collect();
4164 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4166 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
4168 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4170 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
4171 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4173 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
4175 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4178 let split: ~[&str] = data.split('ä').collect();
4179 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4181 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
4183 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4185 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
4186 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4188 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
4190 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4194 fn test_splitn_char_iterator() {
4195 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4197 let split: ~[&str] = data.splitn(' ', 3).collect();
4198 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4200 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
4201 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4204 let split: ~[&str] = data.splitn('ä', 3).collect();
4205 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4207 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
4208 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4212 fn test_rsplitn_char_iterator() {
4213 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4215 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
4217 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4219 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
4221 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4224 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
4226 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4228 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
4230 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4234 fn test_split_char_iterator_no_trailing() {
4235 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4237 let split: ~[&str] = data.split('\n').collect();
4238 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4240 let split: ~[&str] = data.split_terminator('\n').collect();
4241 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4245 fn test_rev_split_char_iterator_no_trailing() {
4246 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4248 let mut split: ~[&str] = data.split('\n').rev().collect();
4250 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4252 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
4254 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4259 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
4260 let words: ~[&str] = data.words().collect();
4261 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
4265 fn test_nfd_chars() {
4266 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
4267 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
4268 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
4269 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
4270 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4271 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4272 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
4273 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
4274 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4275 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
4279 fn test_nfkd_chars() {
4280 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
4281 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
4282 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
4283 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
4284 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4285 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4286 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
4287 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
4288 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4289 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
4294 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
4295 let lines: ~[&str] = data.lines().collect();
4296 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4298 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
4299 let lines: ~[&str] = data.lines().collect();
4300 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4304 fn test_split_strator() {
4305 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
4306 let v: ~[&str] = s.split_str(sep).collect();
4309 t("--1233345--", "12345", ~["--1233345--"]);
4310 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
4311 t("::hello::there", "::", ~["", "hello", "there"]);
4312 t("hello::there::", "::", ~["hello", "there", ""]);
4313 t("::hello::there::", "::", ~["", "hello", "there", ""]);
4314 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
4315 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
4316 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
4317 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
4319 t("zz", "zz", ~["",""]);
4320 t("ok", "z", ~["ok"]);
4321 t("zzz", "zz", ~["","z"]);
4322 t("zzzzz", "zz", ~["","","z"]);
4326 fn test_str_default() {
4327 use default::Default;
4328 fn t<S: Default + Str>() {
4329 let s: S = Default::default();
4330 assert_eq!(s.as_slice(), "");
4338 fn test_str_container() {
4339 fn sum_len<S: Container>(v: &[S]) -> uint {
4340 v.iter().map(|x| x.len()).sum()
4344 assert_eq!(5, sum_len(["012", "", "34"]));
4345 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
4346 assert_eq!(5, sum_len([s.as_slice()]));
4350 fn test_str_truncate() {
4351 let mut s = ~"12345";
4353 assert_eq!(s.as_slice(), "12345");
4355 assert_eq!(s.as_slice(), "123");
4357 assert_eq!(s.as_slice(), "");
4359 let mut s = ~"12345";
4363 let p_ = s.as_ptr();
4369 fn test_str_truncate_invalid_len() {
4370 let mut s = ~"12345";
4376 fn test_str_truncate_split_codepoint() {
4377 let mut s = ~"\u00FC"; // ü
4382 fn test_str_from_utf8() {
4383 let xs = bytes!("hello");
4384 assert_eq!(from_utf8(xs), Some("hello"));
4386 let xs = bytes!("ศไทย中华Việt Nam");
4387 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
4389 let xs = bytes!("hello", 0xff);
4390 assert_eq!(from_utf8(xs), None);
4394 fn test_str_from_utf8_owned() {
4395 let xs = bytes!("hello").to_owned();
4396 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
4398 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
4399 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
4401 let xs = bytes!("hello", 0xff).to_owned();
4402 assert_eq!(from_utf8_owned(xs), None);
4406 fn test_str_from_utf8_lossy() {
4407 let xs = bytes!("hello");
4408 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
4410 let xs = bytes!("ศไทย中华Việt Nam");
4411 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
4413 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
4414 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4416 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4417 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4419 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4420 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4422 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4423 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4425 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4426 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4428 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4429 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4432 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4433 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4437 fn test_from_str() {
4438 let owned: Option<~str> = from_str(&"string");
4439 assert_eq!(owned, Some(~"string"));
4443 fn test_maybe_owned_traits() {
4444 let s = Slice("abcde");
4445 assert_eq!(s.len(), 5);
4446 assert_eq!(s.as_slice(), "abcde");
4447 assert_eq!(s.to_str(), ~"abcde");
4448 assert_eq!(format!("{}", s), ~"abcde");
4449 assert!(s.lt(&Owned(~"bcdef")));
4450 assert_eq!(Slice(""), Default::default());
4452 let o = Owned(~"abcde");
4453 assert_eq!(o.len(), 5);
4454 assert_eq!(o.as_slice(), "abcde");
4455 assert_eq!(o.to_str(), ~"abcde");
4456 assert_eq!(format!("{}", o), ~"abcde");
4457 assert!(o.lt(&Slice("bcdef")));
4458 assert_eq!(Owned(~""), Default::default());
4460 assert_eq!(s.cmp(&o), Equal);
4461 assert!(s.equals(&o));
4462 assert!(s.equiv(&o));
4464 assert_eq!(o.cmp(&s), Equal);
4465 assert!(o.equals(&s));
4466 assert!(o.equiv(&s));
4470 fn test_maybe_owned_methods() {
4471 let s = Slice("abcde");
4472 assert!(s.is_slice());
4473 assert!(!s.is_owned());
4475 let o = Owned(~"abcde");
4476 assert!(!o.is_slice());
4477 assert!(o.is_owned());
4481 fn test_maybe_owned_clone() {
4482 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4483 assert_eq!(Owned(~"abcde"), Slice("abcde").deep_clone());
4485 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4486 assert_eq!(Owned(~"abcde"), Owned(~"abcde").deep_clone());
4488 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4489 assert_eq!(Slice("abcde"), Slice("abcde").deep_clone());
4491 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4492 assert_eq!(Slice("abcde"), Owned(~"abcde").deep_clone());
4496 fn test_maybe_owned_into_owned() {
4497 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4498 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4502 fn test_into_maybe_owned() {
4503 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4504 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4505 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4506 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4513 use self::test::BenchHarness;
4518 fn char_iterator(bh: &mut BenchHarness) {
4519 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4520 let len = s.char_len();
4522 bh.iter(|| assert_eq!(s.chars().len(), len));
4526 fn char_iterator_ascii(bh: &mut BenchHarness) {
4527 let s = "Mary had a little lamb, Little lamb
4528 Mary had a little lamb, Little lamb
4529 Mary had a little lamb, Little lamb
4530 Mary had a little lamb, Little lamb
4531 Mary had a little lamb, Little lamb
4532 Mary had a little lamb, Little lamb";
4533 let len = s.char_len();
4535 bh.iter(|| assert_eq!(s.chars().len(), len));
4539 fn char_iterator_rev(bh: &mut BenchHarness) {
4540 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4541 let len = s.char_len();
4543 bh.iter(|| assert_eq!(s.chars_rev().len(), len));
4547 fn char_indicesator(bh: &mut BenchHarness) {
4548 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4549 let len = s.char_len();
4551 bh.iter(|| assert_eq!(s.char_indices().len(), len));
4555 fn char_indicesator_rev(bh: &mut BenchHarness) {
4556 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4557 let len = s.char_len();
4559 bh.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4563 fn split_unicode_ascii(bh: &mut BenchHarness) {
4564 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4566 bh.iter(|| assert_eq!(s.split('V').len(), 3));
4570 fn split_unicode_not_ascii(bh: &mut BenchHarness) {
4571 struct NotAscii(char);
4572 impl CharEq for NotAscii {
4573 fn matches(&self, c: char) -> bool {
4574 let NotAscii(cc) = *self;
4577 fn only_ascii(&self) -> bool { false }
4579 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4581 bh.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4586 fn split_ascii(bh: &mut BenchHarness) {
4587 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4588 let len = s.split(' ').len();
4590 bh.iter(|| assert_eq!(s.split(' ').len(), len));
4594 fn split_not_ascii(bh: &mut BenchHarness) {
4595 struct NotAscii(char);
4596 impl CharEq for NotAscii {
4598 fn matches(&self, c: char) -> bool {
4599 let NotAscii(cc) = *self;
4602 fn only_ascii(&self) -> bool { false }
4604 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4605 let len = s.split(' ').len();
4607 bh.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4611 fn split_extern_fn(bh: &mut BenchHarness) {
4612 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4613 let len = s.split(' ').len();
4614 fn pred(c: char) -> bool { c == ' ' }
4616 bh.iter(|| assert_eq!(s.split(pred).len(), len));
4620 fn split_closure(bh: &mut BenchHarness) {
4621 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4622 let len = s.split(' ').len();
4624 bh.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4628 fn split_slice(bh: &mut BenchHarness) {
4629 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4630 let len = s.split(' ').len();
4632 bh.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4636 fn is_utf8_100_ascii(bh: &mut BenchHarness) {
4638 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4639 Lorem ipsum dolor sit amet, consectetur. ");
4641 assert_eq!(100, s.len());
4648 fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
4649 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4650 assert_eq!(100, s.len());
4657 fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4658 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4659 Lorem ipsum dolor sit amet, consectetur. ");
4661 assert_eq!(100, s.len());
4663 let _ = from_utf8_lossy(s);
4668 fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4669 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4670 assert_eq!(100, s.len());
4672 let _ = from_utf8_lossy(s);
4677 fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4678 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4680 let _ = from_utf8_lossy(s);
4685 fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4686 let s = ::vec::from_elem(100, 0xF5u8);
4688 let _ = from_utf8_lossy(s);
4693 fn bench_with_capacity(bh: &mut BenchHarness) {
4700 fn bench_push_str(bh: &mut BenchHarness) {
4701 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4709 fn bench_connect(bh: &mut BenchHarness) {
4710 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4712 let v = [s, s, s, s, s, s, s, s, s, s];
4714 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);