1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
68 and can contain null codepoints.
70 The actual representation of strings have direct mappings to vectors:
72 * `~str` is the same as `~[u8]`
73 * `&str` is the same as `&[u8]`
82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
83 use container::{Container, Mutable};
86 use iter::{Iterator, FromIterator, Extendable, range};
87 use iter::{Filter, AdditiveIterator, Map};
88 use iter::{Rev, DoubleEndedIterator, ExactSize};
91 use option::{None, Option, Some};
93 use from_str::FromStr;
95 use slice::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
103 Section: Creating a string
106 /// Consumes a vector of bytes to create a new utf-8 string.
107 /// Returns None if the vector contains invalid UTF-8.
108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
110 Some(unsafe { raw::from_utf8_owned(vv) })
116 /// Converts a vector to a string slice without performing any allocations.
118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
119 /// returned as a '&str' instead of a '&[u8]'
121 /// Returns None if the slice is not utf-8.
122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
124 Some(unsafe { raw::from_utf8(v) })
128 impl FromStr for ~str {
130 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
133 /// Convert a byte to a UTF-8 string
137 /// Fails if invalid UTF-8
138 pub fn from_byte(b: u8) -> ~str {
140 unsafe { ::cast::transmute(~[b]) }
143 /// Convert a char to a string
144 pub fn from_char(ch: char) -> ~str {
145 let mut buf = StrBuf::new();
150 /// Convert a vector of chars to a string
151 pub fn from_chars(chs: &[char]) -> ~str {
152 chs.iter().map(|c| *c).collect()
155 /// Methods for vectors of strings
156 pub trait StrVector {
157 /// Concatenate a vector of strings.
158 fn concat(&self) -> ~str;
160 /// Concatenate a vector of strings, placing a given separator between each.
161 fn connect(&self, sep: &str) -> ~str;
164 impl<'a, S: Str> StrVector for &'a [S] {
165 fn concat(&self) -> ~str {
166 if self.is_empty() { return ~""; }
168 // `len` calculation may overflow but push_str but will check boundaries
169 let len = self.iter().map(|s| s.as_slice().len()).sum();
171 let mut result = StrBuf::with_capacity(len);
173 for s in self.iter() {
174 result.push_str(s.as_slice())
180 fn connect(&self, sep: &str) -> ~str {
181 if self.is_empty() { return ~""; }
184 if sep.is_empty() { return self.concat(); }
186 // this is wrong without the guarantee that `self` is non-empty
187 // `len` calculation may overflow but push_str but will check boundaries
188 let len = sep.len() * (self.len() - 1)
189 + self.iter().map(|s| s.as_slice().len()).sum();
190 let mut result = StrBuf::with_capacity(len);
191 let mut first = true;
193 for s in self.iter() {
197 result.push_str(sep);
199 result.push_str(s.as_slice());
205 impl<'a, S: Str> StrVector for Vec<S> {
207 fn concat(&self) -> ~str {
208 self.as_slice().concat()
212 fn connect(&self, sep: &str) -> ~str {
213 self.as_slice().connect(sep)
217 /// Something that can be used to compare against a character
219 /// Determine if the splitter should split at the given character
220 fn matches(&self, char) -> bool;
221 /// Indicate if this is only concerned about ASCII characters,
222 /// which can allow for a faster implementation.
223 fn only_ascii(&self) -> bool;
226 impl CharEq for char {
228 fn matches(&self, c: char) -> bool { *self == c }
230 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
233 impl<'a> CharEq for |char|: 'a -> bool {
235 fn matches(&self, c: char) -> bool { (*self)(c) }
237 fn only_ascii(&self) -> bool { false }
240 impl CharEq for extern "Rust" fn(char) -> bool {
242 fn matches(&self, c: char) -> bool { (*self)(c) }
244 fn only_ascii(&self) -> bool { false }
247 impl<'a, C: CharEq> CharEq for &'a [C] {
249 fn matches(&self, c: char) -> bool {
250 self.iter().any(|m| m.matches(c))
253 fn only_ascii(&self) -> bool {
254 self.iter().all(|m| m.only_ascii())
262 /// External iterator for a string's characters.
263 /// Use with the `std::iter` module.
265 pub struct Chars<'a> {
266 /// The slice remaining to be iterated
270 impl<'a> Iterator<char> for Chars<'a> {
272 fn next(&mut self) -> Option<char> {
273 // Decode the next codepoint, then update
274 // the slice to be just the remaining part
275 if self.string.len() != 0 {
276 let CharRange {ch, next} = self.string.char_range_at(0);
278 self.string = raw::slice_unchecked(self.string, next, self.string.len());
287 fn size_hint(&self) -> (uint, Option<uint>) {
288 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
292 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
294 fn next_back(&mut self) -> Option<char> {
295 if self.string.len() != 0 {
296 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
298 self.string = raw::slice_unchecked(self.string, 0, next);
307 /// External iterator for a string's characters and their byte offsets.
308 /// Use with the `std::iter` module.
310 pub struct CharOffsets<'a> {
311 /// The original string to be iterated
316 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
318 fn next(&mut self) -> Option<(uint, char)> {
319 // Compute the byte offset by using the pointer offset between
320 // the original string slice and the iterator's remaining part
321 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
322 self.iter.next().map(|ch| (offset, ch))
326 fn size_hint(&self) -> (uint, Option<uint>) {
327 self.iter.size_hint()
331 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
333 fn next_back(&mut self) -> Option<(uint, char)> {
334 self.iter.next_back().map(|ch| {
335 let offset = self.iter.string.len() +
336 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
342 /// External iterator for a string's characters in reverse order.
343 /// Use with the `std::iter` module.
344 pub type RevChars<'a> = Rev<Chars<'a>>;
346 /// External iterator for a string's characters and their byte offsets in reverse order.
347 /// Use with the `std::iter` module.
348 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
350 /// External iterator for a string's bytes.
351 /// Use with the `std::iter` module.
353 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
355 /// External iterator for a string's bytes in reverse order.
356 /// Use with the `std::iter` module.
357 pub type RevBytes<'a> = Rev<Bytes<'a>>;
359 /// An iterator over the substrings of a string, separated by `sep`.
361 pub struct CharSplits<'a, Sep> {
362 /// The slice remaining to be iterated
365 /// Whether an empty string at the end is allowed
366 allow_trailing_empty: bool,
371 /// An iterator over the substrings of a string, separated by `sep`,
372 /// starting from the back of the string.
373 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
375 /// An iterator over the substrings of a string, separated by `sep`,
376 /// splitting at most `count` times.
378 pub struct CharSplitsN<'a, Sep> {
379 iter: CharSplits<'a, Sep>,
380 /// The number of splits remaining
385 /// An iterator over the words of a string, separated by a sequence of whitespace
387 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
389 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
390 pub type AnyLines<'a> =
391 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
393 impl<'a, Sep> CharSplits<'a, Sep> {
395 fn get_end(&mut self) -> Option<&'a str> {
396 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
397 self.finished = true;
405 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
407 fn next(&mut self) -> Option<&'a str> {
408 if self.finished { return None }
410 let mut next_split = None;
412 for (idx, byte) in self.string.bytes().enumerate() {
413 if self.sep.matches(byte as char) && byte < 128u8 {
414 next_split = Some((idx, idx + 1));
419 for (idx, ch) in self.string.char_indices() {
420 if self.sep.matches(ch) {
421 next_split = Some((idx, self.string.char_range_at(idx).next));
427 Some((a, b)) => unsafe {
428 let elt = raw::slice_unchecked(self.string, 0, a);
429 self.string = raw::slice_unchecked(self.string, b, self.string.len());
432 None => self.get_end(),
437 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
438 for CharSplits<'a, Sep> {
440 fn next_back(&mut self) -> Option<&'a str> {
441 if self.finished { return None }
443 if !self.allow_trailing_empty {
444 self.allow_trailing_empty = true;
445 match self.next_back() {
446 Some(elt) if !elt.is_empty() => return Some(elt),
447 _ => if self.finished { return None }
450 let len = self.string.len();
451 let mut next_split = None;
454 for (idx, byte) in self.string.bytes().enumerate().rev() {
455 if self.sep.matches(byte as char) && byte < 128u8 {
456 next_split = Some((idx, idx + 1));
461 for (idx, ch) in self.string.char_indices_rev() {
462 if self.sep.matches(ch) {
463 next_split = Some((idx, self.string.char_range_at(idx).next));
469 Some((a, b)) => unsafe {
470 let elt = raw::slice_unchecked(self.string, b, len);
471 self.string = raw::slice_unchecked(self.string, 0, a);
474 None => { self.finished = true; Some(self.string) }
479 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
481 fn next(&mut self) -> Option<&'a str> {
484 if self.invert { self.iter.next_back() } else { self.iter.next() }
491 /// An iterator over the start and end indices of the matches of a
492 /// substring within a larger string
494 pub struct MatchIndices<'a> {
500 /// An iterator over the substrings of a string separated by a given
503 pub struct StrSplits<'a> {
504 it: MatchIndices<'a>,
509 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
511 fn next(&mut self) -> Option<(uint, uint)> {
512 // See Issue #1932 for why this is a naive search
513 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
514 let mut match_start = 0;
517 while self.position < h_len {
518 if self.haystack[self.position] == self.needle[match_i] {
519 if match_i == 0 { match_start = self.position; }
523 if match_i == n_len {
525 return Some((match_start, self.position));
528 // failed match, backtrack
531 self.position = match_start;
540 impl<'a> Iterator<&'a str> for StrSplits<'a> {
542 fn next(&mut self) -> Option<&'a str> {
543 if self.finished { return None; }
545 match self.it.next() {
546 Some((from, to)) => {
547 let ret = Some(self.it.haystack.slice(self.last_end, from));
552 self.finished = true;
553 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
559 // Helper functions used for Unicode normalization
560 fn canonical_sort(comb: &mut [(char, u8)]) {
564 let len = comb.len();
565 for i in range(0, len) {
566 let mut swapped = false;
567 for j in range(1, len-i) {
568 let class_a = *comb[j-1].ref1();
569 let class_b = *comb[j].ref1();
570 if class_a != 0 && class_b != 0 && class_a > class_b {
575 if !swapped { break; }
580 enum NormalizationForm {
585 /// External iterator for a string's normalization's characters.
586 /// Use with the `std::iter` module.
588 pub struct Normalizations<'a> {
589 kind: NormalizationForm,
591 buffer: ~[(char, u8)],
595 impl<'a> Iterator<char> for Normalizations<'a> {
597 fn next(&mut self) -> Option<char> {
598 use unicode::decompose::canonical_combining_class;
600 match self.buffer.head() {
606 Some(&(c, _)) if self.sorted => {
610 _ => self.sorted = false
613 let decomposer = match self.kind {
614 NFD => char::decompose_canonical,
615 NFKD => char::decompose_compatible
619 for ch in self.iter {
620 let buffer = &mut self.buffer;
621 let sorted = &mut self.sorted;
623 let class = canonical_combining_class(d);
624 if class == 0 && !*sorted {
625 canonical_sort(*buffer);
628 buffer.push((d, class));
635 canonical_sort(self.buffer);
639 match self.buffer.shift() {
644 Some((c, _)) => Some(c),
649 fn size_hint(&self) -> (uint, Option<uint>) {
650 let (lower, _) = self.iter.size_hint();
655 /// Replace all occurrences of one string with another
659 /// * s - The string containing substrings to replace
660 /// * from - The string to replace
661 /// * to - The replacement string
665 /// The original string with all occurances of `from` replaced with `to`
666 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
667 let mut result = StrBuf::new();
668 let mut last_end = 0;
669 for (start, end) in s.match_indices(from) {
670 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
674 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
679 Section: Comparing strings
682 // share the implementation of the lang-item vs. non-lang-item
685 fn eq_slice_(a: &str, b: &str) -> bool {
686 a.len() == b.len() && unsafe {
687 libc::memcmp(a.as_ptr() as *libc::c_void,
688 b.as_ptr() as *libc::c_void,
689 a.len() as libc::size_t) == 0
693 /// Bytewise slice equality
697 pub fn eq_slice(a: &str, b: &str) -> bool {
701 /// Bytewise slice equality
704 pub fn eq_slice(a: &str, b: &str) -> bool {
708 /// Bytewise string equality
710 #[lang="uniq_str_eq"]
712 pub fn eq(a: &~str, b: &~str) -> bool {
718 pub fn eq(a: &~str, b: &~str) -> bool {
726 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
727 /// returning `true` in that case, or, if it is invalid, `false` with
728 /// `iter` reset such that it is pointing at the first byte in the
729 /// invalid sequence.
731 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
733 // save the current thing we're pointing at.
736 // restore the iterator we had at the start of this codepoint.
737 macro_rules! err ( () => { {*iter = old; return false} });
738 macro_rules! next ( () => {
741 // we needed data, but there was none: error!
746 let first = match iter.next() {
748 // we're at the end of the iterator and a codepoint
749 // boundary at the same time, so this string is valid.
753 // ASCII characters are always valid, so only large
754 // bytes need more examination.
756 let w = utf8_char_width(first);
757 let second = next!();
758 // 2-byte encoding is for codepoints \u0080 to \u07ff
759 // first C2 80 last DF BF
760 // 3-byte encoding is for codepoints \u0800 to \uffff
761 // first E0 A0 80 last EF BF BF
762 // excluding surrogates codepoints \ud800 to \udfff
763 // ED A0 80 to ED BF BF
764 // 4-byte encoding is for codepoints \u10000 to \u10ffff
765 // first F0 90 80 80 last F4 8F BF BF
767 // Use the UTF-8 syntax from the RFC
769 // https://tools.ietf.org/html/rfc3629
771 // UTF8-2 = %xC2-DF UTF8-tail
772 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
773 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
774 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
775 // %xF4 %x80-8F 2( UTF8-tail )
777 2 => if second & 192 != TAG_CONT_U8 {err!()},
779 match (first, second, next!() & 192) {
780 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
781 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
782 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
783 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
788 match (first, second, next!() & 192, next!() & 192) {
789 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
790 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
791 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
801 /// Determines if a vector of bytes contains valid UTF-8.
802 pub fn is_utf8(v: &[u8]) -> bool {
803 run_utf8_validation_iterator(&mut v.iter())
807 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
808 let mut it = v.iter();
810 let ok = run_utf8_validation_iterator(&mut it);
814 // work out how many valid bytes we've consumed
815 // (run_utf8_validation_iterator resets the iterator to just
816 // after the last good byte), which we can do because the
817 // vector iterator size_hint is exact.
818 let (remaining, _) = it.size_hint();
819 Some(v.len() - remaining)
823 /// Determines if a vector of `u16` contains valid UTF-16
824 pub fn is_utf16(v: &[u16]) -> bool {
825 let mut it = v.iter();
826 macro_rules! next ( ($ret:expr) => {
827 match it.next() { Some(u) => *u, None => return $ret }
833 match char::from_u32(u as u32) {
836 let u2 = next!(false);
837 if u < 0xD7FF || u > 0xDBFF ||
838 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
844 /// An iterator that decodes UTF-16 encoded codepoints from a vector
847 pub struct UTF16Items<'a> {
848 iter: slice::Items<'a, u16>
850 /// The possibilities for values decoded from a `u16` stream.
851 #[deriving(Eq, TotalEq, Clone, Show)]
853 /// A valid codepoint.
855 /// An invalid surrogate without its pair.
860 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
861 /// replacement character (U+FFFD).
863 pub fn to_char_lossy(&self) -> char {
866 LoneSurrogate(_) => '\uFFFD'
871 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
872 fn next(&mut self) -> Option<UTF16Item> {
873 let u = match self.iter.next() {
878 if u < 0xD800 || 0xDFFF < u {
880 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
881 } else if u >= 0xDC00 {
882 // a trailing surrogate
883 Some(LoneSurrogate(u))
885 // preserve state for rewinding.
888 let u2 = match self.iter.next() {
891 None => return Some(LoneSurrogate(u))
893 if u2 < 0xDC00 || u2 > 0xDFFF {
894 // not a trailing surrogate so we're not a valid
895 // surrogate pair, so rewind to redecode u2 next time.
897 return Some(LoneSurrogate(u))
900 // all ok, so lets decode it.
901 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
902 Some(ScalarValue(unsafe {cast::transmute(c)}))
907 fn size_hint(&self) -> (uint, Option<uint>) {
908 let (low, high) = self.iter.size_hint();
909 // we could be entirely valid surrogates (2 elements per
910 // char), or entirely non-surrogates (1 element per char)
915 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
916 /// returning invalid surrogates as `LoneSurrogate`s.
922 /// use std::str::{ScalarValue, LoneSurrogate};
924 /// // 𝄞mus<invalid>ic<invalid>
925 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
926 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
929 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
930 /// ~[ScalarValue('𝄞'),
931 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
932 /// LoneSurrogate(0xDD1E),
933 /// ScalarValue('i'), ScalarValue('c'),
934 /// LoneSurrogate(0xD834)]);
936 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
937 UTF16Items { iter : v.iter() }
940 /// Return a slice of `v` ending at (and not including) the first NUL
949 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
950 /// // no NULs so no change
951 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
955 /// assert_eq!(str::truncate_utf16_at_nul(v),
956 /// &['a' as u16, 'b' as u16]);
958 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
959 match v.iter().position(|c| *c == 0) {
960 // don't include the 0
961 Some(i) => v.slice_to(i),
966 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
967 /// if `v` contains any invalid data.
975 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
976 /// 0x0073, 0x0069, 0x0063];
977 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
979 /// // 𝄞mu<invalid>ic
981 /// assert_eq!(str::from_utf16(v), None);
983 pub fn from_utf16(v: &[u16]) -> Option<~str> {
984 let mut s = StrBuf::with_capacity(v.len() / 2);
985 for c in utf16_items(v) {
987 ScalarValue(c) => s.push_char(c),
988 LoneSurrogate(_) => return None
994 /// Decode a UTF-16 encoded vector `v` into a string, replacing
995 /// invalid data with the replacement character (U+FFFD).
1001 /// // 𝄞mus<invalid>ic<invalid>
1002 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1003 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1006 /// assert_eq!(str::from_utf16_lossy(v),
1007 /// ~"𝄞mus\uFFFDic\uFFFD");
1009 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1010 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1013 // https://tools.ietf.org/html/rfc3629
1014 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1015 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1017 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1018 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1026 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1027 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1028 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1029 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1030 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1033 /// Given a first byte, determine how many bytes are in this UTF-8 character
1035 pub fn utf8_char_width(b: u8) -> uint {
1036 return UTF8_CHAR_WIDTH[b as uint] as uint;
1039 /// Struct that contains a `char` and the index of the first byte of
1040 /// the next `char` in a string. This can be used as a data structure
1041 /// for iterating over the UTF-8 bytes of a string.
1042 pub struct CharRange {
1045 /// Index of the first byte of the next `char`
1049 // Return the initial codepoint accumulator for the first byte.
1050 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1051 // for width 3, and 3 bits for width 4
1052 macro_rules! utf8_first_byte(
1053 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1056 // return the value of $ch updated with continuation byte $byte
1057 macro_rules! utf8_acc_cont_byte(
1058 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1061 static TAG_CONT_U8: u8 = 128u8;
1063 /// Converts a vector of bytes to a new utf-8 string.
1064 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1069 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1070 /// let output = std::str::from_utf8_lossy(input);
1071 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1073 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1074 let firstbad = match first_non_utf8_index(v) {
1075 None => return Slice(unsafe { cast::transmute(v) }),
1079 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1080 let mut i = firstbad;
1081 let total = v.len();
1082 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1083 unsafe { *xs.unsafe_ref(i) }
1085 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1093 let mut res = StrBuf::with_capacity(total);
1097 res.push_bytes(v.slice_to(i))
1101 // subseqidx is the index of the first byte of the subsequence we're looking at.
1102 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1104 let mut subseqidx = firstbad;
1108 let byte = unsafe_get(v, i);
1111 macro_rules! error(() => ({
1113 if subseqidx != i_ {
1114 res.push_bytes(v.slice(subseqidx, i_));
1117 res.push_bytes(REPLACEMENT);
1122 // subseqidx handles this
1124 let w = utf8_char_width(byte);
1128 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1135 match (byte, safe_get(v, i, total)) {
1136 (0xE0 , 0xA0 .. 0xBF) => (),
1137 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1138 (0xED , 0x80 .. 0x9F) => (),
1139 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1146 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1153 match (byte, safe_get(v, i, total)) {
1154 (0xF0 , 0x90 .. 0xBF) => (),
1155 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1156 (0xF4 , 0x80 .. 0x8F) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1168 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1181 if subseqidx < total {
1183 res.push_bytes(v.slice(subseqidx, total))
1186 Owned(res.into_owned())
1193 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1194 /// This can be useful as an optimization when an allocation is sometimes
1195 /// needed but not always.
1196 pub enum MaybeOwned<'a> {
1197 /// A borrowed string
1203 /// SendStr is a specialization of `MaybeOwned` to be sendable
1204 pub type SendStr = MaybeOwned<'static>;
1206 impl<'a> MaybeOwned<'a> {
1207 /// Returns `true` if this `MaybeOwned` wraps an owned string
1209 pub fn is_owned(&self) -> bool {
1216 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1218 pub fn is_slice(&self) -> bool {
1226 /// Trait for moving into a `MaybeOwned`
1227 pub trait IntoMaybeOwned<'a> {
1228 /// Moves self into a `MaybeOwned`
1229 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1232 impl<'a> IntoMaybeOwned<'a> for ~str {
1234 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1237 impl<'a> IntoMaybeOwned<'a> for &'a str {
1239 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1242 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1244 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1247 impl<'a> Eq for MaybeOwned<'a> {
1249 fn eq(&self, other: &MaybeOwned) -> bool {
1250 self.as_slice() == other.as_slice()
1254 impl<'a> TotalEq for MaybeOwned<'a> {}
1256 impl<'a> Ord for MaybeOwned<'a> {
1258 fn lt(&self, other: &MaybeOwned) -> bool {
1259 self.as_slice().lt(&other.as_slice())
1263 impl<'a> TotalOrd for MaybeOwned<'a> {
1265 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1266 self.as_slice().cmp(&other.as_slice())
1270 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1272 fn equiv(&self, other: &S) -> bool {
1273 self.as_slice() == other.as_slice()
1277 impl<'a> Str for MaybeOwned<'a> {
1279 fn as_slice<'b>(&'b self) -> &'b str {
1282 Owned(ref s) => s.as_slice()
1287 fn into_owned(self) -> ~str {
1289 Slice(s) => s.to_owned(),
1295 impl<'a> Container for MaybeOwned<'a> {
1297 fn len(&self) -> uint { self.as_slice().len() }
1300 impl<'a> Clone for MaybeOwned<'a> {
1302 fn clone(&self) -> MaybeOwned<'a> {
1304 Slice(s) => Slice(s),
1305 Owned(ref s) => Owned(s.to_owned())
1310 impl<'a> Default for MaybeOwned<'a> {
1312 fn default() -> MaybeOwned<'a> { Slice("") }
1315 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1317 fn hash(&self, hasher: &mut H) {
1319 Slice(s) => s.hash(hasher),
1320 Owned(ref s) => s.hash(hasher),
1325 impl<'a> fmt::Show for MaybeOwned<'a> {
1327 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1329 Slice(ref s) => s.fmt(f),
1330 Owned(ref s) => s.fmt(f)
1335 /// Unsafe operations
1338 use container::Container;
1342 use str::{is_utf8, OwnedStr, StrSlice};
1344 use slice::{MutableVector, ImmutableVector, OwnedVector};
1347 /// Create a Rust string from a *u8 buffer of the given length
1348 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1349 let mut v: ~[u8] = slice::with_capacity(len);
1350 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1353 assert!(is_utf8(v));
1354 ::cast::transmute(v)
1357 #[lang="strdup_uniq"]
1360 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1361 from_buf_len(ptr, len)
1364 /// Create a Rust string from a null-terminated C string
1365 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1370 curr = buf.offset(i);
1372 from_buf_len(buf as *u8, i as uint)
1375 /// Converts a slice of bytes to a string slice without checking
1376 /// that the string contains valid UTF-8.
1377 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1381 /// Converts an owned vector of bytes to a new owned string. This assumes
1382 /// that the utf-8-ness of the vector has already been validated
1384 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1388 /// Converts a byte to a string.
1389 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1391 /// Form a slice from a C string. Unsafe because the caller must ensure the
1392 /// C string has the static lifetime, or else the return value may be
1393 /// invalidated later.
1394 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1398 while *curr != 0u8 {
1400 curr = s.offset(len as int);
1402 let v = Slice { data: s, len: len };
1403 assert!(is_utf8(::cast::transmute(v)));
1404 ::cast::transmute(v)
1407 /// Takes a bytewise (not UTF-8) slice from a string.
1409 /// Returns the substring from [`begin`..`end`).
1413 /// If begin is greater than end.
1414 /// If end is greater than the length of the string.
1416 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1417 assert!(begin <= end);
1418 assert!(end <= s.len());
1419 slice_unchecked(s, begin, end)
1422 /// Takes a bytewise (not UTF-8) slice from a string.
1424 /// Returns the substring from [`begin`..`end`).
1426 /// Caller must check slice boundaries!
1428 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1429 cast::transmute(Slice {
1430 data: s.as_ptr().offset(begin as int),
1435 /// Access the str in its vector representation.
1436 /// The caller must preserve the valid UTF-8 property when modifying.
1438 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1442 /// Sets the length of a string
1444 /// This will explicitly set the size of the string, without actually
1445 /// modifing its buffers, so it is up to the caller to ensure that
1446 /// the string is actually the specified size.
1448 fn test_from_buf_len() {
1450 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1452 let c = from_buf_len(b, 3u);
1453 assert_eq!(c, ~"AAA");
1459 Section: Trait implementations
1463 #[allow(missing_doc)]
1465 use container::Container;
1466 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1469 use option::{Some, None};
1470 use str::{Str, StrSlice, eq_slice};
1473 impl<'a> Add<&'a str,~str> for &'a str {
1475 fn add(&self, rhs: & &'a str) -> ~str {
1476 let mut ret = StrBuf::from_owned_str(self.to_owned());
1482 impl<'a> TotalOrd for &'a str {
1484 fn cmp(&self, other: & &'a str) -> Ordering {
1485 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1486 match s_b.cmp(&o_b) {
1487 Greater => return Greater,
1488 Less => return Less,
1493 self.len().cmp(&other.len())
1497 impl TotalOrd for ~str {
1499 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1502 impl<'a> Eq for &'a str {
1504 fn eq(&self, other: & &'a str) -> bool {
1505 eq_slice((*self), (*other))
1508 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1513 fn eq(&self, other: &~str) -> bool {
1514 eq_slice((*self), (*other))
1518 impl<'a> TotalEq for &'a str {}
1520 impl TotalEq for ~str {}
1522 impl<'a> Ord for &'a str {
1524 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1529 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1532 impl<'a, S: Str> Equiv<S> for &'a str {
1534 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1537 impl<'a, S: Str> Equiv<S> for ~str {
1539 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1546 /// Any string that can be represented as a slice
1548 /// Work with `self` as a slice.
1549 fn as_slice<'a>(&'a self) -> &'a str;
1551 /// Convert `self` into a ~str, not making a copy if possible.
1552 fn into_owned(self) -> ~str;
1554 /// Convert `self` into a `StrBuf`.
1556 fn to_strbuf(&self) -> StrBuf {
1557 StrBuf::from_str(self.as_slice())
1560 /// Convert `self` into a `StrBuf`, not making a copy if possible.
1562 fn into_strbuf(self) -> StrBuf {
1563 StrBuf::from_owned_str(self.into_owned())
1567 impl<'a> Str for &'a str {
1569 fn as_slice<'a>(&'a self) -> &'a str { *self }
1572 fn into_owned(self) -> ~str { self.to_owned() }
1575 impl<'a> Str for ~str {
1577 fn as_slice<'a>(&'a self) -> &'a str {
1578 let s: &'a str = *self; s
1582 fn into_owned(self) -> ~str { self }
1585 impl<'a> Container for &'a str {
1587 fn len(&self) -> uint {
1592 impl Container for ~str {
1594 fn len(&self) -> uint { self.as_slice().len() }
1597 impl Mutable for ~str {
1598 /// Remove all content, make the string empty
1600 fn clear(&mut self) {
1607 /// Methods for string slices
1608 pub trait StrSlice<'a> {
1609 /// Returns true if one string contains another
1613 /// - needle - The string to look for
1614 fn contains<'a>(&self, needle: &'a str) -> bool;
1616 /// Returns true if a string contains a char.
1620 /// - needle - The char to look for
1621 fn contains_char(&self, needle: char) -> bool;
1623 /// An iterator over the characters of `self`. Note, this iterates
1624 /// over unicode code-points, not unicode graphemes.
1629 /// let v: ~[char] = "abc åäö".chars().collect();
1630 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1632 fn chars(&self) -> Chars<'a>;
1634 /// An iterator over the characters of `self`, in reverse order.
1635 fn chars_rev(&self) -> RevChars<'a>;
1637 /// An iterator over the bytes of `self`
1638 fn bytes(&self) -> Bytes<'a>;
1640 /// An iterator over the bytes of `self`, in reverse order
1641 fn bytes_rev(&self) -> RevBytes<'a>;
1643 /// An iterator over the characters of `self` and their byte offsets.
1644 fn char_indices(&self) -> CharOffsets<'a>;
1646 /// An iterator over the characters of `self` and their byte offsets,
1647 /// in reverse order.
1648 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1650 /// An iterator over substrings of `self`, separated by characters
1651 /// matched by `sep`.
1656 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1657 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1659 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1660 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1662 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1663 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1665 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1667 /// An iterator over substrings of `self`, separated by characters
1668 /// matched by `sep`, restricted to splitting at most `count`
1674 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1675 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1677 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1678 /// assert_eq!(v, ~["abc", "def2ghi"]);
1680 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1681 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1683 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1685 /// An iterator over substrings of `self`, separated by characters
1686 /// matched by `sep`.
1688 /// Equivalent to `split`, except that the trailing substring
1689 /// is skipped if empty (terminator semantics).
1694 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1695 /// assert_eq!(v, ~["A", "B"]);
1697 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1698 /// assert_eq!(v, ~["A", "", "B", ""]);
1700 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1702 /// An iterator over substrings of `self`, separated by characters
1703 /// matched by `sep`, in reverse order.
1708 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1709 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1711 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1712 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1714 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1715 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1717 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1719 /// An iterator over substrings of `self`, separated by characters
1720 /// matched by `sep`, starting from the end of the string.
1721 /// Restricted to splitting at most `count` times.
1726 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1727 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1729 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1730 /// assert_eq!(v, ~["ghi", "abc1def"]);
1732 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1733 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1735 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1737 /// An iterator over the start and end indices of the disjoint
1738 /// matches of `sep` within `self`.
1740 /// That is, each returned value `(start, end)` satisfies
1741 /// `self.slice(start, end) == sep`. For matches of `sep` within
1742 /// `self` that overlap, only the indicies corresponding to the
1743 /// first match are returned.
1748 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1749 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1751 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1752 /// assert_eq!(v, ~[(1,4), (4,7)]);
1754 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1755 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1757 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1759 /// An iterator over the substrings of `self` separated by `sep`.
1764 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1765 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1767 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1768 /// assert_eq!(v, ~["1", "", "2"]);
1770 fn split_str(&self, &'a str) -> StrSplits<'a>;
1772 /// An iterator over the lines of a string (subsequences separated
1773 /// by `\n`). This does not include the empty string after a
1779 /// let four_lines = "foo\nbar\n\nbaz\n";
1780 /// let v: ~[&str] = four_lines.lines().collect();
1781 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1783 fn lines(&self) -> CharSplits<'a, char>;
1785 /// An iterator over the lines of a string, separated by either
1786 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1787 /// empty trailing line.
1792 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1793 /// let v: ~[&str] = four_lines.lines_any().collect();
1794 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1796 fn lines_any(&self) -> AnyLines<'a>;
1798 /// An iterator over the words of a string (subsequences separated
1799 /// by any sequence of whitespace). Sequences of whitespace are
1800 /// collapsed, so empty "words" are not included.
1805 /// let some_words = " Mary had\ta little \n\t lamb";
1806 /// let v: ~[&str] = some_words.words().collect();
1807 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1809 fn words(&self) -> Words<'a>;
1811 /// An Iterator over the string in Unicode Normalization Form D
1812 /// (canonical decomposition).
1813 fn nfd_chars(&self) -> Normalizations<'a>;
1815 /// An Iterator over the string in Unicode Normalization Form KD
1816 /// (compatibility decomposition).
1817 fn nfkd_chars(&self) -> Normalizations<'a>;
1819 /// Returns true if the string contains only whitespace.
1821 /// Whitespace characters are determined by `char::is_whitespace`.
1826 /// assert!(" \t\n".is_whitespace());
1827 /// assert!("".is_whitespace());
1829 /// assert!( !"abc".is_whitespace());
1831 fn is_whitespace(&self) -> bool;
1833 /// Returns true if the string contains only alphanumeric code
1836 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1841 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1842 /// assert!("".is_alphanumeric());
1844 /// assert!( !" &*~".is_alphanumeric());
1846 fn is_alphanumeric(&self) -> bool;
1848 /// Returns the number of Unicode code points (`char`) that a
1851 /// This does not perform any normalization, and is `O(n)`, since
1852 /// UTF-8 is a variable width encoding of code points.
1854 /// *Warning*: The number of code points in a string does not directly
1855 /// correspond to the number of visible characters or width of the
1856 /// visible text due to composing characters, and double- and
1857 /// zero-width ones.
1859 /// See also `.len()` for the byte length.
1864 /// // composed forms of `ö` and `é`
1865 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1866 /// // decomposed forms of `ö` and `é`
1867 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1869 /// assert_eq!(c.char_len(), 15);
1870 /// assert_eq!(d.char_len(), 17);
1872 /// assert_eq!(c.len(), 21);
1873 /// assert_eq!(d.len(), 23);
1875 /// // the two strings *look* the same
1876 /// println!("{}", c);
1877 /// println!("{}", d);
1879 fn char_len(&self) -> uint;
1881 /// Returns a slice of the given string from the byte range
1882 /// [`begin`..`end`).
1884 /// This operation is `O(1)`.
1886 /// Fails when `begin` and `end` do not point to valid characters
1887 /// or point beyond the last character of the string.
1889 /// See also `slice_to` and `slice_from` for slicing prefixes and
1890 /// suffixes of strings, and `slice_chars` for slicing based on
1891 /// code point counts.
1896 /// let s = "Löwe 老虎 Léopard";
1897 /// assert_eq!(s.slice(0, 1), "L");
1899 /// assert_eq!(s.slice(1, 9), "öwe 老");
1901 /// // these will fail:
1902 /// // byte 2 lies within `ö`:
1903 /// // s.slice(2, 3);
1905 /// // byte 8 lies within `老`
1906 /// // s.slice(1, 8);
1908 /// // byte 100 is outside the string
1909 /// // s.slice(3, 100);
1911 fn slice(&self, begin: uint, end: uint) -> &'a str;
1913 /// Returns a slice of the string from `begin` to its end.
1915 /// Equivalent to `self.slice(begin, self.len())`.
1917 /// Fails when `begin` does not point to a valid character, or is
1920 /// See also `slice`, `slice_to` and `slice_chars`.
1921 fn slice_from(&self, begin: uint) -> &'a str;
1923 /// Returns a slice of the string from the beginning to byte
1926 /// Equivalent to `self.slice(0, end)`.
1928 /// Fails when `end` does not point to a valid character, or is
1931 /// See also `slice`, `slice_from` and `slice_chars`.
1932 fn slice_to(&self, end: uint) -> &'a str;
1934 /// Returns a slice of the string from the character range
1935 /// [`begin`..`end`).
1937 /// That is, start at the `begin`-th code point of the string and
1938 /// continue to the `end`-th code point. This does not detect or
1939 /// handle edge cases such as leaving a combining character as the
1940 /// first code point of the string.
1942 /// Due to the design of UTF-8, this operation is `O(end)`.
1943 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1944 /// variants that use byte indices rather than code point
1947 /// Fails if `begin` > `end` or the either `begin` or `end` are
1948 /// beyond the last character of the string.
1953 /// let s = "Löwe 老虎 Léopard";
1954 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1955 /// assert_eq!(s.slice_chars(5, 7), "老虎");
1957 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1959 /// Returns true if `needle` is a prefix of the string.
1960 fn starts_with(&self, needle: &str) -> bool;
1962 /// Returns true if `needle` is a suffix of the string.
1963 fn ends_with(&self, needle: &str) -> bool;
1965 /// Escape each char in `s` with `char::escape_default`.
1966 fn escape_default(&self) -> ~str;
1968 /// Escape each char in `s` with `char::escape_unicode`.
1969 fn escape_unicode(&self) -> ~str;
1971 /// Returns a string with leading and trailing whitespace removed.
1972 fn trim(&self) -> &'a str;
1974 /// Returns a string with leading whitespace removed.
1975 fn trim_left(&self) -> &'a str;
1977 /// Returns a string with trailing whitespace removed.
1978 fn trim_right(&self) -> &'a str;
1980 /// Returns a string with characters that match `to_trim` removed.
1984 /// * to_trim - a character matcher
1989 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1990 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1991 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1993 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
1995 /// Returns a string with leading `chars_to_trim` removed.
1999 /// * to_trim - a character matcher
2004 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2005 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2006 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2008 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2010 /// Returns a string with trailing `chars_to_trim` removed.
2014 /// * to_trim - a character matcher
2019 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2020 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2021 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2023 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2025 /// Replace all occurrences of one string with another.
2029 /// * `from` - The string to replace
2030 /// * `to` - The replacement string
2034 /// The original string with all occurances of `from` replaced with `to`.
2039 /// let s = ~"Do you know the muffin man,
2040 /// The muffin man, the muffin man, ...";
2042 /// assert_eq!(s.replace("muffin man", "little lamb"),
2043 /// ~"Do you know the little lamb,
2044 /// The little lamb, the little lamb, ...");
2046 /// // not found, so no change.
2047 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2049 fn replace(&self, from: &str, to: &str) -> ~str;
2051 /// Copy a slice into a new owned str.
2052 fn to_owned(&self) -> ~str;
2054 /// Converts to a vector of `u16` encoded as UTF-16.
2055 fn to_utf16(&self) -> ~[u16];
2057 /// Check that `index`-th byte lies at the start and/or end of a
2058 /// UTF-8 code point sequence.
2060 /// The start and end of the string (when `index == self.len()`)
2061 /// are considered to be boundaries.
2063 /// Fails if `index` is greater than `self.len()`.
2068 /// let s = "Löwe 老虎 Léopard";
2069 /// assert!(s.is_char_boundary(0));
2071 /// assert!(s.is_char_boundary(6));
2072 /// assert!(s.is_char_boundary(s.len()));
2074 /// // second byte of `ö`
2075 /// assert!(!s.is_char_boundary(2));
2077 /// // third byte of `老`
2078 /// assert!(!s.is_char_boundary(8));
2080 fn is_char_boundary(&self, index: uint) -> bool;
2082 /// Pluck a character out of a string and return the index of the next
2085 /// This function can be used to iterate over the unicode characters of a
2090 /// This example manually iterate through the characters of a
2091 /// string; this should normally by done by `.chars()` or
2092 /// `.char_indices`.
2095 /// use std::str::CharRange;
2097 /// let s = "中华Việt Nam";
2099 /// while i < s.len() {
2100 /// let CharRange {ch, next} = s.char_range_at(i);
2101 /// println!("{}: {}", i, ch);
2123 /// * s - The string
2124 /// * i - The byte offset of the char to extract
2128 /// A record {ch: char, next: uint} containing the char value and the byte
2129 /// index of the next unicode character.
2133 /// If `i` is greater than or equal to the length of the string.
2134 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2135 fn char_range_at(&self, start: uint) -> CharRange;
2137 /// Given a byte position and a str, return the previous char and its position.
2139 /// This function can be used to iterate over a unicode string in reverse.
2141 /// Returns 0 for next index if called on start index 0.
2142 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2144 /// Plucks the character starting at the `i`th byte of a string
2145 fn char_at(&self, i: uint) -> char;
2147 /// Plucks the character ending at the `i`th byte of a string
2148 fn char_at_reverse(&self, i: uint) -> char;
2150 /// Work with the byte buffer of a string as a byte slice.
2151 fn as_bytes(&self) -> &'a [u8];
2153 /// Returns the byte index of the first character of `self` that
2154 /// matches `search`.
2158 /// `Some` containing the byte index of the last matching character
2159 /// or `None` if there is no match
2164 /// let s = "Löwe 老虎 Léopard";
2166 /// assert_eq!(s.find('L'), Some(0));
2167 /// assert_eq!(s.find('é'), Some(14));
2169 /// // the first space
2170 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2172 /// // neither are found
2173 /// assert_eq!(s.find(&['1', '2']), None);
2175 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2177 /// Returns the byte index of the last character of `self` that
2178 /// matches `search`.
2182 /// `Some` containing the byte index of the last matching character
2183 /// or `None` if there is no match.
2188 /// let s = "Löwe 老虎 Léopard";
2190 /// assert_eq!(s.rfind('L'), Some(13));
2191 /// assert_eq!(s.rfind('é'), Some(14));
2193 /// // the second space
2194 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2196 /// // searches for an occurrence of either `1` or `2`, but neither are found
2197 /// assert_eq!(s.rfind(&['1', '2']), None);
2199 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2201 /// Returns the byte index of the first matching substring
2205 /// * `needle` - The string to search for
2209 /// `Some` containing the byte index of the first matching substring
2210 /// or `None` if there is no match.
2215 /// let s = "Löwe 老虎 Léopard";
2217 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2218 /// assert_eq!(s.find_str("muffin man"), None);
2220 fn find_str(&self, &str) -> Option<uint>;
2222 /// Given a string, make a new string with repeated copies of it.
2223 fn repeat(&self, nn: uint) -> ~str;
2225 /// Retrieves the first character from a string slice and returns
2226 /// it. This does not allocate a new string; instead, it returns a
2227 /// slice that point one character beyond the character that was
2228 /// shifted. If the string does not contain any characters,
2229 /// a tuple of None and an empty string is returned instead.
2234 /// let s = "Löwe 老虎 Léopard";
2235 /// let (c, s1) = s.slice_shift_char();
2236 /// assert_eq!(c, Some('L'));
2237 /// assert_eq!(s1, "öwe 老虎 Léopard");
2239 /// let (c, s2) = s1.slice_shift_char();
2240 /// assert_eq!(c, Some('ö'));
2241 /// assert_eq!(s2, "we 老虎 Léopard");
2243 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2245 /// Levenshtein Distance between two strings.
2246 fn lev_distance(&self, t: &str) -> uint;
2248 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2250 /// Fails if `inner` is not a direct slice contained within self.
2255 /// let string = "a\nb\nc";
2256 /// let lines: ~[&str] = string.lines().collect();
2258 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2259 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2260 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2262 fn subslice_offset(&self, inner: &str) -> uint;
2264 /// Return an unsafe pointer to the strings buffer.
2266 /// The caller must ensure that the string outlives this pointer,
2267 /// and that it is not reallocated (e.g. by pushing to the
2269 fn as_ptr(&self) -> *u8;
2272 impl<'a> StrSlice<'a> for &'a str {
2274 fn contains<'a>(&self, needle: &'a str) -> bool {
2275 self.find_str(needle).is_some()
2279 fn contains_char(&self, needle: char) -> bool {
2280 self.find(needle).is_some()
2284 fn chars(&self) -> Chars<'a> {
2285 Chars{string: *self}
2289 fn chars_rev(&self) -> RevChars<'a> {
2294 fn bytes(&self) -> Bytes<'a> {
2295 self.as_bytes().iter().map(|&b| b)
2299 fn bytes_rev(&self) -> RevBytes<'a> {
2304 fn char_indices(&self) -> CharOffsets<'a> {
2305 CharOffsets{string: *self, iter: self.chars()}
2309 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2310 self.char_indices().rev()
2314 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2317 only_ascii: sep.only_ascii(),
2319 allow_trailing_empty: true,
2325 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2326 -> CharSplitsN<'a, Sep> {
2328 iter: self.split(sep),
2335 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2336 -> CharSplits<'a, Sep> {
2338 allow_trailing_empty: false,
2344 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2345 self.split(sep).rev()
2349 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2350 -> CharSplitsN<'a, Sep> {
2352 iter: self.split(sep),
2359 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2360 assert!(!sep.is_empty())
2369 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2371 it: self.match_indices(sep),
2378 fn lines(&self) -> CharSplits<'a, char> {
2379 self.split_terminator('\n')
2382 fn lines_any(&self) -> AnyLines<'a> {
2383 self.lines().map(|line| {
2385 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2391 fn words(&self) -> Words<'a> {
2392 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2396 fn nfd_chars(&self) -> Normalizations<'a> {
2406 fn nfkd_chars(&self) -> Normalizations<'a> {
2416 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2419 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2422 fn char_len(&self) -> uint { self.chars().len() }
2425 fn slice(&self, begin: uint, end: uint) -> &'a str {
2426 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2427 unsafe { raw::slice_bytes(*self, begin, end) }
2431 fn slice_from(&self, begin: uint) -> &'a str {
2432 self.slice(begin, self.len())
2436 fn slice_to(&self, end: uint) -> &'a str {
2437 assert!(self.is_char_boundary(end));
2438 unsafe { raw::slice_bytes(*self, 0, end) }
2441 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2442 assert!(begin <= end);
2444 let mut begin_byte = None;
2445 let mut end_byte = None;
2447 // This could be even more efficient by not decoding,
2448 // only finding the char boundaries
2449 for (idx, _) in self.char_indices() {
2450 if count == begin { begin_byte = Some(idx); }
2451 if count == end { end_byte = Some(idx); break; }
2454 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2455 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2457 match (begin_byte, end_byte) {
2458 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2459 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2460 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2465 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2466 let n = needle.len();
2467 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2471 fn ends_with(&self, needle: &str) -> bool {
2472 let (m, n) = (self.len(), needle.len());
2473 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2476 fn escape_default(&self) -> ~str {
2477 let mut out = StrBuf::with_capacity(self.len());
2478 for c in self.chars() {
2479 c.escape_default(|c| out.push_char(c));
2484 fn escape_unicode(&self) -> ~str {
2485 let mut out = StrBuf::with_capacity(self.len());
2486 for c in self.chars() {
2487 c.escape_unicode(|c| out.push_char(c));
2493 fn trim(&self) -> &'a str {
2494 self.trim_left().trim_right()
2498 fn trim_left(&self) -> &'a str {
2499 self.trim_left_chars(&char::is_whitespace)
2503 fn trim_right(&self) -> &'a str {
2504 self.trim_right_chars(&char::is_whitespace)
2508 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2509 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2513 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2514 match self.find(|c: char| !to_trim.matches(c)) {
2516 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2521 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2522 match self.rfind(|c: char| !to_trim.matches(c)) {
2525 let next = self.char_range_at(last).next;
2526 unsafe { raw::slice_bytes(*self, 0u, next) }
2531 fn replace(&self, from: &str, to: &str) -> ~str {
2532 let mut result = StrBuf::new();
2533 let mut last_end = 0;
2534 for (start, end) in self.match_indices(from) {
2535 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2536 result.push_str(to);
2539 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2544 fn to_owned(&self) -> ~str {
2545 let len = self.len();
2547 let mut v = slice::with_capacity(len);
2549 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2551 ::cast::transmute(v)
2555 fn to_utf16(&self) -> ~[u16] {
2557 for ch in self.chars() {
2558 // Arithmetic with u32 literals is easier on the eyes than chars.
2559 let mut ch = ch as u32;
2561 if (ch & 0xFFFF_u32) == ch {
2562 // The BMP falls through (assuming non-surrogate, as it
2564 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2567 // Supplementary planes break into surrogates.
2568 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2570 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2571 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2572 u.push_all([w1, w2])
2579 fn is_char_boundary(&self, index: uint) -> bool {
2580 if index == self.len() { return true; }
2581 let b = self[index];
2582 return b < 128u8 || b >= 192u8;
2586 fn char_range_at(&self, i: uint) -> CharRange {
2587 if self[i] < 128u8 {
2588 return CharRange {ch: self[i] as char, next: i + 1 };
2591 // Multibyte case is a fn to allow char_range_at to inline cleanly
2592 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2593 let mut val = s[i] as u32;
2594 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2597 val = utf8_first_byte!(val, w);
2598 val = utf8_acc_cont_byte!(val, s[i + 1]);
2599 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2600 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2602 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2605 return multibyte_char_range_at(*self, i);
2609 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2610 let mut prev = start;
2612 prev = prev.saturating_sub(1);
2613 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2615 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2616 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2617 // while there is a previous byte == 10......
2618 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2622 let mut val = s[i] as u32;
2623 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2626 val = utf8_first_byte!(val, w);
2627 val = utf8_acc_cont_byte!(val, s[i + 1]);
2628 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2629 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2631 return CharRange {ch: unsafe { transmute(val) }, next: i};
2634 return multibyte_char_range_at_reverse(*self, prev);
2638 fn char_at(&self, i: uint) -> char {
2639 self.char_range_at(i).ch
2643 fn char_at_reverse(&self, i: uint) -> char {
2644 self.char_range_at_reverse(i).ch
2648 fn as_bytes(&self) -> &'a [u8] {
2649 unsafe { cast::transmute(*self) }
2652 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2653 if search.only_ascii() {
2654 self.bytes().position(|b| search.matches(b as char))
2656 for (index, c) in self.char_indices() {
2657 if search.matches(c) { return Some(index); }
2663 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2664 if search.only_ascii() {
2665 self.bytes().rposition(|b| search.matches(b as char))
2667 for (index, c) in self.char_indices_rev() {
2668 if search.matches(c) { return Some(index); }
2674 fn find_str(&self, needle: &str) -> Option<uint> {
2675 if needle.is_empty() {
2678 self.match_indices(needle)
2680 .map(|(start, _end)| start)
2684 fn repeat(&self, nn: uint) -> ~str {
2685 let mut ret = StrBuf::with_capacity(nn * self.len());
2686 for _ in range(0, nn) {
2687 ret.push_str(*self);
2693 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2694 if self.is_empty() {
2695 return (None, *self);
2697 let CharRange {ch, next} = self.char_range_at(0u);
2698 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2699 return (Some(ch), next_s);
2703 fn lev_distance(&self, t: &str) -> uint {
2704 let slen = self.len();
2707 if slen == 0 { return tlen; }
2708 if tlen == 0 { return slen; }
2710 let mut dcol = slice::from_fn(tlen + 1, |x| x);
2712 for (i, sc) in self.chars().enumerate() {
2714 let mut current = i;
2715 dcol[0] = current + 1;
2717 for (j, tc) in t.chars().enumerate() {
2719 let next = dcol[j + 1];
2722 dcol[j + 1] = current;
2724 dcol[j + 1] = ::cmp::min(current, next);
2725 dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2735 fn subslice_offset(&self, inner: &str) -> uint {
2736 let a_start = self.as_ptr() as uint;
2737 let a_end = a_start + self.len();
2738 let b_start = inner.as_ptr() as uint;
2739 let b_end = b_start + inner.len();
2741 assert!(a_start <= b_start);
2742 assert!(b_end <= a_end);
2747 fn as_ptr(&self) -> *u8 {
2752 /// Methods for owned strings
2753 pub trait OwnedStr {
2754 /// Shorten a string to the specified length (which must be <= the current length)
2755 fn truncate(&mut self, len: uint);
2757 /// Consumes the string, returning the underlying byte buffer.
2759 /// The buffer does not have a null terminator.
2760 fn into_bytes(self) -> ~[u8];
2762 /// Sets the length of a string
2764 /// This will explicitly set the size of the string, without actually
2765 /// modifying its buffers, so it is up to the caller to ensure that
2766 /// the string is actually the specified size.
2767 unsafe fn set_len(&mut self, new_len: uint);
2769 /// Pushes the given string onto this string, returning the concatenation of the two strings.
2770 fn append(self, rhs: &str) -> ~str;
2773 impl OwnedStr for ~str {
2775 fn truncate(&mut self, len: uint) {
2776 assert!(len <= self.len());
2777 assert!(self.is_char_boundary(len));
2778 unsafe { self.set_len(len); }
2782 fn into_bytes(self) -> ~[u8] {
2783 unsafe { cast::transmute(self) }
2787 unsafe fn set_len(&mut self, new_len: uint) {
2788 raw::as_owned_vec(self).set_len(new_len)
2792 fn append(self, rhs: &str) -> ~str {
2793 let mut new_str = StrBuf::from_owned_str(self);
2794 new_str.push_str(rhs);
2795 new_str.into_owned()
2799 impl Clone for ~str {
2801 fn clone(&self) -> ~str {
2806 impl FromIterator<char> for ~str {
2808 fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2809 let (lower, _) = iterator.size_hint();
2810 let mut buf = StrBuf::with_capacity(lower);
2811 buf.extend(iterator);
2816 // This works because every lifetime is a sub-lifetime of 'static
2817 impl<'a> Default for &'a str {
2818 fn default() -> &'a str { "" }
2821 impl Default for ~str {
2822 fn default() -> ~str { ~"" }
2827 use iter::AdditiveIterator;
2828 use default::Default;
2835 assert!((eq(&~"", &~"")));
2836 assert!((eq(&~"foo", &~"foo")));
2837 assert!((!eq(&~"foo", &~"bar")));
2841 fn test_eq_slice() {
2842 assert!((eq_slice("foobar".slice(0, 3), "foo")));
2843 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2844 assert!((!eq_slice("foo1", "foo2")));
2850 assert!("" <= "foo");
2851 assert!("foo" <= "foo");
2852 assert!("foo" != "bar");
2857 assert_eq!("".len(), 0u);
2858 assert_eq!("hello world".len(), 11u);
2859 assert_eq!("\x63".len(), 1u);
2860 assert_eq!("\xa2".len(), 2u);
2861 assert_eq!("\u03c0".len(), 2u);
2862 assert_eq!("\u2620".len(), 3u);
2863 assert_eq!("\U0001d11e".len(), 4u);
2865 assert_eq!("".char_len(), 0u);
2866 assert_eq!("hello world".char_len(), 11u);
2867 assert_eq!("\x63".char_len(), 1u);
2868 assert_eq!("\xa2".char_len(), 1u);
2869 assert_eq!("\u03c0".char_len(), 1u);
2870 assert_eq!("\u2620".char_len(), 1u);
2871 assert_eq!("\U0001d11e".char_len(), 1u);
2872 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2877 assert_eq!("hello".find('l'), Some(2u));
2878 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2879 assert!("hello".find('x').is_none());
2880 assert!("hello".find(|c:char| c == 'x').is_none());
2881 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2882 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2887 assert_eq!("hello".rfind('l'), Some(3u));
2888 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2889 assert!("hello".rfind('x').is_none());
2890 assert!("hello".rfind(|c:char| c == 'x').is_none());
2891 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2892 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2898 let s: ~str = empty.chars().collect();
2899 assert_eq!(empty, s);
2900 let data = ~"ประเทศไทย中";
2901 let s: ~str = data.chars().collect();
2902 assert_eq!(data, s);
2906 fn test_into_bytes() {
2908 let buf = data.into_bytes();
2909 assert_eq!(bytes!("asdf"), buf.as_slice());
2913 fn test_find_str() {
2915 assert_eq!("".find_str(""), Some(0u));
2916 assert!("banana".find_str("apple pie").is_none());
2918 let data = "abcabc";
2919 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2920 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2921 assert!(data.slice(2u, 4u).find_str("ab").is_none());
2923 let mut data = ~"ประเทศไทย中华Việt Nam";
2925 assert!(data.find_str("ไท华").is_none());
2926 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2927 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2929 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2930 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2931 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2932 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2933 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2935 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2936 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2937 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2938 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2939 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2943 fn test_slice_chars() {
2944 fn t(a: &str, b: &str, start: uint) {
2945 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2948 t("hello", "llo", 2);
2949 t("hello", "el", 1);
2952 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2957 fn t(v: &[~str], s: &str) {
2958 assert_eq!(v.concat(), s.to_str());
2960 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2961 let v: &[~str] = [];
2968 fn t(v: &[~str], sep: &str, s: &str) {
2969 assert_eq!(v.connect(sep), s.to_str());
2971 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2972 " ", "you know I'm no good");
2973 let v: &[~str] = [];
2975 t([~"hi"], " ", "hi");
2979 fn test_concat_slices() {
2980 fn t(v: &[&str], s: &str) {
2981 assert_eq!(v.concat(), s.to_str());
2983 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2984 let v: &[&str] = [];
2990 fn test_connect_slices() {
2991 fn t(v: &[&str], sep: &str, s: &str) {
2992 assert_eq!(v.connect(sep), s.to_str());
2994 t(["you", "know", "I'm", "no", "good"],
2995 " ", "you know I'm no good");
2997 t(["hi"], " ", "hi");
3002 assert_eq!("x".repeat(4), ~"xxxx");
3003 assert_eq!("hi".repeat(4), ~"hihihihi");
3004 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
3005 assert_eq!("".repeat(4), ~"");
3006 assert_eq!("hi".repeat(0), ~"");
3010 fn test_unsafe_slice() {
3011 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
3012 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3013 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3014 fn a_million_letter_a() -> ~str {
3016 let mut rs = StrBuf::new();
3018 rs.push_str("aaaaaaaaaa");
3023 fn half_a_million_letter_a() -> ~str {
3025 let mut rs = StrBuf::new();
3027 rs.push_str("aaaaa");
3032 let letters = a_million_letter_a();
3033 assert!(half_a_million_letter_a() ==
3034 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3038 fn test_starts_with() {
3039 assert!(("".starts_with("")));
3040 assert!(("abc".starts_with("")));
3041 assert!(("abc".starts_with("a")));
3042 assert!((!"a".starts_with("abc")));
3043 assert!((!"".starts_with("abc")));
3044 assert!((!"ödd".starts_with("-")));
3045 assert!(("ödd".starts_with("öd")));
3049 fn test_ends_with() {
3050 assert!(("".ends_with("")));
3051 assert!(("abc".ends_with("")));
3052 assert!(("abc".ends_with("c")));
3053 assert!((!"a".ends_with("abc")));
3054 assert!((!"".ends_with("abc")));
3055 assert!((!"ddö".ends_with("-")));
3056 assert!(("ddö".ends_with("dö")));
3060 fn test_is_empty() {
3061 assert!("".is_empty());
3062 assert!(!"a".is_empty());
3068 assert_eq!("".replace(a, "b"), ~"");
3069 assert_eq!("a".replace(a, "b"), ~"b");
3070 assert_eq!("ab".replace(a, "b"), ~"bb");
3072 assert!(" test test ".replace(test, "toast") ==
3074 assert_eq!(" test test ".replace(test, ""), ~" ");
3078 fn test_replace_2a() {
3079 let data = ~"ประเทศไทย中华";
3080 let repl = ~"دولة الكويت";
3083 let a2 = ~"دولة الكويتทศไทย中华";
3084 assert_eq!(data.replace(a, repl), a2);
3088 fn test_replace_2b() {
3089 let data = ~"ประเทศไทย中华";
3090 let repl = ~"دولة الكويت";
3093 let b2 = ~"ปรدولة الكويتทศไทย中华";
3094 assert_eq!(data.replace(b, repl), b2);
3098 fn test_replace_2c() {
3099 let data = ~"ประเทศไทย中华";
3100 let repl = ~"دولة الكويت";
3103 let c2 = ~"ประเทศไทยدولة الكويت";
3104 assert_eq!(data.replace(c, repl), c2);
3108 fn test_replace_2d() {
3109 let data = ~"ประเทศไทย中华";
3110 let repl = ~"دولة الكويت";
3113 assert_eq!(data.replace(d, repl), data);
3118 assert_eq!("ab", "abc".slice(0, 2));
3119 assert_eq!("bc", "abc".slice(1, 3));
3120 assert_eq!("", "abc".slice(1, 1));
3121 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3123 let data = "ประเทศไทย中华";
3124 assert_eq!("ป", data.slice(0, 3));
3125 assert_eq!("ร", data.slice(3, 6));
3126 assert_eq!("", data.slice(3, 3));
3127 assert_eq!("华", data.slice(30, 33));
3129 fn a_million_letter_X() -> ~str {
3131 let mut rs = StrBuf::new();
3133 rs.push_str("华华华华华华华华华华");
3138 fn half_a_million_letter_X() -> ~str {
3140 let mut rs = StrBuf::new();
3142 rs.push_str("华华华华华");
3147 let letters = a_million_letter_X();
3148 assert!(half_a_million_letter_X() ==
3149 letters.slice(0u, 3u * 500000u).to_owned());
3154 let ss = "中华Việt Nam";
3156 assert_eq!("华", ss.slice(3u, 6u));
3157 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3159 assert_eq!("ab", "abc".slice(0u, 2u));
3160 assert_eq!("bc", "abc".slice(1u, 3u));
3161 assert_eq!("", "abc".slice(1u, 1u));
3163 assert_eq!("中", ss.slice(0u, 3u));
3164 assert_eq!("华V", ss.slice(3u, 7u));
3165 assert_eq!("", ss.slice(3u, 3u));
3180 fn test_slice_fail() {
3181 "中华Việt Nam".slice(0u, 2u);
3185 fn test_slice_from() {
3186 assert_eq!("abcd".slice_from(0), "abcd");
3187 assert_eq!("abcd".slice_from(2), "cd");
3188 assert_eq!("abcd".slice_from(4), "");
3191 fn test_slice_to() {
3192 assert_eq!("abcd".slice_to(0), "");
3193 assert_eq!("abcd".slice_to(2), "ab");
3194 assert_eq!("abcd".slice_to(4), "abcd");
3198 fn test_trim_left_chars() {
3199 let v: &[char] = &[];
3200 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3201 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3202 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3203 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3205 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3206 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3207 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3211 fn test_trim_right_chars() {
3212 let v: &[char] = &[];
3213 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3214 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3215 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3216 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3218 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3219 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3220 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3224 fn test_trim_chars() {
3225 let v: &[char] = &[];
3226 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3227 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3228 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3229 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3231 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3232 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3233 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3237 fn test_trim_left() {
3238 assert_eq!("".trim_left(), "");
3239 assert_eq!("a".trim_left(), "a");
3240 assert_eq!(" ".trim_left(), "");
3241 assert_eq!(" blah".trim_left(), "blah");
3242 assert_eq!(" \u3000 wut".trim_left(), "wut");
3243 assert_eq!("hey ".trim_left(), "hey ");
3247 fn test_trim_right() {
3248 assert_eq!("".trim_right(), "");
3249 assert_eq!("a".trim_right(), "a");
3250 assert_eq!(" ".trim_right(), "");
3251 assert_eq!("blah ".trim_right(), "blah");
3252 assert_eq!("wut \u3000 ".trim_right(), "wut");
3253 assert_eq!(" hey".trim_right(), " hey");
3258 assert_eq!("".trim(), "");
3259 assert_eq!("a".trim(), "a");
3260 assert_eq!(" ".trim(), "");
3261 assert_eq!(" blah ".trim(), "blah");
3262 assert_eq!("\nwut \u3000 ".trim(), "wut");
3263 assert_eq!(" hey dude ".trim(), "hey dude");
3267 fn test_is_whitespace() {
3268 assert!("".is_whitespace());
3269 assert!(" ".is_whitespace());
3270 assert!("\u2009".is_whitespace()); // Thin space
3271 assert!(" \n\t ".is_whitespace());
3272 assert!(!" _ ".is_whitespace());
3276 fn test_slice_shift_char() {
3277 let data = "ประเทศไทย中";
3278 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3282 fn test_slice_shift_char_2() {
3284 assert_eq!(empty.slice_shift_char(), (None, ""));
3289 // deny overlong encodings
3290 assert!(!is_utf8([0xc0, 0x80]));
3291 assert!(!is_utf8([0xc0, 0xae]));
3292 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3293 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3294 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3295 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3296 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3299 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3300 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3302 assert!(is_utf8([0xC2, 0x80]));
3303 assert!(is_utf8([0xDF, 0xBF]));
3304 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3305 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3306 assert!(is_utf8([0xEE, 0x80, 0x80]));
3307 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3308 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3309 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3313 fn test_is_utf16() {
3314 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3322 // surrogate pairs (randomly generated with Python 3's
3323 // .encode('utf-16be'))
3324 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3325 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3326 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3328 // mixtures (also random)
3329 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3330 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3331 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3334 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3337 // surrogate + regular unit
3339 // surrogate + lead surrogate
3341 // unterminated surrogate
3343 // trail surrogate without a lead
3346 // random byte sequences that Python 3's .decode('utf-16be')
3348 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3349 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3350 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3351 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3352 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3353 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3354 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3355 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3356 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3357 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3358 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3359 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3360 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3361 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3362 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3363 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3364 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3365 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3366 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3367 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3368 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3372 fn test_raw_from_c_str() {
3374 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3376 let c = raw::from_c_str(b);
3377 assert_eq!(c, ~"AAAAAAA");
3382 fn test_as_bytes() {
3385 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3386 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3389 assert_eq!("".as_bytes(), &[]);
3390 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3391 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3396 fn test_as_bytes_fail() {
3397 // Don't double free. (I'm not sure if this exercises the
3398 // original problem code path anymore.)
3400 let _bytes = s.as_bytes();
3406 let buf = "hello".as_ptr();
3408 assert_eq!(*buf.offset(0), 'h' as u8);
3409 assert_eq!(*buf.offset(1), 'e' as u8);
3410 assert_eq!(*buf.offset(2), 'l' as u8);
3411 assert_eq!(*buf.offset(3), 'l' as u8);
3412 assert_eq!(*buf.offset(4), 'o' as u8);
3417 fn test_subslice_offset() {
3418 let a = "kernelsprite";
3419 let b = a.slice(7, a.len());
3420 let c = a.slice(0, a.len() - 6);
3421 assert_eq!(a.subslice_offset(b), 7);
3422 assert_eq!(a.subslice_offset(c), 0);
3424 let string = "a\nb\nc";
3425 let mut lines = ~[];
3426 for line in string.lines() { lines.push(line) }
3427 assert_eq!(string.subslice_offset(lines[0]), 0);
3428 assert_eq!(string.subslice_offset(lines[1]), 2);
3429 assert_eq!(string.subslice_offset(lines[2]), 4);
3434 fn test_subslice_offset_2() {
3435 let a = "alchemiter";
3436 let b = "cruxtruder";
3437 a.subslice_offset(b);
3441 fn vec_str_conversions() {
3442 let s1: ~str = ~"All mimsy were the borogoves";
3444 let v: ~[u8] = s1.as_bytes().to_owned();
3445 let s2: ~str = from_utf8(v).unwrap().to_owned();
3446 let mut i: uint = 0u;
3447 let n1: uint = s1.len();
3448 let n2: uint = v.len();
3461 fn test_contains() {
3462 assert!("abcde".contains("bcd"));
3463 assert!("abcde".contains("abcd"));
3464 assert!("abcde".contains("bcde"));
3465 assert!("abcde".contains(""));
3466 assert!("".contains(""));
3467 assert!(!"abcde".contains("def"));
3468 assert!(!"".contains("a"));
3470 let data = ~"ประเทศไทย中华Việt Nam";
3471 assert!(data.contains("ประเ"));
3472 assert!(data.contains("ะเ"));
3473 assert!(data.contains("中华"));
3474 assert!(!data.contains("ไท华"));
3478 fn test_contains_char() {
3479 assert!("abc".contains_char('b'));
3480 assert!("a".contains_char('a'));
3481 assert!(!"abc".contains_char('d'));
3482 assert!(!"".contains_char('a'));
3489 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3490 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3491 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3492 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3495 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3496 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3497 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3498 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3499 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3502 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3503 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3504 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3505 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3506 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3507 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3508 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3509 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3511 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3512 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3513 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3514 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3515 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3516 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3517 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3518 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3519 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3520 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3521 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3523 // Issue #12318, even-numbered non-BMP planes
3525 ~[0xD840, 0xDC00])];
3527 for p in pairs.iter() {
3528 let (s, u) = (*p).clone();
3529 assert!(is_utf16(u));
3530 assert_eq!(s.to_utf16(), u);
3532 assert_eq!(from_utf16(u).unwrap(), s);
3533 assert_eq!(from_utf16_lossy(u), s);
3535 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3536 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3541 fn test_utf16_invalid() {
3542 // completely positive cases tested above.
3544 assert_eq!(from_utf16([0xD800]), None);
3546 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3549 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3552 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3556 fn test_utf16_lossy() {
3557 // completely positive cases tested above.
3559 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3561 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3564 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3567 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3571 fn test_truncate_utf16_at_nul() {
3573 assert_eq!(truncate_utf16_at_nul(v), &[]);
3576 assert_eq!(truncate_utf16_at_nul(v), &[]);
3579 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3582 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3585 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3590 let s = ~"ศไทย中华Việt Nam";
3591 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3593 for ch in v.iter() {
3594 assert!(s.char_at(pos) == *ch);
3595 pos += from_char(*ch).len();
3600 fn test_char_at_reverse() {
3601 let s = ~"ศไทย中华Việt Nam";
3602 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3603 let mut pos = s.len();
3604 for ch in v.rev_iter() {
3605 assert!(s.char_at_reverse(pos) == *ch);
3606 pos -= from_char(*ch).len();
3611 fn test_escape_unicode() {
3612 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3613 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3614 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3615 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3616 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3617 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3618 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3619 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3620 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3624 fn test_escape_default() {
3625 assert_eq!("abc".escape_default(), ~"abc");
3626 assert_eq!("a c".escape_default(), ~"a c");
3627 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3628 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3629 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3630 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3631 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3632 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3636 fn test_total_ord() {
3637 "1234".cmp(& &"123") == Greater;
3638 "123".cmp(& &"1234") == Less;
3639 "1234".cmp(& &"1234") == Equal;
3640 "12345555".cmp(& &"123456") == Less;
3641 "22".cmp(& &"1234") == Greater;
3645 fn test_char_range_at() {
3646 let data = ~"b¢€𤭢𤭢€¢b";
3647 assert_eq!('b', data.char_range_at(0).ch);
3648 assert_eq!('¢', data.char_range_at(1).ch);
3649 assert_eq!('€', data.char_range_at(3).ch);
3650 assert_eq!('𤭢', data.char_range_at(6).ch);
3651 assert_eq!('𤭢', data.char_range_at(10).ch);
3652 assert_eq!('€', data.char_range_at(14).ch);
3653 assert_eq!('¢', data.char_range_at(17).ch);
3654 assert_eq!('b', data.char_range_at(19).ch);
3658 fn test_char_range_at_reverse_underflow() {
3659 assert_eq!("abc".char_range_at_reverse(0).next, 0);
3664 #![allow(unnecessary_allocation)]
3666 ($s1:expr, $s2:expr, $e:expr) => { {
3670 assert_eq!(s1 + s2, e.to_owned());
3671 assert_eq!(s1.to_owned() + s2, e.to_owned());
3675 t!("foo", "bar", "foobar");
3676 t!("foo", ~"bar", "foobar");
3677 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
3678 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
3682 fn test_iterator() {
3684 let s = ~"ศไทย中华Việt Nam";
3685 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3688 let mut it = s.chars();
3691 assert_eq!(c, v[pos]);
3694 assert_eq!(pos, v.len());
3698 fn test_rev_iterator() {
3700 let s = ~"ศไทย中华Việt Nam";
3701 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3704 let mut it = s.chars_rev();
3707 assert_eq!(c, v[pos]);
3710 assert_eq!(pos, v.len());
3714 fn test_iterator_clone() {
3715 let s = "ศไทย中华Việt Nam";
3716 let mut it = s.chars();
3718 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3722 fn test_bytesator() {
3723 let s = ~"ศไทย中华Việt Nam";
3725 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3726 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3731 for b in s.bytes() {
3732 assert_eq!(b, v[pos]);
3738 fn test_bytes_revator() {
3739 let s = ~"ศไทย中华Việt Nam";
3741 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3742 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3745 let mut pos = v.len();
3747 for b in s.bytes_rev() {
3749 assert_eq!(b, v[pos]);
3754 fn test_char_indicesator() {
3756 let s = "ศไทย中华Việt Nam";
3757 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3758 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3761 let mut it = s.char_indices();
3764 assert_eq!(c, (p[pos], v[pos]));
3767 assert_eq!(pos, v.len());
3768 assert_eq!(pos, p.len());
3772 fn test_char_indices_revator() {
3774 let s = "ศไทย中华Việt Nam";
3775 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3776 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3779 let mut it = s.char_indices_rev();
3782 assert_eq!(c, (p[pos], v[pos]));
3785 assert_eq!(pos, v.len());
3786 assert_eq!(pos, p.len());
3790 fn test_split_char_iterator() {
3791 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3793 let split: ~[&str] = data.split(' ').collect();
3794 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3796 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
3798 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3800 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3801 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3803 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
3805 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3808 let split: ~[&str] = data.split('ä').collect();
3809 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3811 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
3813 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3815 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3816 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3818 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
3820 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3824 fn test_splitn_char_iterator() {
3825 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3827 let split: ~[&str] = data.splitn(' ', 3).collect();
3828 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3830 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3831 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3834 let split: ~[&str] = data.splitn('ä', 3).collect();
3835 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3837 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3838 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3842 fn test_rsplitn_char_iterator() {
3843 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3845 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3847 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3849 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3851 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3854 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3856 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3858 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3860 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3864 fn test_split_char_iterator_no_trailing() {
3865 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3867 let split: ~[&str] = data.split('\n').collect();
3868 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3870 let split: ~[&str] = data.split_terminator('\n').collect();
3871 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3875 fn test_rev_split_char_iterator_no_trailing() {
3876 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3878 let mut split: ~[&str] = data.split('\n').rev().collect();
3880 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3882 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3884 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3889 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
3890 let words: ~[&str] = data.words().collect();
3891 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3895 fn test_nfd_chars() {
3896 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
3897 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
3898 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
3899 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
3900 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3901 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3902 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
3903 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
3904 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3905 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
3909 fn test_nfkd_chars() {
3910 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
3911 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
3912 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
3913 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
3914 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3915 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3916 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
3917 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
3918 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3919 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
3924 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3925 let lines: ~[&str] = data.lines().collect();
3926 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3928 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3929 let lines: ~[&str] = data.lines().collect();
3930 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3934 fn test_split_strator() {
3935 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3936 let v: ~[&str] = s.split_str(sep).collect();
3939 t("--1233345--", "12345", ~["--1233345--"]);
3940 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3941 t("::hello::there", "::", ~["", "hello", "there"]);
3942 t("hello::there::", "::", ~["hello", "there", ""]);
3943 t("::hello::there::", "::", ~["", "hello", "there", ""]);
3944 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3945 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3946 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3947 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3949 t("zz", "zz", ~["",""]);
3950 t("ok", "z", ~["ok"]);
3951 t("zzz", "zz", ~["","z"]);
3952 t("zzzzz", "zz", ~["","","z"]);
3956 fn test_str_default() {
3957 use default::Default;
3958 fn t<S: Default + Str>() {
3959 let s: S = Default::default();
3960 assert_eq!(s.as_slice(), "");
3968 fn test_str_container() {
3969 fn sum_len<S: Container>(v: &[S]) -> uint {
3970 v.iter().map(|x| x.len()).sum()
3974 assert_eq!(5, sum_len(["012", "", "34"]));
3975 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3976 assert_eq!(5, sum_len([s.as_slice()]));
3980 fn test_str_from_utf8() {
3981 let xs = bytes!("hello");
3982 assert_eq!(from_utf8(xs), Some("hello"));
3984 let xs = bytes!("ศไทย中华Việt Nam");
3985 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3987 let xs = bytes!("hello", 0xff);
3988 assert_eq!(from_utf8(xs), None);
3992 fn test_str_from_utf8_owned() {
3993 let xs = bytes!("hello").to_owned();
3994 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
3996 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3997 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
3999 let xs = bytes!("hello", 0xff).to_owned();
4000 assert_eq!(from_utf8_owned(xs), None);
4004 fn test_str_from_utf8_lossy() {
4005 let xs = bytes!("hello");
4006 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
4008 let xs = bytes!("ศไทย中华Việt Nam");
4009 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
4011 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
4012 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4014 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4015 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4017 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4018 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4020 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4021 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4023 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4024 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4026 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4027 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4030 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4031 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4035 fn test_from_str() {
4036 let owned: Option<~str> = from_str(&"string");
4037 assert_eq!(owned, Some(~"string"));
4041 fn test_maybe_owned_traits() {
4042 let s = Slice("abcde");
4043 assert_eq!(s.len(), 5);
4044 assert_eq!(s.as_slice(), "abcde");
4045 assert_eq!(s.to_str(), ~"abcde");
4046 assert_eq!(format!("{}", s), ~"abcde");
4047 assert!(s.lt(&Owned(~"bcdef")));
4048 assert_eq!(Slice(""), Default::default());
4050 let o = Owned(~"abcde");
4051 assert_eq!(o.len(), 5);
4052 assert_eq!(o.as_slice(), "abcde");
4053 assert_eq!(o.to_str(), ~"abcde");
4054 assert_eq!(format!("{}", o), ~"abcde");
4055 assert!(o.lt(&Slice("bcdef")));
4056 assert_eq!(Owned(~""), Default::default());
4058 assert!(s.cmp(&o) == Equal);
4059 assert!(s.equiv(&o));
4061 assert!(o.cmp(&s) == Equal);
4062 assert!(o.equiv(&s));
4066 fn test_maybe_owned_methods() {
4067 let s = Slice("abcde");
4068 assert!(s.is_slice());
4069 assert!(!s.is_owned());
4071 let o = Owned(~"abcde");
4072 assert!(!o.is_slice());
4073 assert!(o.is_owned());
4077 fn test_maybe_owned_clone() {
4078 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4079 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4080 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4081 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4085 fn test_maybe_owned_into_owned() {
4086 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4087 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4091 fn test_into_maybe_owned() {
4092 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4093 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4094 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4095 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4102 use self::test::Bencher;
4107 fn char_iterator(b: &mut Bencher) {
4108 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4109 let len = s.char_len();
4111 b.iter(|| assert_eq!(s.chars().len(), len));
4115 fn char_iterator_ascii(b: &mut Bencher) {
4116 let s = "Mary had a little lamb, Little lamb
4117 Mary had a little lamb, Little lamb
4118 Mary had a little lamb, Little lamb
4119 Mary had a little lamb, Little lamb
4120 Mary had a little lamb, Little lamb
4121 Mary had a little lamb, Little lamb";
4122 let len = s.char_len();
4124 b.iter(|| assert_eq!(s.chars().len(), len));
4128 fn char_iterator_rev(b: &mut Bencher) {
4129 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4130 let len = s.char_len();
4132 b.iter(|| assert_eq!(s.chars_rev().len(), len));
4136 fn char_indicesator(b: &mut Bencher) {
4137 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4138 let len = s.char_len();
4140 b.iter(|| assert_eq!(s.char_indices().len(), len));
4144 fn char_indicesator_rev(b: &mut Bencher) {
4145 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4146 let len = s.char_len();
4148 b.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4152 fn split_unicode_ascii(b: &mut Bencher) {
4153 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4155 b.iter(|| assert_eq!(s.split('V').len(), 3));
4159 fn split_unicode_not_ascii(b: &mut Bencher) {
4160 struct NotAscii(char);
4161 impl CharEq for NotAscii {
4162 fn matches(&self, c: char) -> bool {
4163 let NotAscii(cc) = *self;
4166 fn only_ascii(&self) -> bool { false }
4168 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4170 b.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4175 fn split_ascii(b: &mut Bencher) {
4176 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4177 let len = s.split(' ').len();
4179 b.iter(|| assert_eq!(s.split(' ').len(), len));
4183 fn split_not_ascii(b: &mut Bencher) {
4184 struct NotAscii(char);
4185 impl CharEq for NotAscii {
4187 fn matches(&self, c: char) -> bool {
4188 let NotAscii(cc) = *self;
4191 fn only_ascii(&self) -> bool { false }
4193 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4194 let len = s.split(' ').len();
4196 b.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4200 fn split_extern_fn(b: &mut Bencher) {
4201 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4202 let len = s.split(' ').len();
4203 fn pred(c: char) -> bool { c == ' ' }
4205 b.iter(|| assert_eq!(s.split(pred).len(), len));
4209 fn split_closure(b: &mut Bencher) {
4210 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4211 let len = s.split(' ').len();
4213 b.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4217 fn split_slice(b: &mut Bencher) {
4218 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4219 let len = s.split(' ').len();
4221 b.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4225 fn is_utf8_100_ascii(b: &mut Bencher) {
4227 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4228 Lorem ipsum dolor sit amet, consectetur. ");
4230 assert_eq!(100, s.len());
4237 fn is_utf8_100_multibyte(b: &mut Bencher) {
4238 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4239 assert_eq!(100, s.len());
4246 fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
4247 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4248 Lorem ipsum dolor sit amet, consectetur. ");
4250 assert_eq!(100, s.len());
4252 let _ = from_utf8_lossy(s);
4257 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
4258 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4259 assert_eq!(100, s.len());
4261 let _ = from_utf8_lossy(s);
4266 fn from_utf8_lossy_invalid(b: &mut Bencher) {
4267 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4269 let _ = from_utf8_lossy(s);
4274 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
4275 let s = ::slice::from_elem(100, 0xF5u8);
4277 let _ = from_utf8_lossy(s);
4282 fn bench_connect(b: &mut Bencher) {
4283 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4285 let v = [s, s, s, s, s, s, s, s, s, s];
4287 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);