1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = "I am an owned string".to_owned();
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
68 and can contain null codepoints.
70 The actual representation of strings have direct mappings to vectors:
72 * `~str` is the same as `~[u8]`
73 * `&str` is the same as `&[u8]`
82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
83 use container::Container;
86 use iter::{Iterator, FromIterator, Extendable, range};
87 use iter::{Filter, AdditiveIterator, Map};
88 use iter::{Rev, DoubleEndedIterator, ExactSize};
91 use option::{None, Option, Some};
93 use from_str::FromStr;
95 use slice::{OwnedVector, ImmutableVector, MutableVector};
103 Section: Creating a string
106 /// Consumes a vector of bytes to create a new utf-8 string.
107 /// Returns None if the vector contains invalid UTF-8.
108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
110 Some(unsafe { raw::from_utf8_owned(vv) })
116 /// Converts a vector to a string slice without performing any allocations.
118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
119 /// returned as a '&str' instead of a '&[u8]'
121 /// Returns None if the slice is not utf-8.
122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
124 Some(unsafe { raw::from_utf8(v) })
128 impl FromStr for ~str {
130 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
133 /// Convert a byte to a UTF-8 string
137 /// Fails if invalid UTF-8
138 pub fn from_byte(b: u8) -> ~str {
140 unsafe { ::cast::transmute(~[b]) }
143 /// Convert a char to a string
144 pub fn from_char(ch: char) -> ~str {
145 let mut buf = StrBuf::new();
150 /// Convert a vector of chars to a string
151 pub fn from_chars(chs: &[char]) -> ~str {
152 chs.iter().map(|c| *c).collect()
155 /// Methods for vectors of strings
156 pub trait StrVector {
157 /// Concatenate a vector of strings.
158 fn concat(&self) -> ~str;
160 /// Concatenate a vector of strings, placing a given separator between each.
161 fn connect(&self, sep: &str) -> ~str;
164 impl<'a, S: Str> StrVector for &'a [S] {
165 fn concat(&self) -> ~str {
166 if self.is_empty() { return "".to_owned(); }
168 // `len` calculation may overflow but push_str but will check boundaries
169 let len = self.iter().map(|s| s.as_slice().len()).sum();
171 let mut result = StrBuf::with_capacity(len);
173 for s in self.iter() {
174 result.push_str(s.as_slice())
180 fn connect(&self, sep: &str) -> ~str {
181 if self.is_empty() { return "".to_owned(); }
184 if sep.is_empty() { return self.concat(); }
186 // this is wrong without the guarantee that `self` is non-empty
187 // `len` calculation may overflow but push_str but will check boundaries
188 let len = sep.len() * (self.len() - 1)
189 + self.iter().map(|s| s.as_slice().len()).sum();
190 let mut result = StrBuf::with_capacity(len);
191 let mut first = true;
193 for s in self.iter() {
197 result.push_str(sep);
199 result.push_str(s.as_slice());
205 impl<'a, S: Str> StrVector for Vec<S> {
207 fn concat(&self) -> ~str {
208 self.as_slice().concat()
212 fn connect(&self, sep: &str) -> ~str {
213 self.as_slice().connect(sep)
217 /// Something that can be used to compare against a character
219 /// Determine if the splitter should split at the given character
220 fn matches(&self, char) -> bool;
221 /// Indicate if this is only concerned about ASCII characters,
222 /// which can allow for a faster implementation.
223 fn only_ascii(&self) -> bool;
226 impl CharEq for char {
228 fn matches(&self, c: char) -> bool { *self == c }
231 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
234 impl<'a> CharEq for |char|: 'a -> bool {
236 fn matches(&self, c: char) -> bool { (*self)(c) }
239 fn only_ascii(&self) -> bool { false }
242 impl CharEq for extern "Rust" fn(char) -> bool {
244 fn matches(&self, c: char) -> bool { (*self)(c) }
247 fn only_ascii(&self) -> bool { false }
250 impl<'a, C: CharEq> CharEq for &'a [C] {
252 fn matches(&self, c: char) -> bool {
253 self.iter().any(|m| m.matches(c))
257 fn only_ascii(&self) -> bool {
258 self.iter().all(|m| m.only_ascii())
266 /// External iterator for a string's characters.
267 /// Use with the `std::iter` module.
269 pub struct Chars<'a> {
270 /// The slice remaining to be iterated
274 impl<'a> Iterator<char> for Chars<'a> {
276 fn next(&mut self) -> Option<char> {
277 // Decode the next codepoint, then update
278 // the slice to be just the remaining part
279 if self.string.len() != 0 {
280 let CharRange {ch, next} = self.string.char_range_at(0);
282 self.string = raw::slice_unchecked(self.string, next, self.string.len());
291 fn size_hint(&self) -> (uint, Option<uint>) {
292 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
296 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
298 fn next_back(&mut self) -> Option<char> {
299 if self.string.len() != 0 {
300 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
302 self.string = raw::slice_unchecked(self.string, 0, next);
311 /// External iterator for a string's characters and their byte offsets.
312 /// Use with the `std::iter` module.
314 pub struct CharOffsets<'a> {
315 /// The original string to be iterated
320 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
322 fn next(&mut self) -> Option<(uint, char)> {
323 // Compute the byte offset by using the pointer offset between
324 // the original string slice and the iterator's remaining part
325 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
326 self.iter.next().map(|ch| (offset, ch))
330 fn size_hint(&self) -> (uint, Option<uint>) {
331 self.iter.size_hint()
335 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
337 fn next_back(&mut self) -> Option<(uint, char)> {
338 self.iter.next_back().map(|ch| {
339 let offset = self.iter.string.len() +
340 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
346 /// External iterator for a string's characters in reverse order.
347 /// Use with the `std::iter` module.
348 pub type RevChars<'a> = Rev<Chars<'a>>;
350 /// External iterator for a string's characters and their byte offsets in reverse order.
351 /// Use with the `std::iter` module.
352 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
354 /// External iterator for a string's bytes.
355 /// Use with the `std::iter` module.
357 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
359 /// External iterator for a string's bytes in reverse order.
360 /// Use with the `std::iter` module.
361 pub type RevBytes<'a> = Rev<Bytes<'a>>;
363 /// An iterator over the substrings of a string, separated by `sep`.
365 pub struct CharSplits<'a, Sep> {
366 /// The slice remaining to be iterated
369 /// Whether an empty string at the end is allowed
370 allow_trailing_empty: bool,
375 /// An iterator over the substrings of a string, separated by `sep`,
376 /// starting from the back of the string.
377 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
379 /// An iterator over the substrings of a string, separated by `sep`,
380 /// splitting at most `count` times.
382 pub struct CharSplitsN<'a, Sep> {
383 iter: CharSplits<'a, Sep>,
384 /// The number of splits remaining
389 /// An iterator over the words of a string, separated by a sequence of whitespace
391 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
393 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
394 pub type AnyLines<'a> =
395 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
397 impl<'a, Sep> CharSplits<'a, Sep> {
399 fn get_end(&mut self) -> Option<&'a str> {
400 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
401 self.finished = true;
409 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
411 fn next(&mut self) -> Option<&'a str> {
412 if self.finished { return None }
414 let mut next_split = None;
416 for (idx, byte) in self.string.bytes().enumerate() {
417 if self.sep.matches(byte as char) && byte < 128u8 {
418 next_split = Some((idx, idx + 1));
423 for (idx, ch) in self.string.char_indices() {
424 if self.sep.matches(ch) {
425 next_split = Some((idx, self.string.char_range_at(idx).next));
431 Some((a, b)) => unsafe {
432 let elt = raw::slice_unchecked(self.string, 0, a);
433 self.string = raw::slice_unchecked(self.string, b, self.string.len());
436 None => self.get_end(),
441 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
442 for CharSplits<'a, Sep> {
444 fn next_back(&mut self) -> Option<&'a str> {
445 if self.finished { return None }
447 if !self.allow_trailing_empty {
448 self.allow_trailing_empty = true;
449 match self.next_back() {
450 Some(elt) if !elt.is_empty() => return Some(elt),
451 _ => if self.finished { return None }
454 let len = self.string.len();
455 let mut next_split = None;
458 for (idx, byte) in self.string.bytes().enumerate().rev() {
459 if self.sep.matches(byte as char) && byte < 128u8 {
460 next_split = Some((idx, idx + 1));
465 for (idx, ch) in self.string.char_indices_rev() {
466 if self.sep.matches(ch) {
467 next_split = Some((idx, self.string.char_range_at(idx).next));
473 Some((a, b)) => unsafe {
474 let elt = raw::slice_unchecked(self.string, b, len);
475 self.string = raw::slice_unchecked(self.string, 0, a);
478 None => { self.finished = true; Some(self.string) }
483 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
485 fn next(&mut self) -> Option<&'a str> {
488 if self.invert { self.iter.next_back() } else { self.iter.next() }
495 /// An iterator over the start and end indices of the matches of a
496 /// substring within a larger string
498 pub struct MatchIndices<'a> {
504 /// An iterator over the substrings of a string separated by a given
507 pub struct StrSplits<'a> {
508 it: MatchIndices<'a>,
513 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
515 fn next(&mut self) -> Option<(uint, uint)> {
516 // See Issue #1932 for why this is a naive search
517 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
518 let mut match_start = 0;
521 while self.position < h_len {
522 if self.haystack[self.position] == self.needle[match_i] {
523 if match_i == 0 { match_start = self.position; }
527 if match_i == n_len {
529 return Some((match_start, self.position));
532 // failed match, backtrack
535 self.position = match_start;
544 impl<'a> Iterator<&'a str> for StrSplits<'a> {
546 fn next(&mut self) -> Option<&'a str> {
547 if self.finished { return None; }
549 match self.it.next() {
550 Some((from, to)) => {
551 let ret = Some(self.it.haystack.slice(self.last_end, from));
556 self.finished = true;
557 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
563 // Helper functions used for Unicode normalization
564 fn canonical_sort(comb: &mut [(char, u8)]) {
568 let len = comb.len();
569 for i in range(0, len) {
570 let mut swapped = false;
571 for j in range(1, len-i) {
572 let class_a = *comb[j-1].ref1();
573 let class_b = *comb[j].ref1();
574 if class_a != 0 && class_b != 0 && class_a > class_b {
579 if !swapped { break; }
584 enum NormalizationForm {
589 /// External iterator for a string's normalization's characters.
590 /// Use with the `std::iter` module.
592 pub struct Normalizations<'a> {
593 kind: NormalizationForm,
595 buffer: Vec<(char, u8)>,
599 impl<'a> Iterator<char> for Normalizations<'a> {
601 fn next(&mut self) -> Option<char> {
602 use unicode::decompose::canonical_combining_class;
604 match self.buffer.as_slice().head() {
610 Some(&(c, _)) if self.sorted => {
614 _ => self.sorted = false
617 let decomposer = match self.kind {
618 NFD => char::decompose_canonical,
619 NFKD => char::decompose_compatible
623 for ch in self.iter {
624 let buffer = &mut self.buffer;
625 let sorted = &mut self.sorted;
627 let class = canonical_combining_class(d);
628 if class == 0 && !*sorted {
629 canonical_sort(buffer.as_mut_slice());
632 buffer.push((d, class));
639 canonical_sort(self.buffer.as_mut_slice());
643 match self.buffer.shift() {
648 Some((c, _)) => Some(c),
653 fn size_hint(&self) -> (uint, Option<uint>) {
654 let (lower, _) = self.iter.size_hint();
659 /// Replace all occurrences of one string with another
663 /// * s - The string containing substrings to replace
664 /// * from - The string to replace
665 /// * to - The replacement string
669 /// The original string with all occurances of `from` replaced with `to`
670 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
671 let mut result = StrBuf::new();
672 let mut last_end = 0;
673 for (start, end) in s.match_indices(from) {
674 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
678 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
683 Section: Comparing strings
686 // share the implementation of the lang-item vs. non-lang-item
689 fn eq_slice_(a: &str, b: &str) -> bool {
690 a.len() == b.len() && unsafe {
691 libc::memcmp(a.as_ptr() as *libc::c_void,
692 b.as_ptr() as *libc::c_void,
693 a.len() as libc::size_t) == 0
697 /// Bytewise slice equality
701 pub fn eq_slice(a: &str, b: &str) -> bool {
705 /// Bytewise slice equality
708 pub fn eq_slice(a: &str, b: &str) -> bool {
712 /// Bytewise string equality
714 #[lang="uniq_str_eq"]
716 pub fn eq(a: &~str, b: &~str) -> bool {
722 pub fn eq(a: &~str, b: &~str) -> bool {
730 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
731 /// returning `true` in that case, or, if it is invalid, `false` with
732 /// `iter` reset such that it is pointing at the first byte in the
733 /// invalid sequence.
735 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
737 // save the current thing we're pointing at.
740 // restore the iterator we had at the start of this codepoint.
741 macro_rules! err ( () => { {*iter = old; return false} });
742 macro_rules! next ( () => {
745 // we needed data, but there was none: error!
750 let first = match iter.next() {
752 // we're at the end of the iterator and a codepoint
753 // boundary at the same time, so this string is valid.
757 // ASCII characters are always valid, so only large
758 // bytes need more examination.
760 let w = utf8_char_width(first);
761 let second = next!();
762 // 2-byte encoding is for codepoints \u0080 to \u07ff
763 // first C2 80 last DF BF
764 // 3-byte encoding is for codepoints \u0800 to \uffff
765 // first E0 A0 80 last EF BF BF
766 // excluding surrogates codepoints \ud800 to \udfff
767 // ED A0 80 to ED BF BF
768 // 4-byte encoding is for codepoints \u10000 to \u10ffff
769 // first F0 90 80 80 last F4 8F BF BF
771 // Use the UTF-8 syntax from the RFC
773 // https://tools.ietf.org/html/rfc3629
775 // UTF8-2 = %xC2-DF UTF8-tail
776 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
777 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
778 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
779 // %xF4 %x80-8F 2( UTF8-tail )
781 2 => if second & 192 != TAG_CONT_U8 {err!()},
783 match (first, second, next!() & 192) {
784 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
785 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
786 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
787 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
792 match (first, second, next!() & 192, next!() & 192) {
793 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
794 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
795 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
805 /// Determines if a vector of bytes contains valid UTF-8.
806 pub fn is_utf8(v: &[u8]) -> bool {
807 run_utf8_validation_iterator(&mut v.iter())
811 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
812 let mut it = v.iter();
814 let ok = run_utf8_validation_iterator(&mut it);
818 // work out how many valid bytes we've consumed
819 // (run_utf8_validation_iterator resets the iterator to just
820 // after the last good byte), which we can do because the
821 // vector iterator size_hint is exact.
822 let (remaining, _) = it.size_hint();
823 Some(v.len() - remaining)
827 /// Determines if a vector of `u16` contains valid UTF-16
828 pub fn is_utf16(v: &[u16]) -> bool {
829 let mut it = v.iter();
830 macro_rules! next ( ($ret:expr) => {
831 match it.next() { Some(u) => *u, None => return $ret }
837 match char::from_u32(u as u32) {
840 let u2 = next!(false);
841 if u < 0xD7FF || u > 0xDBFF ||
842 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
848 /// An iterator that decodes UTF-16 encoded codepoints from a vector
851 pub struct UTF16Items<'a> {
852 iter: slice::Items<'a, u16>
854 /// The possibilities for values decoded from a `u16` stream.
855 #[deriving(Eq, TotalEq, Clone, Show)]
857 /// A valid codepoint.
859 /// An invalid surrogate without its pair.
864 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
865 /// replacement character (U+FFFD).
867 pub fn to_char_lossy(&self) -> char {
870 LoneSurrogate(_) => '\uFFFD'
875 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
876 fn next(&mut self) -> Option<UTF16Item> {
877 let u = match self.iter.next() {
882 if u < 0xD800 || 0xDFFF < u {
884 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
885 } else if u >= 0xDC00 {
886 // a trailing surrogate
887 Some(LoneSurrogate(u))
889 // preserve state for rewinding.
892 let u2 = match self.iter.next() {
895 None => return Some(LoneSurrogate(u))
897 if u2 < 0xDC00 || u2 > 0xDFFF {
898 // not a trailing surrogate so we're not a valid
899 // surrogate pair, so rewind to redecode u2 next time.
901 return Some(LoneSurrogate(u))
904 // all ok, so lets decode it.
905 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
906 Some(ScalarValue(unsafe {cast::transmute(c)}))
911 fn size_hint(&self) -> (uint, Option<uint>) {
912 let (low, high) = self.iter.size_hint();
913 // we could be entirely valid surrogates (2 elements per
914 // char), or entirely non-surrogates (1 element per char)
919 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
920 /// returning invalid surrogates as `LoneSurrogate`s.
926 /// use std::str::{ScalarValue, LoneSurrogate};
928 /// // 𝄞mus<invalid>ic<invalid>
929 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
930 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
933 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
934 /// ~[ScalarValue('𝄞'),
935 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
936 /// LoneSurrogate(0xDD1E),
937 /// ScalarValue('i'), ScalarValue('c'),
938 /// LoneSurrogate(0xD834)]);
940 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
941 UTF16Items { iter : v.iter() }
944 /// Return a slice of `v` ending at (and not including) the first NUL
953 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
954 /// // no NULs so no change
955 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
959 /// assert_eq!(str::truncate_utf16_at_nul(v),
960 /// &['a' as u16, 'b' as u16]);
962 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
963 match v.iter().position(|c| *c == 0) {
964 // don't include the 0
965 Some(i) => v.slice_to(i),
970 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
971 /// if `v` contains any invalid data.
979 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
980 /// 0x0073, 0x0069, 0x0063];
981 /// assert_eq!(str::from_utf16(v), Some("𝄞music".to_owned()));
983 /// // 𝄞mu<invalid>ic
985 /// assert_eq!(str::from_utf16(v), None);
987 pub fn from_utf16(v: &[u16]) -> Option<~str> {
988 let mut s = StrBuf::with_capacity(v.len() / 2);
989 for c in utf16_items(v) {
991 ScalarValue(c) => s.push_char(c),
992 LoneSurrogate(_) => return None
998 /// Decode a UTF-16 encoded vector `v` into a string, replacing
999 /// invalid data with the replacement character (U+FFFD).
1005 /// // 𝄞mus<invalid>ic<invalid>
1006 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1007 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1010 /// assert_eq!(str::from_utf16_lossy(v),
1011 /// "𝄞mus\uFFFDic\uFFFD".to_owned());
1013 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1014 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1017 // https://tools.ietf.org/html/rfc3629
1018 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1023 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1024 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1025 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1026 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1027 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1028 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1029 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1030 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1031 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1032 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1033 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1034 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1037 /// Given a first byte, determine how many bytes are in this UTF-8 character
1039 pub fn utf8_char_width(b: u8) -> uint {
1040 return UTF8_CHAR_WIDTH[b as uint] as uint;
1043 /// Struct that contains a `char` and the index of the first byte of
1044 /// the next `char` in a string. This can be used as a data structure
1045 /// for iterating over the UTF-8 bytes of a string.
1046 pub struct CharRange {
1049 /// Index of the first byte of the next `char`
1053 // Return the initial codepoint accumulator for the first byte.
1054 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1055 // for width 3, and 3 bits for width 4
1056 macro_rules! utf8_first_byte(
1057 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1060 // return the value of $ch updated with continuation byte $byte
1061 macro_rules! utf8_acc_cont_byte(
1062 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1065 static TAG_CONT_U8: u8 = 128u8;
1067 /// Converts a vector of bytes to a new utf-8 string.
1068 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1073 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1074 /// let output = std::str::from_utf8_lossy(input);
1075 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1077 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1078 let firstbad = match first_non_utf8_index(v) {
1079 None => return Slice(unsafe { cast::transmute(v) }),
1083 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1084 let mut i = firstbad;
1085 let total = v.len();
1086 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1087 unsafe { *xs.unsafe_ref(i) }
1089 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1097 let mut res = StrBuf::with_capacity(total);
1101 res.push_bytes(v.slice_to(i))
1105 // subseqidx is the index of the first byte of the subsequence we're looking at.
1106 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1108 let mut subseqidx = firstbad;
1112 let byte = unsafe_get(v, i);
1115 macro_rules! error(() => ({
1117 if subseqidx != i_ {
1118 res.push_bytes(v.slice(subseqidx, i_));
1121 res.push_bytes(REPLACEMENT);
1126 // subseqidx handles this
1128 let w = utf8_char_width(byte);
1132 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1139 match (byte, safe_get(v, i, total)) {
1140 (0xE0 , 0xA0 .. 0xBF) => (),
1141 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1142 (0xED , 0x80 .. 0x9F) => (),
1143 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1150 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1157 match (byte, safe_get(v, i, total)) {
1158 (0xF0 , 0x90 .. 0xBF) => (),
1159 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1160 (0xF4 , 0x80 .. 0x8F) => (),
1167 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1172 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1185 if subseqidx < total {
1187 res.push_bytes(v.slice(subseqidx, total))
1190 Owned(res.into_owned())
1197 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1198 /// This can be useful as an optimization when an allocation is sometimes
1199 /// needed but not always.
1200 pub enum MaybeOwned<'a> {
1201 /// A borrowed string
1207 /// SendStr is a specialization of `MaybeOwned` to be sendable
1208 pub type SendStr = MaybeOwned<'static>;
1210 impl<'a> MaybeOwned<'a> {
1211 /// Returns `true` if this `MaybeOwned` wraps an owned string
1213 pub fn is_owned(&self) -> bool {
1220 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1222 pub fn is_slice(&self) -> bool {
1230 /// Trait for moving into a `MaybeOwned`
1231 pub trait IntoMaybeOwned<'a> {
1232 /// Moves self into a `MaybeOwned`
1233 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1236 impl<'a> IntoMaybeOwned<'a> for ~str {
1238 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1241 impl<'a> IntoMaybeOwned<'a> for &'a str {
1243 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1246 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1248 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1251 impl<'a> Eq for MaybeOwned<'a> {
1253 fn eq(&self, other: &MaybeOwned) -> bool {
1254 self.as_slice() == other.as_slice()
1258 impl<'a> TotalEq for MaybeOwned<'a> {}
1260 impl<'a> Ord for MaybeOwned<'a> {
1262 fn lt(&self, other: &MaybeOwned) -> bool {
1263 self.as_slice().lt(&other.as_slice())
1267 impl<'a> TotalOrd for MaybeOwned<'a> {
1269 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1270 self.as_slice().cmp(&other.as_slice())
1274 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1276 fn equiv(&self, other: &S) -> bool {
1277 self.as_slice() == other.as_slice()
1281 impl<'a> Str for MaybeOwned<'a> {
1283 fn as_slice<'b>(&'b self) -> &'b str {
1286 Owned(ref s) => s.as_slice()
1291 fn into_owned(self) -> ~str {
1293 Slice(s) => s.to_owned(),
1299 impl<'a> Container for MaybeOwned<'a> {
1301 fn len(&self) -> uint { self.as_slice().len() }
1304 impl<'a> Clone for MaybeOwned<'a> {
1306 fn clone(&self) -> MaybeOwned<'a> {
1308 Slice(s) => Slice(s),
1309 Owned(ref s) => Owned(s.to_owned())
1314 impl<'a> Default for MaybeOwned<'a> {
1316 fn default() -> MaybeOwned<'a> { Slice("") }
1319 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1321 fn hash(&self, hasher: &mut H) {
1323 Slice(s) => s.hash(hasher),
1324 Owned(ref s) => s.hash(hasher),
1329 impl<'a> fmt::Show for MaybeOwned<'a> {
1331 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1333 Slice(ref s) => s.fmt(f),
1334 Owned(ref s) => s.fmt(f)
1339 /// Unsafe operations
1342 use container::Container;
1348 use slice::{MutableVector, ImmutableVector, OwnedVector, Vector};
1349 use str::{is_utf8, StrSlice};
1352 /// Create a Rust string from a *u8 buffer of the given length
1353 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1354 let mut v = Vec::with_capacity(len);
1355 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1358 assert!(is_utf8(v.as_slice()));
1359 ::cast::transmute(v.move_iter().collect::<~[u8]>())
1362 #[lang="strdup_uniq"]
1365 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1366 from_buf_len(ptr, len)
1369 /// Create a Rust string from a null-terminated C string
1370 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1375 curr = buf.offset(i);
1377 from_buf_len(buf as *u8, i as uint)
1380 /// Converts a slice of bytes to a string slice without checking
1381 /// that the string contains valid UTF-8.
1382 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1386 /// Converts an owned vector of bytes to a new owned string. This assumes
1387 /// that the utf-8-ness of the vector has already been validated
1389 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1393 /// Converts a byte to a string.
1394 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1396 /// Form a slice from a C string. Unsafe because the caller must ensure the
1397 /// C string has the static lifetime, or else the return value may be
1398 /// invalidated later.
1399 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1403 while *curr != 0u8 {
1405 curr = s.offset(len as int);
1407 let v = Slice { data: s, len: len };
1408 assert!(is_utf8(::cast::transmute(v)));
1409 ::cast::transmute(v)
1412 /// Takes a bytewise (not UTF-8) slice from a string.
1414 /// Returns the substring from [`begin`..`end`).
1418 /// If begin is greater than end.
1419 /// If end is greater than the length of the string.
1421 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1422 assert!(begin <= end);
1423 assert!(end <= s.len());
1424 slice_unchecked(s, begin, end)
1427 /// Takes a bytewise (not UTF-8) slice from a string.
1429 /// Returns the substring from [`begin`..`end`).
1431 /// Caller must check slice boundaries!
1433 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1434 cast::transmute(Slice {
1435 data: s.as_ptr().offset(begin as int),
1440 /// Access the str in its vector representation.
1441 /// The caller must preserve the valid UTF-8 property when modifying.
1443 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1447 /// Sets the length of a string
1449 /// This will explicitly set the size of the string, without actually
1450 /// modifing its buffers, so it is up to the caller to ensure that
1451 /// the string is actually the specified size.
1453 fn test_from_buf_len() {
1455 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1457 let c = from_buf_len(b, 3u);
1458 assert_eq!(c, "AAA".to_owned());
1464 Section: Trait implementations
1468 #[allow(missing_doc)]
1470 use container::Container;
1471 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1474 use option::{Some, None};
1475 use str::{Str, StrSlice, eq_slice};
1478 impl<'a> Add<&'a str,~str> for &'a str {
1480 fn add(&self, rhs: & &'a str) -> ~str {
1481 let mut ret = StrBuf::from_owned_str(self.to_owned());
1487 impl<'a> TotalOrd for &'a str {
1489 fn cmp(&self, other: & &'a str) -> Ordering {
1490 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1491 match s_b.cmp(&o_b) {
1492 Greater => return Greater,
1493 Less => return Less,
1498 self.len().cmp(&other.len())
1502 impl TotalOrd for ~str {
1504 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1507 impl<'a> Eq for &'a str {
1509 fn eq(&self, other: & &'a str) -> bool {
1510 eq_slice((*self), (*other))
1513 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1518 fn eq(&self, other: &~str) -> bool {
1519 eq_slice((*self), (*other))
1523 impl<'a> TotalEq for &'a str {}
1525 impl TotalEq for ~str {}
1527 impl<'a> Ord for &'a str {
1529 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1534 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1537 impl<'a, S: Str> Equiv<S> for &'a str {
1539 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1542 impl<'a, S: Str> Equiv<S> for ~str {
1544 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1551 /// Any string that can be represented as a slice
1553 /// Work with `self` as a slice.
1554 fn as_slice<'a>(&'a self) -> &'a str;
1556 /// Convert `self` into a ~str, not making a copy if possible.
1557 fn into_owned(self) -> ~str;
1559 /// Convert `self` into a `StrBuf`.
1561 fn to_strbuf(&self) -> StrBuf {
1562 StrBuf::from_str(self.as_slice())
1565 /// Convert `self` into a `StrBuf`, not making a copy if possible.
1567 fn into_strbuf(self) -> StrBuf {
1568 StrBuf::from_owned_str(self.into_owned())
1572 impl<'a> Str for &'a str {
1574 fn as_slice<'a>(&'a self) -> &'a str { *self }
1577 fn into_owned(self) -> ~str { self.to_owned() }
1580 impl<'a> Str for ~str {
1582 fn as_slice<'a>(&'a self) -> &'a str {
1583 let s: &'a str = *self; s
1587 fn into_owned(self) -> ~str { self }
1590 impl<'a> Container for &'a str {
1592 fn len(&self) -> uint {
1597 impl Container for ~str {
1599 fn len(&self) -> uint { self.as_slice().len() }
1602 /// Methods for string slices
1603 pub trait StrSlice<'a> {
1604 /// Returns true if one string contains another
1608 /// - needle - The string to look for
1609 fn contains<'a>(&self, needle: &'a str) -> bool;
1611 /// Returns true if a string contains a char.
1615 /// - needle - The char to look for
1616 fn contains_char(&self, needle: char) -> bool;
1618 /// An iterator over the characters of `self`. Note, this iterates
1619 /// over unicode code-points, not unicode graphemes.
1624 /// let v: ~[char] = "abc åäö".chars().collect();
1625 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1627 fn chars(&self) -> Chars<'a>;
1629 /// An iterator over the characters of `self`, in reverse order.
1630 fn chars_rev(&self) -> RevChars<'a>;
1632 /// An iterator over the bytes of `self`
1633 fn bytes(&self) -> Bytes<'a>;
1635 /// An iterator over the bytes of `self`, in reverse order
1636 fn bytes_rev(&self) -> RevBytes<'a>;
1638 /// An iterator over the characters of `self` and their byte offsets.
1639 fn char_indices(&self) -> CharOffsets<'a>;
1641 /// An iterator over the characters of `self` and their byte offsets,
1642 /// in reverse order.
1643 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1645 /// An iterator over substrings of `self`, separated by characters
1646 /// matched by `sep`.
1651 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1652 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1654 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1655 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1657 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1658 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1660 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1662 /// An iterator over substrings of `self`, separated by characters
1663 /// matched by `sep`, restricted to splitting at most `count`
1669 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1670 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1672 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1673 /// assert_eq!(v, ~["abc", "def2ghi"]);
1675 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1676 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1678 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1680 /// An iterator over substrings of `self`, separated by characters
1681 /// matched by `sep`.
1683 /// Equivalent to `split`, except that the trailing substring
1684 /// is skipped if empty (terminator semantics).
1689 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1690 /// assert_eq!(v, ~["A", "B"]);
1692 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1693 /// assert_eq!(v, ~["A", "", "B", ""]);
1695 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1697 /// An iterator over substrings of `self`, separated by characters
1698 /// matched by `sep`, in reverse order.
1703 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1704 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1706 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1707 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1709 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1710 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1712 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1714 /// An iterator over substrings of `self`, separated by characters
1715 /// matched by `sep`, starting from the end of the string.
1716 /// Restricted to splitting at most `count` times.
1721 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1722 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1724 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1725 /// assert_eq!(v, ~["ghi", "abc1def"]);
1727 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1728 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1730 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1732 /// An iterator over the start and end indices of the disjoint
1733 /// matches of `sep` within `self`.
1735 /// That is, each returned value `(start, end)` satisfies
1736 /// `self.slice(start, end) == sep`. For matches of `sep` within
1737 /// `self` that overlap, only the indicies corresponding to the
1738 /// first match are returned.
1743 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1744 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1746 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1747 /// assert_eq!(v, ~[(1,4), (4,7)]);
1749 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1750 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1752 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1754 /// An iterator over the substrings of `self` separated by `sep`.
1759 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1760 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1762 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1763 /// assert_eq!(v, ~["1", "", "2"]);
1765 fn split_str(&self, &'a str) -> StrSplits<'a>;
1767 /// An iterator over the lines of a string (subsequences separated
1768 /// by `\n`). This does not include the empty string after a
1774 /// let four_lines = "foo\nbar\n\nbaz\n";
1775 /// let v: ~[&str] = four_lines.lines().collect();
1776 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1778 fn lines(&self) -> CharSplits<'a, char>;
1780 /// An iterator over the lines of a string, separated by either
1781 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1782 /// empty trailing line.
1787 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1788 /// let v: ~[&str] = four_lines.lines_any().collect();
1789 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1791 fn lines_any(&self) -> AnyLines<'a>;
1793 /// An iterator over the words of a string (subsequences separated
1794 /// by any sequence of whitespace). Sequences of whitespace are
1795 /// collapsed, so empty "words" are not included.
1800 /// let some_words = " Mary had\ta little \n\t lamb";
1801 /// let v: ~[&str] = some_words.words().collect();
1802 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1804 fn words(&self) -> Words<'a>;
1806 /// An Iterator over the string in Unicode Normalization Form D
1807 /// (canonical decomposition).
1808 fn nfd_chars(&self) -> Normalizations<'a>;
1810 /// An Iterator over the string in Unicode Normalization Form KD
1811 /// (compatibility decomposition).
1812 fn nfkd_chars(&self) -> Normalizations<'a>;
1814 /// Returns true if the string contains only whitespace.
1816 /// Whitespace characters are determined by `char::is_whitespace`.
1821 /// assert!(" \t\n".is_whitespace());
1822 /// assert!("".is_whitespace());
1824 /// assert!( !"abc".is_whitespace());
1826 fn is_whitespace(&self) -> bool;
1828 /// Returns true if the string contains only alphanumeric code
1831 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1836 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1837 /// assert!("".is_alphanumeric());
1839 /// assert!( !" &*~".is_alphanumeric());
1841 fn is_alphanumeric(&self) -> bool;
1843 /// Returns the number of Unicode code points (`char`) that a
1846 /// This does not perform any normalization, and is `O(n)`, since
1847 /// UTF-8 is a variable width encoding of code points.
1849 /// *Warning*: The number of code points in a string does not directly
1850 /// correspond to the number of visible characters or width of the
1851 /// visible text due to composing characters, and double- and
1852 /// zero-width ones.
1854 /// See also `.len()` for the byte length.
1859 /// // composed forms of `ö` and `é`
1860 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1861 /// // decomposed forms of `ö` and `é`
1862 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1864 /// assert_eq!(c.char_len(), 15);
1865 /// assert_eq!(d.char_len(), 17);
1867 /// assert_eq!(c.len(), 21);
1868 /// assert_eq!(d.len(), 23);
1870 /// // the two strings *look* the same
1871 /// println!("{}", c);
1872 /// println!("{}", d);
1874 fn char_len(&self) -> uint;
1876 /// Returns a slice of the given string from the byte range
1877 /// [`begin`..`end`).
1879 /// This operation is `O(1)`.
1881 /// Fails when `begin` and `end` do not point to valid characters
1882 /// or point beyond the last character of the string.
1884 /// See also `slice_to` and `slice_from` for slicing prefixes and
1885 /// suffixes of strings, and `slice_chars` for slicing based on
1886 /// code point counts.
1891 /// let s = "Löwe 老虎 Léopard";
1892 /// assert_eq!(s.slice(0, 1), "L");
1894 /// assert_eq!(s.slice(1, 9), "öwe 老");
1896 /// // these will fail:
1897 /// // byte 2 lies within `ö`:
1898 /// // s.slice(2, 3);
1900 /// // byte 8 lies within `老`
1901 /// // s.slice(1, 8);
1903 /// // byte 100 is outside the string
1904 /// // s.slice(3, 100);
1906 fn slice(&self, begin: uint, end: uint) -> &'a str;
1908 /// Returns a slice of the string from `begin` to its end.
1910 /// Equivalent to `self.slice(begin, self.len())`.
1912 /// Fails when `begin` does not point to a valid character, or is
1915 /// See also `slice`, `slice_to` and `slice_chars`.
1916 fn slice_from(&self, begin: uint) -> &'a str;
1918 /// Returns a slice of the string from the beginning to byte
1921 /// Equivalent to `self.slice(0, end)`.
1923 /// Fails when `end` does not point to a valid character, or is
1926 /// See also `slice`, `slice_from` and `slice_chars`.
1927 fn slice_to(&self, end: uint) -> &'a str;
1929 /// Returns a slice of the string from the character range
1930 /// [`begin`..`end`).
1932 /// That is, start at the `begin`-th code point of the string and
1933 /// continue to the `end`-th code point. This does not detect or
1934 /// handle edge cases such as leaving a combining character as the
1935 /// first code point of the string.
1937 /// Due to the design of UTF-8, this operation is `O(end)`.
1938 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1939 /// variants that use byte indices rather than code point
1942 /// Fails if `begin` > `end` or the either `begin` or `end` are
1943 /// beyond the last character of the string.
1948 /// let s = "Löwe 老虎 Léopard";
1949 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1950 /// assert_eq!(s.slice_chars(5, 7), "老虎");
1952 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1954 /// Returns true if `needle` is a prefix of the string.
1955 fn starts_with(&self, needle: &str) -> bool;
1957 /// Returns true if `needle` is a suffix of the string.
1958 fn ends_with(&self, needle: &str) -> bool;
1960 /// Escape each char in `s` with `char::escape_default`.
1961 fn escape_default(&self) -> ~str;
1963 /// Escape each char in `s` with `char::escape_unicode`.
1964 fn escape_unicode(&self) -> ~str;
1966 /// Returns a string with leading and trailing whitespace removed.
1967 fn trim(&self) -> &'a str;
1969 /// Returns a string with leading whitespace removed.
1970 fn trim_left(&self) -> &'a str;
1972 /// Returns a string with trailing whitespace removed.
1973 fn trim_right(&self) -> &'a str;
1975 /// Returns a string with characters that match `to_trim` removed.
1979 /// * to_trim - a character matcher
1984 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1985 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1986 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1988 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
1990 /// Returns a string with leading `chars_to_trim` removed.
1994 /// * to_trim - a character matcher
1999 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2000 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2001 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2003 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2005 /// Returns a string with trailing `chars_to_trim` removed.
2009 /// * to_trim - a character matcher
2014 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2015 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2016 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2018 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2020 /// Replace all occurrences of one string with another.
2024 /// * `from` - The string to replace
2025 /// * `to` - The replacement string
2029 /// The original string with all occurances of `from` replaced with `to`.
2034 /// let s = ~"Do you know the muffin man,
2035 /// The muffin man, the muffin man, ...";
2037 /// assert_eq!(s.replace("muffin man", "little lamb"),
2038 /// ~"Do you know the little lamb,
2039 /// The little lamb, the little lamb, ...");
2041 /// // not found, so no change.
2042 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2044 fn replace(&self, from: &str, to: &str) -> ~str;
2046 /// Copy a slice into a new owned str.
2047 fn to_owned(&self) -> ~str;
2049 /// Converts to a vector of `u16` encoded as UTF-16.
2050 fn to_utf16(&self) -> ~[u16];
2052 /// Check that `index`-th byte lies at the start and/or end of a
2053 /// UTF-8 code point sequence.
2055 /// The start and end of the string (when `index == self.len()`)
2056 /// are considered to be boundaries.
2058 /// Fails if `index` is greater than `self.len()`.
2063 /// let s = "Löwe 老虎 Léopard";
2064 /// assert!(s.is_char_boundary(0));
2066 /// assert!(s.is_char_boundary(6));
2067 /// assert!(s.is_char_boundary(s.len()));
2069 /// // second byte of `ö`
2070 /// assert!(!s.is_char_boundary(2));
2072 /// // third byte of `老`
2073 /// assert!(!s.is_char_boundary(8));
2075 fn is_char_boundary(&self, index: uint) -> bool;
2077 /// Pluck a character out of a string and return the index of the next
2080 /// This function can be used to iterate over the unicode characters of a
2085 /// This example manually iterate through the characters of a
2086 /// string; this should normally by done by `.chars()` or
2087 /// `.char_indices`.
2090 /// use std::str::CharRange;
2092 /// let s = "中华Việt Nam";
2094 /// while i < s.len() {
2095 /// let CharRange {ch, next} = s.char_range_at(i);
2096 /// println!("{}: {}", i, ch);
2118 /// * s - The string
2119 /// * i - The byte offset of the char to extract
2123 /// A record {ch: char, next: uint} containing the char value and the byte
2124 /// index of the next unicode character.
2128 /// If `i` is greater than or equal to the length of the string.
2129 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2130 fn char_range_at(&self, start: uint) -> CharRange;
2132 /// Given a byte position and a str, return the previous char and its position.
2134 /// This function can be used to iterate over a unicode string in reverse.
2136 /// Returns 0 for next index if called on start index 0.
2137 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2139 /// Plucks the character starting at the `i`th byte of a string
2140 fn char_at(&self, i: uint) -> char;
2142 /// Plucks the character ending at the `i`th byte of a string
2143 fn char_at_reverse(&self, i: uint) -> char;
2145 /// Work with the byte buffer of a string as a byte slice.
2146 fn as_bytes(&self) -> &'a [u8];
2148 /// Returns the byte index of the first character of `self` that
2149 /// matches `search`.
2153 /// `Some` containing the byte index of the last matching character
2154 /// or `None` if there is no match
2159 /// let s = "Löwe 老虎 Léopard";
2161 /// assert_eq!(s.find('L'), Some(0));
2162 /// assert_eq!(s.find('é'), Some(14));
2164 /// // the first space
2165 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2167 /// // neither are found
2168 /// assert_eq!(s.find(&['1', '2']), None);
2170 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2172 /// Returns the byte index of the last character of `self` that
2173 /// matches `search`.
2177 /// `Some` containing the byte index of the last matching character
2178 /// or `None` if there is no match.
2183 /// let s = "Löwe 老虎 Léopard";
2185 /// assert_eq!(s.rfind('L'), Some(13));
2186 /// assert_eq!(s.rfind('é'), Some(14));
2188 /// // the second space
2189 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2191 /// // searches for an occurrence of either `1` or `2`, but neither are found
2192 /// assert_eq!(s.rfind(&['1', '2']), None);
2194 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2196 /// Returns the byte index of the first matching substring
2200 /// * `needle` - The string to search for
2204 /// `Some` containing the byte index of the first matching substring
2205 /// or `None` if there is no match.
2210 /// let s = "Löwe 老虎 Léopard";
2212 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2213 /// assert_eq!(s.find_str("muffin man"), None);
2215 fn find_str(&self, &str) -> Option<uint>;
2217 /// Given a string, make a new string with repeated copies of it.
2218 fn repeat(&self, nn: uint) -> ~str;
2220 /// Retrieves the first character from a string slice and returns
2221 /// it. This does not allocate a new string; instead, it returns a
2222 /// slice that point one character beyond the character that was
2223 /// shifted. If the string does not contain any characters,
2224 /// a tuple of None and an empty string is returned instead.
2229 /// let s = "Löwe 老虎 Léopard";
2230 /// let (c, s1) = s.slice_shift_char();
2231 /// assert_eq!(c, Some('L'));
2232 /// assert_eq!(s1, "öwe 老虎 Léopard");
2234 /// let (c, s2) = s1.slice_shift_char();
2235 /// assert_eq!(c, Some('ö'));
2236 /// assert_eq!(s2, "we 老虎 Léopard");
2238 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2240 /// Levenshtein Distance between two strings.
2241 fn lev_distance(&self, t: &str) -> uint;
2243 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2245 /// Fails if `inner` is not a direct slice contained within self.
2250 /// let string = "a\nb\nc";
2251 /// let lines: ~[&str] = string.lines().collect();
2253 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2254 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2255 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2257 fn subslice_offset(&self, inner: &str) -> uint;
2259 /// Return an unsafe pointer to the strings buffer.
2261 /// The caller must ensure that the string outlives this pointer,
2262 /// and that it is not reallocated (e.g. by pushing to the
2264 fn as_ptr(&self) -> *u8;
2267 impl<'a> StrSlice<'a> for &'a str {
2269 fn contains<'a>(&self, needle: &'a str) -> bool {
2270 self.find_str(needle).is_some()
2274 fn contains_char(&self, needle: char) -> bool {
2275 self.find(needle).is_some()
2279 fn chars(&self) -> Chars<'a> {
2280 Chars{string: *self}
2284 fn chars_rev(&self) -> RevChars<'a> {
2289 fn bytes(&self) -> Bytes<'a> {
2290 self.as_bytes().iter().map(|&b| b)
2294 fn bytes_rev(&self) -> RevBytes<'a> {
2299 fn char_indices(&self) -> CharOffsets<'a> {
2300 CharOffsets{string: *self, iter: self.chars()}
2304 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2305 self.char_indices().rev()
2309 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2312 only_ascii: sep.only_ascii(),
2314 allow_trailing_empty: true,
2320 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2321 -> CharSplitsN<'a, Sep> {
2323 iter: self.split(sep),
2330 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2331 -> CharSplits<'a, Sep> {
2333 allow_trailing_empty: false,
2339 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2340 self.split(sep).rev()
2344 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2345 -> CharSplitsN<'a, Sep> {
2347 iter: self.split(sep),
2354 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2355 assert!(!sep.is_empty())
2364 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2366 it: self.match_indices(sep),
2373 fn lines(&self) -> CharSplits<'a, char> {
2374 self.split_terminator('\n')
2377 fn lines_any(&self) -> AnyLines<'a> {
2378 self.lines().map(|line| {
2380 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2386 fn words(&self) -> Words<'a> {
2387 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2391 fn nfd_chars(&self) -> Normalizations<'a> {
2401 fn nfkd_chars(&self) -> Normalizations<'a> {
2411 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2414 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2417 fn char_len(&self) -> uint { self.chars().len() }
2420 fn slice(&self, begin: uint, end: uint) -> &'a str {
2421 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2422 unsafe { raw::slice_bytes(*self, begin, end) }
2426 fn slice_from(&self, begin: uint) -> &'a str {
2427 self.slice(begin, self.len())
2431 fn slice_to(&self, end: uint) -> &'a str {
2432 assert!(self.is_char_boundary(end));
2433 unsafe { raw::slice_bytes(*self, 0, end) }
2436 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2437 assert!(begin <= end);
2439 let mut begin_byte = None;
2440 let mut end_byte = None;
2442 // This could be even more efficient by not decoding,
2443 // only finding the char boundaries
2444 for (idx, _) in self.char_indices() {
2445 if count == begin { begin_byte = Some(idx); }
2446 if count == end { end_byte = Some(idx); break; }
2449 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2450 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2452 match (begin_byte, end_byte) {
2453 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2454 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2455 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2460 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2461 let n = needle.len();
2462 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2466 fn ends_with(&self, needle: &str) -> bool {
2467 let (m, n) = (self.len(), needle.len());
2468 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2471 fn escape_default(&self) -> ~str {
2472 let mut out = StrBuf::with_capacity(self.len());
2473 for c in self.chars() {
2474 c.escape_default(|c| out.push_char(c));
2479 fn escape_unicode(&self) -> ~str {
2480 let mut out = StrBuf::with_capacity(self.len());
2481 for c in self.chars() {
2482 c.escape_unicode(|c| out.push_char(c));
2488 fn trim(&self) -> &'a str {
2489 self.trim_left().trim_right()
2493 fn trim_left(&self) -> &'a str {
2494 self.trim_left_chars(&char::is_whitespace)
2498 fn trim_right(&self) -> &'a str {
2499 self.trim_right_chars(&char::is_whitespace)
2503 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2504 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2508 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2509 match self.find(|c: char| !to_trim.matches(c)) {
2511 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2516 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2517 match self.rfind(|c: char| !to_trim.matches(c)) {
2520 let next = self.char_range_at(last).next;
2521 unsafe { raw::slice_bytes(*self, 0u, next) }
2526 fn replace(&self, from: &str, to: &str) -> ~str {
2527 let mut result = StrBuf::new();
2528 let mut last_end = 0;
2529 for (start, end) in self.match_indices(from) {
2530 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2531 result.push_str(to);
2534 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2539 fn to_owned(&self) -> ~str {
2540 let len = self.len();
2542 let mut v = Vec::with_capacity(len);
2544 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2546 ::cast::transmute(v.move_iter().collect::<~[u8]>())
2550 fn to_utf16(&self) -> ~[u16] {
2551 let mut u = Vec::new();;
2552 for ch in self.chars() {
2553 let mut buf = [0u16, ..2];
2554 let n = ch.encode_utf16(buf /* as mut slice! */);
2555 u.push_all(buf.slice_to(n));
2557 u.move_iter().collect()
2561 fn is_char_boundary(&self, index: uint) -> bool {
2562 if index == self.len() { return true; }
2563 let b = self[index];
2564 return b < 128u8 || b >= 192u8;
2568 fn char_range_at(&self, i: uint) -> CharRange {
2569 if self[i] < 128u8 {
2570 return CharRange {ch: self[i] as char, next: i + 1 };
2573 // Multibyte case is a fn to allow char_range_at to inline cleanly
2574 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2575 let mut val = s[i] as u32;
2576 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2579 val = utf8_first_byte!(val, w);
2580 val = utf8_acc_cont_byte!(val, s[i + 1]);
2581 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2582 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2584 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2587 return multibyte_char_range_at(*self, i);
2591 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2592 let mut prev = start;
2594 prev = prev.saturating_sub(1);
2595 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2597 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2598 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2599 // while there is a previous byte == 10......
2600 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2604 let mut val = s[i] as u32;
2605 let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2608 val = utf8_first_byte!(val, w);
2609 val = utf8_acc_cont_byte!(val, s[i + 1]);
2610 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2611 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2613 return CharRange {ch: unsafe { transmute(val) }, next: i};
2616 return multibyte_char_range_at_reverse(*self, prev);
2620 fn char_at(&self, i: uint) -> char {
2621 self.char_range_at(i).ch
2625 fn char_at_reverse(&self, i: uint) -> char {
2626 self.char_range_at_reverse(i).ch
2630 fn as_bytes(&self) -> &'a [u8] {
2631 unsafe { cast::transmute(*self) }
2634 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2635 if search.only_ascii() {
2636 self.bytes().position(|b| search.matches(b as char))
2638 for (index, c) in self.char_indices() {
2639 if search.matches(c) { return Some(index); }
2645 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2646 if search.only_ascii() {
2647 self.bytes().rposition(|b| search.matches(b as char))
2649 for (index, c) in self.char_indices_rev() {
2650 if search.matches(c) { return Some(index); }
2656 fn find_str(&self, needle: &str) -> Option<uint> {
2657 if needle.is_empty() {
2660 self.match_indices(needle)
2662 .map(|(start, _end)| start)
2666 fn repeat(&self, nn: uint) -> ~str {
2667 let mut ret = StrBuf::with_capacity(nn * self.len());
2668 for _ in range(0, nn) {
2669 ret.push_str(*self);
2675 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2676 if self.is_empty() {
2677 return (None, *self);
2679 let CharRange {ch, next} = self.char_range_at(0u);
2680 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2681 return (Some(ch), next_s);
2685 fn lev_distance(&self, t: &str) -> uint {
2686 let slen = self.len();
2689 if slen == 0 { return tlen; }
2690 if tlen == 0 { return slen; }
2692 let mut dcol = Vec::from_fn(tlen + 1, |x| x);
2694 for (i, sc) in self.chars().enumerate() {
2696 let mut current = i;
2697 *dcol.get_mut(0) = current + 1;
2699 for (j, tc) in t.chars().enumerate() {
2701 let next = *dcol.get(j + 1);
2704 *dcol.get_mut(j + 1) = current;
2706 *dcol.get_mut(j + 1) = ::cmp::min(current, next);
2707 *dcol.get_mut(j + 1) = ::cmp::min(*dcol.get(j + 1),
2715 return *dcol.get(tlen);
2718 fn subslice_offset(&self, inner: &str) -> uint {
2719 let a_start = self.as_ptr() as uint;
2720 let a_end = a_start + self.len();
2721 let b_start = inner.as_ptr() as uint;
2722 let b_end = b_start + inner.len();
2724 assert!(a_start <= b_start);
2725 assert!(b_end <= a_end);
2730 fn as_ptr(&self) -> *u8 {
2735 /// Methods for owned strings
2736 pub trait OwnedStr {
2737 /// Consumes the string, returning the underlying byte buffer.
2739 /// The buffer does not have a null terminator.
2740 fn into_bytes(self) -> ~[u8];
2742 /// Pushes the given string onto this string, returning the concatenation of the two strings.
2743 fn append(self, rhs: &str) -> ~str;
2746 impl OwnedStr for ~str {
2748 fn into_bytes(self) -> ~[u8] {
2749 unsafe { cast::transmute(self) }
2753 fn append(self, rhs: &str) -> ~str {
2754 let mut new_str = StrBuf::from_owned_str(self);
2755 new_str.push_str(rhs);
2756 new_str.into_owned()
2760 impl Clone for ~str {
2762 fn clone(&self) -> ~str {
2767 impl FromIterator<char> for ~str {
2769 fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2770 let (lower, _) = iterator.size_hint();
2771 let mut buf = StrBuf::with_capacity(lower);
2772 buf.extend(iterator);
2777 // This works because every lifetime is a sub-lifetime of 'static
2778 impl<'a> Default for &'a str {
2779 fn default() -> &'a str { "" }
2782 impl Default for ~str {
2783 fn default() -> ~str { "".to_owned() }
2788 use iter::AdditiveIterator;
2789 use default::Default;
2796 assert!((eq(&"".to_owned(), &"".to_owned())));
2797 assert!((eq(&"foo".to_owned(), &"foo".to_owned())));
2798 assert!((!eq(&"foo".to_owned(), &"bar".to_owned())));
2802 fn test_eq_slice() {
2803 assert!((eq_slice("foobar".slice(0, 3), "foo")));
2804 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2805 assert!((!eq_slice("foo1", "foo2")));
2811 assert!("" <= "foo");
2812 assert!("foo" <= "foo");
2813 assert!("foo" != "bar");
2818 assert_eq!("".len(), 0u);
2819 assert_eq!("hello world".len(), 11u);
2820 assert_eq!("\x63".len(), 1u);
2821 assert_eq!("\xa2".len(), 2u);
2822 assert_eq!("\u03c0".len(), 2u);
2823 assert_eq!("\u2620".len(), 3u);
2824 assert_eq!("\U0001d11e".len(), 4u);
2826 assert_eq!("".char_len(), 0u);
2827 assert_eq!("hello world".char_len(), 11u);
2828 assert_eq!("\x63".char_len(), 1u);
2829 assert_eq!("\xa2".char_len(), 1u);
2830 assert_eq!("\u03c0".char_len(), 1u);
2831 assert_eq!("\u2620".char_len(), 1u);
2832 assert_eq!("\U0001d11e".char_len(), 1u);
2833 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2838 assert_eq!("hello".find('l'), Some(2u));
2839 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2840 assert!("hello".find('x').is_none());
2841 assert!("hello".find(|c:char| c == 'x').is_none());
2842 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2843 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2848 assert_eq!("hello".rfind('l'), Some(3u));
2849 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2850 assert!("hello".rfind('x').is_none());
2851 assert!("hello".rfind(|c:char| c == 'x').is_none());
2852 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2853 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2858 let empty = "".to_owned();
2859 let s: ~str = empty.chars().collect();
2860 assert_eq!(empty, s);
2861 let data = "ประเทศไทย中".to_owned();
2862 let s: ~str = data.chars().collect();
2863 assert_eq!(data, s);
2867 fn test_into_bytes() {
2868 let data = "asdf".to_owned();
2869 let buf = data.into_bytes();
2870 assert_eq!(bytes!("asdf"), buf.as_slice());
2874 fn test_find_str() {
2876 assert_eq!("".find_str(""), Some(0u));
2877 assert!("banana".find_str("apple pie").is_none());
2879 let data = "abcabc";
2880 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2881 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2882 assert!(data.slice(2u, 4u).find_str("ab").is_none());
2884 let mut data = "ประเทศไทย中华Việt Nam".to_owned();
2886 assert!(data.find_str("ไท华").is_none());
2887 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2888 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2890 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2891 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2892 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2893 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2894 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2896 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2897 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2898 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2899 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2900 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2904 fn test_slice_chars() {
2905 fn t(a: &str, b: &str, start: uint) {
2906 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2909 t("hello", "llo", 2);
2910 t("hello", "el", 1);
2913 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2918 fn t(v: &[~str], s: &str) {
2919 assert_eq!(v.concat(), s.to_str());
2921 t(["you".to_owned(), "know".to_owned(), "I'm".to_owned(),
2922 "no".to_owned(), "good".to_owned()], "youknowI'mnogood");
2923 let v: &[~str] = [];
2925 t(["hi".to_owned()], "hi");
2930 fn t(v: &[~str], sep: &str, s: &str) {
2931 assert_eq!(v.connect(sep), s.to_str());
2933 t(["you".to_owned(), "know".to_owned(), "I'm".to_owned(),
2934 "no".to_owned(), "good".to_owned()],
2935 " ", "you know I'm no good");
2936 let v: &[~str] = [];
2938 t(["hi".to_owned()], " ", "hi");
2942 fn test_concat_slices() {
2943 fn t(v: &[&str], s: &str) {
2944 assert_eq!(v.concat(), s.to_str());
2946 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2947 let v: &[&str] = [];
2953 fn test_connect_slices() {
2954 fn t(v: &[&str], sep: &str, s: &str) {
2955 assert_eq!(v.connect(sep), s.to_str());
2957 t(["you", "know", "I'm", "no", "good"],
2958 " ", "you know I'm no good");
2960 t(["hi"], " ", "hi");
2965 assert_eq!("x".repeat(4), "xxxx".to_owned());
2966 assert_eq!("hi".repeat(4), "hihihihi".to_owned());
2967 assert_eq!("ไท华".repeat(3), "ไท华ไท华ไท华".to_owned());
2968 assert_eq!("".repeat(4), "".to_owned());
2969 assert_eq!("hi".repeat(0), "".to_owned());
2973 fn test_unsafe_slice() {
2974 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2975 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
2976 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2977 fn a_million_letter_a() -> ~str {
2979 let mut rs = StrBuf::new();
2981 rs.push_str("aaaaaaaaaa");
2986 fn half_a_million_letter_a() -> ~str {
2988 let mut rs = StrBuf::new();
2990 rs.push_str("aaaaa");
2995 let letters = a_million_letter_a();
2996 assert!(half_a_million_letter_a() ==
2997 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3001 fn test_starts_with() {
3002 assert!(("".starts_with("")));
3003 assert!(("abc".starts_with("")));
3004 assert!(("abc".starts_with("a")));
3005 assert!((!"a".starts_with("abc")));
3006 assert!((!"".starts_with("abc")));
3007 assert!((!"ödd".starts_with("-")));
3008 assert!(("ödd".starts_with("öd")));
3012 fn test_ends_with() {
3013 assert!(("".ends_with("")));
3014 assert!(("abc".ends_with("")));
3015 assert!(("abc".ends_with("c")));
3016 assert!((!"a".ends_with("abc")));
3017 assert!((!"".ends_with("abc")));
3018 assert!((!"ddö".ends_with("-")));
3019 assert!(("ddö".ends_with("dö")));
3023 fn test_is_empty() {
3024 assert!("".is_empty());
3025 assert!(!"a".is_empty());
3031 assert_eq!("".replace(a, "b"), "".to_owned());
3032 assert_eq!("a".replace(a, "b"), "b".to_owned());
3033 assert_eq!("ab".replace(a, "b"), "bb".to_owned());
3035 assert!(" test test ".replace(test, "toast") ==
3036 " toast toast ".to_owned());
3037 assert_eq!(" test test ".replace(test, ""), " ".to_owned());
3041 fn test_replace_2a() {
3042 let data = "ประเทศไทย中华".to_owned();
3043 let repl = "دولة الكويت".to_owned();
3045 let a = "ประเ".to_owned();
3046 let a2 = "دولة الكويتทศไทย中华".to_owned();
3047 assert_eq!(data.replace(a, repl), a2);
3051 fn test_replace_2b() {
3052 let data = "ประเทศไทย中华".to_owned();
3053 let repl = "دولة الكويت".to_owned();
3055 let b = "ะเ".to_owned();
3056 let b2 = "ปรدولة الكويتทศไทย中华".to_owned();
3057 assert_eq!(data.replace(b, repl), b2);
3061 fn test_replace_2c() {
3062 let data = "ประเทศไทย中华".to_owned();
3063 let repl = "دولة الكويت".to_owned();
3065 let c = "中华".to_owned();
3066 let c2 = "ประเทศไทยدولة الكويت".to_owned();
3067 assert_eq!(data.replace(c, repl), c2);
3071 fn test_replace_2d() {
3072 let data = "ประเทศไทย中华".to_owned();
3073 let repl = "دولة الكويت".to_owned();
3075 let d = "ไท华".to_owned();
3076 assert_eq!(data.replace(d, repl), data);
3081 assert_eq!("ab", "abc".slice(0, 2));
3082 assert_eq!("bc", "abc".slice(1, 3));
3083 assert_eq!("", "abc".slice(1, 1));
3084 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3086 let data = "ประเทศไทย中华";
3087 assert_eq!("ป", data.slice(0, 3));
3088 assert_eq!("ร", data.slice(3, 6));
3089 assert_eq!("", data.slice(3, 3));
3090 assert_eq!("华", data.slice(30, 33));
3092 fn a_million_letter_X() -> ~str {
3094 let mut rs = StrBuf::new();
3096 rs.push_str("华华华华华华华华华华");
3101 fn half_a_million_letter_X() -> ~str {
3103 let mut rs = StrBuf::new();
3105 rs.push_str("华华华华华");
3110 let letters = a_million_letter_X();
3111 assert!(half_a_million_letter_X() ==
3112 letters.slice(0u, 3u * 500000u).to_owned());
3117 let ss = "中华Việt Nam";
3119 assert_eq!("华", ss.slice(3u, 6u));
3120 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3122 assert_eq!("ab", "abc".slice(0u, 2u));
3123 assert_eq!("bc", "abc".slice(1u, 3u));
3124 assert_eq!("", "abc".slice(1u, 1u));
3126 assert_eq!("中", ss.slice(0u, 3u));
3127 assert_eq!("华V", ss.slice(3u, 7u));
3128 assert_eq!("", ss.slice(3u, 3u));
3143 fn test_slice_fail() {
3144 "中华Việt Nam".slice(0u, 2u);
3148 fn test_slice_from() {
3149 assert_eq!("abcd".slice_from(0), "abcd");
3150 assert_eq!("abcd".slice_from(2), "cd");
3151 assert_eq!("abcd".slice_from(4), "");
3154 fn test_slice_to() {
3155 assert_eq!("abcd".slice_to(0), "");
3156 assert_eq!("abcd".slice_to(2), "ab");
3157 assert_eq!("abcd".slice_to(4), "abcd");
3161 fn test_trim_left_chars() {
3162 let v: &[char] = &[];
3163 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3164 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3165 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3166 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3168 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3169 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3170 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3174 fn test_trim_right_chars() {
3175 let v: &[char] = &[];
3176 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3177 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3178 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3179 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3181 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3182 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3183 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3187 fn test_trim_chars() {
3188 let v: &[char] = &[];
3189 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3190 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3191 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3192 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3194 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3195 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3196 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3200 fn test_trim_left() {
3201 assert_eq!("".trim_left(), "");
3202 assert_eq!("a".trim_left(), "a");
3203 assert_eq!(" ".trim_left(), "");
3204 assert_eq!(" blah".trim_left(), "blah");
3205 assert_eq!(" \u3000 wut".trim_left(), "wut");
3206 assert_eq!("hey ".trim_left(), "hey ");
3210 fn test_trim_right() {
3211 assert_eq!("".trim_right(), "");
3212 assert_eq!("a".trim_right(), "a");
3213 assert_eq!(" ".trim_right(), "");
3214 assert_eq!("blah ".trim_right(), "blah");
3215 assert_eq!("wut \u3000 ".trim_right(), "wut");
3216 assert_eq!(" hey".trim_right(), " hey");
3221 assert_eq!("".trim(), "");
3222 assert_eq!("a".trim(), "a");
3223 assert_eq!(" ".trim(), "");
3224 assert_eq!(" blah ".trim(), "blah");
3225 assert_eq!("\nwut \u3000 ".trim(), "wut");
3226 assert_eq!(" hey dude ".trim(), "hey dude");
3230 fn test_is_whitespace() {
3231 assert!("".is_whitespace());
3232 assert!(" ".is_whitespace());
3233 assert!("\u2009".is_whitespace()); // Thin space
3234 assert!(" \n\t ".is_whitespace());
3235 assert!(!" _ ".is_whitespace());
3239 fn test_slice_shift_char() {
3240 let data = "ประเทศไทย中";
3241 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3245 fn test_slice_shift_char_2() {
3247 assert_eq!(empty.slice_shift_char(), (None, ""));
3252 // deny overlong encodings
3253 assert!(!is_utf8([0xc0, 0x80]));
3254 assert!(!is_utf8([0xc0, 0xae]));
3255 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3256 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3257 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3258 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3259 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3262 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3263 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3265 assert!(is_utf8([0xC2, 0x80]));
3266 assert!(is_utf8([0xDF, 0xBF]));
3267 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3268 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3269 assert!(is_utf8([0xEE, 0x80, 0x80]));
3270 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3271 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3272 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3276 fn test_is_utf16() {
3277 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3285 // surrogate pairs (randomly generated with Python 3's
3286 // .encode('utf-16be'))
3287 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3288 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3289 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3291 // mixtures (also random)
3292 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3293 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3294 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3297 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3300 // surrogate + regular unit
3302 // surrogate + lead surrogate
3304 // unterminated surrogate
3306 // trail surrogate without a lead
3309 // random byte sequences that Python 3's .decode('utf-16be')
3311 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3312 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3313 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3314 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3315 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3316 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3317 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3318 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3319 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3320 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3321 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3322 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3323 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3324 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3325 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3326 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3327 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3328 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3329 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3330 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3331 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3335 fn test_raw_from_c_str() {
3337 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3339 let c = raw::from_c_str(b);
3340 assert_eq!(c, "AAAAAAA".to_owned());
3345 fn test_as_bytes() {
3348 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3349 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3352 assert_eq!("".as_bytes(), &[]);
3353 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3354 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3359 fn test_as_bytes_fail() {
3360 // Don't double free. (I'm not sure if this exercises the
3361 // original problem code path anymore.)
3362 let s = "".to_owned();
3363 let _bytes = s.as_bytes();
3369 let buf = "hello".as_ptr();
3371 assert_eq!(*buf.offset(0), 'h' as u8);
3372 assert_eq!(*buf.offset(1), 'e' as u8);
3373 assert_eq!(*buf.offset(2), 'l' as u8);
3374 assert_eq!(*buf.offset(3), 'l' as u8);
3375 assert_eq!(*buf.offset(4), 'o' as u8);
3380 fn test_subslice_offset() {
3381 let a = "kernelsprite";
3382 let b = a.slice(7, a.len());
3383 let c = a.slice(0, a.len() - 6);
3384 assert_eq!(a.subslice_offset(b), 7);
3385 assert_eq!(a.subslice_offset(c), 0);
3387 let string = "a\nb\nc";
3388 let lines: ~[&str] = string.lines().collect();
3389 assert_eq!(string.subslice_offset(lines[0]), 0);
3390 assert_eq!(string.subslice_offset(lines[1]), 2);
3391 assert_eq!(string.subslice_offset(lines[2]), 4);
3396 fn test_subslice_offset_2() {
3397 let a = "alchemiter";
3398 let b = "cruxtruder";
3399 a.subslice_offset(b);
3403 fn vec_str_conversions() {
3404 let s1: ~str = "All mimsy were the borogoves".to_owned();
3406 let v: ~[u8] = s1.as_bytes().to_owned();
3407 let s2: ~str = from_utf8(v).unwrap().to_owned();
3408 let mut i: uint = 0u;
3409 let n1: uint = s1.len();
3410 let n2: uint = v.len();
3423 fn test_contains() {
3424 assert!("abcde".contains("bcd"));
3425 assert!("abcde".contains("abcd"));
3426 assert!("abcde".contains("bcde"));
3427 assert!("abcde".contains(""));
3428 assert!("".contains(""));
3429 assert!(!"abcde".contains("def"));
3430 assert!(!"".contains("a"));
3432 let data = "ประเทศไทย中华Việt Nam".to_owned();
3433 assert!(data.contains("ประเ"));
3434 assert!(data.contains("ะเ"));
3435 assert!(data.contains("中华"));
3436 assert!(!data.contains("ไท华"));
3440 fn test_contains_char() {
3441 assert!("abc".contains_char('b'));
3442 assert!("a".contains_char('a'));
3443 assert!(!"abc".contains_char('d'));
3444 assert!(!"".contains_char('a'));
3450 [("𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n".to_owned(),
3451 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3452 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3453 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3454 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3456 ("𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n".to_owned(),
3457 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3458 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3459 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3460 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3461 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3464 ("𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n".to_owned(),
3465 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3466 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3467 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3468 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3469 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3470 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3471 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3473 ("𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n".to_owned(),
3474 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3475 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3476 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3477 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3478 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3479 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3480 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3481 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3482 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3483 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3485 // Issue #12318, even-numbered non-BMP planes
3486 ("\U00020000".to_owned(),
3487 ~[0xD840, 0xDC00])];
3489 for p in pairs.iter() {
3490 let (s, u) = (*p).clone();
3491 assert!(is_utf16(u));
3492 assert_eq!(s.to_utf16(), u);
3494 assert_eq!(from_utf16(u).unwrap(), s);
3495 assert_eq!(from_utf16_lossy(u), s);
3497 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3498 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3503 fn test_utf16_invalid() {
3504 // completely positive cases tested above.
3506 assert_eq!(from_utf16([0xD800]), None);
3508 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3511 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3514 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3518 fn test_utf16_lossy() {
3519 // completely positive cases tested above.
3521 assert_eq!(from_utf16_lossy([0xD800]), "\uFFFD".to_owned());
3523 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), "\uFFFD\uFFFD".to_owned());
3526 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), "a\uFFFD".to_owned());
3529 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), "\uFFFD𐒋\uFFFD".to_owned());
3533 fn test_truncate_utf16_at_nul() {
3535 assert_eq!(truncate_utf16_at_nul(v), &[]);
3538 assert_eq!(truncate_utf16_at_nul(v), &[]);
3541 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3544 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3547 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3552 let s = "ศไทย中华Việt Nam".to_owned();
3553 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3555 for ch in v.iter() {
3556 assert!(s.char_at(pos) == *ch);
3557 pos += from_char(*ch).len();
3562 fn test_char_at_reverse() {
3563 let s = "ศไทย中华Việt Nam".to_owned();
3564 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3565 let mut pos = s.len();
3566 for ch in v.rev_iter() {
3567 assert!(s.char_at_reverse(pos) == *ch);
3568 pos -= from_char(*ch).len();
3573 fn test_escape_unicode() {
3574 assert_eq!("abc".escape_unicode(), "\\x61\\x62\\x63".to_owned());
3575 assert_eq!("a c".escape_unicode(), "\\x61\\x20\\x63".to_owned());
3576 assert_eq!("\r\n\t".escape_unicode(), "\\x0d\\x0a\\x09".to_owned());
3577 assert_eq!("'\"\\".escape_unicode(), "\\x27\\x22\\x5c".to_owned());
3578 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), "\\x00\\x01\\xfe\\xff".to_owned());
3579 assert_eq!("\u0100\uffff".escape_unicode(), "\\u0100\\uffff".to_owned());
3580 assert_eq!("\U00010000\U0010ffff".escape_unicode(), "\\U00010000\\U0010ffff".to_owned());
3581 assert_eq!("ab\ufb00".escape_unicode(), "\\x61\\x62\\ufb00".to_owned());
3582 assert_eq!("\U0001d4ea\r".escape_unicode(), "\\U0001d4ea\\x0d".to_owned());
3586 fn test_escape_default() {
3587 assert_eq!("abc".escape_default(), "abc".to_owned());
3588 assert_eq!("a c".escape_default(), "a c".to_owned());
3589 assert_eq!("\r\n\t".escape_default(), "\\r\\n\\t".to_owned());
3590 assert_eq!("'\"\\".escape_default(), "\\'\\\"\\\\".to_owned());
3591 assert_eq!("\u0100\uffff".escape_default(), "\\u0100\\uffff".to_owned());
3592 assert_eq!("\U00010000\U0010ffff".escape_default(), "\\U00010000\\U0010ffff".to_owned());
3593 assert_eq!("ab\ufb00".escape_default(), "ab\\ufb00".to_owned());
3594 assert_eq!("\U0001d4ea\r".escape_default(), "\\U0001d4ea\\r".to_owned());
3598 fn test_total_ord() {
3599 "1234".cmp(& &"123") == Greater;
3600 "123".cmp(& &"1234") == Less;
3601 "1234".cmp(& &"1234") == Equal;
3602 "12345555".cmp(& &"123456") == Less;
3603 "22".cmp(& &"1234") == Greater;
3607 fn test_char_range_at() {
3608 let data = "b¢€𤭢𤭢€¢b".to_owned();
3609 assert_eq!('b', data.char_range_at(0).ch);
3610 assert_eq!('¢', data.char_range_at(1).ch);
3611 assert_eq!('€', data.char_range_at(3).ch);
3612 assert_eq!('𤭢', data.char_range_at(6).ch);
3613 assert_eq!('𤭢', data.char_range_at(10).ch);
3614 assert_eq!('€', data.char_range_at(14).ch);
3615 assert_eq!('¢', data.char_range_at(17).ch);
3616 assert_eq!('b', data.char_range_at(19).ch);
3620 fn test_char_range_at_reverse_underflow() {
3621 assert_eq!("abc".char_range_at_reverse(0).next, 0);
3626 #![allow(unnecessary_allocation)]
3628 ($s1:expr, $s2:expr, $e:expr) => { {
3632 assert_eq!(s1 + s2, e.to_owned());
3633 assert_eq!(s1.to_owned() + s2, e.to_owned());
3637 t!("foo", "bar", "foobar");
3638 t!("foo", "bar".to_owned(), "foobar");
3639 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
3640 t!("ศไทย中", "华Việt Nam".to_owned(), "ศไทย中华Việt Nam");
3644 fn test_iterator() {
3646 let s = "ศไทย中华Việt Nam".to_owned();
3647 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3650 let mut it = s.chars();
3653 assert_eq!(c, v[pos]);
3656 assert_eq!(pos, v.len());
3660 fn test_rev_iterator() {
3662 let s = "ศไทย中华Việt Nam".to_owned();
3663 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3666 let mut it = s.chars_rev();
3669 assert_eq!(c, v[pos]);
3672 assert_eq!(pos, v.len());
3676 fn test_iterator_clone() {
3677 let s = "ศไทย中华Việt Nam";
3678 let mut it = s.chars();
3680 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3684 fn test_bytesator() {
3685 let s = "ศไทย中华Việt Nam".to_owned();
3687 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3688 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3693 for b in s.bytes() {
3694 assert_eq!(b, v[pos]);
3700 fn test_bytes_revator() {
3701 let s = "ศไทย中华Việt Nam".to_owned();
3703 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3704 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3707 let mut pos = v.len();
3709 for b in s.bytes_rev() {
3711 assert_eq!(b, v[pos]);
3716 fn test_char_indicesator() {
3718 let s = "ศไทย中华Việt Nam";
3719 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3720 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3723 let mut it = s.char_indices();
3726 assert_eq!(c, (p[pos], v[pos]));
3729 assert_eq!(pos, v.len());
3730 assert_eq!(pos, p.len());
3734 fn test_char_indices_revator() {
3736 let s = "ศไทย中华Việt Nam";
3737 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3738 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3741 let mut it = s.char_indices_rev();
3744 assert_eq!(c, (p[pos], v[pos]));
3747 assert_eq!(pos, v.len());
3748 assert_eq!(pos, p.len());
3752 fn test_split_char_iterator() {
3753 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3755 let split: ~[&str] = data.split(' ').collect();
3756 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3758 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
3760 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3762 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3763 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3765 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
3767 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3770 let split: ~[&str] = data.split('ä').collect();
3771 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3773 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
3775 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3777 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3778 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3780 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
3782 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3786 fn test_splitn_char_iterator() {
3787 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3789 let split: ~[&str] = data.splitn(' ', 3).collect();
3790 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3792 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3793 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3796 let split: ~[&str] = data.splitn('ä', 3).collect();
3797 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3799 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3800 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3804 fn test_rsplitn_char_iterator() {
3805 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3807 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3809 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3811 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3813 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3816 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3818 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3820 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3822 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3826 fn test_split_char_iterator_no_trailing() {
3827 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3829 let split: ~[&str] = data.split('\n').collect();
3830 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3832 let split: ~[&str] = data.split_terminator('\n').collect();
3833 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3837 fn test_rev_split_char_iterator_no_trailing() {
3838 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3840 let mut split: ~[&str] = data.split('\n').rev().collect();
3842 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3844 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3846 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3851 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
3852 let words: ~[&str] = data.words().collect();
3853 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3857 fn test_nfd_chars() {
3858 assert_eq!("abc".nfd_chars().collect::<~str>(), "abc".to_owned());
3859 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), "d\u0307\u01c4".to_owned());
3860 assert_eq!("\u2026".nfd_chars().collect::<~str>(), "\u2026".to_owned());
3861 assert_eq!("\u2126".nfd_chars().collect::<~str>(), "\u03a9".to_owned());
3862 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3863 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3864 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), "a\u0301".to_owned());
3865 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), "\u0301a".to_owned());
3866 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), "\u1111\u1171\u11b6".to_owned());
3867 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), "\u1100\u1162".to_owned());
3871 fn test_nfkd_chars() {
3872 assert_eq!("abc".nfkd_chars().collect::<~str>(), "abc".to_owned());
3873 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), "d\u0307DZ\u030c".to_owned());
3874 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), "...".to_owned());
3875 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), "\u03a9".to_owned());
3876 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3877 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), "d\u0323\u0307".to_owned());
3878 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), "a\u0301".to_owned());
3879 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), "\u0301a".to_owned());
3880 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), "\u1111\u1171\u11b6".to_owned());
3881 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), "\u1100\u1162".to_owned());
3886 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3887 let lines: ~[&str] = data.lines().collect();
3888 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3890 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3891 let lines: ~[&str] = data.lines().collect();
3892 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3896 fn test_split_strator() {
3897 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3898 let v: ~[&str] = s.split_str(sep).collect();
3901 t("--1233345--", "12345", ~["--1233345--"]);
3902 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3903 t("::hello::there", "::", ~["", "hello", "there"]);
3904 t("hello::there::", "::", ~["hello", "there", ""]);
3905 t("::hello::there::", "::", ~["", "hello", "there", ""]);
3906 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3907 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3908 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3909 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3911 t("zz", "zz", ~["",""]);
3912 t("ok", "z", ~["ok"]);
3913 t("zzz", "zz", ~["","z"]);
3914 t("zzzzz", "zz", ~["","","z"]);
3918 fn test_str_default() {
3919 use default::Default;
3920 fn t<S: Default + Str>() {
3921 let s: S = Default::default();
3922 assert_eq!(s.as_slice(), "");
3930 fn test_str_container() {
3931 fn sum_len<S: Container>(v: &[S]) -> uint {
3932 v.iter().map(|x| x.len()).sum()
3935 let s = "01234".to_owned();
3936 assert_eq!(5, sum_len(["012", "", "34"]));
3937 assert_eq!(5, sum_len(["01".to_owned(), "2".to_owned(), "34".to_owned(), "".to_owned()]));
3938 assert_eq!(5, sum_len([s.as_slice()]));
3942 fn test_str_from_utf8() {
3943 let xs = bytes!("hello");
3944 assert_eq!(from_utf8(xs), Some("hello"));
3946 let xs = bytes!("ศไทย中华Việt Nam");
3947 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3949 let xs = bytes!("hello", 0xff);
3950 assert_eq!(from_utf8(xs), None);
3954 fn test_str_from_utf8_owned() {
3955 let xs = bytes!("hello").to_owned();
3956 assert_eq!(from_utf8_owned(xs), Some("hello".to_owned()));
3958 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3959 assert_eq!(from_utf8_owned(xs), Some("ศไทย中华Việt Nam".to_owned()));
3961 let xs = bytes!("hello", 0xff).to_owned();
3962 assert_eq!(from_utf8_owned(xs), None);
3966 fn test_str_from_utf8_lossy() {
3967 let xs = bytes!("hello");
3968 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
3970 let xs = bytes!("ศไทย中华Việt Nam");
3971 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
3973 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
3974 assert_eq!(from_utf8_lossy(xs), Owned("Hello\uFFFD There\uFFFD Goodbye".to_owned()));
3976 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
3977 assert_eq!(from_utf8_lossy(xs), Owned("Hello\uFFFD\uFFFD There\uFFFD Goodbye".to_owned()));
3979 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
3980 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFD\uFFFDbar".to_owned()));
3982 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
3983 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFDbar\uFFFDbaz".to_owned()));
3985 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
3986 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz".to_owned()));
3988 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
3989 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFD\uFFFD\uFFFD\uFFFD\
3990 foo\U00010000bar".to_owned()));
3993 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
3994 assert_eq!(from_utf8_lossy(xs), Owned("\uFFFD\uFFFD\uFFFDfoo\
3995 \uFFFD\uFFFD\uFFFDbar".to_owned()));
3999 fn test_from_str() {
4000 let owned: Option<~str> = from_str(&"string");
4001 assert_eq!(owned, Some("string".to_owned()));
4005 fn test_maybe_owned_traits() {
4006 let s = Slice("abcde");
4007 assert_eq!(s.len(), 5);
4008 assert_eq!(s.as_slice(), "abcde");
4009 assert_eq!(s.to_str(), "abcde".to_owned());
4010 assert_eq!(format!("{}", s), "abcde".to_owned());
4011 assert!(s.lt(&Owned("bcdef".to_owned())));
4012 assert_eq!(Slice(""), Default::default());
4014 let o = Owned("abcde".to_owned());
4015 assert_eq!(o.len(), 5);
4016 assert_eq!(o.as_slice(), "abcde");
4017 assert_eq!(o.to_str(), "abcde".to_owned());
4018 assert_eq!(format!("{}", o), "abcde".to_owned());
4019 assert!(o.lt(&Slice("bcdef")));
4020 assert_eq!(Owned("".to_owned()), Default::default());
4022 assert!(s.cmp(&o) == Equal);
4023 assert!(s.equiv(&o));
4025 assert!(o.cmp(&s) == Equal);
4026 assert!(o.equiv(&s));
4030 fn test_maybe_owned_methods() {
4031 let s = Slice("abcde");
4032 assert!(s.is_slice());
4033 assert!(!s.is_owned());
4035 let o = Owned("abcde".to_owned());
4036 assert!(!o.is_slice());
4037 assert!(o.is_owned());
4041 fn test_maybe_owned_clone() {
4042 assert_eq!(Owned("abcde".to_owned()), Slice("abcde").clone());
4043 assert_eq!(Owned("abcde".to_owned()), Owned("abcde".to_owned()).clone());
4044 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4045 assert_eq!(Slice("abcde"), Owned("abcde".to_owned()).clone());
4049 fn test_maybe_owned_into_owned() {
4050 assert_eq!(Slice("abcde").into_owned(), "abcde".to_owned());
4051 assert_eq!(Owned("abcde".to_owned()).into_owned(), "abcde".to_owned());
4055 fn test_into_maybe_owned() {
4056 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4057 assert_eq!(("abcde".to_owned()).into_maybe_owned(), Slice("abcde"));
4058 assert_eq!("abcde".into_maybe_owned(), Owned("abcde".to_owned()));
4059 assert_eq!(("abcde".to_owned()).into_maybe_owned(), Owned("abcde".to_owned()));
4066 use self::test::Bencher;
4071 fn char_iterator(b: &mut Bencher) {
4072 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4073 let len = s.char_len();
4075 b.iter(|| assert_eq!(s.chars().len(), len));
4079 fn char_iterator_ascii(b: &mut Bencher) {
4080 let s = "Mary had a little lamb, Little lamb
4081 Mary had a little lamb, Little lamb
4082 Mary had a little lamb, Little lamb
4083 Mary had a little lamb, Little lamb
4084 Mary had a little lamb, Little lamb
4085 Mary had a little lamb, Little lamb";
4086 let len = s.char_len();
4088 b.iter(|| assert_eq!(s.chars().len(), len));
4092 fn char_iterator_rev(b: &mut Bencher) {
4093 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4094 let len = s.char_len();
4096 b.iter(|| assert_eq!(s.chars_rev().len(), len));
4100 fn char_indicesator(b: &mut Bencher) {
4101 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4102 let len = s.char_len();
4104 b.iter(|| assert_eq!(s.char_indices().len(), len));
4108 fn char_indicesator_rev(b: &mut Bencher) {
4109 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4110 let len = s.char_len();
4112 b.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4116 fn split_unicode_ascii(b: &mut Bencher) {
4117 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4119 b.iter(|| assert_eq!(s.split('V').len(), 3));
4123 fn split_unicode_not_ascii(b: &mut Bencher) {
4124 struct NotAscii(char);
4125 impl CharEq for NotAscii {
4126 fn matches(&self, c: char) -> bool {
4127 let NotAscii(cc) = *self;
4130 fn only_ascii(&self) -> bool { false }
4132 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4134 b.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4139 fn split_ascii(b: &mut Bencher) {
4140 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4141 let len = s.split(' ').len();
4143 b.iter(|| assert_eq!(s.split(' ').len(), len));
4147 fn split_not_ascii(b: &mut Bencher) {
4148 struct NotAscii(char);
4149 impl CharEq for NotAscii {
4151 fn matches(&self, c: char) -> bool {
4152 let NotAscii(cc) = *self;
4155 fn only_ascii(&self) -> bool { false }
4157 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4158 let len = s.split(' ').len();
4160 b.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4164 fn split_extern_fn(b: &mut Bencher) {
4165 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4166 let len = s.split(' ').len();
4167 fn pred(c: char) -> bool { c == ' ' }
4169 b.iter(|| assert_eq!(s.split(pred).len(), len));
4173 fn split_closure(b: &mut Bencher) {
4174 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4175 let len = s.split(' ').len();
4177 b.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4181 fn split_slice(b: &mut Bencher) {
4182 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4183 let len = s.split(' ').len();
4185 b.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4189 fn is_utf8_100_ascii(b: &mut Bencher) {
4191 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4192 Lorem ipsum dolor sit amet, consectetur. ");
4194 assert_eq!(100, s.len());
4201 fn is_utf8_100_multibyte(b: &mut Bencher) {
4202 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4203 assert_eq!(100, s.len());
4210 fn from_utf8_lossy_100_ascii(b: &mut Bencher) {
4211 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4212 Lorem ipsum dolor sit amet, consectetur. ");
4214 assert_eq!(100, s.len());
4216 let _ = from_utf8_lossy(s);
4221 fn from_utf8_lossy_100_multibyte(b: &mut Bencher) {
4222 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4223 assert_eq!(100, s.len());
4225 let _ = from_utf8_lossy(s);
4230 fn from_utf8_lossy_invalid(b: &mut Bencher) {
4231 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4233 let _ = from_utf8_lossy(s);
4238 fn from_utf8_lossy_100_invalid(b: &mut Bencher) {
4239 let s = Vec::from_elem(100, 0xF5u8);
4241 let _ = from_utf8_lossy(s.as_slice());
4246 fn bench_connect(b: &mut Bencher) {
4247 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4249 let v = [s, s, s, s, s, s, s, s, s, s];
4251 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);