1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
64 let mut buf = ~"testing";
67 assert_eq!(buf, ~"testing 123");
72 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
73 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
74 encoded UTF-8 sequences. Additionally, strings are not null-terminated
75 and can contain null codepoints.
77 The actual representation of strings have direct mappings to vectors:
79 * `~str` is the same as `~[u8]`
80 * `&str` is the same as `&[u8]`
89 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
90 use container::{Container, Mutable};
93 use iter::{Iterator, FromIterator, Extendable, range};
94 use iter::{Filter, AdditiveIterator, Map};
95 use iter::{Rev, DoubleEndedIterator, ExactSize};
98 use option::{None, Option, Some};
101 use from_str::FromStr;
103 use slice::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
106 use default::Default;
110 Section: Creating a string
113 /// Consumes a vector of bytes to create a new utf-8 string.
114 /// Returns None if the vector contains invalid UTF-8.
115 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
117 Some(unsafe { raw::from_utf8_owned(vv) })
123 /// Converts a vector to a string slice without performing any allocations.
125 /// Once the slice has been validated as utf-8, it is transmuted in-place and
126 /// returned as a '&str' instead of a '&[u8]'
128 /// Returns None if the slice is not utf-8.
129 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
131 Some(unsafe { raw::from_utf8(v) })
135 impl FromStr for ~str {
137 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
140 /// Convert a byte to a UTF-8 string
144 /// Fails if invalid UTF-8
145 pub fn from_byte(b: u8) -> ~str {
147 unsafe { ::cast::transmute(~[b]) }
150 /// Convert a char to a string
151 pub fn from_char(ch: char) -> ~str {
157 /// Convert a vector of chars to a string
158 pub fn from_chars(chs: &[char]) -> ~str {
159 chs.iter().map(|c| *c).collect()
163 pub fn push_str(lhs: &mut ~str, rhs: &str) {
167 /// Methods for vectors of strings
168 pub trait StrVector {
169 /// Concatenate a vector of strings.
170 fn concat(&self) -> ~str;
172 /// Concatenate a vector of strings, placing a given separator between each.
173 fn connect(&self, sep: &str) -> ~str;
176 impl<'a, S: Str> StrVector for &'a [S] {
177 fn concat(&self) -> ~str {
178 if self.is_empty() { return ~""; }
180 // `len` calculation may overflow but push_str but will check boundaries
181 let len = self.iter().map(|s| s.as_slice().len()).sum();
183 let mut result = with_capacity(len);
185 for s in self.iter() {
186 result.push_str(s.as_slice())
191 fn connect(&self, sep: &str) -> ~str {
192 if self.is_empty() { return ~""; }
195 if sep.is_empty() { return self.concat(); }
197 // this is wrong without the guarantee that `self` is non-empty
198 // `len` calculation may overflow but push_str but will check boundaries
199 let len = sep.len() * (self.len() - 1)
200 + self.iter().map(|s| s.as_slice().len()).sum();
201 let mut result = with_capacity(len);
202 let mut first = true;
204 for s in self.iter() {
208 result.push_str(sep);
210 result.push_str(s.as_slice());
216 impl<'a, S: Str> StrVector for Vec<S> {
218 fn concat(&self) -> ~str {
219 self.as_slice().concat()
223 fn connect(&self, sep: &str) -> ~str {
224 self.as_slice().connect(sep)
228 /// Something that can be used to compare against a character
230 /// Determine if the splitter should split at the given character
231 fn matches(&self, char) -> bool;
232 /// Indicate if this is only concerned about ASCII characters,
233 /// which can allow for a faster implementation.
234 fn only_ascii(&self) -> bool;
237 impl CharEq for char {
239 fn matches(&self, c: char) -> bool { *self == c }
241 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
244 impl<'a> CharEq for 'a |char| -> bool {
246 fn matches(&self, c: char) -> bool { (*self)(c) }
248 fn only_ascii(&self) -> bool { false }
251 impl CharEq for extern "Rust" fn(char) -> bool {
253 fn matches(&self, c: char) -> bool { (*self)(c) }
255 fn only_ascii(&self) -> bool { false }
258 impl<'a, C: CharEq> CharEq for &'a [C] {
260 fn matches(&self, c: char) -> bool {
261 self.iter().any(|m| m.matches(c))
264 fn only_ascii(&self) -> bool {
265 self.iter().all(|m| m.only_ascii())
273 /// External iterator for a string's characters.
274 /// Use with the `std::iter` module.
276 pub struct Chars<'a> {
277 /// The slice remaining to be iterated
278 priv string: &'a str,
281 impl<'a> Iterator<char> for Chars<'a> {
283 fn next(&mut self) -> Option<char> {
284 // Decode the next codepoint, then update
285 // the slice to be just the remaining part
286 if self.string.len() != 0 {
287 let CharRange {ch, next} = self.string.char_range_at(0);
289 self.string = raw::slice_unchecked(self.string, next, self.string.len());
298 fn size_hint(&self) -> (uint, Option<uint>) {
299 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
303 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
305 fn next_back(&mut self) -> Option<char> {
306 if self.string.len() != 0 {
307 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
309 self.string = raw::slice_unchecked(self.string, 0, next);
318 /// External iterator for a string's characters and their byte offsets.
319 /// Use with the `std::iter` module.
321 pub struct CharOffsets<'a> {
322 /// The original string to be iterated
323 priv string: &'a str,
324 priv iter: Chars<'a>,
327 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
329 fn next(&mut self) -> Option<(uint, char)> {
330 // Compute the byte offset by using the pointer offset between
331 // the original string slice and the iterator's remaining part
332 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
333 self.iter.next().map(|ch| (offset, ch))
337 fn size_hint(&self) -> (uint, Option<uint>) {
338 self.iter.size_hint()
342 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
344 fn next_back(&mut self) -> Option<(uint, char)> {
345 self.iter.next_back().map(|ch| {
346 let offset = self.iter.string.len() +
347 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
353 /// External iterator for a string's characters in reverse order.
354 /// Use with the `std::iter` module.
355 pub type RevChars<'a> = Rev<Chars<'a>>;
357 /// External iterator for a string's characters and their byte offsets in reverse order.
358 /// Use with the `std::iter` module.
359 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
361 /// External iterator for a string's bytes.
362 /// Use with the `std::iter` module.
364 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
366 /// External iterator for a string's bytes in reverse order.
367 /// Use with the `std::iter` module.
368 pub type RevBytes<'a> = Rev<Bytes<'a>>;
370 /// An iterator over the substrings of a string, separated by `sep`.
372 pub struct CharSplits<'a, Sep> {
373 /// The slice remaining to be iterated
374 priv string: &'a str,
376 /// Whether an empty string at the end is allowed
377 priv allow_trailing_empty: bool,
378 priv only_ascii: bool,
382 /// An iterator over the substrings of a string, separated by `sep`,
383 /// starting from the back of the string.
384 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
386 /// An iterator over the substrings of a string, separated by `sep`,
387 /// splitting at most `count` times.
389 pub struct CharSplitsN<'a, Sep> {
390 priv iter: CharSplits<'a, Sep>,
391 /// The number of splits remaining
396 /// An iterator over the words of a string, separated by a sequence of whitespace
398 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
400 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
401 pub type AnyLines<'a> =
402 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
404 impl<'a, Sep> CharSplits<'a, Sep> {
406 fn get_end(&mut self) -> Option<&'a str> {
407 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
408 self.finished = true;
416 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
418 fn next(&mut self) -> Option<&'a str> {
419 if self.finished { return None }
421 let mut next_split = None;
423 for (idx, byte) in self.string.bytes().enumerate() {
424 if self.sep.matches(byte as char) && byte < 128u8 {
425 next_split = Some((idx, idx + 1));
430 for (idx, ch) in self.string.char_indices() {
431 if self.sep.matches(ch) {
432 next_split = Some((idx, self.string.char_range_at(idx).next));
438 Some((a, b)) => unsafe {
439 let elt = raw::slice_unchecked(self.string, 0, a);
440 self.string = raw::slice_unchecked(self.string, b, self.string.len());
443 None => self.get_end(),
448 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
449 for CharSplits<'a, Sep> {
451 fn next_back(&mut self) -> Option<&'a str> {
452 if self.finished { return None }
454 if !self.allow_trailing_empty {
455 self.allow_trailing_empty = true;
456 match self.next_back() {
457 Some(elt) if !elt.is_empty() => return Some(elt),
458 _ => if self.finished { return None }
461 let len = self.string.len();
462 let mut next_split = None;
465 for (idx, byte) in self.string.bytes().enumerate().rev() {
466 if self.sep.matches(byte as char) && byte < 128u8 {
467 next_split = Some((idx, idx + 1));
472 for (idx, ch) in self.string.char_indices_rev() {
473 if self.sep.matches(ch) {
474 next_split = Some((idx, self.string.char_range_at(idx).next));
480 Some((a, b)) => unsafe {
481 let elt = raw::slice_unchecked(self.string, b, len);
482 self.string = raw::slice_unchecked(self.string, 0, a);
485 None => { self.finished = true; Some(self.string) }
490 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
492 fn next(&mut self) -> Option<&'a str> {
495 if self.invert { self.iter.next_back() } else { self.iter.next() }
502 /// An iterator over the start and end indices of the matches of a
503 /// substring within a larger string
505 pub struct MatchIndices<'a> {
506 priv haystack: &'a str,
507 priv needle: &'a str,
511 /// An iterator over the substrings of a string separated by a given
514 pub struct StrSplits<'a> {
515 priv it: MatchIndices<'a>,
520 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
522 fn next(&mut self) -> Option<(uint, uint)> {
523 // See Issue #1932 for why this is a naive search
524 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
525 let mut match_start = 0;
528 while self.position < h_len {
529 if self.haystack[self.position] == self.needle[match_i] {
530 if match_i == 0 { match_start = self.position; }
534 if match_i == n_len {
536 return Some((match_start, self.position));
539 // failed match, backtrack
542 self.position = match_start;
551 impl<'a> Iterator<&'a str> for StrSplits<'a> {
553 fn next(&mut self) -> Option<&'a str> {
554 if self.finished { return None; }
556 match self.it.next() {
557 Some((from, to)) => {
558 let ret = Some(self.it.haystack.slice(self.last_end, from));
563 self.finished = true;
564 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
570 // Helper functions used for Unicode normalization
571 fn canonical_sort(comb: &mut [(char, u8)]) {
575 let len = comb.len();
576 for i in range(0, len) {
577 let mut swapped = false;
578 for j in range(1, len-i) {
579 let class_a = *comb[j-1].ref1();
580 let class_b = *comb[j].ref1();
581 if class_a != 0 && class_b != 0 && class_a > class_b {
586 if !swapped { break; }
591 enum NormalizationForm {
596 /// External iterator for a string's normalization's characters.
597 /// Use with the `std::iter` module.
599 pub struct Normalizations<'a> {
600 priv kind: NormalizationForm,
601 priv iter: Chars<'a>,
602 priv buffer: ~[(char, u8)],
606 impl<'a> Iterator<char> for Normalizations<'a> {
608 fn next(&mut self) -> Option<char> {
609 use unicode::decompose::canonical_combining_class;
611 match self.buffer.head() {
617 Some(&(c, _)) if self.sorted => {
621 _ => self.sorted = false
624 let decomposer = match self.kind {
625 NFD => char::decompose_canonical,
626 NFKD => char::decompose_compatible
630 for ch in self.iter {
631 let buffer = &mut self.buffer;
632 let sorted = &mut self.sorted;
634 let class = canonical_combining_class(d);
635 if class == 0 && !*sorted {
636 canonical_sort(*buffer);
639 buffer.push((d, class));
646 canonical_sort(self.buffer);
650 match self.buffer.shift() {
655 Some((c, _)) => Some(c),
660 fn size_hint(&self) -> (uint, Option<uint>) {
661 let (lower, _) = self.iter.size_hint();
666 /// Replace all occurrences of one string with another
670 /// * s - The string containing substrings to replace
671 /// * from - The string to replace
672 /// * to - The replacement string
676 /// The original string with all occurances of `from` replaced with `to`
677 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
678 let mut result = ~"";
679 let mut last_end = 0;
680 for (start, end) in s.match_indices(from) {
681 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
685 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
690 Section: Comparing strings
693 // share the implementation of the lang-item vs. non-lang-item
696 fn eq_slice_(a: &str, b: &str) -> bool {
697 a.len() == b.len() && unsafe {
698 libc::memcmp(a.as_ptr() as *libc::c_void,
699 b.as_ptr() as *libc::c_void,
700 a.len() as libc::size_t) == 0
704 /// Bytewise slice equality
708 pub fn eq_slice(a: &str, b: &str) -> bool {
712 /// Bytewise slice equality
715 pub fn eq_slice(a: &str, b: &str) -> bool {
719 /// Bytewise string equality
721 #[lang="uniq_str_eq"]
723 pub fn eq(a: &~str, b: &~str) -> bool {
729 pub fn eq(a: &~str, b: &~str) -> bool {
737 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
738 /// returning `true` in that case, or, if it is invalid, `false` with
739 /// `iter` reset such that it is pointing at the first byte in the
740 /// invalid sequence.
742 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
744 // save the current thing we're pointing at.
747 // restore the iterator we had at the start of this codepoint.
748 macro_rules! err ( () => { {*iter = old; return false} });
749 macro_rules! next ( () => {
752 // we needed data, but there was none: error!
757 let first = match iter.next() {
759 // we're at the end of the iterator and a codepoint
760 // boundary at the same time, so this string is valid.
764 // ASCII characters are always valid, so only large
765 // bytes need more examination.
767 let w = utf8_char_width(first);
768 let second = next!();
769 // 2-byte encoding is for codepoints \u0080 to \u07ff
770 // first C2 80 last DF BF
771 // 3-byte encoding is for codepoints \u0800 to \uffff
772 // first E0 A0 80 last EF BF BF
773 // excluding surrogates codepoints \ud800 to \udfff
774 // ED A0 80 to ED BF BF
775 // 4-byte encoding is for codepoints \u10000 to \u10ffff
776 // first F0 90 80 80 last F4 8F BF BF
778 // Use the UTF-8 syntax from the RFC
780 // https://tools.ietf.org/html/rfc3629
782 // UTF8-2 = %xC2-DF UTF8-tail
783 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
784 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
785 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
786 // %xF4 %x80-8F 2( UTF8-tail )
788 2 => if second & 192 != TAG_CONT_U8 {err!()},
790 match (first, second, next!() & 192) {
791 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
792 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
793 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
794 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
799 match (first, second, next!() & 192, next!() & 192) {
800 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
801 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
802 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
812 /// Determines if a vector of bytes contains valid UTF-8.
813 pub fn is_utf8(v: &[u8]) -> bool {
814 run_utf8_validation_iterator(&mut v.iter())
818 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
819 let mut it = v.iter();
821 let ok = run_utf8_validation_iterator(&mut it);
825 // work out how many valid bytes we've consumed
826 // (run_utf8_validation_iterator resets the iterator to just
827 // after the last good byte), which we can do because the
828 // vector iterator size_hint is exact.
829 let (remaining, _) = it.size_hint();
830 Some(v.len() - remaining)
834 /// Determines if a vector of `u16` contains valid UTF-16
835 pub fn is_utf16(v: &[u16]) -> bool {
836 let mut it = v.iter();
837 macro_rules! next ( ($ret:expr) => {
838 match it.next() { Some(u) => *u, None => return $ret }
844 match char::from_u32(u as u32) {
847 let u2 = next!(false);
848 if u < 0xD7FF || u > 0xDBFF ||
849 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
855 /// An iterator that decodes UTF-16 encoded codepoints from a vector
858 pub struct UTF16Items<'a> {
859 priv iter: slice::Items<'a, u16>
861 /// The possibilities for values decoded from a `u16` stream.
862 #[deriving(Eq, TotalEq, Clone, Show)]
864 /// A valid codepoint.
866 /// An invalid surrogate without its pair.
871 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
872 /// replacement character (U+FFFD).
874 pub fn to_char_lossy(&self) -> char {
877 LoneSurrogate(_) => '\uFFFD'
882 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
883 fn next(&mut self) -> Option<UTF16Item> {
884 let u = match self.iter.next() {
889 if u < 0xD800 || 0xDFFF < u {
891 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
892 } else if u >= 0xDC00 {
893 // a trailing surrogate
894 Some(LoneSurrogate(u))
896 // preserve state for rewinding.
899 let u2 = match self.iter.next() {
902 None => return Some(LoneSurrogate(u))
904 if u2 < 0xDC00 || u2 > 0xDFFF {
905 // not a trailing surrogate so we're not a valid
906 // surrogate pair, so rewind to redecode u2 next time.
908 return Some(LoneSurrogate(u))
911 // all ok, so lets decode it.
912 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
913 Some(ScalarValue(unsafe {cast::transmute(c)}))
918 fn size_hint(&self) -> (uint, Option<uint>) {
919 let (low, high) = self.iter.size_hint();
920 // we could be entirely valid surrogates (2 elements per
921 // char), or entirely non-surrogates (1 element per char)
926 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
927 /// returning invalid surrogates as `LoneSurrogate`s.
933 /// use std::str::{ScalarValue, LoneSurrogate};
935 /// // 𝄞mus<invalid>ic<invalid>
936 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
937 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
940 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
941 /// ~[ScalarValue('𝄞'),
942 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
943 /// LoneSurrogate(0xDD1E),
944 /// ScalarValue('i'), ScalarValue('c'),
945 /// LoneSurrogate(0xD834)]);
947 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
948 UTF16Items { iter : v.iter() }
951 /// Return a slice of `v` ending at (and not including) the first NUL
960 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
961 /// // no NULs so no change
962 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
966 /// assert_eq!(str::truncate_utf16_at_nul(v),
967 /// &['a' as u16, 'b' as u16]);
969 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
970 match v.iter().position(|c| *c == 0) {
971 // don't include the 0
972 Some(i) => v.slice_to(i),
977 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
978 /// if `v` contains any invalid data.
986 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
987 /// 0x0073, 0x0069, 0x0063];
988 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
990 /// // 𝄞mu<invalid>ic
992 /// assert_eq!(str::from_utf16(v), None);
994 pub fn from_utf16(v: &[u16]) -> Option<~str> {
995 let mut s = with_capacity(v.len() / 2);
996 for c in utf16_items(v) {
998 ScalarValue(c) => s.push_char(c),
999 LoneSurrogate(_) => return None
1005 /// Decode a UTF-16 encoded vector `v` into a string, replacing
1006 /// invalid data with the replacement character (U+FFFD).
1012 /// // 𝄞mus<invalid>ic<invalid>
1013 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1014 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1017 /// assert_eq!(str::from_utf16_lossy(v),
1018 /// ~"𝄞mus\uFFFDic\uFFFD");
1020 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1021 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1024 /// Allocates a new string with the specified capacity. The string returned is
1025 /// the empty string, but has capacity for much more.
1027 pub fn with_capacity(capacity: uint) -> ~str {
1029 cast::transmute(slice::with_capacity::<~[u8]>(capacity))
1033 // https://tools.ietf.org/html/rfc3629
1034 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1035 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1036 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1037 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1038 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1039 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1040 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1041 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1042 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1043 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1044 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1045 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1046 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1047 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1048 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1049 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1050 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1053 /// Given a first byte, determine how many bytes are in this UTF-8 character
1055 pub fn utf8_char_width(b: u8) -> uint {
1056 return UTF8_CHAR_WIDTH[b] as uint;
1059 /// Struct that contains a `char` and the index of the first byte of
1060 /// the next `char` in a string. This can be used as a data structure
1061 /// for iterating over the UTF-8 bytes of a string.
1062 pub struct CharRange {
1065 /// Index of the first byte of the next `char`
1069 // Return the initial codepoint accumulator for the first byte.
1070 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1071 // for width 3, and 3 bits for width 4
1072 macro_rules! utf8_first_byte(
1073 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1076 // return the value of $ch updated with continuation byte $byte
1077 macro_rules! utf8_acc_cont_byte(
1078 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1081 static TAG_CONT_U8: u8 = 128u8;
1083 /// Converts a vector of bytes to a new utf-8 string.
1084 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1089 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1090 /// let output = std::str::from_utf8_lossy(input);
1091 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1093 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1094 let firstbad = match first_non_utf8_index(v) {
1095 None => return Slice(unsafe { cast::transmute(v) }),
1099 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1100 let mut i = firstbad;
1101 let total = v.len();
1102 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1103 unsafe { *xs.unsafe_ref(i) }
1105 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1112 let mut res = with_capacity(total);
1115 unsafe { raw::push_bytes(&mut res, v.slice_to(i)) };
1118 // subseqidx is the index of the first byte of the subsequence we're looking at.
1119 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1121 let mut subseqidx = firstbad;
1125 let byte = unsafe_get(v, i);
1128 macro_rules! error(() => ({
1130 if subseqidx != i_ {
1131 raw::push_bytes(&mut res, v.slice(subseqidx, i_));
1134 raw::push_bytes(&mut res, REPLACEMENT);
1139 // subseqidx handles this
1141 let w = utf8_char_width(byte);
1145 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1152 match (byte, safe_get(v, i, total)) {
1153 (0xE0 , 0xA0 .. 0xBF) => (),
1154 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1155 (0xED , 0x80 .. 0x9F) => (),
1156 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1170 match (byte, safe_get(v, i, total)) {
1171 (0xF0 , 0x90 .. 0xBF) => (),
1172 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1173 (0xF4 , 0x80 .. 0x8F) => (),
1180 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1185 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1198 if subseqidx < total {
1199 unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) };
1208 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1209 /// This can be useful as an optimization when an allocation is sometimes
1210 /// needed but not always.
1211 pub enum MaybeOwned<'a> {
1212 /// A borrowed string
1218 /// SendStr is a specialization of `MaybeOwned` to be sendable
1219 pub type SendStr = MaybeOwned<'static>;
1221 impl<'a> MaybeOwned<'a> {
1222 /// Returns `true` if this `MaybeOwned` wraps an owned string
1224 pub fn is_owned(&self) -> bool {
1231 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1233 pub fn is_slice(&self) -> bool {
1241 /// Trait for moving into a `MaybeOwned`
1242 pub trait IntoMaybeOwned<'a> {
1243 /// Moves self into a `MaybeOwned`
1244 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1247 impl<'a> IntoMaybeOwned<'a> for ~str {
1249 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1252 impl<'a> IntoMaybeOwned<'a> for &'a str {
1254 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1257 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1259 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1262 impl<'a> Eq for MaybeOwned<'a> {
1264 fn eq(&self, other: &MaybeOwned) -> bool {
1265 self.as_slice() == other.as_slice()
1269 impl<'a> TotalEq for MaybeOwned<'a> {}
1271 impl<'a> Ord for MaybeOwned<'a> {
1273 fn lt(&self, other: &MaybeOwned) -> bool {
1274 self.as_slice().lt(&other.as_slice())
1278 impl<'a> TotalOrd for MaybeOwned<'a> {
1280 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1281 self.as_slice().cmp(&other.as_slice())
1285 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1287 fn equiv(&self, other: &S) -> bool {
1288 self.as_slice() == other.as_slice()
1292 impl<'a> Str for MaybeOwned<'a> {
1294 fn as_slice<'b>(&'b self) -> &'b str {
1297 Owned(ref s) => s.as_slice()
1302 fn into_owned(self) -> ~str {
1304 Slice(s) => s.to_owned(),
1310 impl<'a> Container for MaybeOwned<'a> {
1312 fn len(&self) -> uint { self.as_slice().len() }
1315 impl<'a> Clone for MaybeOwned<'a> {
1317 fn clone(&self) -> MaybeOwned<'a> {
1319 Slice(s) => Slice(s),
1320 Owned(ref s) => Owned(s.to_owned())
1325 impl<'a> Default for MaybeOwned<'a> {
1327 fn default() -> MaybeOwned<'a> { Slice("") }
1330 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1332 fn hash(&self, hasher: &mut H) {
1334 Slice(s) => s.hash(hasher),
1335 Owned(ref s) => s.hash(hasher),
1340 impl<'a> fmt::Show for MaybeOwned<'a> {
1342 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1344 Slice(ref s) => s.fmt(f),
1345 Owned(ref s) => s.fmt(f)
1350 /// Unsafe operations
1353 use container::Container;
1357 use option::{Option, Some, None};
1358 use str::{is_utf8, OwnedStr, StrSlice};
1360 use slice::{MutableVector, ImmutableVector, OwnedVector};
1363 /// Create a Rust string from a *u8 buffer of the given length
1364 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1365 let mut v: ~[u8] = slice::with_capacity(len);
1366 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1369 assert!(is_utf8(v));
1370 ::cast::transmute(v)
1373 #[lang="strdup_uniq"]
1376 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1377 from_buf_len(ptr, len)
1380 /// Create a Rust string from a null-terminated C string
1381 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1386 curr = buf.offset(i);
1388 from_buf_len(buf as *u8, i as uint)
1391 /// Converts a slice of bytes to a string slice without checking
1392 /// that the string contains valid UTF-8.
1393 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1397 /// Converts an owned vector of bytes to a new owned string. This assumes
1398 /// that the utf-8-ness of the vector has already been validated
1400 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1404 /// Converts a byte to a string.
1405 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1407 /// Form a slice from a C string. Unsafe because the caller must ensure the
1408 /// C string has the static lifetime, or else the return value may be
1409 /// invalidated later.
1410 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1414 while *curr != 0u8 {
1416 curr = s.offset(len as int);
1418 let v = Slice { data: s, len: len };
1419 assert!(is_utf8(::cast::transmute(v)));
1420 ::cast::transmute(v)
1423 /// Takes a bytewise (not UTF-8) slice from a string.
1425 /// Returns the substring from [`begin`..`end`).
1429 /// If begin is greater than end.
1430 /// If end is greater than the length of the string.
1432 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1433 assert!(begin <= end);
1434 assert!(end <= s.len());
1435 slice_unchecked(s, begin, end)
1438 /// Takes a bytewise (not UTF-8) slice from a string.
1440 /// Returns the substring from [`begin`..`end`).
1442 /// Caller must check slice boundaries!
1444 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1445 cast::transmute(Slice {
1446 data: s.as_ptr().offset(begin as int),
1451 /// Appends a byte to a string.
1452 /// The caller must preserve the valid UTF-8 property.
1454 pub unsafe fn push_byte(s: &mut ~str, b: u8) {
1455 as_owned_vec(s).push(b)
1458 /// Appends a vector of bytes to a string.
1459 /// The caller must preserve the valid UTF-8 property.
1461 pub unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
1462 slice::bytes::push_bytes(as_owned_vec(s), bytes);
1465 /// Removes the last byte from a string and returns it.
1466 /// Returns None when an empty string is passed.
1467 /// The caller must preserve the valid UTF-8 property.
1468 pub unsafe fn pop_byte(s: &mut ~str) -> Option<u8> {
1473 let b = s[len - 1u];
1479 /// Removes the first byte from a string and returns it.
1480 /// Returns None when an empty string is passed.
1481 /// The caller must preserve the valid UTF-8 property.
1482 pub unsafe fn shift_byte(s: &mut ~str) -> Option<u8> {
1488 *s = s.slice(1, len).to_owned();
1493 /// Access the str in its vector representation.
1494 /// The caller must preserve the valid UTF-8 property when modifying.
1496 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1500 /// Sets the length of a string
1502 /// This will explicitly set the size of the string, without actually
1503 /// modifing its buffers, so it is up to the caller to ensure that
1504 /// the string is actually the specified size.
1506 fn test_from_buf_len() {
1508 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1510 let c = from_buf_len(b, 3u);
1511 assert_eq!(c, ~"AAA");
1517 Section: Trait implementations
1521 #[allow(missing_doc)]
1523 use container::Container;
1524 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1527 use option::{Some, None};
1528 use str::{Str, StrSlice, OwnedStr, eq_slice};
1530 impl<'a> Add<&'a str,~str> for &'a str {
1532 fn add(&self, rhs: & &'a str) -> ~str {
1533 let mut ret = self.to_owned();
1539 impl<'a> TotalOrd for &'a str {
1541 fn cmp(&self, other: & &'a str) -> Ordering {
1542 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1543 match s_b.cmp(&o_b) {
1544 Greater => return Greater,
1545 Less => return Less,
1550 self.len().cmp(&other.len())
1554 impl TotalOrd for ~str {
1556 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1559 impl<'a> Eq for &'a str {
1561 fn eq(&self, other: & &'a str) -> bool {
1562 eq_slice((*self), (*other))
1565 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1570 fn eq(&self, other: &~str) -> bool {
1571 eq_slice((*self), (*other))
1575 impl<'a> TotalEq for &'a str {}
1577 impl TotalEq for ~str {}
1579 impl<'a> Ord for &'a str {
1581 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1586 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1589 impl<'a, S: Str> Equiv<S> for &'a str {
1591 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1594 impl<'a, S: Str> Equiv<S> for ~str {
1596 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1603 /// Any string that can be represented as a slice
1605 /// Work with `self` as a slice.
1606 fn as_slice<'a>(&'a self) -> &'a str;
1608 /// Convert `self` into a ~str, not making a copy if possible
1609 fn into_owned(self) -> ~str;
1612 impl<'a> Str for &'a str {
1614 fn as_slice<'a>(&'a self) -> &'a str { *self }
1617 fn into_owned(self) -> ~str { self.to_owned() }
1620 impl<'a> Str for ~str {
1622 fn as_slice<'a>(&'a self) -> &'a str {
1623 let s: &'a str = *self; s
1627 fn into_owned(self) -> ~str { self }
1630 impl<'a> Container for &'a str {
1632 fn len(&self) -> uint {
1637 impl Container for ~str {
1639 fn len(&self) -> uint { self.as_slice().len() }
1642 impl Mutable for ~str {
1643 /// Remove all content, make the string empty
1645 fn clear(&mut self) {
1652 /// Methods for string slices
1653 pub trait StrSlice<'a> {
1654 /// Returns true if one string contains another
1658 /// - needle - The string to look for
1659 fn contains<'a>(&self, needle: &'a str) -> bool;
1661 /// Returns true if a string contains a char.
1665 /// - needle - The char to look for
1666 fn contains_char(&self, needle: char) -> bool;
1668 /// An iterator over the characters of `self`. Note, this iterates
1669 /// over unicode code-points, not unicode graphemes.
1674 /// let v: ~[char] = "abc åäö".chars().collect();
1675 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1677 fn chars(&self) -> Chars<'a>;
1679 /// An iterator over the characters of `self`, in reverse order.
1680 fn chars_rev(&self) -> RevChars<'a>;
1682 /// An iterator over the bytes of `self`
1683 fn bytes(&self) -> Bytes<'a>;
1685 /// An iterator over the bytes of `self`, in reverse order
1686 fn bytes_rev(&self) -> RevBytes<'a>;
1688 /// An iterator over the characters of `self` and their byte offsets.
1689 fn char_indices(&self) -> CharOffsets<'a>;
1691 /// An iterator over the characters of `self` and their byte offsets,
1692 /// in reverse order.
1693 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1695 /// An iterator over substrings of `self`, separated by characters
1696 /// matched by `sep`.
1701 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1702 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1704 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1705 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1707 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1708 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1710 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1712 /// An iterator over substrings of `self`, separated by characters
1713 /// matched by `sep`, restricted to splitting at most `count`
1719 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1720 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1722 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1723 /// assert_eq!(v, ~["abc", "def2ghi"]);
1725 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1726 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1728 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1730 /// An iterator over substrings of `self`, separated by characters
1731 /// matched by `sep`.
1733 /// Equivalent to `split`, except that the trailing substring
1734 /// is skipped if empty (terminator semantics).
1739 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1740 /// assert_eq!(v, ~["A", "B"]);
1742 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1743 /// assert_eq!(v, ~["A", "", "B", ""]);
1745 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1747 /// An iterator over substrings of `self`, separated by characters
1748 /// matched by `sep`, in reverse order.
1753 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1754 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1756 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1757 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1759 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1760 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1762 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1764 /// An iterator over substrings of `self`, separated by characters
1765 /// matched by `sep`, starting from the end of the string.
1766 /// Restricted to splitting at most `count` times.
1771 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1772 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1774 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1775 /// assert_eq!(v, ~["ghi", "abc1def"]);
1777 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1778 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1780 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1782 /// An iterator over the start and end indices of the disjoint
1783 /// matches of `sep` within `self`.
1785 /// That is, each returned value `(start, end)` satisfies
1786 /// `self.slice(start, end) == sep`. For matches of `sep` within
1787 /// `self` that overlap, only the indicies corresponding to the
1788 /// first match are returned.
1793 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1794 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1796 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1797 /// assert_eq!(v, ~[(1,4), (4,7)]);
1799 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1800 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1802 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1804 /// An iterator over the substrings of `self` separated by `sep`.
1809 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1810 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1812 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1813 /// assert_eq!(v, ~["1", "", "2"]);
1815 fn split_str(&self, &'a str) -> StrSplits<'a>;
1817 /// An iterator over the lines of a string (subsequences separated
1818 /// by `\n`). This does not include the empty string after a
1824 /// let four_lines = "foo\nbar\n\nbaz\n";
1825 /// let v: ~[&str] = four_lines.lines().collect();
1826 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1828 fn lines(&self) -> CharSplits<'a, char>;
1830 /// An iterator over the lines of a string, separated by either
1831 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1832 /// empty trailing line.
1837 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1838 /// let v: ~[&str] = four_lines.lines_any().collect();
1839 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1841 fn lines_any(&self) -> AnyLines<'a>;
1843 /// An iterator over the words of a string (subsequences separated
1844 /// by any sequence of whitespace). Sequences of whitespace are
1845 /// collapsed, so empty "words" are not included.
1850 /// let some_words = " Mary had\ta little \n\t lamb";
1851 /// let v: ~[&str] = some_words.words().collect();
1852 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1854 fn words(&self) -> Words<'a>;
1856 /// An Iterator over the string in Unicode Normalization Form D
1857 /// (canonical decomposition).
1858 fn nfd_chars(&self) -> Normalizations<'a>;
1860 /// An Iterator over the string in Unicode Normalization Form KD
1861 /// (compatibility decomposition).
1862 fn nfkd_chars(&self) -> Normalizations<'a>;
1864 /// Returns true if the string contains only whitespace.
1866 /// Whitespace characters are determined by `char::is_whitespace`.
1871 /// assert!(" \t\n".is_whitespace());
1872 /// assert!("".is_whitespace());
1874 /// assert!( !"abc".is_whitespace());
1876 fn is_whitespace(&self) -> bool;
1878 /// Returns true if the string contains only alphanumeric code
1881 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1886 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1887 /// assert!("".is_alphanumeric());
1889 /// assert!( !" &*~".is_alphanumeric());
1891 fn is_alphanumeric(&self) -> bool;
1893 /// Returns the number of Unicode code points (`char`) that a
1896 /// This does not perform any normalization, and is `O(n)`, since
1897 /// UTF-8 is a variable width encoding of code points.
1899 /// *Warning*: The number of code points in a string does not directly
1900 /// correspond to the number of visible characters or width of the
1901 /// visible text due to composing characters, and double- and
1902 /// zero-width ones.
1904 /// See also `.len()` for the byte length.
1909 /// // composed forms of `ö` and `é`
1910 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1911 /// // decomposed forms of `ö` and `é`
1912 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1914 /// assert_eq!(c.char_len(), 15);
1915 /// assert_eq!(d.char_len(), 17);
1917 /// assert_eq!(c.len(), 21);
1918 /// assert_eq!(d.len(), 23);
1920 /// // the two strings *look* the same
1921 /// println!("{}", c);
1922 /// println!("{}", d);
1924 fn char_len(&self) -> uint;
1926 /// Returns a slice of the given string from the byte range
1927 /// [`begin`..`end`).
1929 /// This operation is `O(1)`.
1931 /// Fails when `begin` and `end` do not point to valid characters
1932 /// or point beyond the last character of the string.
1934 /// See also `slice_to` and `slice_from` for slicing prefixes and
1935 /// suffixes of strings, and `slice_chars` for slicing based on
1936 /// code point counts.
1941 /// let s = "Löwe 老虎 Léopard";
1942 /// assert_eq!(s.slice(0, 1), "L");
1944 /// assert_eq!(s.slice(1, 9), "öwe 老");
1946 /// // these will fail:
1947 /// // byte 2 lies within `ö`:
1948 /// // s.slice(2, 3);
1950 /// // byte 8 lies within `老`
1951 /// // s.slice(1, 8);
1953 /// // byte 100 is outside the string
1954 /// // s.slice(3, 100);
1956 fn slice(&self, begin: uint, end: uint) -> &'a str;
1958 /// Returns a slice of the string from `begin` to its end.
1960 /// Equivalent to `self.slice(begin, self.len())`.
1962 /// Fails when `begin` does not point to a valid character, or is
1965 /// See also `slice`, `slice_to` and `slice_chars`.
1966 fn slice_from(&self, begin: uint) -> &'a str;
1968 /// Returns a slice of the string from the beginning to byte
1971 /// Equivalent to `self.slice(0, end)`.
1973 /// Fails when `end` does not point to a valid character, or is
1976 /// See also `slice`, `slice_from` and `slice_chars`.
1977 fn slice_to(&self, end: uint) -> &'a str;
1979 /// Returns a slice of the string from the character range
1980 /// [`begin`..`end`).
1982 /// That is, start at the `begin`-th code point of the string and
1983 /// continue to the `end`-th code point. This does not detect or
1984 /// handle edge cases such as leaving a combining character as the
1985 /// first code point of the string.
1987 /// Due to the design of UTF-8, this operation is `O(end)`.
1988 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1989 /// variants that use byte indices rather than code point
1992 /// Fails if `begin` > `end` or the either `begin` or `end` are
1993 /// beyond the last character of the string.
1998 /// let s = "Löwe 老虎 Léopard";
1999 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
2000 /// assert_eq!(s.slice_chars(5, 7), "老虎");
2002 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
2004 /// Returns true if `needle` is a prefix of the string.
2005 fn starts_with(&self, needle: &str) -> bool;
2007 /// Returns true if `needle` is a suffix of the string.
2008 fn ends_with(&self, needle: &str) -> bool;
2010 /// Escape each char in `s` with `char::escape_default`.
2011 fn escape_default(&self) -> ~str;
2013 /// Escape each char in `s` with `char::escape_unicode`.
2014 fn escape_unicode(&self) -> ~str;
2016 /// Returns a string with leading and trailing whitespace removed.
2017 fn trim(&self) -> &'a str;
2019 /// Returns a string with leading whitespace removed.
2020 fn trim_left(&self) -> &'a str;
2022 /// Returns a string with trailing whitespace removed.
2023 fn trim_right(&self) -> &'a str;
2025 /// Returns a string with characters that match `to_trim` removed.
2029 /// * to_trim - a character matcher
2034 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
2035 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
2036 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
2038 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2040 /// Returns a string with leading `chars_to_trim` removed.
2044 /// * to_trim - a character matcher
2049 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2050 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2051 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2053 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2055 /// Returns a string with trailing `chars_to_trim` removed.
2059 /// * to_trim - a character matcher
2064 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2065 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2066 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2068 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2070 /// Replace all occurrences of one string with another.
2074 /// * `from` - The string to replace
2075 /// * `to` - The replacement string
2079 /// The original string with all occurances of `from` replaced with `to`.
2084 /// let s = ~"Do you know the muffin man,
2085 /// The muffin man, the muffin man, ...";
2087 /// assert_eq!(s.replace("muffin man", "little lamb"),
2088 /// ~"Do you know the little lamb,
2089 /// The little lamb, the little lamb, ...");
2091 /// // not found, so no change.
2092 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2094 fn replace(&self, from: &str, to: &str) -> ~str;
2096 /// Copy a slice into a new owned str.
2097 fn to_owned(&self) -> ~str;
2099 /// Converts to a vector of `u16` encoded as UTF-16.
2100 fn to_utf16(&self) -> ~[u16];
2102 /// Check that `index`-th byte lies at the start and/or end of a
2103 /// UTF-8 code point sequence.
2105 /// The start and end of the string (when `index == self.len()`)
2106 /// are considered to be boundaries.
2108 /// Fails if `index` is greater than `self.len()`.
2113 /// let s = "Löwe 老虎 Léopard";
2114 /// assert!(s.is_char_boundary(0));
2116 /// assert!(s.is_char_boundary(6));
2117 /// assert!(s.is_char_boundary(s.len()));
2119 /// // second byte of `ö`
2120 /// assert!(!s.is_char_boundary(2));
2122 /// // third byte of `老`
2123 /// assert!(!s.is_char_boundary(8));
2125 fn is_char_boundary(&self, index: uint) -> bool;
2127 /// Pluck a character out of a string and return the index of the next
2130 /// This function can be used to iterate over the unicode characters of a
2135 /// This example manually iterate through the characters of a
2136 /// string; this should normally by done by `.chars()` or
2137 /// `.char_indices`.
2140 /// use std::str::CharRange;
2142 /// let s = "中华Việt Nam";
2144 /// while i < s.len() {
2145 /// let CharRange {ch, next} = s.char_range_at(i);
2146 /// println!("{}: {}", i, ch);
2168 /// * s - The string
2169 /// * i - The byte offset of the char to extract
2173 /// A record {ch: char, next: uint} containing the char value and the byte
2174 /// index of the next unicode character.
2178 /// If `i` is greater than or equal to the length of the string.
2179 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2180 fn char_range_at(&self, start: uint) -> CharRange;
2182 /// Given a byte position and a str, return the previous char and its position.
2184 /// This function can be used to iterate over a unicode string in reverse.
2186 /// Returns 0 for next index if called on start index 0.
2187 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2189 /// Plucks the character starting at the `i`th byte of a string
2190 fn char_at(&self, i: uint) -> char;
2192 /// Plucks the character ending at the `i`th byte of a string
2193 fn char_at_reverse(&self, i: uint) -> char;
2195 /// Work with the byte buffer of a string as a byte slice.
2196 fn as_bytes(&self) -> &'a [u8];
2198 /// Returns the byte index of the first character of `self` that
2199 /// matches `search`.
2203 /// `Some` containing the byte index of the last matching character
2204 /// or `None` if there is no match
2209 /// let s = "Löwe 老虎 Léopard";
2211 /// assert_eq!(s.find('L'), Some(0));
2212 /// assert_eq!(s.find('é'), Some(14));
2214 /// // the first space
2215 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2217 /// // neither are found
2218 /// assert_eq!(s.find(&['1', '2']), None);
2220 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2222 /// Returns the byte index of the last character of `self` that
2223 /// matches `search`.
2227 /// `Some` containing the byte index of the last matching character
2228 /// or `None` if there is no match.
2233 /// let s = "Löwe 老虎 Léopard";
2235 /// assert_eq!(s.rfind('L'), Some(13));
2236 /// assert_eq!(s.rfind('é'), Some(14));
2238 /// // the second space
2239 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2241 /// // searches for an occurrence of either `1` or `2`, but neither are found
2242 /// assert_eq!(s.rfind(&['1', '2']), None);
2244 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2246 /// Returns the byte index of the first matching substring
2250 /// * `needle` - The string to search for
2254 /// `Some` containing the byte index of the first matching substring
2255 /// or `None` if there is no match.
2260 /// let s = "Löwe 老虎 Léopard";
2262 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2263 /// assert_eq!(s.find_str("muffin man"), None);
2265 fn find_str(&self, &str) -> Option<uint>;
2267 /// Given a string, make a new string with repeated copies of it.
2268 fn repeat(&self, nn: uint) -> ~str;
2270 /// Retrieves the first character from a string slice and returns
2271 /// it. This does not allocate a new string; instead, it returns a
2272 /// slice that point one character beyond the character that was
2273 /// shifted. If the string does not contain any characters,
2274 /// a tuple of None and an empty string is returned instead.
2279 /// let s = "Löwe 老虎 Léopard";
2280 /// let (c, s1) = s.slice_shift_char();
2281 /// assert_eq!(c, Some('L'));
2282 /// assert_eq!(s1, "öwe 老虎 Léopard");
2284 /// let (c, s2) = s1.slice_shift_char();
2285 /// assert_eq!(c, Some('ö'));
2286 /// assert_eq!(s2, "we 老虎 Léopard");
2288 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2290 /// Levenshtein Distance between two strings.
2291 fn lev_distance(&self, t: &str) -> uint;
2293 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2295 /// Fails if `inner` is not a direct slice contained within self.
2300 /// let string = "a\nb\nc";
2301 /// let lines: ~[&str] = string.lines().collect();
2303 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2304 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2305 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2307 fn subslice_offset(&self, inner: &str) -> uint;
2309 /// Return an unsafe pointer to the strings buffer.
2311 /// The caller must ensure that the string outlives this pointer,
2312 /// and that it is not reallocated (e.g. by pushing to the
2314 fn as_ptr(&self) -> *u8;
2317 impl<'a> StrSlice<'a> for &'a str {
2319 fn contains<'a>(&self, needle: &'a str) -> bool {
2320 self.find_str(needle).is_some()
2324 fn contains_char(&self, needle: char) -> bool {
2325 self.find(needle).is_some()
2329 fn chars(&self) -> Chars<'a> {
2330 Chars{string: *self}
2334 fn chars_rev(&self) -> RevChars<'a> {
2339 fn bytes(&self) -> Bytes<'a> {
2340 self.as_bytes().iter().map(|&b| b)
2344 fn bytes_rev(&self) -> RevBytes<'a> {
2349 fn char_indices(&self) -> CharOffsets<'a> {
2350 CharOffsets{string: *self, iter: self.chars()}
2354 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2355 self.char_indices().rev()
2359 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2362 only_ascii: sep.only_ascii(),
2364 allow_trailing_empty: true,
2370 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2371 -> CharSplitsN<'a, Sep> {
2373 iter: self.split(sep),
2380 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2381 -> CharSplits<'a, Sep> {
2383 allow_trailing_empty: false,
2389 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2390 self.split(sep).rev()
2394 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2395 -> CharSplitsN<'a, Sep> {
2397 iter: self.split(sep),
2404 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2405 assert!(!sep.is_empty())
2414 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2416 it: self.match_indices(sep),
2423 fn lines(&self) -> CharSplits<'a, char> {
2424 self.split_terminator('\n')
2427 fn lines_any(&self) -> AnyLines<'a> {
2428 self.lines().map(|line| {
2430 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2436 fn words(&self) -> Words<'a> {
2437 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2441 fn nfd_chars(&self) -> Normalizations<'a> {
2451 fn nfkd_chars(&self) -> Normalizations<'a> {
2461 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2464 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2467 fn char_len(&self) -> uint { self.chars().len() }
2470 fn slice(&self, begin: uint, end: uint) -> &'a str {
2471 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2472 unsafe { raw::slice_bytes(*self, begin, end) }
2476 fn slice_from(&self, begin: uint) -> &'a str {
2477 self.slice(begin, self.len())
2481 fn slice_to(&self, end: uint) -> &'a str {
2482 assert!(self.is_char_boundary(end));
2483 unsafe { raw::slice_bytes(*self, 0, end) }
2486 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2487 assert!(begin <= end);
2489 let mut begin_byte = None;
2490 let mut end_byte = None;
2492 // This could be even more efficient by not decoding,
2493 // only finding the char boundaries
2494 for (idx, _) in self.char_indices() {
2495 if count == begin { begin_byte = Some(idx); }
2496 if count == end { end_byte = Some(idx); break; }
2499 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2500 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2502 match (begin_byte, end_byte) {
2503 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2504 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2505 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2510 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2511 let n = needle.len();
2512 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2516 fn ends_with(&self, needle: &str) -> bool {
2517 let (m, n) = (self.len(), needle.len());
2518 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2521 fn escape_default(&self) -> ~str {
2522 let mut out = with_capacity(self.len());
2523 for c in self.chars() {
2524 c.escape_default(|c| out.push_char(c));
2529 fn escape_unicode(&self) -> ~str {
2530 let mut out = with_capacity(self.len());
2531 for c in self.chars() {
2532 c.escape_unicode(|c| out.push_char(c));
2538 fn trim(&self) -> &'a str {
2539 self.trim_left().trim_right()
2543 fn trim_left(&self) -> &'a str {
2544 self.trim_left_chars(&char::is_whitespace)
2548 fn trim_right(&self) -> &'a str {
2549 self.trim_right_chars(&char::is_whitespace)
2553 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2554 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2558 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2559 match self.find(|c: char| !to_trim.matches(c)) {
2561 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2566 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2567 match self.rfind(|c: char| !to_trim.matches(c)) {
2570 let next = self.char_range_at(last).next;
2571 unsafe { raw::slice_bytes(*self, 0u, next) }
2576 fn replace(&self, from: &str, to: &str) -> ~str {
2577 let mut result = ~"";
2578 let mut last_end = 0;
2579 for (start, end) in self.match_indices(from) {
2580 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2581 result.push_str(to);
2584 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2589 fn to_owned(&self) -> ~str {
2590 let len = self.len();
2592 let mut v = slice::with_capacity(len);
2594 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2596 ::cast::transmute(v)
2600 fn to_utf16(&self) -> ~[u16] {
2602 for ch in self.chars() {
2603 // Arithmetic with u32 literals is easier on the eyes than chars.
2604 let mut ch = ch as u32;
2606 if (ch & 0xFFFF_u32) == ch {
2607 // The BMP falls through (assuming non-surrogate, as it
2609 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2612 // Supplementary planes break into surrogates.
2613 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2615 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2616 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2617 u.push_all([w1, w2])
2624 fn is_char_boundary(&self, index: uint) -> bool {
2625 if index == self.len() { return true; }
2626 let b = self[index];
2627 return b < 128u8 || b >= 192u8;
2631 fn char_range_at(&self, i: uint) -> CharRange {
2632 if self[i] < 128u8 {
2633 return CharRange {ch: self[i] as char, next: i + 1 };
2636 // Multibyte case is a fn to allow char_range_at to inline cleanly
2637 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2638 let mut val = s[i] as u32;
2639 let w = UTF8_CHAR_WIDTH[val] as uint;
2642 val = utf8_first_byte!(val, w);
2643 val = utf8_acc_cont_byte!(val, s[i + 1]);
2644 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2645 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2647 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2650 return multibyte_char_range_at(*self, i);
2654 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2655 let mut prev = start;
2657 prev = prev.saturating_sub(1);
2658 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2660 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2661 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2662 // while there is a previous byte == 10......
2663 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2667 let mut val = s[i] as u32;
2668 let w = UTF8_CHAR_WIDTH[val] as uint;
2671 val = utf8_first_byte!(val, w);
2672 val = utf8_acc_cont_byte!(val, s[i + 1]);
2673 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2674 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2676 return CharRange {ch: unsafe { transmute(val) }, next: i};
2679 return multibyte_char_range_at_reverse(*self, prev);
2683 fn char_at(&self, i: uint) -> char {
2684 self.char_range_at(i).ch
2688 fn char_at_reverse(&self, i: uint) -> char {
2689 self.char_range_at_reverse(i).ch
2693 fn as_bytes(&self) -> &'a [u8] {
2694 unsafe { cast::transmute(*self) }
2697 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2698 if search.only_ascii() {
2699 self.bytes().position(|b| search.matches(b as char))
2701 for (index, c) in self.char_indices() {
2702 if search.matches(c) { return Some(index); }
2708 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2709 if search.only_ascii() {
2710 self.bytes().rposition(|b| search.matches(b as char))
2712 for (index, c) in self.char_indices_rev() {
2713 if search.matches(c) { return Some(index); }
2719 fn find_str(&self, needle: &str) -> Option<uint> {
2720 if needle.is_empty() {
2723 self.match_indices(needle)
2725 .map(|(start, _end)| start)
2729 fn repeat(&self, nn: uint) -> ~str {
2730 let mut ret = with_capacity(nn * self.len());
2731 for _ in range(0, nn) {
2732 ret.push_str(*self);
2738 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2739 if self.is_empty() {
2740 return (None, *self);
2742 let CharRange {ch, next} = self.char_range_at(0u);
2743 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2744 return (Some(ch), next_s);
2748 fn lev_distance(&self, t: &str) -> uint {
2749 let slen = self.len();
2752 if slen == 0 { return tlen; }
2753 if tlen == 0 { return slen; }
2755 let mut dcol = slice::from_fn(tlen + 1, |x| x);
2757 for (i, sc) in self.chars().enumerate() {
2759 let mut current = i;
2760 dcol[0] = current + 1;
2762 for (j, tc) in t.chars().enumerate() {
2764 let next = dcol[j + 1];
2767 dcol[j + 1] = current;
2769 dcol[j + 1] = ::cmp::min(current, next);
2770 dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2780 fn subslice_offset(&self, inner: &str) -> uint {
2781 let a_start = self.as_ptr() as uint;
2782 let a_end = a_start + self.len();
2783 let b_start = inner.as_ptr() as uint;
2784 let b_end = b_start + inner.len();
2786 assert!(a_start <= b_start);
2787 assert!(b_end <= a_end);
2792 fn as_ptr(&self) -> *u8 {
2797 /// Methods for owned strings
2798 pub trait OwnedStr {
2799 /// Appends a string slice to the back of a string, without overallocating.
2800 fn push_str_no_overallocate(&mut self, rhs: &str);
2802 /// Appends a string slice to the back of a string
2803 fn push_str(&mut self, rhs: &str);
2805 /// Appends a character to the back of a string
2806 fn push_char(&mut self, c: char);
2808 /// Remove the final character from a string and return it. Return None
2809 /// when the string is empty.
2810 fn pop_char(&mut self) -> Option<char>;
2812 /// Remove the first character from a string and return it. Return None
2813 /// when the string is empty.
2814 fn shift_char(&mut self) -> Option<char>;
2816 /// Prepend a char to a string
2817 fn unshift_char(&mut self, ch: char);
2819 /// Insert a new sub-string at the given position in a string, in O(n + m) time
2820 /// (with n and m the lengths of the string and the substring.)
2821 /// This fails if `position` is not at a character boundary.
2822 fn insert(&mut self, position: uint, substring: &str);
2824 /// Insert a char at the given position in a string, in O(n + m) time
2825 /// (with n and m the lengths of the string and the substring.)
2826 /// This fails if `position` is not at a character boundary.
2827 fn insert_char(&mut self, position: uint, ch: char);
2829 /// Concatenate two strings together.
2830 fn append(self, rhs: &str) -> ~str;
2832 /// Reserves capacity for exactly `n` bytes in the given string.
2834 /// Assuming single-byte characters, the resulting string will be large
2835 /// enough to hold a string of length `n`.
2837 /// If the capacity for `s` is already equal to or greater than the requested
2838 /// capacity, then no action is taken.
2843 /// * n - The number of bytes to reserve space for
2844 fn reserve_exact(&mut self, n: uint);
2846 /// Reserves capacity for at least `n` bytes in the given string.
2848 /// Assuming single-byte characters, the resulting string will be large
2849 /// enough to hold a string of length `n`.
2851 /// This function will over-allocate in order to amortize the allocation costs
2852 /// in scenarios where the caller may need to repeatedly reserve additional
2855 /// If the capacity for `s` is already equal to or greater than the requested
2856 /// capacity, then no action is taken.
2861 /// * n - The number of bytes to reserve space for
2862 fn reserve(&mut self, n: uint);
2864 /// Returns the number of single-byte characters the string can hold without
2866 fn capacity(&self) -> uint;
2868 /// Shorten a string to the specified length (which must be <= the current length)
2869 fn truncate(&mut self, len: uint);
2871 /// Consumes the string, returning the underlying byte buffer.
2873 /// The buffer does not have a null terminator.
2874 fn into_bytes(self) -> ~[u8];
2876 /// Sets the length of a string
2878 /// This will explicitly set the size of the string, without actually
2879 /// modifying its buffers, so it is up to the caller to ensure that
2880 /// the string is actually the specified size.
2881 unsafe fn set_len(&mut self, new_len: uint);
2884 impl OwnedStr for ~str {
2886 fn push_str_no_overallocate(&mut self, rhs: &str) {
2887 let new_cap = self.len() + rhs.len();
2888 self.reserve_exact(new_cap);
2893 fn push_str(&mut self, rhs: &str) {
2895 raw::push_bytes(self, rhs.as_bytes());
2900 fn push_char(&mut self, c: char) {
2901 let cur_len = self.len();
2902 // may use up to 4 bytes.
2904 let v = raw::as_owned_vec(self);
2905 v.reserve_additional(4);
2907 // Attempt to not use an intermediate buffer by just pushing bytes
2908 // directly onto this string.
2909 let write_ptr = v.as_mut_ptr().offset(cur_len as int);
2910 let used = slice::raw::mut_buf_as_slice(write_ptr, 4, |slc| c.encode_utf8(slc));
2912 v.set_len(cur_len + used);
2917 fn pop_char(&mut self) -> Option<char> {
2918 let end = self.len();
2922 let CharRange {ch, next} = self.char_range_at_reverse(end);
2923 unsafe { self.set_len(next); }
2929 fn shift_char(&mut self) -> Option<char> {
2930 if self.is_empty() {
2933 let CharRange {ch, next} = self.char_range_at(0u);
2934 *self = self.slice(next, self.len()).to_owned();
2940 fn unshift_char(&mut self, ch: char) {
2941 // This could be more efficient.
2942 let mut new_str = ~"";
2943 new_str.push_char(ch);
2944 new_str.push_str(*self);
2949 fn insert(&mut self, position: uint, substring: &str) {
2950 // This could be more efficient.
2951 let mut new_str = self.slice_to(position).to_owned();
2952 new_str.push_str(substring);
2953 new_str.push_str(self.slice_from(position));
2958 fn insert_char(&mut self, position: uint, ch: char) {
2959 // This could be more efficient.
2960 let mut new_str = self.slice_to(position).to_owned();
2961 new_str.push_char(ch);
2962 new_str.push_str(self.slice_from(position));
2967 fn append(self, rhs: &str) -> ~str {
2968 let mut new_str = self;
2969 new_str.push_str_no_overallocate(rhs);
2974 fn reserve_exact(&mut self, n: uint) {
2976 raw::as_owned_vec(self).reserve_exact(n)
2981 fn reserve(&mut self, n: uint) {
2983 raw::as_owned_vec(self).reserve(n)
2988 fn capacity(&self) -> uint {
2990 let buf: &~[u8] = cast::transmute(self);
2996 fn truncate(&mut self, len: uint) {
2997 assert!(len <= self.len());
2998 assert!(self.is_char_boundary(len));
2999 unsafe { self.set_len(len); }
3003 fn into_bytes(self) -> ~[u8] {
3004 unsafe { cast::transmute(self) }
3008 unsafe fn set_len(&mut self, new_len: uint) {
3009 raw::as_owned_vec(self).set_len(new_len)
3013 impl Clone for ~str {
3015 fn clone(&self) -> ~str {
3020 impl FromIterator<char> for ~str {
3022 fn from_iterator<T: Iterator<char>>(iterator: &mut T) -> ~str {
3023 let (lower, _) = iterator.size_hint();
3024 let mut buf = with_capacity(lower);
3025 buf.extend(iterator);
3030 impl Extendable<char> for ~str {
3032 fn extend<T: Iterator<char>>(&mut self, iterator: &mut T) {
3033 let (lower, _) = iterator.size_hint();
3034 let reserve = lower + self.len();
3035 self.reserve(reserve);
3036 for ch in *iterator {
3042 // This works because every lifetime is a sub-lifetime of 'static
3043 impl<'a> Default for &'a str {
3044 fn default() -> &'a str { "" }
3047 impl Default for ~str {
3048 fn default() -> ~str { ~"" }
3053 use iter::AdditiveIterator;
3054 use default::Default;
3060 assert!((eq(&~"", &~"")));
3061 assert!((eq(&~"foo", &~"foo")));
3062 assert!((!eq(&~"foo", &~"bar")));
3066 fn test_eq_slice() {
3067 assert!((eq_slice("foobar".slice(0, 3), "foo")));
3068 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
3069 assert!((!eq_slice("foo1", "foo2")));
3075 assert!("" <= "foo");
3076 assert!("foo" <= "foo");
3077 assert!("foo" != "bar");
3082 assert_eq!("".len(), 0u);
3083 assert_eq!("hello world".len(), 11u);
3084 assert_eq!("\x63".len(), 1u);
3085 assert_eq!("\xa2".len(), 2u);
3086 assert_eq!("\u03c0".len(), 2u);
3087 assert_eq!("\u2620".len(), 3u);
3088 assert_eq!("\U0001d11e".len(), 4u);
3090 assert_eq!("".char_len(), 0u);
3091 assert_eq!("hello world".char_len(), 11u);
3092 assert_eq!("\x63".char_len(), 1u);
3093 assert_eq!("\xa2".char_len(), 1u);
3094 assert_eq!("\u03c0".char_len(), 1u);
3095 assert_eq!("\u2620".char_len(), 1u);
3096 assert_eq!("\U0001d11e".char_len(), 1u);
3097 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
3102 assert_eq!("hello".find('l'), Some(2u));
3103 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
3104 assert!("hello".find('x').is_none());
3105 assert!("hello".find(|c:char| c == 'x').is_none());
3106 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
3107 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
3112 assert_eq!("hello".rfind('l'), Some(3u));
3113 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
3114 assert!("hello".rfind('x').is_none());
3115 assert!("hello".rfind(|c:char| c == 'x').is_none());
3116 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
3117 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
3121 fn test_push_str() {
3124 assert_eq!(s.slice_from(0), "");
3126 assert_eq!(s.slice_from(0), "abc");
3127 s.push_str("ประเทศไทย中华Việt Nam");
3128 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3135 assert_eq!(s.slice_from(0), "");
3136 s = s.append("abc");
3137 assert_eq!(s.slice_from(0), "abc");
3138 s = s.append("ประเทศไทย中华Việt Nam");
3139 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3143 fn test_pop_char() {
3144 let mut data = ~"ประเทศไทย中华";
3145 let cc = data.pop_char();
3146 assert_eq!(~"ประเทศไทย中", data);
3147 assert_eq!(Some('华'), cc);
3151 fn test_pop_char_2() {
3152 let mut data2 = ~"华";
3153 let cc2 = data2.pop_char();
3154 assert_eq!(~"", data2);
3155 assert_eq!(Some('华'), cc2);
3159 fn test_pop_char_empty() {
3161 let cc3 = data.pop_char();
3162 assert_eq!(~"", data);
3163 assert_eq!(None, cc3);
3167 fn test_push_char() {
3168 let mut data = ~"ประเทศไทย中";
3169 data.push_char('华');
3170 data.push_char('b'); // 1 byte
3171 data.push_char('¢'); // 2 byte
3172 data.push_char('€'); // 3 byte
3173 data.push_char('𤭢'); // 4 byte
3174 assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
3178 fn test_shift_char() {
3179 let mut data = ~"ประเทศไทย中";
3180 let cc = data.shift_char();
3181 assert_eq!(~"ระเทศไทย中", data);
3182 assert_eq!(Some('ป'), cc);
3186 fn test_unshift_char() {
3187 let mut data = ~"ประเทศไทย中";
3188 data.unshift_char('华');
3189 assert_eq!(~"华ประเทศไทย中", data);
3193 fn test_insert_char() {
3194 let mut data = ~"ประเทศไทย中";
3195 data.insert_char(15, '华');
3196 assert_eq!(~"ประเท华ศไทย中", data);
3201 let mut data = ~"ประเทศไทย中";
3202 data.insert(15, "华中");
3203 assert_eq!(~"ประเท华中ศไทย中", data);
3209 let s: ~str = empty.chars().collect();
3210 assert_eq!(empty, s);
3211 let data = ~"ประเทศไทย中";
3212 let s: ~str = data.chars().collect();
3213 assert_eq!(data, s);
3218 let data = ~"ประเทศไทย中";
3219 let mut cpy = data.clone();
3221 let mut it = other.chars();
3222 cpy.extend(&mut it);
3223 assert_eq!(cpy, data + other);
3228 let mut empty = ~"";
3230 assert_eq!("", empty.as_slice());
3231 let mut data = ~"ประเทศไทย中";
3233 assert_eq!("", data.as_slice());
3234 data.push_char('华');
3235 assert_eq!("华", data.as_slice());
3239 fn test_into_bytes() {
3241 let buf = data.into_bytes();
3242 assert_eq!(bytes!("asdf"), buf.as_slice());
3246 fn test_find_str() {
3248 assert_eq!("".find_str(""), Some(0u));
3249 assert!("banana".find_str("apple pie").is_none());
3251 let data = "abcabc";
3252 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
3253 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
3254 assert!(data.slice(2u, 4u).find_str("ab").is_none());
3256 let mut data = ~"ประเทศไทย中华Việt Nam";
3258 assert!(data.find_str("ไท华").is_none());
3259 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
3260 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
3262 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
3263 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
3264 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
3265 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
3266 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
3268 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
3269 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
3270 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
3271 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
3272 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
3276 fn test_slice_chars() {
3277 fn t(a: &str, b: &str, start: uint) {
3278 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
3281 t("hello", "llo", 2);
3282 t("hello", "el", 1);
3285 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
3290 fn t(v: &[~str], s: &str) {
3291 assert_eq!(v.concat(), s.to_str());
3293 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
3294 let v: &[~str] = [];
3301 fn t(v: &[~str], sep: &str, s: &str) {
3302 assert_eq!(v.connect(sep), s.to_str());
3304 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
3305 " ", "you know I'm no good");
3306 let v: &[~str] = [];
3308 t([~"hi"], " ", "hi");
3312 fn test_concat_slices() {
3313 fn t(v: &[&str], s: &str) {
3314 assert_eq!(v.concat(), s.to_str());
3316 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
3317 let v: &[&str] = [];
3323 fn test_connect_slices() {
3324 fn t(v: &[&str], sep: &str, s: &str) {
3325 assert_eq!(v.connect(sep), s.to_str());
3327 t(["you", "know", "I'm", "no", "good"],
3328 " ", "you know I'm no good");
3330 t(["hi"], " ", "hi");
3335 assert_eq!("x".repeat(4), ~"xxxx");
3336 assert_eq!("hi".repeat(4), ~"hihihihi");
3337 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
3338 assert_eq!("".repeat(4), ~"");
3339 assert_eq!("hi".repeat(0), ~"");
3343 fn test_unsafe_slice() {
3344 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
3345 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3346 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3347 fn a_million_letter_a() -> ~str {
3350 while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
3353 fn half_a_million_letter_a() -> ~str {
3356 while i < 100000 { rs.push_str("aaaaa"); i += 1; }
3359 let letters = a_million_letter_a();
3360 assert!(half_a_million_letter_a() ==
3361 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3365 fn test_starts_with() {
3366 assert!(("".starts_with("")));
3367 assert!(("abc".starts_with("")));
3368 assert!(("abc".starts_with("a")));
3369 assert!((!"a".starts_with("abc")));
3370 assert!((!"".starts_with("abc")));
3371 assert!((!"ödd".starts_with("-")));
3372 assert!(("ödd".starts_with("öd")));
3376 fn test_ends_with() {
3377 assert!(("".ends_with("")));
3378 assert!(("abc".ends_with("")));
3379 assert!(("abc".ends_with("c")));
3380 assert!((!"a".ends_with("abc")));
3381 assert!((!"".ends_with("abc")));
3382 assert!((!"ddö".ends_with("-")));
3383 assert!(("ddö".ends_with("dö")));
3387 fn test_is_empty() {
3388 assert!("".is_empty());
3389 assert!(!"a".is_empty());
3395 assert_eq!("".replace(a, "b"), ~"");
3396 assert_eq!("a".replace(a, "b"), ~"b");
3397 assert_eq!("ab".replace(a, "b"), ~"bb");
3399 assert!(" test test ".replace(test, "toast") ==
3401 assert_eq!(" test test ".replace(test, ""), ~" ");
3405 fn test_replace_2a() {
3406 let data = ~"ประเทศไทย中华";
3407 let repl = ~"دولة الكويت";
3410 let a2 = ~"دولة الكويتทศไทย中华";
3411 assert_eq!(data.replace(a, repl), a2);
3415 fn test_replace_2b() {
3416 let data = ~"ประเทศไทย中华";
3417 let repl = ~"دولة الكويت";
3420 let b2 = ~"ปรدولة الكويتทศไทย中华";
3421 assert_eq!(data.replace(b, repl), b2);
3425 fn test_replace_2c() {
3426 let data = ~"ประเทศไทย中华";
3427 let repl = ~"دولة الكويت";
3430 let c2 = ~"ประเทศไทยدولة الكويت";
3431 assert_eq!(data.replace(c, repl), c2);
3435 fn test_replace_2d() {
3436 let data = ~"ประเทศไทย中华";
3437 let repl = ~"دولة الكويت";
3440 assert_eq!(data.replace(d, repl), data);
3445 assert_eq!("ab", "abc".slice(0, 2));
3446 assert_eq!("bc", "abc".slice(1, 3));
3447 assert_eq!("", "abc".slice(1, 1));
3448 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3450 let data = "ประเทศไทย中华";
3451 assert_eq!("ป", data.slice(0, 3));
3452 assert_eq!("ร", data.slice(3, 6));
3453 assert_eq!("", data.slice(3, 3));
3454 assert_eq!("华", data.slice(30, 33));
3456 fn a_million_letter_X() -> ~str {
3460 push_str(&mut rs, "华华华华华华华华华华");
3465 fn half_a_million_letter_X() -> ~str {
3468 while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
3471 let letters = a_million_letter_X();
3472 assert!(half_a_million_letter_X() ==
3473 letters.slice(0u, 3u * 500000u).to_owned());
3478 let ss = "中华Việt Nam";
3480 assert_eq!("华", ss.slice(3u, 6u));
3481 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3483 assert_eq!("ab", "abc".slice(0u, 2u));
3484 assert_eq!("bc", "abc".slice(1u, 3u));
3485 assert_eq!("", "abc".slice(1u, 1u));
3487 assert_eq!("中", ss.slice(0u, 3u));
3488 assert_eq!("华V", ss.slice(3u, 7u));
3489 assert_eq!("", ss.slice(3u, 3u));
3504 fn test_slice_fail() {
3505 "中华Việt Nam".slice(0u, 2u);
3509 fn test_slice_from() {
3510 assert_eq!("abcd".slice_from(0), "abcd");
3511 assert_eq!("abcd".slice_from(2), "cd");
3512 assert_eq!("abcd".slice_from(4), "");
3515 fn test_slice_to() {
3516 assert_eq!("abcd".slice_to(0), "");
3517 assert_eq!("abcd".slice_to(2), "ab");
3518 assert_eq!("abcd".slice_to(4), "abcd");
3522 fn test_trim_left_chars() {
3523 let v: &[char] = &[];
3524 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3525 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3526 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3527 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3529 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3530 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3531 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3535 fn test_trim_right_chars() {
3536 let v: &[char] = &[];
3537 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3538 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3539 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3540 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3542 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3543 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3544 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3548 fn test_trim_chars() {
3549 let v: &[char] = &[];
3550 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3551 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3552 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3553 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3555 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3556 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3557 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3561 fn test_trim_left() {
3562 assert_eq!("".trim_left(), "");
3563 assert_eq!("a".trim_left(), "a");
3564 assert_eq!(" ".trim_left(), "");
3565 assert_eq!(" blah".trim_left(), "blah");
3566 assert_eq!(" \u3000 wut".trim_left(), "wut");
3567 assert_eq!("hey ".trim_left(), "hey ");
3571 fn test_trim_right() {
3572 assert_eq!("".trim_right(), "");
3573 assert_eq!("a".trim_right(), "a");
3574 assert_eq!(" ".trim_right(), "");
3575 assert_eq!("blah ".trim_right(), "blah");
3576 assert_eq!("wut \u3000 ".trim_right(), "wut");
3577 assert_eq!(" hey".trim_right(), " hey");
3582 assert_eq!("".trim(), "");
3583 assert_eq!("a".trim(), "a");
3584 assert_eq!(" ".trim(), "");
3585 assert_eq!(" blah ".trim(), "blah");
3586 assert_eq!("\nwut \u3000 ".trim(), "wut");
3587 assert_eq!(" hey dude ".trim(), "hey dude");
3591 fn test_is_whitespace() {
3592 assert!("".is_whitespace());
3593 assert!(" ".is_whitespace());
3594 assert!("\u2009".is_whitespace()); // Thin space
3595 assert!(" \n\t ".is_whitespace());
3596 assert!(!" _ ".is_whitespace());
3600 fn test_slice_shift_char() {
3601 let data = "ประเทศไทย中";
3602 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3606 fn test_slice_shift_char_2() {
3608 assert_eq!(empty.slice_shift_char(), (None, ""));
3612 fn test_push_byte() {
3614 unsafe{raw::push_byte(&mut s, 'D' as u8)};
3615 assert_eq!(s, ~"ABCD");
3619 fn test_shift_byte() {
3621 let b = unsafe{raw::shift_byte(&mut s)};
3622 assert_eq!(s, ~"BC");
3623 assert_eq!(b, Some(65u8));
3627 fn test_pop_byte() {
3629 let b = unsafe{raw::pop_byte(&mut s)};
3630 assert_eq!(s, ~"AB");
3631 assert_eq!(b, Some(67u8));
3636 // deny overlong encodings
3637 assert!(!is_utf8([0xc0, 0x80]));
3638 assert!(!is_utf8([0xc0, 0xae]));
3639 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3640 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3641 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3642 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3643 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3646 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3647 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3649 assert!(is_utf8([0xC2, 0x80]));
3650 assert!(is_utf8([0xDF, 0xBF]));
3651 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3652 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3653 assert!(is_utf8([0xEE, 0x80, 0x80]));
3654 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3655 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3656 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3660 fn test_is_utf16() {
3661 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3669 // surrogate pairs (randomly generated with Python 3's
3670 // .encode('utf-16be'))
3671 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3672 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3673 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3675 // mixtures (also random)
3676 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3677 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3678 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3681 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3684 // surrogate + regular unit
3686 // surrogate + lead surrogate
3688 // unterminated surrogate
3690 // trail surrogate without a lead
3693 // random byte sequences that Python 3's .decode('utf-16be')
3695 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3696 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3697 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3698 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3699 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3700 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3701 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3702 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3703 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3704 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3705 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3706 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3707 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3708 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3709 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3710 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3711 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3712 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3713 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3714 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3715 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3719 fn test_raw_from_c_str() {
3721 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3723 let c = raw::from_c_str(b);
3724 assert_eq!(c, ~"AAAAAAA");
3729 fn test_as_bytes() {
3732 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3733 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3736 assert_eq!("".as_bytes(), &[]);
3737 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3738 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3743 fn test_as_bytes_fail() {
3744 // Don't double free. (I'm not sure if this exercises the
3745 // original problem code path anymore.)
3747 let _bytes = s.as_bytes();
3753 let buf = "hello".as_ptr();
3755 assert_eq!(*buf.offset(0), 'h' as u8);
3756 assert_eq!(*buf.offset(1), 'e' as u8);
3757 assert_eq!(*buf.offset(2), 'l' as u8);
3758 assert_eq!(*buf.offset(3), 'l' as u8);
3759 assert_eq!(*buf.offset(4), 'o' as u8);
3764 fn test_subslice_offset() {
3765 let a = "kernelsprite";
3766 let b = a.slice(7, a.len());
3767 let c = a.slice(0, a.len() - 6);
3768 assert_eq!(a.subslice_offset(b), 7);
3769 assert_eq!(a.subslice_offset(c), 0);
3771 let string = "a\nb\nc";
3772 let mut lines = ~[];
3773 for line in string.lines() { lines.push(line) }
3774 assert_eq!(string.subslice_offset(lines[0]), 0);
3775 assert_eq!(string.subslice_offset(lines[1]), 2);
3776 assert_eq!(string.subslice_offset(lines[2]), 4);
3781 fn test_subslice_offset_2() {
3782 let a = "alchemiter";
3783 let b = "cruxtruder";
3784 a.subslice_offset(b);
3788 fn vec_str_conversions() {
3789 let s1: ~str = ~"All mimsy were the borogoves";
3791 let v: ~[u8] = s1.as_bytes().to_owned();
3792 let s2: ~str = from_utf8(v).unwrap().to_owned();
3793 let mut i: uint = 0u;
3794 let n1: uint = s1.len();
3795 let n2: uint = v.len();
3808 fn test_contains() {
3809 assert!("abcde".contains("bcd"));
3810 assert!("abcde".contains("abcd"));
3811 assert!("abcde".contains("bcde"));
3812 assert!("abcde".contains(""));
3813 assert!("".contains(""));
3814 assert!(!"abcde".contains("def"));
3815 assert!(!"".contains("a"));
3817 let data = ~"ประเทศไทย中华Việt Nam";
3818 assert!(data.contains("ประเ"));
3819 assert!(data.contains("ะเ"));
3820 assert!(data.contains("中华"));
3821 assert!(!data.contains("ไท华"));
3825 fn test_contains_char() {
3826 assert!("abc".contains_char('b'));
3827 assert!("a".contains_char('a'));
3828 assert!(!"abc".contains_char('d'));
3829 assert!(!"".contains_char('a'));
3836 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3837 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3838 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3839 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3842 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3843 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3844 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3845 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3846 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3849 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3850 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3851 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3852 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3853 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3854 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3855 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3856 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3858 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3859 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3860 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3861 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3862 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3863 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3864 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3865 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3866 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3867 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3868 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3870 // Issue #12318, even-numbered non-BMP planes
3872 ~[0xD840, 0xDC00])];
3874 for p in pairs.iter() {
3875 let (s, u) = (*p).clone();
3876 assert!(is_utf16(u));
3877 assert_eq!(s.to_utf16(), u);
3879 assert_eq!(from_utf16(u).unwrap(), s);
3880 assert_eq!(from_utf16_lossy(u), s);
3882 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3883 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3888 fn test_utf16_invalid() {
3889 // completely positive cases tested above.
3891 assert_eq!(from_utf16([0xD800]), None);
3893 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3896 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3899 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3903 fn test_utf16_lossy() {
3904 // completely positive cases tested above.
3906 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3908 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3911 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3914 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3918 fn test_truncate_utf16_at_nul() {
3920 assert_eq!(truncate_utf16_at_nul(v), &[]);
3923 assert_eq!(truncate_utf16_at_nul(v), &[]);
3926 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3929 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3932 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3937 let s = ~"ศไทย中华Việt Nam";
3938 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3940 for ch in v.iter() {
3941 assert!(s.char_at(pos) == *ch);
3942 pos += from_char(*ch).len();
3947 fn test_char_at_reverse() {
3948 let s = ~"ศไทย中华Việt Nam";
3949 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3950 let mut pos = s.len();
3951 for ch in v.rev_iter() {
3952 assert!(s.char_at_reverse(pos) == *ch);
3953 pos -= from_char(*ch).len();
3958 fn test_escape_unicode() {
3959 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3960 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3961 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3962 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3963 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3964 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3965 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3966 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3967 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3971 fn test_escape_default() {
3972 assert_eq!("abc".escape_default(), ~"abc");
3973 assert_eq!("a c".escape_default(), ~"a c");
3974 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3975 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3976 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3977 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3978 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3979 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3983 fn test_total_ord() {
3984 "1234".cmp(& &"123") == Greater;
3985 "123".cmp(& &"1234") == Less;
3986 "1234".cmp(& &"1234") == Equal;
3987 "12345555".cmp(& &"123456") == Less;
3988 "22".cmp(& &"1234") == Greater;
3992 fn test_char_range_at() {
3993 let data = ~"b¢€𤭢𤭢€¢b";
3994 assert_eq!('b', data.char_range_at(0).ch);
3995 assert_eq!('¢', data.char_range_at(1).ch);
3996 assert_eq!('€', data.char_range_at(3).ch);
3997 assert_eq!('𤭢', data.char_range_at(6).ch);
3998 assert_eq!('𤭢', data.char_range_at(10).ch);
3999 assert_eq!('€', data.char_range_at(14).ch);
4000 assert_eq!('¢', data.char_range_at(17).ch);
4001 assert_eq!('b', data.char_range_at(19).ch);
4005 fn test_char_range_at_reverse_underflow() {
4006 assert_eq!("abc".char_range_at_reverse(0).next, 0);
4011 #[allow(unnecessary_allocation)];
4013 ($s1:expr, $s2:expr, $e:expr) => { {
4017 assert_eq!(s1 + s2, e.to_owned());
4018 assert_eq!(s1.to_owned() + s2, e.to_owned());
4022 t!("foo", "bar", "foobar");
4023 t!("foo", ~"bar", "foobar");
4024 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
4025 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
4029 fn test_iterator() {
4031 let s = ~"ศไทย中华Việt Nam";
4032 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4035 let mut it = s.chars();
4038 assert_eq!(c, v[pos]);
4041 assert_eq!(pos, v.len());
4045 fn test_rev_iterator() {
4047 let s = ~"ศไทย中华Việt Nam";
4048 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4051 let mut it = s.chars_rev();
4054 assert_eq!(c, v[pos]);
4057 assert_eq!(pos, v.len());
4061 fn test_iterator_clone() {
4062 let s = "ศไทย中华Việt Nam";
4063 let mut it = s.chars();
4065 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
4069 fn test_bytesator() {
4070 let s = ~"ศไทย中华Việt Nam";
4072 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4073 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4078 for b in s.bytes() {
4079 assert_eq!(b, v[pos]);
4085 fn test_bytes_revator() {
4086 let s = ~"ศไทย中华Việt Nam";
4088 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4089 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4092 let mut pos = v.len();
4094 for b in s.bytes_rev() {
4096 assert_eq!(b, v[pos]);
4101 fn test_char_indicesator() {
4103 let s = "ศไทย中华Việt Nam";
4104 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
4105 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4108 let mut it = s.char_indices();
4111 assert_eq!(c, (p[pos], v[pos]));
4114 assert_eq!(pos, v.len());
4115 assert_eq!(pos, p.len());
4119 fn test_char_indices_revator() {
4121 let s = "ศไทย中华Việt Nam";
4122 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
4123 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4126 let mut it = s.char_indices_rev();
4129 assert_eq!(c, (p[pos], v[pos]));
4132 assert_eq!(pos, v.len());
4133 assert_eq!(pos, p.len());
4137 fn test_split_char_iterator() {
4138 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4140 let split: ~[&str] = data.split(' ').collect();
4141 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4143 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
4145 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4147 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
4148 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4150 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
4152 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4155 let split: ~[&str] = data.split('ä').collect();
4156 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4158 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
4160 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4162 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
4163 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4165 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
4167 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4171 fn test_splitn_char_iterator() {
4172 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4174 let split: ~[&str] = data.splitn(' ', 3).collect();
4175 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4177 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
4178 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4181 let split: ~[&str] = data.splitn('ä', 3).collect();
4182 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4184 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
4185 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4189 fn test_rsplitn_char_iterator() {
4190 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4192 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
4194 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4196 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
4198 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4201 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
4203 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4205 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
4207 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4211 fn test_split_char_iterator_no_trailing() {
4212 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4214 let split: ~[&str] = data.split('\n').collect();
4215 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4217 let split: ~[&str] = data.split_terminator('\n').collect();
4218 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4222 fn test_rev_split_char_iterator_no_trailing() {
4223 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4225 let mut split: ~[&str] = data.split('\n').rev().collect();
4227 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4229 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
4231 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4236 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
4237 let words: ~[&str] = data.words().collect();
4238 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
4242 fn test_nfd_chars() {
4243 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
4244 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
4245 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
4246 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
4247 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4248 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4249 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
4250 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
4251 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4252 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
4256 fn test_nfkd_chars() {
4257 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
4258 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
4259 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
4260 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
4261 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4262 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4263 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
4264 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
4265 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4266 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
4271 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
4272 let lines: ~[&str] = data.lines().collect();
4273 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4275 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
4276 let lines: ~[&str] = data.lines().collect();
4277 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4281 fn test_split_strator() {
4282 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
4283 let v: ~[&str] = s.split_str(sep).collect();
4286 t("--1233345--", "12345", ~["--1233345--"]);
4287 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
4288 t("::hello::there", "::", ~["", "hello", "there"]);
4289 t("hello::there::", "::", ~["hello", "there", ""]);
4290 t("::hello::there::", "::", ~["", "hello", "there", ""]);
4291 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
4292 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
4293 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
4294 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
4296 t("zz", "zz", ~["",""]);
4297 t("ok", "z", ~["ok"]);
4298 t("zzz", "zz", ~["","z"]);
4299 t("zzzzz", "zz", ~["","","z"]);
4303 fn test_str_default() {
4304 use default::Default;
4305 fn t<S: Default + Str>() {
4306 let s: S = Default::default();
4307 assert_eq!(s.as_slice(), "");
4315 fn test_str_container() {
4316 fn sum_len<S: Container>(v: &[S]) -> uint {
4317 v.iter().map(|x| x.len()).sum()
4321 assert_eq!(5, sum_len(["012", "", "34"]));
4322 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
4323 assert_eq!(5, sum_len([s.as_slice()]));
4327 fn test_str_truncate() {
4328 let mut s = ~"12345";
4330 assert_eq!(s.as_slice(), "12345");
4332 assert_eq!(s.as_slice(), "123");
4334 assert_eq!(s.as_slice(), "");
4336 let mut s = ~"12345";
4340 let p_ = s.as_ptr();
4346 fn test_str_truncate_invalid_len() {
4347 let mut s = ~"12345";
4353 fn test_str_truncate_split_codepoint() {
4354 let mut s = ~"\u00FC"; // ü
4359 fn test_str_from_utf8() {
4360 let xs = bytes!("hello");
4361 assert_eq!(from_utf8(xs), Some("hello"));
4363 let xs = bytes!("ศไทย中华Việt Nam");
4364 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
4366 let xs = bytes!("hello", 0xff);
4367 assert_eq!(from_utf8(xs), None);
4371 fn test_str_from_utf8_owned() {
4372 let xs = bytes!("hello").to_owned();
4373 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
4375 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
4376 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
4378 let xs = bytes!("hello", 0xff).to_owned();
4379 assert_eq!(from_utf8_owned(xs), None);
4383 fn test_str_from_utf8_lossy() {
4384 let xs = bytes!("hello");
4385 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
4387 let xs = bytes!("ศไทย中华Việt Nam");
4388 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
4390 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
4391 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4393 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4394 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4396 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4397 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4399 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4400 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4402 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4403 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4405 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4406 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4409 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4410 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4414 fn test_from_str() {
4415 let owned: Option<~str> = from_str(&"string");
4416 assert_eq!(owned, Some(~"string"));
4420 fn test_maybe_owned_traits() {
4421 let s = Slice("abcde");
4422 assert_eq!(s.len(), 5);
4423 assert_eq!(s.as_slice(), "abcde");
4424 assert_eq!(s.to_str(), ~"abcde");
4425 assert_eq!(format!("{}", s), ~"abcde");
4426 assert!(s.lt(&Owned(~"bcdef")));
4427 assert_eq!(Slice(""), Default::default());
4429 let o = Owned(~"abcde");
4430 assert_eq!(o.len(), 5);
4431 assert_eq!(o.as_slice(), "abcde");
4432 assert_eq!(o.to_str(), ~"abcde");
4433 assert_eq!(format!("{}", o), ~"abcde");
4434 assert!(o.lt(&Slice("bcdef")));
4435 assert_eq!(Owned(~""), Default::default());
4437 assert!(s.cmp(&o) == Equal);
4438 assert!(s.equiv(&o));
4440 assert!(o.cmp(&s) == Equal);
4441 assert!(o.equiv(&s));
4445 fn test_maybe_owned_methods() {
4446 let s = Slice("abcde");
4447 assert!(s.is_slice());
4448 assert!(!s.is_owned());
4450 let o = Owned(~"abcde");
4451 assert!(!o.is_slice());
4452 assert!(o.is_owned());
4456 fn test_maybe_owned_clone() {
4457 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4458 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4459 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4460 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4464 fn test_maybe_owned_into_owned() {
4465 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4466 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4470 fn test_into_maybe_owned() {
4471 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4472 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4473 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4474 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4481 use self::test::BenchHarness;
4486 fn char_iterator(bh: &mut BenchHarness) {
4487 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4488 let len = s.char_len();
4490 bh.iter(|| assert_eq!(s.chars().len(), len));
4494 fn char_iterator_ascii(bh: &mut BenchHarness) {
4495 let s = "Mary had a little lamb, Little lamb
4496 Mary had a little lamb, Little lamb
4497 Mary had a little lamb, Little lamb
4498 Mary had a little lamb, Little lamb
4499 Mary had a little lamb, Little lamb
4500 Mary had a little lamb, Little lamb";
4501 let len = s.char_len();
4503 bh.iter(|| assert_eq!(s.chars().len(), len));
4507 fn char_iterator_rev(bh: &mut BenchHarness) {
4508 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4509 let len = s.char_len();
4511 bh.iter(|| assert_eq!(s.chars_rev().len(), len));
4515 fn char_indicesator(bh: &mut BenchHarness) {
4516 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4517 let len = s.char_len();
4519 bh.iter(|| assert_eq!(s.char_indices().len(), len));
4523 fn char_indicesator_rev(bh: &mut BenchHarness) {
4524 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4525 let len = s.char_len();
4527 bh.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4531 fn split_unicode_ascii(bh: &mut BenchHarness) {
4532 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4534 bh.iter(|| assert_eq!(s.split('V').len(), 3));
4538 fn split_unicode_not_ascii(bh: &mut BenchHarness) {
4539 struct NotAscii(char);
4540 impl CharEq for NotAscii {
4541 fn matches(&self, c: char) -> bool {
4542 let NotAscii(cc) = *self;
4545 fn only_ascii(&self) -> bool { false }
4547 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4549 bh.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4554 fn split_ascii(bh: &mut BenchHarness) {
4555 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4556 let len = s.split(' ').len();
4558 bh.iter(|| assert_eq!(s.split(' ').len(), len));
4562 fn split_not_ascii(bh: &mut BenchHarness) {
4563 struct NotAscii(char);
4564 impl CharEq for NotAscii {
4566 fn matches(&self, c: char) -> bool {
4567 let NotAscii(cc) = *self;
4570 fn only_ascii(&self) -> bool { false }
4572 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4573 let len = s.split(' ').len();
4575 bh.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4579 fn split_extern_fn(bh: &mut BenchHarness) {
4580 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4581 let len = s.split(' ').len();
4582 fn pred(c: char) -> bool { c == ' ' }
4584 bh.iter(|| assert_eq!(s.split(pred).len(), len));
4588 fn split_closure(bh: &mut BenchHarness) {
4589 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4590 let len = s.split(' ').len();
4592 bh.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4596 fn split_slice(bh: &mut BenchHarness) {
4597 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4598 let len = s.split(' ').len();
4600 bh.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4604 fn is_utf8_100_ascii(bh: &mut BenchHarness) {
4606 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4607 Lorem ipsum dolor sit amet, consectetur. ");
4609 assert_eq!(100, s.len());
4616 fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
4617 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4618 assert_eq!(100, s.len());
4625 fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4626 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4627 Lorem ipsum dolor sit amet, consectetur. ");
4629 assert_eq!(100, s.len());
4631 let _ = from_utf8_lossy(s);
4636 fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4637 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4638 assert_eq!(100, s.len());
4640 let _ = from_utf8_lossy(s);
4645 fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4646 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4648 let _ = from_utf8_lossy(s);
4653 fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4654 let s = ::slice::from_elem(100, 0xF5u8);
4656 let _ = from_utf8_lossy(s);
4661 fn bench_with_capacity(bh: &mut BenchHarness) {
4668 fn bench_push_str(bh: &mut BenchHarness) {
4669 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4677 fn bench_connect(bh: &mut BenchHarness) {
4678 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4680 let v = [s, s, s, s, s, s, s, s, s, s];
4682 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);