1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
13 Unicode string manipulation (`str` type)
17 Rust's string type is one of the core primitive types of the language. While
18 represented by the name `str`, the name `str` is not actually a valid type in
19 Rust. Each string must also be decorated with its ownership. This means that
20 there are two common kinds of strings in Rust:
22 * `~str` - This is an owned string. This type obeys all of the normal semantics
23 of the `~T` types, meaning that it has one, and only one, owner. This
24 type cannot be implicitly copied, and is moved out of when passed to
27 * `&str` - This is the borrowed string type. This type of string can only be
28 created from the other kind of string. As the name "borrowed"
29 implies, this type of string is owned elsewhere, and this string
30 cannot be moved out of.
32 As an example, here's a few different kinds of strings.
36 let owned_string = ~"I am an owned string";
37 let borrowed_string1 = "This string is borrowed with the 'static lifetime";
38 let borrowed_string2: &str = owned_string; // owned strings can be borrowed
42 From the example above, you can see that Rust has 2 different kinds of string
43 literals. The owned literals correspond to the owned string types, but the
44 "borrowed literal" is actually more akin to C's concept of a static string.
46 When a string is declared without a `~` sigil, then the string is allocated
47 statically in the rodata of the executable/library. The string then has the
48 type `&'static str` meaning that the string is valid for the `'static`
49 lifetime, otherwise known as the lifetime of the entire program. As can be
50 inferred from the type, these static strings are not mutable.
54 Many languages have immutable strings by default, and Rust has a particular
55 flavor on this idea. As with the rest of Rust types, strings are immutable by
56 default. If a string is declared as `mut`, however, it may be mutated. This
57 works the same way as the rest of Rust's type system in the sense that if
58 there's a mutable reference to a string, there may only be one mutable reference
59 to that string. With these guarantees, strings can easily transition between
60 being mutable/immutable with the same benefits of having mutable strings in
64 let mut buf = ~"testing";
67 assert_eq!(buf, ~"testing 123");
72 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
73 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
74 encoded UTF-8 sequences. Additionally, strings are not null-terminated
75 and can contain null codepoints.
77 The actual representation of strings have direct mappings to vectors:
79 * `~str` is the same as `~[u8]`
80 * `&str` is the same as `&[u8]`
89 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
90 use container::{Container, Mutable};
93 use iter::{Iterator, FromIterator, Extendable, range};
94 use iter::{Filter, AdditiveIterator, Map};
95 use iter::{Rev, DoubleEndedIterator, ExactSize};
98 use option::{None, Option, Some};
101 use from_str::FromStr;
103 use slice::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
106 use default::Default;
110 Section: Creating a string
113 /// Consumes a vector of bytes to create a new utf-8 string.
114 /// Returns None if the vector contains invalid UTF-8.
115 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
117 Some(unsafe { raw::from_utf8_owned(vv) })
123 /// Converts a vector to a string slice without performing any allocations.
125 /// Once the slice has been validated as utf-8, it is transmuted in-place and
126 /// returned as a '&str' instead of a '&[u8]'
128 /// Returns None if the slice is not utf-8.
129 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
131 Some(unsafe { raw::from_utf8(v) })
135 impl FromStr for ~str {
137 fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
140 /// Convert a byte to a UTF-8 string
144 /// Fails if invalid UTF-8
145 pub fn from_byte(b: u8) -> ~str {
147 unsafe { ::cast::transmute(~[b]) }
150 /// Convert a char to a string
151 pub fn from_char(ch: char) -> ~str {
157 /// Convert a vector of chars to a string
158 pub fn from_chars(chs: &[char]) -> ~str {
159 chs.iter().map(|c| *c).collect()
163 pub fn push_str(lhs: &mut ~str, rhs: &str) {
167 /// Methods for vectors of strings
168 pub trait StrVector {
169 /// Concatenate a vector of strings.
170 fn concat(&self) -> ~str;
172 /// Concatenate a vector of strings, placing a given separator between each.
173 fn connect(&self, sep: &str) -> ~str;
176 impl<'a, S: Str> StrVector for &'a [S] {
177 fn concat(&self) -> ~str {
178 if self.is_empty() { return ~""; }
180 // `len` calculation may overflow but push_str but will check boundaries
181 let len = self.iter().map(|s| s.as_slice().len()).sum();
183 let mut result = with_capacity(len);
185 for s in self.iter() {
186 result.push_str(s.as_slice())
191 fn connect(&self, sep: &str) -> ~str {
192 if self.is_empty() { return ~""; }
195 if sep.is_empty() { return self.concat(); }
197 // this is wrong without the guarantee that `self` is non-empty
198 // `len` calculation may overflow but push_str but will check boundaries
199 let len = sep.len() * (self.len() - 1)
200 + self.iter().map(|s| s.as_slice().len()).sum();
201 let mut result = with_capacity(len);
202 let mut first = true;
204 for s in self.iter() {
208 result.push_str(sep);
210 result.push_str(s.as_slice());
216 impl<'a, S: Str> StrVector for Vec<S> {
218 fn concat(&self) -> ~str {
219 self.as_slice().concat()
223 fn connect(&self, sep: &str) -> ~str {
224 self.as_slice().connect(sep)
228 /// Something that can be used to compare against a character
230 /// Determine if the splitter should split at the given character
231 fn matches(&self, char) -> bool;
232 /// Indicate if this is only concerned about ASCII characters,
233 /// which can allow for a faster implementation.
234 fn only_ascii(&self) -> bool;
237 impl CharEq for char {
239 fn matches(&self, c: char) -> bool { *self == c }
241 fn only_ascii(&self) -> bool { (*self as uint) < 128 }
244 impl<'a> CharEq for 'a |char| -> bool {
246 fn matches(&self, c: char) -> bool { (*self)(c) }
248 fn only_ascii(&self) -> bool { false }
251 impl CharEq for extern "Rust" fn(char) -> bool {
253 fn matches(&self, c: char) -> bool { (*self)(c) }
255 fn only_ascii(&self) -> bool { false }
258 impl<'a, C: CharEq> CharEq for &'a [C] {
260 fn matches(&self, c: char) -> bool {
261 self.iter().any(|m| m.matches(c))
264 fn only_ascii(&self) -> bool {
265 self.iter().all(|m| m.only_ascii())
273 /// External iterator for a string's characters.
274 /// Use with the `std::iter` module.
276 pub struct Chars<'a> {
277 /// The slice remaining to be iterated
278 priv string: &'a str,
281 impl<'a> Iterator<char> for Chars<'a> {
283 fn next(&mut self) -> Option<char> {
284 // Decode the next codepoint, then update
285 // the slice to be just the remaining part
286 if self.string.len() != 0 {
287 let CharRange {ch, next} = self.string.char_range_at(0);
289 self.string = raw::slice_unchecked(self.string, next, self.string.len());
298 fn size_hint(&self) -> (uint, Option<uint>) {
299 (self.string.len().saturating_add(3)/4, Some(self.string.len()))
303 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
305 fn next_back(&mut self) -> Option<char> {
306 if self.string.len() != 0 {
307 let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
309 self.string = raw::slice_unchecked(self.string, 0, next);
318 /// External iterator for a string's characters and their byte offsets.
319 /// Use with the `std::iter` module.
321 pub struct CharOffsets<'a> {
322 /// The original string to be iterated
323 priv string: &'a str,
324 priv iter: Chars<'a>,
327 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
329 fn next(&mut self) -> Option<(uint, char)> {
330 // Compute the byte offset by using the pointer offset between
331 // the original string slice and the iterator's remaining part
332 let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
333 self.iter.next().map(|ch| (offset, ch))
337 fn size_hint(&self) -> (uint, Option<uint>) {
338 self.iter.size_hint()
342 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
344 fn next_back(&mut self) -> Option<(uint, char)> {
345 self.iter.next_back().map(|ch| {
346 let offset = self.iter.string.len() +
347 self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
353 /// External iterator for a string's characters in reverse order.
354 /// Use with the `std::iter` module.
355 pub type RevChars<'a> = Rev<Chars<'a>>;
357 /// External iterator for a string's characters and their byte offsets in reverse order.
358 /// Use with the `std::iter` module.
359 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
361 /// External iterator for a string's bytes.
362 /// Use with the `std::iter` module.
364 Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
366 /// External iterator for a string's bytes in reverse order.
367 /// Use with the `std::iter` module.
368 pub type RevBytes<'a> = Rev<Bytes<'a>>;
370 /// An iterator over the substrings of a string, separated by `sep`.
372 pub struct CharSplits<'a, Sep> {
373 /// The slice remaining to be iterated
374 priv string: &'a str,
376 /// Whether an empty string at the end is allowed
377 priv allow_trailing_empty: bool,
378 priv only_ascii: bool,
382 /// An iterator over the substrings of a string, separated by `sep`,
383 /// starting from the back of the string.
384 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
386 /// An iterator over the substrings of a string, separated by `sep`,
387 /// splitting at most `count` times.
389 pub struct CharSplitsN<'a, Sep> {
390 priv iter: CharSplits<'a, Sep>,
391 /// The number of splits remaining
396 /// An iterator over the words of a string, separated by a sequence of whitespace
398 Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
400 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
401 pub type AnyLines<'a> =
402 Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
404 impl<'a, Sep> CharSplits<'a, Sep> {
406 fn get_end(&mut self) -> Option<&'a str> {
407 if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
408 self.finished = true;
416 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
418 fn next(&mut self) -> Option<&'a str> {
419 if self.finished { return None }
421 let mut next_split = None;
423 for (idx, byte) in self.string.bytes().enumerate() {
424 if self.sep.matches(byte as char) && byte < 128u8 {
425 next_split = Some((idx, idx + 1));
430 for (idx, ch) in self.string.char_indices() {
431 if self.sep.matches(ch) {
432 next_split = Some((idx, self.string.char_range_at(idx).next));
438 Some((a, b)) => unsafe {
439 let elt = raw::slice_unchecked(self.string, 0, a);
440 self.string = raw::slice_unchecked(self.string, b, self.string.len());
443 None => self.get_end(),
448 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
449 for CharSplits<'a, Sep> {
451 fn next_back(&mut self) -> Option<&'a str> {
452 if self.finished { return None }
454 if !self.allow_trailing_empty {
455 self.allow_trailing_empty = true;
456 match self.next_back() {
457 Some(elt) if !elt.is_empty() => return Some(elt),
458 _ => if self.finished { return None }
461 let len = self.string.len();
462 let mut next_split = None;
465 for (idx, byte) in self.string.bytes().enumerate().rev() {
466 if self.sep.matches(byte as char) && byte < 128u8 {
467 next_split = Some((idx, idx + 1));
472 for (idx, ch) in self.string.char_indices_rev() {
473 if self.sep.matches(ch) {
474 next_split = Some((idx, self.string.char_range_at(idx).next));
480 Some((a, b)) => unsafe {
481 let elt = raw::slice_unchecked(self.string, b, len);
482 self.string = raw::slice_unchecked(self.string, 0, a);
485 None => { self.finished = true; Some(self.string) }
490 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
492 fn next(&mut self) -> Option<&'a str> {
495 if self.invert { self.iter.next_back() } else { self.iter.next() }
502 /// An iterator over the start and end indices of the matches of a
503 /// substring within a larger string
505 pub struct MatchIndices<'a> {
506 priv haystack: &'a str,
507 priv needle: &'a str,
511 /// An iterator over the substrings of a string separated by a given
514 pub struct StrSplits<'a> {
515 priv it: MatchIndices<'a>,
520 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
522 fn next(&mut self) -> Option<(uint, uint)> {
523 // See Issue #1932 for why this is a naive search
524 let (h_len, n_len) = (self.haystack.len(), self.needle.len());
525 let mut match_start = 0;
528 while self.position < h_len {
529 if self.haystack[self.position] == self.needle[match_i] {
530 if match_i == 0 { match_start = self.position; }
534 if match_i == n_len {
536 return Some((match_start, self.position));
539 // failed match, backtrack
542 self.position = match_start;
551 impl<'a> Iterator<&'a str> for StrSplits<'a> {
553 fn next(&mut self) -> Option<&'a str> {
554 if self.finished { return None; }
556 match self.it.next() {
557 Some((from, to)) => {
558 let ret = Some(self.it.haystack.slice(self.last_end, from));
563 self.finished = true;
564 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
570 // Helper functions used for Unicode normalization
571 fn canonical_sort(comb: &mut [(char, u8)]) {
575 let len = comb.len();
576 for i in range(0, len) {
577 let mut swapped = false;
578 for j in range(1, len-i) {
579 let class_a = *comb[j-1].ref1();
580 let class_b = *comb[j].ref1();
581 if class_a != 0 && class_b != 0 && class_a > class_b {
586 if !swapped { break; }
591 enum NormalizationForm {
596 /// External iterator for a string's normalization's characters.
597 /// Use with the `std::iter` module.
599 pub struct Normalizations<'a> {
600 priv kind: NormalizationForm,
601 priv iter: Chars<'a>,
602 priv buffer: ~[(char, u8)],
606 impl<'a> Iterator<char> for Normalizations<'a> {
608 fn next(&mut self) -> Option<char> {
609 use unicode::decompose::canonical_combining_class;
611 match self.buffer.head() {
617 Some(&(c, _)) if self.sorted => {
621 _ => self.sorted = false
624 let decomposer = match self.kind {
625 NFD => char::decompose_canonical,
626 NFKD => char::decompose_compatible
630 for ch in self.iter {
631 let buffer = &mut self.buffer;
632 let sorted = &mut self.sorted;
634 let class = canonical_combining_class(d);
635 if class == 0 && !*sorted {
636 canonical_sort(*buffer);
639 buffer.push((d, class));
646 canonical_sort(self.buffer);
650 match self.buffer.shift() {
655 Some((c, _)) => Some(c),
660 fn size_hint(&self) -> (uint, Option<uint>) {
661 let (lower, _) = self.iter.size_hint();
666 /// Replace all occurrences of one string with another
670 /// * s - The string containing substrings to replace
671 /// * from - The string to replace
672 /// * to - The replacement string
676 /// The original string with all occurances of `from` replaced with `to`
677 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
678 let mut result = ~"";
679 let mut last_end = 0;
680 for (start, end) in s.match_indices(from) {
681 result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
685 result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
690 Section: Comparing strings
693 // share the implementation of the lang-item vs. non-lang-item
696 fn eq_slice_(a: &str, b: &str) -> bool {
697 a.len() == b.len() && unsafe {
698 libc::memcmp(a.as_ptr() as *libc::c_void,
699 b.as_ptr() as *libc::c_void,
700 a.len() as libc::size_t) == 0
704 /// Bytewise slice equality
708 pub fn eq_slice(a: &str, b: &str) -> bool {
712 /// Bytewise slice equality
715 pub fn eq_slice(a: &str, b: &str) -> bool {
719 /// Bytewise string equality
721 #[lang="uniq_str_eq"]
723 pub fn eq(a: &~str, b: &~str) -> bool {
729 pub fn eq(a: &~str, b: &~str) -> bool {
737 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
738 /// returning `true` in that case, or, if it is invalid, `false` with
739 /// `iter` reset such that it is pointing at the first byte in the
740 /// invalid sequence.
742 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
744 // save the current thing we're pointing at.
747 // restore the iterator we had at the start of this codepoint.
748 macro_rules! err ( () => { {*iter = old; return false} });
749 macro_rules! next ( () => {
752 // we needed data, but there was none: error!
757 let first = match iter.next() {
759 // we're at the end of the iterator and a codepoint
760 // boundary at the same time, so this string is valid.
764 // ASCII characters are always valid, so only large
765 // bytes need more examination.
767 let w = utf8_char_width(first);
768 let second = next!();
769 // 2-byte encoding is for codepoints \u0080 to \u07ff
770 // first C2 80 last DF BF
771 // 3-byte encoding is for codepoints \u0800 to \uffff
772 // first E0 A0 80 last EF BF BF
773 // excluding surrogates codepoints \ud800 to \udfff
774 // ED A0 80 to ED BF BF
775 // 4-byte encoding is for codepoints \u10000 to \u10ffff
776 // first F0 90 80 80 last F4 8F BF BF
778 // Use the UTF-8 syntax from the RFC
780 // https://tools.ietf.org/html/rfc3629
782 // UTF8-2 = %xC2-DF UTF8-tail
783 // UTF8-3 = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
784 // %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
785 // UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
786 // %xF4 %x80-8F 2( UTF8-tail )
788 2 => if second & 192 != TAG_CONT_U8 {err!()},
790 match (first, second, next!() & 192) {
791 (0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
792 (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
793 (0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
794 (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
799 match (first, second, next!() & 192, next!() & 192) {
800 (0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
801 (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
802 (0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
812 /// Determines if a vector of bytes contains valid UTF-8.
813 pub fn is_utf8(v: &[u8]) -> bool {
814 run_utf8_validation_iterator(&mut v.iter())
818 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
819 let mut it = v.iter();
821 let ok = run_utf8_validation_iterator(&mut it);
825 // work out how many valid bytes we've consumed
826 // (run_utf8_validation_iterator resets the iterator to just
827 // after the last good byte), which we can do because the
828 // vector iterator size_hint is exact.
829 let (remaining, _) = it.size_hint();
830 Some(v.len() - remaining)
834 /// Determines if a vector of `u16` contains valid UTF-16
835 pub fn is_utf16(v: &[u16]) -> bool {
836 let mut it = v.iter();
837 macro_rules! next ( ($ret:expr) => {
838 match it.next() { Some(u) => *u, None => return $ret }
844 match char::from_u32(u as u32) {
847 let u2 = next!(false);
848 if u < 0xD7FF || u > 0xDBFF ||
849 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
855 /// An iterator that decodes UTF-16 encoded codepoints from a vector
858 pub struct UTF16Items<'a> {
859 priv iter: slice::Items<'a, u16>
861 /// The possibilities for values decoded from a `u16` stream.
862 #[deriving(Eq, TotalEq, Clone, Show)]
864 /// A valid codepoint.
866 /// An invalid surrogate without its pair.
871 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
872 /// replacement character (U+FFFD).
874 pub fn to_char_lossy(&self) -> char {
877 LoneSurrogate(_) => '\uFFFD'
882 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
883 fn next(&mut self) -> Option<UTF16Item> {
884 let u = match self.iter.next() {
889 if u < 0xD800 || 0xDFFF < u {
891 Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
892 } else if u >= 0xDC00 {
893 // a trailing surrogate
894 Some(LoneSurrogate(u))
896 // preserve state for rewinding.
899 let u2 = match self.iter.next() {
902 None => return Some(LoneSurrogate(u))
904 if u2 < 0xDC00 || u2 > 0xDFFF {
905 // not a trailing surrogate so we're not a valid
906 // surrogate pair, so rewind to redecode u2 next time.
908 return Some(LoneSurrogate(u))
911 // all ok, so lets decode it.
912 let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
913 Some(ScalarValue(unsafe {cast::transmute(c)}))
918 fn size_hint(&self) -> (uint, Option<uint>) {
919 let (low, high) = self.iter.size_hint();
920 // we could be entirely valid surrogates (2 elements per
921 // char), or entirely non-surrogates (1 element per char)
926 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
927 /// returning invalid surrogates as `LoneSurrogate`s.
933 /// use std::str::{ScalarValue, LoneSurrogate};
935 /// // 𝄞mus<invalid>ic<invalid>
936 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
937 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
940 /// assert_eq!(str::utf16_items(v).to_owned_vec(),
941 /// ~[ScalarValue('𝄞'),
942 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
943 /// LoneSurrogate(0xDD1E),
944 /// ScalarValue('i'), ScalarValue('c'),
945 /// LoneSurrogate(0xD834)]);
947 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
948 UTF16Items { iter : v.iter() }
951 /// Return a slice of `v` ending at (and not including) the first NUL
960 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
961 /// // no NULs so no change
962 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
966 /// assert_eq!(str::truncate_utf16_at_nul(v),
967 /// &['a' as u16, 'b' as u16]);
969 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
970 match v.iter().position(|c| *c == 0) {
971 // don't include the 0
972 Some(i) => v.slice_to(i),
977 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
978 /// if `v` contains any invalid data.
986 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
987 /// 0x0073, 0x0069, 0x0063];
988 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
990 /// // 𝄞mu<invalid>ic
992 /// assert_eq!(str::from_utf16(v), None);
994 pub fn from_utf16(v: &[u16]) -> Option<~str> {
995 let mut s = with_capacity(v.len() / 2);
996 for c in utf16_items(v) {
998 ScalarValue(c) => s.push_char(c),
999 LoneSurrogate(_) => return None
1005 /// Decode a UTF-16 encoded vector `v` into a string, replacing
1006 /// invalid data with the replacement character (U+FFFD).
1012 /// // 𝄞mus<invalid>ic<invalid>
1013 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1014 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
1017 /// assert_eq!(str::from_utf16_lossy(v),
1018 /// ~"𝄞mus\uFFFDic\uFFFD");
1020 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1021 utf16_items(v).map(|c| c.to_char_lossy()).collect()
1024 /// Allocates a new string with the specified capacity. The string returned is
1025 /// the empty string, but has capacity for much more.
1027 pub fn with_capacity(capacity: uint) -> ~str {
1029 cast::transmute(slice::with_capacity::<~[u8]>(capacity))
1033 // https://tools.ietf.org/html/rfc3629
1034 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1035 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1036 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1037 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1038 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1039 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1040 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1041 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1042 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1043 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1044 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1045 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1046 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1047 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1048 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1049 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1050 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1053 /// Given a first byte, determine how many bytes are in this UTF-8 character
1055 pub fn utf8_char_width(b: u8) -> uint {
1056 return UTF8_CHAR_WIDTH[b] as uint;
1059 /// Struct that contains a `char` and the index of the first byte of
1060 /// the next `char` in a string. This can be used as a data structure
1061 /// for iterating over the UTF-8 bytes of a string.
1062 pub struct CharRange {
1065 /// Index of the first byte of the next `char`
1069 // Return the initial codepoint accumulator for the first byte.
1070 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1071 // for width 3, and 3 bits for width 4
1072 macro_rules! utf8_first_byte(
1073 ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1076 // return the value of $ch updated with continuation byte $byte
1077 macro_rules! utf8_acc_cont_byte(
1078 ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1081 static TAG_CONT_U8: u8 = 128u8;
1083 /// Converts a vector of bytes to a new utf-8 string.
1084 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1089 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1090 /// let output = std::str::from_utf8_lossy(input);
1091 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1093 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1094 let firstbad = match first_non_utf8_index(v) {
1095 None => return Slice(unsafe { cast::transmute(v) }),
1099 static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1100 let mut i = firstbad;
1101 let total = v.len();
1102 fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1103 unsafe { *xs.unsafe_ref(i) }
1105 fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1112 let mut res = with_capacity(total);
1115 unsafe { raw::push_bytes(&mut res, v.slice_to(i)) };
1118 // subseqidx is the index of the first byte of the subsequence we're looking at.
1119 // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1121 let mut subseqidx = firstbad;
1125 let byte = unsafe_get(v, i);
1128 macro_rules! error(() => ({
1130 if subseqidx != i_ {
1131 raw::push_bytes(&mut res, v.slice(subseqidx, i_));
1134 raw::push_bytes(&mut res, REPLACEMENT);
1139 // subseqidx handles this
1141 let w = utf8_char_width(byte);
1145 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1152 match (byte, safe_get(v, i, total)) {
1153 (0xE0 , 0xA0 .. 0xBF) => (),
1154 (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1155 (0xED , 0x80 .. 0x9F) => (),
1156 (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1163 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1170 match (byte, safe_get(v, i, total)) {
1171 (0xF0 , 0x90 .. 0xBF) => (),
1172 (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1173 (0xF4 , 0x80 .. 0x8F) => (),
1180 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1185 if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1198 if subseqidx < total {
1199 unsafe { raw::push_bytes(&mut res, v.slice(subseqidx, total)) };
1208 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1209 /// This can be useful as an optimization when an allocation is sometimes
1210 /// needed but not always.
1211 pub enum MaybeOwned<'a> {
1212 /// A borrowed string
1218 /// SendStr is a specialization of `MaybeOwned` to be sendable
1219 pub type SendStr = MaybeOwned<'static>;
1221 impl<'a> MaybeOwned<'a> {
1222 /// Returns `true` if this `MaybeOwned` wraps an owned string
1224 pub fn is_owned(&self) -> bool {
1231 /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1233 pub fn is_slice(&self) -> bool {
1241 /// Trait for moving into a `MaybeOwned`
1242 pub trait IntoMaybeOwned<'a> {
1243 /// Moves self into a `MaybeOwned`
1244 fn into_maybe_owned(self) -> MaybeOwned<'a>;
1247 impl<'a> IntoMaybeOwned<'a> for ~str {
1249 fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1252 impl<'a> IntoMaybeOwned<'a> for &'a str {
1254 fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1257 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1259 fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1262 impl<'a> Eq for MaybeOwned<'a> {
1264 fn eq(&self, other: &MaybeOwned) -> bool {
1265 self.as_slice().equals(&other.as_slice())
1269 impl<'a> TotalEq for MaybeOwned<'a> {
1271 fn equals(&self, other: &MaybeOwned) -> bool {
1272 self.as_slice().equals(&other.as_slice())
1276 impl<'a> Ord for MaybeOwned<'a> {
1278 fn lt(&self, other: &MaybeOwned) -> bool {
1279 self.as_slice().lt(&other.as_slice())
1283 impl<'a> TotalOrd for MaybeOwned<'a> {
1285 fn cmp(&self, other: &MaybeOwned) -> Ordering {
1286 self.as_slice().cmp(&other.as_slice())
1290 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1292 fn equiv(&self, other: &S) -> bool {
1293 self.as_slice().equals(&other.as_slice())
1297 impl<'a> Str for MaybeOwned<'a> {
1299 fn as_slice<'b>(&'b self) -> &'b str {
1302 Owned(ref s) => s.as_slice()
1307 fn into_owned(self) -> ~str {
1309 Slice(s) => s.to_owned(),
1315 impl<'a> Container for MaybeOwned<'a> {
1317 fn len(&self) -> uint { self.as_slice().len() }
1320 impl<'a> Clone for MaybeOwned<'a> {
1322 fn clone(&self) -> MaybeOwned<'a> {
1324 Slice(s) => Slice(s),
1325 Owned(ref s) => Owned(s.to_owned())
1330 impl<'a> Default for MaybeOwned<'a> {
1332 fn default() -> MaybeOwned<'a> { Slice("") }
1335 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1337 fn hash(&self, hasher: &mut H) {
1339 Slice(s) => s.hash(hasher),
1340 Owned(ref s) => s.hash(hasher),
1345 impl<'a> fmt::Show for MaybeOwned<'a> {
1347 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1349 Slice(ref s) => s.fmt(f),
1350 Owned(ref s) => s.fmt(f)
1355 /// Unsafe operations
1358 use container::Container;
1362 use option::{Option, Some, None};
1363 use str::{is_utf8, OwnedStr, StrSlice};
1365 use slice::{MutableVector, ImmutableVector, OwnedVector};
1368 /// Create a Rust string from a *u8 buffer of the given length
1369 pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1370 let mut v: ~[u8] = slice::with_capacity(len);
1371 ptr::copy_memory(v.as_mut_ptr(), buf, len);
1374 assert!(is_utf8(v));
1375 ::cast::transmute(v)
1378 #[lang="strdup_uniq"]
1381 unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1382 from_buf_len(ptr, len)
1385 /// Create a Rust string from a null-terminated C string
1386 pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1391 curr = buf.offset(i);
1393 from_buf_len(buf as *u8, i as uint)
1396 /// Converts a slice of bytes to a string slice without checking
1397 /// that the string contains valid UTF-8.
1398 pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1402 /// Converts an owned vector of bytes to a new owned string. This assumes
1403 /// that the utf-8-ness of the vector has already been validated
1405 pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1409 /// Converts a byte to a string.
1410 pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1412 /// Form a slice from a C string. Unsafe because the caller must ensure the
1413 /// C string has the static lifetime, or else the return value may be
1414 /// invalidated later.
1415 pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1419 while *curr != 0u8 {
1421 curr = s.offset(len as int);
1423 let v = Slice { data: s, len: len };
1424 assert!(is_utf8(::cast::transmute(v)));
1425 ::cast::transmute(v)
1428 /// Takes a bytewise (not UTF-8) slice from a string.
1430 /// Returns the substring from [`begin`..`end`).
1434 /// If begin is greater than end.
1435 /// If end is greater than the length of the string.
1437 pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1438 assert!(begin <= end);
1439 assert!(end <= s.len());
1440 slice_unchecked(s, begin, end)
1443 /// Takes a bytewise (not UTF-8) slice from a string.
1445 /// Returns the substring from [`begin`..`end`).
1447 /// Caller must check slice boundaries!
1449 pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1450 cast::transmute(Slice {
1451 data: s.as_ptr().offset(begin as int),
1456 /// Appends a byte to a string.
1457 /// The caller must preserve the valid UTF-8 property.
1459 pub unsafe fn push_byte(s: &mut ~str, b: u8) {
1460 as_owned_vec(s).push(b)
1463 /// Appends a vector of bytes to a string.
1464 /// The caller must preserve the valid UTF-8 property.
1466 pub unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
1467 slice::bytes::push_bytes(as_owned_vec(s), bytes);
1470 /// Removes the last byte from a string and returns it.
1471 /// Returns None when an empty string is passed.
1472 /// The caller must preserve the valid UTF-8 property.
1473 pub unsafe fn pop_byte(s: &mut ~str) -> Option<u8> {
1478 let b = s[len - 1u];
1484 /// Removes the first byte from a string and returns it.
1485 /// Returns None when an empty string is passed.
1486 /// The caller must preserve the valid UTF-8 property.
1487 pub unsafe fn shift_byte(s: &mut ~str) -> Option<u8> {
1493 *s = s.slice(1, len).to_owned();
1498 /// Access the str in its vector representation.
1499 /// The caller must preserve the valid UTF-8 property when modifying.
1501 pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1505 /// Sets the length of a string
1507 /// This will explicitly set the size of the string, without actually
1508 /// modifing its buffers, so it is up to the caller to ensure that
1509 /// the string is actually the specified size.
1511 fn test_from_buf_len() {
1513 let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1515 let c = from_buf_len(b, 3u);
1516 assert_eq!(c, ~"AAA");
1522 Section: Trait implementations
1526 #[allow(missing_doc)]
1528 use container::Container;
1529 use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1532 use option::{Some, None};
1533 use str::{Str, StrSlice, OwnedStr, eq_slice};
1535 impl<'a> Add<&'a str,~str> for &'a str {
1537 fn add(&self, rhs: & &'a str) -> ~str {
1538 let mut ret = self.to_owned();
1544 impl<'a> TotalOrd for &'a str {
1546 fn cmp(&self, other: & &'a str) -> Ordering {
1547 for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1548 match s_b.cmp(&o_b) {
1549 Greater => return Greater,
1550 Less => return Less,
1555 self.len().cmp(&other.len())
1559 impl TotalOrd for ~str {
1561 fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1564 impl<'a> Eq for &'a str {
1566 fn eq(&self, other: & &'a str) -> bool {
1567 eq_slice((*self), (*other))
1570 fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1575 fn eq(&self, other: &~str) -> bool {
1576 eq_slice((*self), (*other))
1580 impl<'a> TotalEq for &'a str {
1582 fn equals(&self, other: & &'a str) -> bool {
1583 eq_slice((*self), (*other))
1587 impl TotalEq for ~str {
1589 fn equals(&self, other: &~str) -> bool {
1590 eq_slice((*self), (*other))
1594 impl<'a> Ord for &'a str {
1596 fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1601 fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1604 impl<'a, S: Str> Equiv<S> for &'a str {
1606 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1609 impl<'a, S: Str> Equiv<S> for ~str {
1611 fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1618 /// Any string that can be represented as a slice
1620 /// Work with `self` as a slice.
1621 fn as_slice<'a>(&'a self) -> &'a str;
1623 /// Convert `self` into a ~str, not making a copy if possible
1624 fn into_owned(self) -> ~str;
1627 impl<'a> Str for &'a str {
1629 fn as_slice<'a>(&'a self) -> &'a str { *self }
1632 fn into_owned(self) -> ~str { self.to_owned() }
1635 impl<'a> Str for ~str {
1637 fn as_slice<'a>(&'a self) -> &'a str {
1638 let s: &'a str = *self; s
1642 fn into_owned(self) -> ~str { self }
1645 impl<'a> Container for &'a str {
1647 fn len(&self) -> uint {
1652 impl Container for ~str {
1654 fn len(&self) -> uint { self.as_slice().len() }
1657 impl Mutable for ~str {
1658 /// Remove all content, make the string empty
1660 fn clear(&mut self) {
1667 /// Methods for string slices
1668 pub trait StrSlice<'a> {
1669 /// Returns true if one string contains another
1673 /// - needle - The string to look for
1674 fn contains<'a>(&self, needle: &'a str) -> bool;
1676 /// Returns true if a string contains a char.
1680 /// - needle - The char to look for
1681 fn contains_char(&self, needle: char) -> bool;
1683 /// An iterator over the characters of `self`. Note, this iterates
1684 /// over unicode code-points, not unicode graphemes.
1689 /// let v: ~[char] = "abc åäö".chars().collect();
1690 /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1692 fn chars(&self) -> Chars<'a>;
1694 /// An iterator over the characters of `self`, in reverse order.
1695 fn chars_rev(&self) -> RevChars<'a>;
1697 /// An iterator over the bytes of `self`
1698 fn bytes(&self) -> Bytes<'a>;
1700 /// An iterator over the bytes of `self`, in reverse order
1701 fn bytes_rev(&self) -> RevBytes<'a>;
1703 /// An iterator over the characters of `self` and their byte offsets.
1704 fn char_indices(&self) -> CharOffsets<'a>;
1706 /// An iterator over the characters of `self` and their byte offsets,
1707 /// in reverse order.
1708 fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1710 /// An iterator over substrings of `self`, separated by characters
1711 /// matched by `sep`.
1716 /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1717 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1719 /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1720 /// assert_eq!(v, ~["abc", "def", "ghi"]);
1722 /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1723 /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1725 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1727 /// An iterator over substrings of `self`, separated by characters
1728 /// matched by `sep`, restricted to splitting at most `count`
1734 /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1735 /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1737 /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1738 /// assert_eq!(v, ~["abc", "def2ghi"]);
1740 /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1741 /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1743 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1745 /// An iterator over substrings of `self`, separated by characters
1746 /// matched by `sep`.
1748 /// Equivalent to `split`, except that the trailing substring
1749 /// is skipped if empty (terminator semantics).
1754 /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1755 /// assert_eq!(v, ~["A", "B"]);
1757 /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1758 /// assert_eq!(v, ~["A", "", "B", ""]);
1760 fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1762 /// An iterator over substrings of `self`, separated by characters
1763 /// matched by `sep`, in reverse order.
1768 /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1769 /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1771 /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1772 /// assert_eq!(v, ~["ghi", "def", "abc"]);
1774 /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1775 /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1777 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1779 /// An iterator over substrings of `self`, separated by characters
1780 /// matched by `sep`, starting from the end of the string.
1781 /// Restricted to splitting at most `count` times.
1786 /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1787 /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1789 /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1790 /// assert_eq!(v, ~["ghi", "abc1def"]);
1792 /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1793 /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1795 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1797 /// An iterator over the start and end indices of the disjoint
1798 /// matches of `sep` within `self`.
1800 /// That is, each returned value `(start, end)` satisfies
1801 /// `self.slice(start, end) == sep`. For matches of `sep` within
1802 /// `self` that overlap, only the indicies corresponding to the
1803 /// first match are returned.
1808 /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1809 /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1811 /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1812 /// assert_eq!(v, ~[(1,4), (4,7)]);
1814 /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1815 /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1817 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1819 /// An iterator over the substrings of `self` separated by `sep`.
1824 /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1825 /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1827 /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1828 /// assert_eq!(v, ~["1", "", "2"]);
1830 fn split_str(&self, &'a str) -> StrSplits<'a>;
1832 /// An iterator over the lines of a string (subsequences separated
1833 /// by `\n`). This does not include the empty string after a
1839 /// let four_lines = "foo\nbar\n\nbaz\n";
1840 /// let v: ~[&str] = four_lines.lines().collect();
1841 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1843 fn lines(&self) -> CharSplits<'a, char>;
1845 /// An iterator over the lines of a string, separated by either
1846 /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1847 /// empty trailing line.
1852 /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1853 /// let v: ~[&str] = four_lines.lines_any().collect();
1854 /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1856 fn lines_any(&self) -> AnyLines<'a>;
1858 /// An iterator over the words of a string (subsequences separated
1859 /// by any sequence of whitespace). Sequences of whitespace are
1860 /// collapsed, so empty "words" are not included.
1865 /// let some_words = " Mary had\ta little \n\t lamb";
1866 /// let v: ~[&str] = some_words.words().collect();
1867 /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1869 fn words(&self) -> Words<'a>;
1871 /// An Iterator over the string in Unicode Normalization Form D
1872 /// (canonical decomposition).
1873 fn nfd_chars(&self) -> Normalizations<'a>;
1875 /// An Iterator over the string in Unicode Normalization Form KD
1876 /// (compatibility decomposition).
1877 fn nfkd_chars(&self) -> Normalizations<'a>;
1879 /// Returns true if the string contains only whitespace.
1881 /// Whitespace characters are determined by `char::is_whitespace`.
1886 /// assert!(" \t\n".is_whitespace());
1887 /// assert!("".is_whitespace());
1889 /// assert!( !"abc".is_whitespace());
1891 fn is_whitespace(&self) -> bool;
1893 /// Returns true if the string contains only alphanumeric code
1896 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1901 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1902 /// assert!("".is_alphanumeric());
1904 /// assert!( !" &*~".is_alphanumeric());
1906 fn is_alphanumeric(&self) -> bool;
1908 /// Returns the number of Unicode code points (`char`) that a
1911 /// This does not perform any normalization, and is `O(n)`, since
1912 /// UTF-8 is a variable width encoding of code points.
1914 /// *Warning*: The number of code points in a string does not directly
1915 /// correspond to the number of visible characters or width of the
1916 /// visible text due to composing characters, and double- and
1917 /// zero-width ones.
1919 /// See also `.len()` for the byte length.
1924 /// // composed forms of `ö` and `é`
1925 /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1926 /// // decomposed forms of `ö` and `é`
1927 /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1929 /// assert_eq!(c.char_len(), 15);
1930 /// assert_eq!(d.char_len(), 17);
1932 /// assert_eq!(c.len(), 21);
1933 /// assert_eq!(d.len(), 23);
1935 /// // the two strings *look* the same
1936 /// println!("{}", c);
1937 /// println!("{}", d);
1939 fn char_len(&self) -> uint;
1941 /// Returns a slice of the given string from the byte range
1942 /// [`begin`..`end`).
1944 /// This operation is `O(1)`.
1946 /// Fails when `begin` and `end` do not point to valid characters
1947 /// or point beyond the last character of the string.
1949 /// See also `slice_to` and `slice_from` for slicing prefixes and
1950 /// suffixes of strings, and `slice_chars` for slicing based on
1951 /// code point counts.
1956 /// let s = "Löwe 老虎 Léopard";
1957 /// assert_eq!(s.slice(0, 1), "L");
1959 /// assert_eq!(s.slice(1, 9), "öwe 老");
1961 /// // these will fail:
1962 /// // byte 2 lies within `ö`:
1963 /// // s.slice(2, 3);
1965 /// // byte 8 lies within `老`
1966 /// // s.slice(1, 8);
1968 /// // byte 100 is outside the string
1969 /// // s.slice(3, 100);
1971 fn slice(&self, begin: uint, end: uint) -> &'a str;
1973 /// Returns a slice of the string from `begin` to its end.
1975 /// Equivalent to `self.slice(begin, self.len())`.
1977 /// Fails when `begin` does not point to a valid character, or is
1980 /// See also `slice`, `slice_to` and `slice_chars`.
1981 fn slice_from(&self, begin: uint) -> &'a str;
1983 /// Returns a slice of the string from the beginning to byte
1986 /// Equivalent to `self.slice(0, end)`.
1988 /// Fails when `end` does not point to a valid character, or is
1991 /// See also `slice`, `slice_from` and `slice_chars`.
1992 fn slice_to(&self, end: uint) -> &'a str;
1994 /// Returns a slice of the string from the character range
1995 /// [`begin`..`end`).
1997 /// That is, start at the `begin`-th code point of the string and
1998 /// continue to the `end`-th code point. This does not detect or
1999 /// handle edge cases such as leaving a combining character as the
2000 /// first code point of the string.
2002 /// Due to the design of UTF-8, this operation is `O(end)`.
2003 /// See `slice`, `slice_to` and `slice_from` for `O(1)`
2004 /// variants that use byte indices rather than code point
2007 /// Fails if `begin` > `end` or the either `begin` or `end` are
2008 /// beyond the last character of the string.
2013 /// let s = "Löwe 老虎 Léopard";
2014 /// assert_eq!(s.slice_chars(0, 4), "Löwe");
2015 /// assert_eq!(s.slice_chars(5, 7), "老虎");
2017 fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
2019 /// Returns true if `needle` is a prefix of the string.
2020 fn starts_with(&self, needle: &str) -> bool;
2022 /// Returns true if `needle` is a suffix of the string.
2023 fn ends_with(&self, needle: &str) -> bool;
2025 /// Escape each char in `s` with `char::escape_default`.
2026 fn escape_default(&self) -> ~str;
2028 /// Escape each char in `s` with `char::escape_unicode`.
2029 fn escape_unicode(&self) -> ~str;
2031 /// Returns a string with leading and trailing whitespace removed.
2032 fn trim(&self) -> &'a str;
2034 /// Returns a string with leading whitespace removed.
2035 fn trim_left(&self) -> &'a str;
2037 /// Returns a string with trailing whitespace removed.
2038 fn trim_right(&self) -> &'a str;
2040 /// Returns a string with characters that match `to_trim` removed.
2044 /// * to_trim - a character matcher
2049 /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
2050 /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
2051 /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
2053 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2055 /// Returns a string with leading `chars_to_trim` removed.
2059 /// * to_trim - a character matcher
2064 /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2065 /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2066 /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2068 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2070 /// Returns a string with trailing `chars_to_trim` removed.
2074 /// * to_trim - a character matcher
2079 /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2080 /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2081 /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2083 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2085 /// Replace all occurrences of one string with another.
2089 /// * `from` - The string to replace
2090 /// * `to` - The replacement string
2094 /// The original string with all occurances of `from` replaced with `to`.
2099 /// let s = ~"Do you know the muffin man,
2100 /// The muffin man, the muffin man, ...";
2102 /// assert_eq!(s.replace("muffin man", "little lamb"),
2103 /// ~"Do you know the little lamb,
2104 /// The little lamb, the little lamb, ...");
2106 /// // not found, so no change.
2107 /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2109 fn replace(&self, from: &str, to: &str) -> ~str;
2111 /// Copy a slice into a new owned str.
2112 fn to_owned(&self) -> ~str;
2114 /// Converts to a vector of `u16` encoded as UTF-16.
2115 fn to_utf16(&self) -> ~[u16];
2117 /// Check that `index`-th byte lies at the start and/or end of a
2118 /// UTF-8 code point sequence.
2120 /// The start and end of the string (when `index == self.len()`)
2121 /// are considered to be boundaries.
2123 /// Fails if `index` is greater than `self.len()`.
2128 /// let s = "Löwe 老虎 Léopard";
2129 /// assert!(s.is_char_boundary(0));
2131 /// assert!(s.is_char_boundary(6));
2132 /// assert!(s.is_char_boundary(s.len()));
2134 /// // second byte of `ö`
2135 /// assert!(!s.is_char_boundary(2));
2137 /// // third byte of `老`
2138 /// assert!(!s.is_char_boundary(8));
2140 fn is_char_boundary(&self, index: uint) -> bool;
2142 /// Pluck a character out of a string and return the index of the next
2145 /// This function can be used to iterate over the unicode characters of a
2150 /// This example manually iterate through the characters of a
2151 /// string; this should normally by done by `.chars()` or
2152 /// `.char_indices`.
2155 /// use std::str::CharRange;
2157 /// let s = "中华Việt Nam";
2159 /// while i < s.len() {
2160 /// let CharRange {ch, next} = s.char_range_at(i);
2161 /// println!("{}: {}", i, ch);
2183 /// * s - The string
2184 /// * i - The byte offset of the char to extract
2188 /// A record {ch: char, next: uint} containing the char value and the byte
2189 /// index of the next unicode character.
2193 /// If `i` is greater than or equal to the length of the string.
2194 /// If `i` is not the index of the beginning of a valid UTF-8 character.
2195 fn char_range_at(&self, start: uint) -> CharRange;
2197 /// Given a byte position and a str, return the previous char and its position.
2199 /// This function can be used to iterate over a unicode string in reverse.
2201 /// Returns 0 for next index if called on start index 0.
2202 fn char_range_at_reverse(&self, start: uint) -> CharRange;
2204 /// Plucks the character starting at the `i`th byte of a string
2205 fn char_at(&self, i: uint) -> char;
2207 /// Plucks the character ending at the `i`th byte of a string
2208 fn char_at_reverse(&self, i: uint) -> char;
2210 /// Work with the byte buffer of a string as a byte slice.
2211 fn as_bytes(&self) -> &'a [u8];
2213 /// Returns the byte index of the first character of `self` that
2214 /// matches `search`.
2218 /// `Some` containing the byte index of the last matching character
2219 /// or `None` if there is no match
2224 /// let s = "Löwe 老虎 Léopard";
2226 /// assert_eq!(s.find('L'), Some(0));
2227 /// assert_eq!(s.find('é'), Some(14));
2229 /// // the first space
2230 /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2232 /// // neither are found
2233 /// assert_eq!(s.find(&['1', '2']), None);
2235 fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2237 /// Returns the byte index of the last character of `self` that
2238 /// matches `search`.
2242 /// `Some` containing the byte index of the last matching character
2243 /// or `None` if there is no match.
2248 /// let s = "Löwe 老虎 Léopard";
2250 /// assert_eq!(s.rfind('L'), Some(13));
2251 /// assert_eq!(s.rfind('é'), Some(14));
2253 /// // the second space
2254 /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2256 /// // searches for an occurrence of either `1` or `2`, but neither are found
2257 /// assert_eq!(s.rfind(&['1', '2']), None);
2259 fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2261 /// Returns the byte index of the first matching substring
2265 /// * `needle` - The string to search for
2269 /// `Some` containing the byte index of the first matching substring
2270 /// or `None` if there is no match.
2275 /// let s = "Löwe 老虎 Léopard";
2277 /// assert_eq!(s.find_str("老虎 L"), Some(6));
2278 /// assert_eq!(s.find_str("muffin man"), None);
2280 fn find_str(&self, &str) -> Option<uint>;
2282 /// Given a string, make a new string with repeated copies of it.
2283 fn repeat(&self, nn: uint) -> ~str;
2285 /// Retrieves the first character from a string slice and returns
2286 /// it. This does not allocate a new string; instead, it returns a
2287 /// slice that point one character beyond the character that was
2288 /// shifted. If the string does not contain any characters,
2289 /// a tuple of None and an empty string is returned instead.
2294 /// let s = "Löwe 老虎 Léopard";
2295 /// let (c, s1) = s.slice_shift_char();
2296 /// assert_eq!(c, Some('L'));
2297 /// assert_eq!(s1, "öwe 老虎 Léopard");
2299 /// let (c, s2) = s1.slice_shift_char();
2300 /// assert_eq!(c, Some('ö'));
2301 /// assert_eq!(s2, "we 老虎 Léopard");
2303 fn slice_shift_char(&self) -> (Option<char>, &'a str);
2305 /// Levenshtein Distance between two strings.
2306 fn lev_distance(&self, t: &str) -> uint;
2308 /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2310 /// Fails if `inner` is not a direct slice contained within self.
2315 /// let string = "a\nb\nc";
2316 /// let lines: ~[&str] = string.lines().collect();
2318 /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2319 /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2320 /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2322 fn subslice_offset(&self, inner: &str) -> uint;
2324 /// Return an unsafe pointer to the strings buffer.
2326 /// The caller must ensure that the string outlives this pointer,
2327 /// and that it is not reallocated (e.g. by pushing to the
2329 fn as_ptr(&self) -> *u8;
2332 impl<'a> StrSlice<'a> for &'a str {
2334 fn contains<'a>(&self, needle: &'a str) -> bool {
2335 self.find_str(needle).is_some()
2339 fn contains_char(&self, needle: char) -> bool {
2340 self.find(needle).is_some()
2344 fn chars(&self) -> Chars<'a> {
2345 Chars{string: *self}
2349 fn chars_rev(&self) -> RevChars<'a> {
2354 fn bytes(&self) -> Bytes<'a> {
2355 self.as_bytes().iter().map(|&b| b)
2359 fn bytes_rev(&self) -> RevBytes<'a> {
2364 fn char_indices(&self) -> CharOffsets<'a> {
2365 CharOffsets{string: *self, iter: self.chars()}
2369 fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2370 self.char_indices().rev()
2374 fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2377 only_ascii: sep.only_ascii(),
2379 allow_trailing_empty: true,
2385 fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2386 -> CharSplitsN<'a, Sep> {
2388 iter: self.split(sep),
2395 fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2396 -> CharSplits<'a, Sep> {
2398 allow_trailing_empty: false,
2404 fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2405 self.split(sep).rev()
2409 fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2410 -> CharSplitsN<'a, Sep> {
2412 iter: self.split(sep),
2419 fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2420 assert!(!sep.is_empty())
2429 fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2431 it: self.match_indices(sep),
2438 fn lines(&self) -> CharSplits<'a, char> {
2439 self.split_terminator('\n')
2442 fn lines_any(&self) -> AnyLines<'a> {
2443 self.lines().map(|line| {
2445 if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2451 fn words(&self) -> Words<'a> {
2452 self.split(char::is_whitespace).filter(|s| !s.is_empty())
2456 fn nfd_chars(&self) -> Normalizations<'a> {
2466 fn nfkd_chars(&self) -> Normalizations<'a> {
2476 fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2479 fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2482 fn char_len(&self) -> uint { self.chars().len() }
2485 fn slice(&self, begin: uint, end: uint) -> &'a str {
2486 assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2487 unsafe { raw::slice_bytes(*self, begin, end) }
2491 fn slice_from(&self, begin: uint) -> &'a str {
2492 self.slice(begin, self.len())
2496 fn slice_to(&self, end: uint) -> &'a str {
2497 assert!(self.is_char_boundary(end));
2498 unsafe { raw::slice_bytes(*self, 0, end) }
2501 fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2502 assert!(begin <= end);
2504 let mut begin_byte = None;
2505 let mut end_byte = None;
2507 // This could be even more efficient by not decoding,
2508 // only finding the char boundaries
2509 for (idx, _) in self.char_indices() {
2510 if count == begin { begin_byte = Some(idx); }
2511 if count == end { end_byte = Some(idx); break; }
2514 if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2515 if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2517 match (begin_byte, end_byte) {
2518 (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2519 (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2520 (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2525 fn starts_with<'a>(&self, needle: &'a str) -> bool {
2526 let n = needle.len();
2527 self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2531 fn ends_with(&self, needle: &str) -> bool {
2532 let (m, n) = (self.len(), needle.len());
2533 m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2536 fn escape_default(&self) -> ~str {
2537 let mut out = with_capacity(self.len());
2538 for c in self.chars() {
2539 c.escape_default(|c| out.push_char(c));
2544 fn escape_unicode(&self) -> ~str {
2545 let mut out = with_capacity(self.len());
2546 for c in self.chars() {
2547 c.escape_unicode(|c| out.push_char(c));
2553 fn trim(&self) -> &'a str {
2554 self.trim_left().trim_right()
2558 fn trim_left(&self) -> &'a str {
2559 self.trim_left_chars(&char::is_whitespace)
2563 fn trim_right(&self) -> &'a str {
2564 self.trim_right_chars(&char::is_whitespace)
2568 fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2569 self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2573 fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2574 match self.find(|c: char| !to_trim.matches(c)) {
2576 Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2581 fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2582 match self.rfind(|c: char| !to_trim.matches(c)) {
2585 let next = self.char_range_at(last).next;
2586 unsafe { raw::slice_bytes(*self, 0u, next) }
2591 fn replace(&self, from: &str, to: &str) -> ~str {
2592 let mut result = ~"";
2593 let mut last_end = 0;
2594 for (start, end) in self.match_indices(from) {
2595 result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2596 result.push_str(to);
2599 result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2604 fn to_owned(&self) -> ~str {
2605 let len = self.len();
2607 let mut v = slice::with_capacity(len);
2609 ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2611 ::cast::transmute(v)
2615 fn to_utf16(&self) -> ~[u16] {
2617 for ch in self.chars() {
2618 // Arithmetic with u32 literals is easier on the eyes than chars.
2619 let mut ch = ch as u32;
2621 if (ch & 0xFFFF_u32) == ch {
2622 // The BMP falls through (assuming non-surrogate, as it
2624 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2627 // Supplementary planes break into surrogates.
2628 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2630 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2631 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2632 u.push_all([w1, w2])
2639 fn is_char_boundary(&self, index: uint) -> bool {
2640 if index == self.len() { return true; }
2641 let b = self[index];
2642 return b < 128u8 || b >= 192u8;
2646 fn char_range_at(&self, i: uint) -> CharRange {
2647 if self[i] < 128u8 {
2648 return CharRange {ch: self[i] as char, next: i + 1 };
2651 // Multibyte case is a fn to allow char_range_at to inline cleanly
2652 fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2653 let mut val = s[i] as u32;
2654 let w = UTF8_CHAR_WIDTH[val] as uint;
2657 val = utf8_first_byte!(val, w);
2658 val = utf8_acc_cont_byte!(val, s[i + 1]);
2659 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2660 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2662 return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2665 return multibyte_char_range_at(*self, i);
2669 fn char_range_at_reverse(&self, start: uint) -> CharRange {
2670 let mut prev = start;
2672 prev = prev.saturating_sub(1);
2673 if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2675 // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2676 fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2677 // while there is a previous byte == 10......
2678 while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2682 let mut val = s[i] as u32;
2683 let w = UTF8_CHAR_WIDTH[val] as uint;
2686 val = utf8_first_byte!(val, w);
2687 val = utf8_acc_cont_byte!(val, s[i + 1]);
2688 if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2689 if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2691 return CharRange {ch: unsafe { transmute(val) }, next: i};
2694 return multibyte_char_range_at_reverse(*self, prev);
2698 fn char_at(&self, i: uint) -> char {
2699 self.char_range_at(i).ch
2703 fn char_at_reverse(&self, i: uint) -> char {
2704 self.char_range_at_reverse(i).ch
2708 fn as_bytes(&self) -> &'a [u8] {
2709 unsafe { cast::transmute(*self) }
2712 fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2713 if search.only_ascii() {
2714 self.bytes().position(|b| search.matches(b as char))
2716 for (index, c) in self.char_indices() {
2717 if search.matches(c) { return Some(index); }
2723 fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2724 if search.only_ascii() {
2725 self.bytes().rposition(|b| search.matches(b as char))
2727 for (index, c) in self.char_indices_rev() {
2728 if search.matches(c) { return Some(index); }
2734 fn find_str(&self, needle: &str) -> Option<uint> {
2735 if needle.is_empty() {
2738 self.match_indices(needle)
2740 .map(|(start, _end)| start)
2744 fn repeat(&self, nn: uint) -> ~str {
2745 let mut ret = with_capacity(nn * self.len());
2746 for _ in range(0, nn) {
2747 ret.push_str(*self);
2753 fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2754 if self.is_empty() {
2755 return (None, *self);
2757 let CharRange {ch, next} = self.char_range_at(0u);
2758 let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2759 return (Some(ch), next_s);
2763 fn lev_distance(&self, t: &str) -> uint {
2764 let slen = self.len();
2767 if slen == 0 { return tlen; }
2768 if tlen == 0 { return slen; }
2770 let mut dcol = slice::from_fn(tlen + 1, |x| x);
2772 for (i, sc) in self.chars().enumerate() {
2774 let mut current = i;
2775 dcol[0] = current + 1;
2777 for (j, tc) in t.chars().enumerate() {
2779 let next = dcol[j + 1];
2782 dcol[j + 1] = current;
2784 dcol[j + 1] = ::cmp::min(current, next);
2785 dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2795 fn subslice_offset(&self, inner: &str) -> uint {
2796 let a_start = self.as_ptr() as uint;
2797 let a_end = a_start + self.len();
2798 let b_start = inner.as_ptr() as uint;
2799 let b_end = b_start + inner.len();
2801 assert!(a_start <= b_start);
2802 assert!(b_end <= a_end);
2807 fn as_ptr(&self) -> *u8 {
2812 /// Methods for owned strings
2813 pub trait OwnedStr {
2814 /// Appends a string slice to the back of a string, without overallocating.
2815 fn push_str_no_overallocate(&mut self, rhs: &str);
2817 /// Appends a string slice to the back of a string
2818 fn push_str(&mut self, rhs: &str);
2820 /// Appends a character to the back of a string
2821 fn push_char(&mut self, c: char);
2823 /// Remove the final character from a string and return it. Return None
2824 /// when the string is empty.
2825 fn pop_char(&mut self) -> Option<char>;
2827 /// Remove the first character from a string and return it. Return None
2828 /// when the string is empty.
2829 fn shift_char(&mut self) -> Option<char>;
2831 /// Prepend a char to a string
2832 fn unshift_char(&mut self, ch: char);
2834 /// Insert a new sub-string at the given position in a string, in O(n + m) time
2835 /// (with n and m the lengths of the string and the substring.)
2836 /// This fails if `position` is not at a character boundary.
2837 fn insert(&mut self, position: uint, substring: &str);
2839 /// Insert a char at the given position in a string, in O(n + m) time
2840 /// (with n and m the lengths of the string and the substring.)
2841 /// This fails if `position` is not at a character boundary.
2842 fn insert_char(&mut self, position: uint, ch: char);
2844 /// Concatenate two strings together.
2845 fn append(self, rhs: &str) -> ~str;
2847 /// Reserves capacity for exactly `n` bytes in the given string.
2849 /// Assuming single-byte characters, the resulting string will be large
2850 /// enough to hold a string of length `n`.
2852 /// If the capacity for `s` is already equal to or greater than the requested
2853 /// capacity, then no action is taken.
2858 /// * n - The number of bytes to reserve space for
2859 fn reserve_exact(&mut self, n: uint);
2861 /// Reserves capacity for at least `n` bytes in the given string.
2863 /// Assuming single-byte characters, the resulting string will be large
2864 /// enough to hold a string of length `n`.
2866 /// This function will over-allocate in order to amortize the allocation costs
2867 /// in scenarios where the caller may need to repeatedly reserve additional
2870 /// If the capacity for `s` is already equal to or greater than the requested
2871 /// capacity, then no action is taken.
2876 /// * n - The number of bytes to reserve space for
2877 fn reserve(&mut self, n: uint);
2879 /// Returns the number of single-byte characters the string can hold without
2881 fn capacity(&self) -> uint;
2883 /// Shorten a string to the specified length (which must be <= the current length)
2884 fn truncate(&mut self, len: uint);
2886 /// Consumes the string, returning the underlying byte buffer.
2888 /// The buffer does not have a null terminator.
2889 fn into_bytes(self) -> ~[u8];
2891 /// Sets the length of a string
2893 /// This will explicitly set the size of the string, without actually
2894 /// modifying its buffers, so it is up to the caller to ensure that
2895 /// the string is actually the specified size.
2896 unsafe fn set_len(&mut self, new_len: uint);
2899 impl OwnedStr for ~str {
2901 fn push_str_no_overallocate(&mut self, rhs: &str) {
2902 let new_cap = self.len() + rhs.len();
2903 self.reserve_exact(new_cap);
2908 fn push_str(&mut self, rhs: &str) {
2910 raw::push_bytes(self, rhs.as_bytes());
2915 fn push_char(&mut self, c: char) {
2916 let cur_len = self.len();
2917 // may use up to 4 bytes.
2919 let v = raw::as_owned_vec(self);
2920 v.reserve_additional(4);
2922 // Attempt to not use an intermediate buffer by just pushing bytes
2923 // directly onto this string.
2924 let write_ptr = v.as_mut_ptr().offset(cur_len as int);
2925 let used = slice::raw::mut_buf_as_slice(write_ptr, 4, |slc| c.encode_utf8(slc));
2927 v.set_len(cur_len + used);
2932 fn pop_char(&mut self) -> Option<char> {
2933 let end = self.len();
2937 let CharRange {ch, next} = self.char_range_at_reverse(end);
2938 unsafe { self.set_len(next); }
2944 fn shift_char(&mut self) -> Option<char> {
2945 if self.is_empty() {
2948 let CharRange {ch, next} = self.char_range_at(0u);
2949 *self = self.slice(next, self.len()).to_owned();
2955 fn unshift_char(&mut self, ch: char) {
2956 // This could be more efficient.
2957 let mut new_str = ~"";
2958 new_str.push_char(ch);
2959 new_str.push_str(*self);
2964 fn insert(&mut self, position: uint, substring: &str) {
2965 // This could be more efficient.
2966 let mut new_str = self.slice_to(position).to_owned();
2967 new_str.push_str(substring);
2968 new_str.push_str(self.slice_from(position));
2973 fn insert_char(&mut self, position: uint, ch: char) {
2974 // This could be more efficient.
2975 let mut new_str = self.slice_to(position).to_owned();
2976 new_str.push_char(ch);
2977 new_str.push_str(self.slice_from(position));
2982 fn append(self, rhs: &str) -> ~str {
2983 let mut new_str = self;
2984 new_str.push_str_no_overallocate(rhs);
2989 fn reserve_exact(&mut self, n: uint) {
2991 raw::as_owned_vec(self).reserve_exact(n)
2996 fn reserve(&mut self, n: uint) {
2998 raw::as_owned_vec(self).reserve(n)
3003 fn capacity(&self) -> uint {
3005 let buf: &~[u8] = cast::transmute(self);
3011 fn truncate(&mut self, len: uint) {
3012 assert!(len <= self.len());
3013 assert!(self.is_char_boundary(len));
3014 unsafe { self.set_len(len); }
3018 fn into_bytes(self) -> ~[u8] {
3019 unsafe { cast::transmute(self) }
3023 unsafe fn set_len(&mut self, new_len: uint) {
3024 raw::as_owned_vec(self).set_len(new_len)
3028 impl Clone for ~str {
3030 fn clone(&self) -> ~str {
3035 impl FromIterator<char> for ~str {
3037 fn from_iterator<T: Iterator<char>>(iterator: &mut T) -> ~str {
3038 let (lower, _) = iterator.size_hint();
3039 let mut buf = with_capacity(lower);
3040 buf.extend(iterator);
3045 impl Extendable<char> for ~str {
3047 fn extend<T: Iterator<char>>(&mut self, iterator: &mut T) {
3048 let (lower, _) = iterator.size_hint();
3049 let reserve = lower + self.len();
3050 self.reserve(reserve);
3051 for ch in *iterator {
3057 // This works because every lifetime is a sub-lifetime of 'static
3058 impl<'a> Default for &'a str {
3059 fn default() -> &'a str { "" }
3062 impl Default for ~str {
3063 fn default() -> ~str { ~"" }
3068 use iter::AdditiveIterator;
3069 use default::Default;
3075 assert!((eq(&~"", &~"")));
3076 assert!((eq(&~"foo", &~"foo")));
3077 assert!((!eq(&~"foo", &~"bar")));
3081 fn test_eq_slice() {
3082 assert!((eq_slice("foobar".slice(0, 3), "foo")));
3083 assert!((eq_slice("barfoo".slice(3, 6), "foo")));
3084 assert!((!eq_slice("foo1", "foo2")));
3090 assert!("" <= "foo");
3091 assert!("foo" <= "foo");
3092 assert!("foo" != "bar");
3097 assert_eq!("".len(), 0u);
3098 assert_eq!("hello world".len(), 11u);
3099 assert_eq!("\x63".len(), 1u);
3100 assert_eq!("\xa2".len(), 2u);
3101 assert_eq!("\u03c0".len(), 2u);
3102 assert_eq!("\u2620".len(), 3u);
3103 assert_eq!("\U0001d11e".len(), 4u);
3105 assert_eq!("".char_len(), 0u);
3106 assert_eq!("hello world".char_len(), 11u);
3107 assert_eq!("\x63".char_len(), 1u);
3108 assert_eq!("\xa2".char_len(), 1u);
3109 assert_eq!("\u03c0".char_len(), 1u);
3110 assert_eq!("\u2620".char_len(), 1u);
3111 assert_eq!("\U0001d11e".char_len(), 1u);
3112 assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
3117 assert_eq!("hello".find('l'), Some(2u));
3118 assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
3119 assert!("hello".find('x').is_none());
3120 assert!("hello".find(|c:char| c == 'x').is_none());
3121 assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
3122 assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
3127 assert_eq!("hello".rfind('l'), Some(3u));
3128 assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
3129 assert!("hello".rfind('x').is_none());
3130 assert!("hello".rfind(|c:char| c == 'x').is_none());
3131 assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
3132 assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
3136 fn test_push_str() {
3139 assert_eq!(s.slice_from(0), "");
3141 assert_eq!(s.slice_from(0), "abc");
3142 s.push_str("ประเทศไทย中华Việt Nam");
3143 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3150 assert_eq!(s.slice_from(0), "");
3151 s = s.append("abc");
3152 assert_eq!(s.slice_from(0), "abc");
3153 s = s.append("ประเทศไทย中华Việt Nam");
3154 assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
3158 fn test_pop_char() {
3159 let mut data = ~"ประเทศไทย中华";
3160 let cc = data.pop_char();
3161 assert_eq!(~"ประเทศไทย中", data);
3162 assert_eq!(Some('华'), cc);
3166 fn test_pop_char_2() {
3167 let mut data2 = ~"华";
3168 let cc2 = data2.pop_char();
3169 assert_eq!(~"", data2);
3170 assert_eq!(Some('华'), cc2);
3174 fn test_pop_char_empty() {
3176 let cc3 = data.pop_char();
3177 assert_eq!(~"", data);
3178 assert_eq!(None, cc3);
3182 fn test_push_char() {
3183 let mut data = ~"ประเทศไทย中";
3184 data.push_char('华');
3185 data.push_char('b'); // 1 byte
3186 data.push_char('¢'); // 2 byte
3187 data.push_char('€'); // 3 byte
3188 data.push_char('𤭢'); // 4 byte
3189 assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
3193 fn test_shift_char() {
3194 let mut data = ~"ประเทศไทย中";
3195 let cc = data.shift_char();
3196 assert_eq!(~"ระเทศไทย中", data);
3197 assert_eq!(Some('ป'), cc);
3201 fn test_unshift_char() {
3202 let mut data = ~"ประเทศไทย中";
3203 data.unshift_char('华');
3204 assert_eq!(~"华ประเทศไทย中", data);
3208 fn test_insert_char() {
3209 let mut data = ~"ประเทศไทย中";
3210 data.insert_char(15, '华');
3211 assert_eq!(~"ประเท华ศไทย中", data);
3216 let mut data = ~"ประเทศไทย中";
3217 data.insert(15, "华中");
3218 assert_eq!(~"ประเท华中ศไทย中", data);
3224 let s: ~str = empty.chars().collect();
3225 assert_eq!(empty, s);
3226 let data = ~"ประเทศไทย中";
3227 let s: ~str = data.chars().collect();
3228 assert_eq!(data, s);
3233 let data = ~"ประเทศไทย中";
3234 let mut cpy = data.clone();
3236 let mut it = other.chars();
3237 cpy.extend(&mut it);
3238 assert_eq!(cpy, data + other);
3243 let mut empty = ~"";
3245 assert_eq!("", empty.as_slice());
3246 let mut data = ~"ประเทศไทย中";
3248 assert_eq!("", data.as_slice());
3249 data.push_char('华');
3250 assert_eq!("华", data.as_slice());
3254 fn test_into_bytes() {
3256 let buf = data.into_bytes();
3257 assert_eq!(bytes!("asdf"), buf.as_slice());
3261 fn test_find_str() {
3263 assert_eq!("".find_str(""), Some(0u));
3264 assert!("banana".find_str("apple pie").is_none());
3266 let data = "abcabc";
3267 assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
3268 assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
3269 assert!(data.slice(2u, 4u).find_str("ab").is_none());
3271 let mut data = ~"ประเทศไทย中华Việt Nam";
3273 assert!(data.find_str("ไท华").is_none());
3274 assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
3275 assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
3277 assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
3278 assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
3279 assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
3280 assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
3281 assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
3283 assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
3284 assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
3285 assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
3286 assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
3287 assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
3291 fn test_slice_chars() {
3292 fn t(a: &str, b: &str, start: uint) {
3293 assert_eq!(a.slice_chars(start, start + b.char_len()), b);
3296 t("hello", "llo", 2);
3297 t("hello", "el", 1);
3300 assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
3305 fn t(v: &[~str], s: &str) {
3306 assert_eq!(v.concat(), s.to_str());
3308 t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
3309 let v: &[~str] = [];
3316 fn t(v: &[~str], sep: &str, s: &str) {
3317 assert_eq!(v.connect(sep), s.to_str());
3319 t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
3320 " ", "you know I'm no good");
3321 let v: &[~str] = [];
3323 t([~"hi"], " ", "hi");
3327 fn test_concat_slices() {
3328 fn t(v: &[&str], s: &str) {
3329 assert_eq!(v.concat(), s.to_str());
3331 t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
3332 let v: &[&str] = [];
3338 fn test_connect_slices() {
3339 fn t(v: &[&str], sep: &str, s: &str) {
3340 assert_eq!(v.connect(sep), s.to_str());
3342 t(["you", "know", "I'm", "no", "good"],
3343 " ", "you know I'm no good");
3345 t(["hi"], " ", "hi");
3350 assert_eq!("x".repeat(4), ~"xxxx");
3351 assert_eq!("hi".repeat(4), ~"hihihihi");
3352 assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
3353 assert_eq!("".repeat(4), ~"");
3354 assert_eq!("hi".repeat(0), ~"");
3358 fn test_unsafe_slice() {
3359 assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
3360 assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3361 assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3362 fn a_million_letter_a() -> ~str {
3365 while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
3368 fn half_a_million_letter_a() -> ~str {
3371 while i < 100000 { rs.push_str("aaaaa"); i += 1; }
3374 let letters = a_million_letter_a();
3375 assert!(half_a_million_letter_a() ==
3376 unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3380 fn test_starts_with() {
3381 assert!(("".starts_with("")));
3382 assert!(("abc".starts_with("")));
3383 assert!(("abc".starts_with("a")));
3384 assert!((!"a".starts_with("abc")));
3385 assert!((!"".starts_with("abc")));
3386 assert!((!"ödd".starts_with("-")));
3387 assert!(("ödd".starts_with("öd")));
3391 fn test_ends_with() {
3392 assert!(("".ends_with("")));
3393 assert!(("abc".ends_with("")));
3394 assert!(("abc".ends_with("c")));
3395 assert!((!"a".ends_with("abc")));
3396 assert!((!"".ends_with("abc")));
3397 assert!((!"ddö".ends_with("-")));
3398 assert!(("ddö".ends_with("dö")));
3402 fn test_is_empty() {
3403 assert!("".is_empty());
3404 assert!(!"a".is_empty());
3410 assert_eq!("".replace(a, "b"), ~"");
3411 assert_eq!("a".replace(a, "b"), ~"b");
3412 assert_eq!("ab".replace(a, "b"), ~"bb");
3414 assert!(" test test ".replace(test, "toast") ==
3416 assert_eq!(" test test ".replace(test, ""), ~" ");
3420 fn test_replace_2a() {
3421 let data = ~"ประเทศไทย中华";
3422 let repl = ~"دولة الكويت";
3425 let a2 = ~"دولة الكويتทศไทย中华";
3426 assert_eq!(data.replace(a, repl), a2);
3430 fn test_replace_2b() {
3431 let data = ~"ประเทศไทย中华";
3432 let repl = ~"دولة الكويت";
3435 let b2 = ~"ปรدولة الكويتทศไทย中华";
3436 assert_eq!(data.replace(b, repl), b2);
3440 fn test_replace_2c() {
3441 let data = ~"ประเทศไทย中华";
3442 let repl = ~"دولة الكويت";
3445 let c2 = ~"ประเทศไทยدولة الكويت";
3446 assert_eq!(data.replace(c, repl), c2);
3450 fn test_replace_2d() {
3451 let data = ~"ประเทศไทย中华";
3452 let repl = ~"دولة الكويت";
3455 assert_eq!(data.replace(d, repl), data);
3460 assert_eq!("ab", "abc".slice(0, 2));
3461 assert_eq!("bc", "abc".slice(1, 3));
3462 assert_eq!("", "abc".slice(1, 1));
3463 assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3465 let data = "ประเทศไทย中华";
3466 assert_eq!("ป", data.slice(0, 3));
3467 assert_eq!("ร", data.slice(3, 6));
3468 assert_eq!("", data.slice(3, 3));
3469 assert_eq!("华", data.slice(30, 33));
3471 fn a_million_letter_X() -> ~str {
3475 push_str(&mut rs, "华华华华华华华华华华");
3480 fn half_a_million_letter_X() -> ~str {
3483 while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
3486 let letters = a_million_letter_X();
3487 assert!(half_a_million_letter_X() ==
3488 letters.slice(0u, 3u * 500000u).to_owned());
3493 let ss = "中华Việt Nam";
3495 assert_eq!("华", ss.slice(3u, 6u));
3496 assert_eq!("Việt Nam", ss.slice(6u, 16u));
3498 assert_eq!("ab", "abc".slice(0u, 2u));
3499 assert_eq!("bc", "abc".slice(1u, 3u));
3500 assert_eq!("", "abc".slice(1u, 1u));
3502 assert_eq!("中", ss.slice(0u, 3u));
3503 assert_eq!("华V", ss.slice(3u, 7u));
3504 assert_eq!("", ss.slice(3u, 3u));
3519 fn test_slice_fail() {
3520 "中华Việt Nam".slice(0u, 2u);
3524 fn test_slice_from() {
3525 assert_eq!("abcd".slice_from(0), "abcd");
3526 assert_eq!("abcd".slice_from(2), "cd");
3527 assert_eq!("abcd".slice_from(4), "");
3530 fn test_slice_to() {
3531 assert_eq!("abcd".slice_to(0), "");
3532 assert_eq!("abcd".slice_to(2), "ab");
3533 assert_eq!("abcd".slice_to(4), "abcd");
3537 fn test_trim_left_chars() {
3538 let v: &[char] = &[];
3539 assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3540 assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3541 assert_eq!(" *** *** ".trim_left_chars(& &['*', ' ']), "");
3542 assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3544 assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3545 assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3546 assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3550 fn test_trim_right_chars() {
3551 let v: &[char] = &[];
3552 assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3553 assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3554 assert_eq!(" *** *** ".trim_right_chars(& &['*', ' ']), "");
3555 assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3557 assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3558 assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3559 assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3563 fn test_trim_chars() {
3564 let v: &[char] = &[];
3565 assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3566 assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3567 assert_eq!(" *** *** ".trim_chars(& &['*', ' ']), "");
3568 assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3570 assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3571 assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3572 assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3576 fn test_trim_left() {
3577 assert_eq!("".trim_left(), "");
3578 assert_eq!("a".trim_left(), "a");
3579 assert_eq!(" ".trim_left(), "");
3580 assert_eq!(" blah".trim_left(), "blah");
3581 assert_eq!(" \u3000 wut".trim_left(), "wut");
3582 assert_eq!("hey ".trim_left(), "hey ");
3586 fn test_trim_right() {
3587 assert_eq!("".trim_right(), "");
3588 assert_eq!("a".trim_right(), "a");
3589 assert_eq!(" ".trim_right(), "");
3590 assert_eq!("blah ".trim_right(), "blah");
3591 assert_eq!("wut \u3000 ".trim_right(), "wut");
3592 assert_eq!(" hey".trim_right(), " hey");
3597 assert_eq!("".trim(), "");
3598 assert_eq!("a".trim(), "a");
3599 assert_eq!(" ".trim(), "");
3600 assert_eq!(" blah ".trim(), "blah");
3601 assert_eq!("\nwut \u3000 ".trim(), "wut");
3602 assert_eq!(" hey dude ".trim(), "hey dude");
3606 fn test_is_whitespace() {
3607 assert!("".is_whitespace());
3608 assert!(" ".is_whitespace());
3609 assert!("\u2009".is_whitespace()); // Thin space
3610 assert!(" \n\t ".is_whitespace());
3611 assert!(!" _ ".is_whitespace());
3615 fn test_slice_shift_char() {
3616 let data = "ประเทศไทย中";
3617 assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3621 fn test_slice_shift_char_2() {
3623 assert_eq!(empty.slice_shift_char(), (None, ""));
3627 fn test_push_byte() {
3629 unsafe{raw::push_byte(&mut s, 'D' as u8)};
3630 assert_eq!(s, ~"ABCD");
3634 fn test_shift_byte() {
3636 let b = unsafe{raw::shift_byte(&mut s)};
3637 assert_eq!(s, ~"BC");
3638 assert_eq!(b, Some(65u8));
3642 fn test_pop_byte() {
3644 let b = unsafe{raw::pop_byte(&mut s)};
3645 assert_eq!(s, ~"AB");
3646 assert_eq!(b, Some(67u8));
3651 // deny overlong encodings
3652 assert!(!is_utf8([0xc0, 0x80]));
3653 assert!(!is_utf8([0xc0, 0xae]));
3654 assert!(!is_utf8([0xe0, 0x80, 0x80]));
3655 assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3656 assert!(!is_utf8([0xe0, 0x81, 0x81]));
3657 assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3658 assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3661 assert!(!is_utf8([0xED, 0xA0, 0x80]));
3662 assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3664 assert!(is_utf8([0xC2, 0x80]));
3665 assert!(is_utf8([0xDF, 0xBF]));
3666 assert!(is_utf8([0xE0, 0xA0, 0x80]));
3667 assert!(is_utf8([0xED, 0x9F, 0xBF]));
3668 assert!(is_utf8([0xEE, 0x80, 0x80]));
3669 assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3670 assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3671 assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3675 fn test_is_utf16() {
3676 macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3684 // surrogate pairs (randomly generated with Python 3's
3685 // .encode('utf-16be'))
3686 pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3687 [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3688 [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3690 // mixtures (also random)
3691 pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3692 [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3693 [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3696 macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3699 // surrogate + regular unit
3701 // surrogate + lead surrogate
3703 // unterminated surrogate
3705 // trail surrogate without a lead
3708 // random byte sequences that Python 3's .decode('utf-16be')
3710 neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3711 [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3712 [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3713 [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3714 [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3715 [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3716 [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3717 [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3718 [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3719 [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3720 [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3721 [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3722 [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3723 [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3724 [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3725 [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3726 [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3727 [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3728 [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3729 [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3730 [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3734 fn test_raw_from_c_str() {
3736 let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3738 let c = raw::from_c_str(b);
3739 assert_eq!(c, ~"AAAAAAA");
3744 fn test_as_bytes() {
3747 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3748 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3751 assert_eq!("".as_bytes(), &[]);
3752 assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3753 assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3758 fn test_as_bytes_fail() {
3759 // Don't double free. (I'm not sure if this exercises the
3760 // original problem code path anymore.)
3762 let _bytes = s.as_bytes();
3768 let buf = "hello".as_ptr();
3770 assert_eq!(*buf.offset(0), 'h' as u8);
3771 assert_eq!(*buf.offset(1), 'e' as u8);
3772 assert_eq!(*buf.offset(2), 'l' as u8);
3773 assert_eq!(*buf.offset(3), 'l' as u8);
3774 assert_eq!(*buf.offset(4), 'o' as u8);
3779 fn test_subslice_offset() {
3780 let a = "kernelsprite";
3781 let b = a.slice(7, a.len());
3782 let c = a.slice(0, a.len() - 6);
3783 assert_eq!(a.subslice_offset(b), 7);
3784 assert_eq!(a.subslice_offset(c), 0);
3786 let string = "a\nb\nc";
3787 let mut lines = ~[];
3788 for line in string.lines() { lines.push(line) }
3789 assert_eq!(string.subslice_offset(lines[0]), 0);
3790 assert_eq!(string.subslice_offset(lines[1]), 2);
3791 assert_eq!(string.subslice_offset(lines[2]), 4);
3796 fn test_subslice_offset_2() {
3797 let a = "alchemiter";
3798 let b = "cruxtruder";
3799 a.subslice_offset(b);
3803 fn vec_str_conversions() {
3804 let s1: ~str = ~"All mimsy were the borogoves";
3806 let v: ~[u8] = s1.as_bytes().to_owned();
3807 let s2: ~str = from_utf8(v).unwrap().to_owned();
3808 let mut i: uint = 0u;
3809 let n1: uint = s1.len();
3810 let n2: uint = v.len();
3823 fn test_contains() {
3824 assert!("abcde".contains("bcd"));
3825 assert!("abcde".contains("abcd"));
3826 assert!("abcde".contains("bcde"));
3827 assert!("abcde".contains(""));
3828 assert!("".contains(""));
3829 assert!(!"abcde".contains("def"));
3830 assert!(!"".contains("a"));
3832 let data = ~"ประเทศไทย中华Việt Nam";
3833 assert!(data.contains("ประเ"));
3834 assert!(data.contains("ะเ"));
3835 assert!(data.contains("中华"));
3836 assert!(!data.contains("ไท华"));
3840 fn test_contains_char() {
3841 assert!("abc".contains_char('b'));
3842 assert!("a".contains_char('a'));
3843 assert!(!"abc".contains_char('d'));
3844 assert!(!"".contains_char('a'));
3851 ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3852 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3853 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3854 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3857 ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3858 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3859 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3860 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3861 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3864 (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3865 ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3866 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3867 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3868 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3869 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3870 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3871 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3873 (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3874 ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3875 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3876 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3877 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3878 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3879 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3880 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3881 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3882 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3883 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3885 // Issue #12318, even-numbered non-BMP planes
3887 ~[0xD840, 0xDC00])];
3889 for p in pairs.iter() {
3890 let (s, u) = (*p).clone();
3891 assert!(is_utf16(u));
3892 assert_eq!(s.to_utf16(), u);
3894 assert_eq!(from_utf16(u).unwrap(), s);
3895 assert_eq!(from_utf16_lossy(u), s);
3897 assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3898 assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3903 fn test_utf16_invalid() {
3904 // completely positive cases tested above.
3906 assert_eq!(from_utf16([0xD800]), None);
3908 assert_eq!(from_utf16([0xD800, 0xD800]), None);
3911 assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3914 assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3918 fn test_utf16_lossy() {
3919 // completely positive cases tested above.
3921 assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3923 assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3926 assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3929 assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3933 fn test_truncate_utf16_at_nul() {
3935 assert_eq!(truncate_utf16_at_nul(v), &[]);
3938 assert_eq!(truncate_utf16_at_nul(v), &[]);
3941 assert_eq!(truncate_utf16_at_nul(v), &[1]);
3944 assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3947 assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3952 let s = ~"ศไทย中华Việt Nam";
3953 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3955 for ch in v.iter() {
3956 assert!(s.char_at(pos) == *ch);
3957 pos += from_char(*ch).len();
3962 fn test_char_at_reverse() {
3963 let s = ~"ศไทย中华Việt Nam";
3964 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3965 let mut pos = s.len();
3966 for ch in v.rev_iter() {
3967 assert!(s.char_at_reverse(pos) == *ch);
3968 pos -= from_char(*ch).len();
3973 fn test_escape_unicode() {
3974 assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3975 assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3976 assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3977 assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3978 assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3979 assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3980 assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3981 assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3982 assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3986 fn test_escape_default() {
3987 assert_eq!("abc".escape_default(), ~"abc");
3988 assert_eq!("a c".escape_default(), ~"a c");
3989 assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3990 assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3991 assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3992 assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3993 assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3994 assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3998 fn test_total_ord() {
3999 "1234".cmp(& &"123") == Greater;
4000 "123".cmp(& &"1234") == Less;
4001 "1234".cmp(& &"1234") == Equal;
4002 "12345555".cmp(& &"123456") == Less;
4003 "22".cmp(& &"1234") == Greater;
4007 fn test_char_range_at() {
4008 let data = ~"b¢€𤭢𤭢€¢b";
4009 assert_eq!('b', data.char_range_at(0).ch);
4010 assert_eq!('¢', data.char_range_at(1).ch);
4011 assert_eq!('€', data.char_range_at(3).ch);
4012 assert_eq!('𤭢', data.char_range_at(6).ch);
4013 assert_eq!('𤭢', data.char_range_at(10).ch);
4014 assert_eq!('€', data.char_range_at(14).ch);
4015 assert_eq!('¢', data.char_range_at(17).ch);
4016 assert_eq!('b', data.char_range_at(19).ch);
4020 fn test_char_range_at_reverse_underflow() {
4021 assert_eq!("abc".char_range_at_reverse(0).next, 0);
4026 #[allow(unnecessary_allocation)];
4028 ($s1:expr, $s2:expr, $e:expr) => { {
4032 assert_eq!(s1 + s2, e.to_owned());
4033 assert_eq!(s1.to_owned() + s2, e.to_owned());
4037 t!("foo", "bar", "foobar");
4038 t!("foo", ~"bar", "foobar");
4039 t!("ศไทย中", "华Việt Nam", "ศไทย中华Việt Nam");
4040 t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
4044 fn test_iterator() {
4046 let s = ~"ศไทย中华Việt Nam";
4047 let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4050 let mut it = s.chars();
4053 assert_eq!(c, v[pos]);
4056 assert_eq!(pos, v.len());
4060 fn test_rev_iterator() {
4062 let s = ~"ศไทย中华Việt Nam";
4063 let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4066 let mut it = s.chars_rev();
4069 assert_eq!(c, v[pos]);
4072 assert_eq!(pos, v.len());
4076 fn test_iterator_clone() {
4077 let s = "ศไทย中华Việt Nam";
4078 let mut it = s.chars();
4080 assert!(it.zip(it.clone()).all(|(x,y)| x == y));
4084 fn test_bytesator() {
4085 let s = ~"ศไทย中华Việt Nam";
4087 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4088 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4093 for b in s.bytes() {
4094 assert_eq!(b, v[pos]);
4100 fn test_bytes_revator() {
4101 let s = ~"ศไทย中华Việt Nam";
4103 224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
4104 184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
4107 let mut pos = v.len();
4109 for b in s.bytes_rev() {
4111 assert_eq!(b, v[pos]);
4116 fn test_char_indicesator() {
4118 let s = "ศไทย中华Việt Nam";
4119 let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
4120 let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
4123 let mut it = s.char_indices();
4126 assert_eq!(c, (p[pos], v[pos]));
4129 assert_eq!(pos, v.len());
4130 assert_eq!(pos, p.len());
4134 fn test_char_indices_revator() {
4136 let s = "ศไทย中华Việt Nam";
4137 let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
4138 let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
4141 let mut it = s.char_indices_rev();
4144 assert_eq!(c, (p[pos], v[pos]));
4147 assert_eq!(pos, v.len());
4148 assert_eq!(pos, p.len());
4152 fn test_split_char_iterator() {
4153 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4155 let split: ~[&str] = data.split(' ').collect();
4156 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4158 let mut rsplit: ~[&str] = data.rsplit(' ').collect();
4160 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4162 let split: ~[&str] = data.split(|c: char| c == ' ').collect();
4163 assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4165 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
4167 assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
4170 let split: ~[&str] = data.split('ä').collect();
4171 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4173 let mut rsplit: ~[&str] = data.rsplit('ä').collect();
4175 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4177 let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
4178 assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4180 let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
4182 assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
4186 fn test_splitn_char_iterator() {
4187 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4189 let split: ~[&str] = data.splitn(' ', 3).collect();
4190 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4192 let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
4193 assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
4196 let split: ~[&str] = data.splitn('ä', 3).collect();
4197 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4199 let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
4200 assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
4204 fn test_rsplitn_char_iterator() {
4205 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4207 let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
4209 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4211 let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
4213 assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
4216 let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
4218 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4220 let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
4222 assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
4226 fn test_split_char_iterator_no_trailing() {
4227 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4229 let split: ~[&str] = data.split('\n').collect();
4230 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4232 let split: ~[&str] = data.split_terminator('\n').collect();
4233 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4237 fn test_rev_split_char_iterator_no_trailing() {
4238 let data = "\nMäry häd ä little lämb\nLittle lämb\n";
4240 let mut split: ~[&str] = data.split('\n').rev().collect();
4242 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
4244 let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
4246 assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
4251 let data = "\n \tMäry häd\tä little lämb\nLittle lämb\n";
4252 let words: ~[&str] = data.words().collect();
4253 assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
4257 fn test_nfd_chars() {
4258 assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
4259 assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
4260 assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
4261 assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
4262 assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4263 assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
4264 assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
4265 assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
4266 assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4267 assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
4271 fn test_nfkd_chars() {
4272 assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
4273 assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
4274 assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
4275 assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
4276 assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4277 assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
4278 assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
4279 assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
4280 assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
4281 assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
4286 let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
4287 let lines: ~[&str] = data.lines().collect();
4288 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4290 let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
4291 let lines: ~[&str] = data.lines().collect();
4292 assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
4296 fn test_split_strator() {
4297 fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
4298 let v: ~[&str] = s.split_str(sep).collect();
4301 t("--1233345--", "12345", ~["--1233345--"]);
4302 t("abc::hello::there", "::", ~["abc", "hello", "there"]);
4303 t("::hello::there", "::", ~["", "hello", "there"]);
4304 t("hello::there::", "::", ~["hello", "there", ""]);
4305 t("::hello::there::", "::", ~["", "hello", "there", ""]);
4306 t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
4307 t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
4308 t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
4309 t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
4311 t("zz", "zz", ~["",""]);
4312 t("ok", "z", ~["ok"]);
4313 t("zzz", "zz", ~["","z"]);
4314 t("zzzzz", "zz", ~["","","z"]);
4318 fn test_str_default() {
4319 use default::Default;
4320 fn t<S: Default + Str>() {
4321 let s: S = Default::default();
4322 assert_eq!(s.as_slice(), "");
4330 fn test_str_container() {
4331 fn sum_len<S: Container>(v: &[S]) -> uint {
4332 v.iter().map(|x| x.len()).sum()
4336 assert_eq!(5, sum_len(["012", "", "34"]));
4337 assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
4338 assert_eq!(5, sum_len([s.as_slice()]));
4342 fn test_str_truncate() {
4343 let mut s = ~"12345";
4345 assert_eq!(s.as_slice(), "12345");
4347 assert_eq!(s.as_slice(), "123");
4349 assert_eq!(s.as_slice(), "");
4351 let mut s = ~"12345";
4355 let p_ = s.as_ptr();
4361 fn test_str_truncate_invalid_len() {
4362 let mut s = ~"12345";
4368 fn test_str_truncate_split_codepoint() {
4369 let mut s = ~"\u00FC"; // ü
4374 fn test_str_from_utf8() {
4375 let xs = bytes!("hello");
4376 assert_eq!(from_utf8(xs), Some("hello"));
4378 let xs = bytes!("ศไทย中华Việt Nam");
4379 assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
4381 let xs = bytes!("hello", 0xff);
4382 assert_eq!(from_utf8(xs), None);
4386 fn test_str_from_utf8_owned() {
4387 let xs = bytes!("hello").to_owned();
4388 assert_eq!(from_utf8_owned(xs), Some(~"hello"));
4390 let xs = bytes!("ศไทย中华Việt Nam").to_owned();
4391 assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
4393 let xs = bytes!("hello", 0xff).to_owned();
4394 assert_eq!(from_utf8_owned(xs), None);
4398 fn test_str_from_utf8_lossy() {
4399 let xs = bytes!("hello");
4400 assert_eq!(from_utf8_lossy(xs), Slice("hello"));
4402 let xs = bytes!("ศไทย中华Việt Nam");
4403 assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
4405 let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
4406 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4408 let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4409 assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4411 let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4412 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4414 let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4415 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4417 let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4418 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4420 let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4421 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4424 let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4425 assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4429 fn test_from_str() {
4430 let owned: Option<~str> = from_str(&"string");
4431 assert_eq!(owned, Some(~"string"));
4435 fn test_maybe_owned_traits() {
4436 let s = Slice("abcde");
4437 assert_eq!(s.len(), 5);
4438 assert_eq!(s.as_slice(), "abcde");
4439 assert_eq!(s.to_str(), ~"abcde");
4440 assert_eq!(format!("{}", s), ~"abcde");
4441 assert!(s.lt(&Owned(~"bcdef")));
4442 assert_eq!(Slice(""), Default::default());
4444 let o = Owned(~"abcde");
4445 assert_eq!(o.len(), 5);
4446 assert_eq!(o.as_slice(), "abcde");
4447 assert_eq!(o.to_str(), ~"abcde");
4448 assert_eq!(format!("{}", o), ~"abcde");
4449 assert!(o.lt(&Slice("bcdef")));
4450 assert_eq!(Owned(~""), Default::default());
4452 assert!(s.cmp(&o) == Equal);
4453 assert!(s.equals(&o));
4454 assert!(s.equiv(&o));
4456 assert!(o.cmp(&s) == Equal);
4457 assert!(o.equals(&s));
4458 assert!(o.equiv(&s));
4462 fn test_maybe_owned_methods() {
4463 let s = Slice("abcde");
4464 assert!(s.is_slice());
4465 assert!(!s.is_owned());
4467 let o = Owned(~"abcde");
4468 assert!(!o.is_slice());
4469 assert!(o.is_owned());
4473 fn test_maybe_owned_clone() {
4474 assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4475 assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4476 assert_eq!(Slice("abcde"), Slice("abcde").clone());
4477 assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4481 fn test_maybe_owned_into_owned() {
4482 assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4483 assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4487 fn test_into_maybe_owned() {
4488 assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4489 assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4490 assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4491 assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4498 use self::test::BenchHarness;
4503 fn char_iterator(bh: &mut BenchHarness) {
4504 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4505 let len = s.char_len();
4507 bh.iter(|| assert_eq!(s.chars().len(), len));
4511 fn char_iterator_ascii(bh: &mut BenchHarness) {
4512 let s = "Mary had a little lamb, Little lamb
4513 Mary had a little lamb, Little lamb
4514 Mary had a little lamb, Little lamb
4515 Mary had a little lamb, Little lamb
4516 Mary had a little lamb, Little lamb
4517 Mary had a little lamb, Little lamb";
4518 let len = s.char_len();
4520 bh.iter(|| assert_eq!(s.chars().len(), len));
4524 fn char_iterator_rev(bh: &mut BenchHarness) {
4525 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4526 let len = s.char_len();
4528 bh.iter(|| assert_eq!(s.chars_rev().len(), len));
4532 fn char_indicesator(bh: &mut BenchHarness) {
4533 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4534 let len = s.char_len();
4536 bh.iter(|| assert_eq!(s.char_indices().len(), len));
4540 fn char_indicesator_rev(bh: &mut BenchHarness) {
4541 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4542 let len = s.char_len();
4544 bh.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4548 fn split_unicode_ascii(bh: &mut BenchHarness) {
4549 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4551 bh.iter(|| assert_eq!(s.split('V').len(), 3));
4555 fn split_unicode_not_ascii(bh: &mut BenchHarness) {
4556 struct NotAscii(char);
4557 impl CharEq for NotAscii {
4558 fn matches(&self, c: char) -> bool {
4559 let NotAscii(cc) = *self;
4562 fn only_ascii(&self) -> bool { false }
4564 let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4566 bh.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4571 fn split_ascii(bh: &mut BenchHarness) {
4572 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4573 let len = s.split(' ').len();
4575 bh.iter(|| assert_eq!(s.split(' ').len(), len));
4579 fn split_not_ascii(bh: &mut BenchHarness) {
4580 struct NotAscii(char);
4581 impl CharEq for NotAscii {
4583 fn matches(&self, c: char) -> bool {
4584 let NotAscii(cc) = *self;
4587 fn only_ascii(&self) -> bool { false }
4589 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4590 let len = s.split(' ').len();
4592 bh.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4596 fn split_extern_fn(bh: &mut BenchHarness) {
4597 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4598 let len = s.split(' ').len();
4599 fn pred(c: char) -> bool { c == ' ' }
4601 bh.iter(|| assert_eq!(s.split(pred).len(), len));
4605 fn split_closure(bh: &mut BenchHarness) {
4606 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4607 let len = s.split(' ').len();
4609 bh.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4613 fn split_slice(bh: &mut BenchHarness) {
4614 let s = "Mary had a little lamb, Little lamb, little-lamb.";
4615 let len = s.split(' ').len();
4617 bh.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4621 fn is_utf8_100_ascii(bh: &mut BenchHarness) {
4623 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4624 Lorem ipsum dolor sit amet, consectetur. ");
4626 assert_eq!(100, s.len());
4633 fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
4634 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4635 assert_eq!(100, s.len());
4642 fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4643 let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4644 Lorem ipsum dolor sit amet, consectetur. ");
4646 assert_eq!(100, s.len());
4648 let _ = from_utf8_lossy(s);
4653 fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4654 let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4655 assert_eq!(100, s.len());
4657 let _ = from_utf8_lossy(s);
4662 fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4663 let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4665 let _ = from_utf8_lossy(s);
4670 fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4671 let s = ::slice::from_elem(100, 0xF5u8);
4673 let _ = from_utf8_lossy(s);
4678 fn bench_with_capacity(bh: &mut BenchHarness) {
4685 fn bench_push_str(bh: &mut BenchHarness) {
4686 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4694 fn bench_connect(bh: &mut BenchHarness) {
4695 let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4697 let v = [s, s, s, s, s, s, s, s, s, s];
4699 assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);