src/libstd/str.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12
  13 Unicode string manipulation (`str` type)
  14
  15 # Basic Usage
  16
  17 Rust's string type is one of the core primitive types of the language. While
  18 represented by the name `str`, the name `str` is not actually a valid type in
  19 Rust. Each string must also be decorated with its ownership. This means that
  20 there are two common kinds of strings in Rust:
  21
  22 * `~str` - This is an owned string. This type obeys all of the normal semantics
  23            of the `~T` types, meaning that it has one, and only one, owner. This
  24            type cannot be implicitly copied, and is moved out of when passed to
  25            other functions.
  26
  27 * `&str` - This is the borrowed string type. This type of string can only be
  28            created from the other kind of string. As the name "borrowed"
  29            implies, this type of string is owned elsewhere, and this string
  30            cannot be moved out of.
  31
  32 As an example, here's a few different kinds of strings.
  33
  34 ```rust
  35 fn main() {
  36     let owned_string = ~"I am an owned string";
  37     let borrowed_string1 = "This string is borrowed with the 'static lifetime";
  38     let borrowed_string2: &str = owned_string;   // owned strings can be borrowed
  39 }
  40  ```
  41
  42 From the example above, you can see that Rust has 2 different kinds of string
  43 literals. The owned literals correspond to the owned string types, but the
  44 "borrowed literal" is actually more akin to C's concept of a static string.
  45
  46 When a string is declared without a `~` sigil, then the string is allocated
  47 statically in the rodata of the executable/library. The string then has the
  48 type `&'static str` meaning that the string is valid for the `'static`
  49 lifetime, otherwise known as the lifetime of the entire program. As can be
  50 inferred from the type, these static strings are not mutable.
  51
  52 # Mutability
  53
  54 Many languages have immutable strings by default, and Rust has a particular
  55 flavor on this idea. As with the rest of Rust types, strings are immutable by
  56 default. If a string is declared as `mut`, however, it may be mutated. This
  57 works the same way as the rest of Rust's type system in the sense that if
  58 there's a mutable reference to a string, there may only be one mutable reference
  59 to that string. With these guarantees, strings can easily transition between
  60 being mutable/immutable with the same benefits of having mutable strings in
  61 other languages.
  62
  63 # Representation
  64
  65 Rust's string type, `str`, is a sequence of unicode codepoints encoded as a
  66 stream of UTF-8 bytes. All safely-created strings are guaranteed to be validly
  67 encoded UTF-8 sequences. Additionally, strings are not null-terminated
  68 and can contain null codepoints.
  69
  70 The actual representation of strings have direct mappings to vectors:
  71
  72 * `~str` is the same as `~[u8]`
  73 * `&str` is the same as `&[u8]`
  74
  75 */
  76
  77 use cast;
  78 use cast::transmute;
  79 use char;
  80 use char::Char;
  81 use clone::Clone;
  82 use cmp::{Eq, TotalEq, Ord, TotalOrd, Equiv, Ordering};
  83 use container::{Container, Mutable};
  84 use fmt;
  85 use io::Writer;
  86 use iter::{Iterator, FromIterator, Extendable, range};
  87 use iter::{Filter, AdditiveIterator, Map};
  88 use iter::{Rev, DoubleEndedIterator, ExactSize};
  89 use libc;
  90 use num::Saturating;
  91 use option::{None, Option, Some};
  92 use ptr;
  93 use from_str::FromStr;
  94 use slice;
  95 use slice::{OwnedVector, OwnedCloneableVector, ImmutableVector, MutableVector};
  96 use slice::{Vector};
  97 use vec::Vec;
  98 use default::Default;
  99 use raw::Repr;
 100 use strbuf::StrBuf;
 101
 102 /*
 103 Section: Creating a string
 104 */
 105
 106 /// Consumes a vector of bytes to create a new utf-8 string.
 107 /// Returns None if the vector contains invalid UTF-8.
 108 pub fn from_utf8_owned(vv: ~[u8]) -> Option<~str> {
 109     if is_utf8(vv) {
 110         Some(unsafe { raw::from_utf8_owned(vv) })
 111     } else {
 112         None
 113     }
 114 }
 115
 116 /// Converts a vector to a string slice without performing any allocations.
 117 ///
 118 /// Once the slice has been validated as utf-8, it is transmuted in-place and
 119 /// returned as a '&str' instead of a '&[u8]'
 120 ///
 121 /// Returns None if the slice is not utf-8.
 122 pub fn from_utf8<'a>(v: &'a [u8]) -> Option<&'a str> {
 123     if is_utf8(v) {
 124         Some(unsafe { raw::from_utf8(v) })
 125     } else { None }
 126 }
 127
 128 impl FromStr for ~str {
 129     #[inline]
 130     fn from_str(s: &str) -> Option<~str> { Some(s.to_owned()) }
 131 }
 132
 133 /// Convert a byte to a UTF-8 string
 134 ///
 135 /// # Failure
 136 ///
 137 /// Fails if invalid UTF-8
 138 pub fn from_byte(b: u8) -> ~str {
 139     assert!(b < 128u8);
 140     unsafe { ::cast::transmute(~[b]) }
 141 }
 142
 143 /// Convert a char to a string
 144 pub fn from_char(ch: char) -> ~str {
 145     let mut buf = StrBuf::new();
 146     buf.push_char(ch);
 147     buf.into_owned()
 148 }
 149
 150 /// Convert a vector of chars to a string
 151 pub fn from_chars(chs: &[char]) -> ~str {
 152     chs.iter().map(|c| *c).collect()
 153 }
 154
 155 /// Methods for vectors of strings
 156 pub trait StrVector {
 157     /// Concatenate a vector of strings.
 158     fn concat(&self) -> ~str;
 159
 160     /// Concatenate a vector of strings, placing a given separator between each.
 161     fn connect(&self, sep: &str) -> ~str;
 162 }
 163
 164 impl<'a, S: Str> StrVector for &'a [S] {
 165     fn concat(&self) -> ~str {
 166         if self.is_empty() { return ~""; }
 167
 168         // `len` calculation may overflow but push_str but will check boundaries
 169         let len = self.iter().map(|s| s.as_slice().len()).sum();
 170
 171         let mut result = StrBuf::with_capacity(len);
 172
 173         for s in self.iter() {
 174             result.push_str(s.as_slice())
 175         }
 176
 177         result.into_owned()
 178     }
 179
 180     fn connect(&self, sep: &str) -> ~str {
 181         if self.is_empty() { return ~""; }
 182
 183         // concat is faster
 184         if sep.is_empty() { return self.concat(); }
 185
 186         // this is wrong without the guarantee that `self` is non-empty
 187         // `len` calculation may overflow but push_str but will check boundaries
 188         let len = sep.len() * (self.len() - 1)
 189             + self.iter().map(|s| s.as_slice().len()).sum();
 190         let mut result = StrBuf::with_capacity(len);
 191         let mut first = true;
 192
 193         for s in self.iter() {
 194             if first {
 195                 first = false;
 196             } else {
 197                 result.push_str(sep);
 198             }
 199             result.push_str(s.as_slice());
 200         }
 201         result.into_owned()
 202     }
 203 }
 204
 205 impl<'a, S: Str> StrVector for Vec<S> {
 206     #[inline]
 207     fn concat(&self) -> ~str {
 208         self.as_slice().concat()
 209     }
 210
 211     #[inline]
 212     fn connect(&self, sep: &str) -> ~str {
 213         self.as_slice().connect(sep)
 214     }
 215 }
 216
 217 /// Something that can be used to compare against a character
 218 pub trait CharEq {
 219     /// Determine if the splitter should split at the given character
 220     fn matches(&self, char) -> bool;
 221     /// Indicate if this is only concerned about ASCII characters,
 222     /// which can allow for a faster implementation.
 223     fn only_ascii(&self) -> bool;
 224 }
 225
 226 impl CharEq for char {
 227     #[inline]
 228     fn matches(&self, c: char) -> bool { *self == c }
 229
 230     fn only_ascii(&self) -> bool { (*self as uint) < 128 }
 231 }
 232
 233 impl<'a> CharEq for |char|: 'a -> bool {
 234     #[inline]
 235     fn matches(&self, c: char) -> bool { (*self)(c) }
 236
 237     fn only_ascii(&self) -> bool { false }
 238 }
 239
 240 impl CharEq for extern "Rust" fn(char) -> bool {
 241     #[inline]
 242     fn matches(&self, c: char) -> bool { (*self)(c) }
 243
 244     fn only_ascii(&self) -> bool { false }
 245 }
 246
 247 impl<'a, C: CharEq> CharEq for &'a [C] {
 248     #[inline]
 249     fn matches(&self, c: char) -> bool {
 250         self.iter().any(|m| m.matches(c))
 251     }
 252
 253     fn only_ascii(&self) -> bool {
 254         self.iter().all(|m| m.only_ascii())
 255     }
 256 }
 257
 258 /*
 259 Section: Iterators
 260 */
 261
 262 /// External iterator for a string's characters.
 263 /// Use with the `std::iter` module.
 264 #[deriving(Clone)]
 265 pub struct Chars<'a> {
 266     /// The slice remaining to be iterated
 267     string: &'a str,
 268 }
 269
 270 impl<'a> Iterator<char> for Chars<'a> {
 271     #[inline]
 272     fn next(&mut self) -> Option<char> {
 273         // Decode the next codepoint, then update
 274         // the slice to be just the remaining part
 275         if self.string.len() != 0 {
 276             let CharRange {ch, next} = self.string.char_range_at(0);
 277             unsafe {
 278                 self.string = raw::slice_unchecked(self.string, next, self.string.len());
 279             }
 280             Some(ch)
 281         } else {
 282             None
 283         }
 284     }
 285
 286     #[inline]
 287     fn size_hint(&self) -> (uint, Option<uint>) {
 288         (self.string.len().saturating_add(3)/4, Some(self.string.len()))
 289     }
 290 }
 291
 292 impl<'a> DoubleEndedIterator<char> for Chars<'a> {
 293     #[inline]
 294     fn next_back(&mut self) -> Option<char> {
 295         if self.string.len() != 0 {
 296             let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
 297             unsafe {
 298                 self.string = raw::slice_unchecked(self.string, 0, next);
 299             }
 300             Some(ch)
 301         } else {
 302             None
 303         }
 304     }
 305 }
 306
 307 /// External iterator for a string's characters and their byte offsets.
 308 /// Use with the `std::iter` module.
 309 #[deriving(Clone)]
 310 pub struct CharOffsets<'a> {
 311     /// The original string to be iterated
 312     string: &'a str,
 313     iter: Chars<'a>,
 314 }
 315
 316 impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
 317     #[inline]
 318     fn next(&mut self) -> Option<(uint, char)> {
 319         // Compute the byte offset by using the pointer offset between
 320         // the original string slice and the iterator's remaining part
 321         let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
 322         self.iter.next().map(|ch| (offset, ch))
 323     }
 324
 325     #[inline]
 326     fn size_hint(&self) -> (uint, Option<uint>) {
 327         self.iter.size_hint()
 328     }
 329 }
 330
 331 impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
 332     #[inline]
 333     fn next_back(&mut self) -> Option<(uint, char)> {
 334         self.iter.next_back().map(|ch| {
 335             let offset = self.iter.string.len() +
 336                     self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
 337             (offset, ch)
 338         })
 339     }
 340 }
 341
 342 /// External iterator for a string's characters in reverse order.
 343 /// Use with the `std::iter` module.
 344 pub type RevChars<'a> = Rev<Chars<'a>>;
 345
 346 /// External iterator for a string's characters and their byte offsets in reverse order.
 347 /// Use with the `std::iter` module.
 348 pub type RevCharOffsets<'a> = Rev<CharOffsets<'a>>;
 349
 350 /// External iterator for a string's bytes.
 351 /// Use with the `std::iter` module.
 352 pub type Bytes<'a> =
 353     Map<'a, &'a u8, u8, slice::Items<'a, u8>>;
 354
 355 /// External iterator for a string's bytes in reverse order.
 356 /// Use with the `std::iter` module.
 357 pub type RevBytes<'a> = Rev<Bytes<'a>>;
 358
 359 /// An iterator over the substrings of a string, separated by `sep`.
 360 #[deriving(Clone)]
 361 pub struct CharSplits<'a, Sep> {
 362     /// The slice remaining to be iterated
 363     string: &'a str,
 364     sep: Sep,
 365     /// Whether an empty string at the end is allowed
 366     allow_trailing_empty: bool,
 367     only_ascii: bool,
 368     finished: bool,
 369 }
 370
 371 /// An iterator over the substrings of a string, separated by `sep`,
 372 /// starting from the back of the string.
 373 pub type RevCharSplits<'a, Sep> = Rev<CharSplits<'a, Sep>>;
 374
 375 /// An iterator over the substrings of a string, separated by `sep`,
 376 /// splitting at most `count` times.
 377 #[deriving(Clone)]
 378 pub struct CharSplitsN<'a, Sep> {
 379     iter: CharSplits<'a, Sep>,
 380     /// The number of splits remaining
 381     count: uint,
 382     invert: bool,
 383 }
 384
 385 /// An iterator over the words of a string, separated by a sequence of whitespace
 386 pub type Words<'a> =
 387     Filter<'a, &'a str, CharSplits<'a, extern "Rust" fn(char) -> bool>>;
 388
 389 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
 390 pub type AnyLines<'a> =
 391     Map<'a, &'a str, &'a str, CharSplits<'a, char>>;
 392
 393 impl<'a, Sep> CharSplits<'a, Sep> {
 394     #[inline]
 395     fn get_end(&mut self) -> Option<&'a str> {
 396         if !self.finished && (self.allow_trailing_empty || self.string.len() > 0) {
 397             self.finished = true;
 398             Some(self.string)
 399         } else {
 400             None
 401         }
 402     }
 403 }
 404
 405 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplits<'a, Sep> {
 406     #[inline]
 407     fn next(&mut self) -> Option<&'a str> {
 408         if self.finished { return None }
 409
 410         let mut next_split = None;
 411         if self.only_ascii {
 412             for (idx, byte) in self.string.bytes().enumerate() {
 413                 if self.sep.matches(byte as char) && byte < 128u8 {
 414                     next_split = Some((idx, idx + 1));
 415                     break;
 416                 }
 417             }
 418         } else {
 419             for (idx, ch) in self.string.char_indices() {
 420                 if self.sep.matches(ch) {
 421                     next_split = Some((idx, self.string.char_range_at(idx).next));
 422                     break;
 423                 }
 424             }
 425         }
 426         match next_split {
 427             Some((a, b)) => unsafe {
 428                 let elt = raw::slice_unchecked(self.string, 0, a);
 429                 self.string = raw::slice_unchecked(self.string, b, self.string.len());
 430                 Some(elt)
 431             },
 432             None => self.get_end(),
 433         }
 434     }
 435 }
 436
 437 impl<'a, Sep: CharEq> DoubleEndedIterator<&'a str>
 438 for CharSplits<'a, Sep> {
 439     #[inline]
 440     fn next_back(&mut self) -> Option<&'a str> {
 441         if self.finished { return None }
 442
 443         if !self.allow_trailing_empty {
 444             self.allow_trailing_empty = true;
 445             match self.next_back() {
 446                 Some(elt) if !elt.is_empty() => return Some(elt),
 447                 _ => if self.finished { return None }
 448             }
 449         }
 450         let len = self.string.len();
 451         let mut next_split = None;
 452
 453         if self.only_ascii {
 454             for (idx, byte) in self.string.bytes().enumerate().rev() {
 455                 if self.sep.matches(byte as char) && byte < 128u8 {
 456                     next_split = Some((idx, idx + 1));
 457                     break;
 458                 }
 459             }
 460         } else {
 461             for (idx, ch) in self.string.char_indices_rev() {
 462                 if self.sep.matches(ch) {
 463                     next_split = Some((idx, self.string.char_range_at(idx).next));
 464                     break;
 465                 }
 466             }
 467         }
 468         match next_split {
 469             Some((a, b)) => unsafe {
 470                 let elt = raw::slice_unchecked(self.string, b, len);
 471                 self.string = raw::slice_unchecked(self.string, 0, a);
 472                 Some(elt)
 473             },
 474             None => { self.finished = true; Some(self.string) }
 475         }
 476     }
 477 }
 478
 479 impl<'a, Sep: CharEq> Iterator<&'a str> for CharSplitsN<'a, Sep> {
 480     #[inline]
 481     fn next(&mut self) -> Option<&'a str> {
 482         if self.count != 0 {
 483             self.count -= 1;
 484             if self.invert { self.iter.next_back() } else { self.iter.next() }
 485         } else {
 486             self.iter.get_end()
 487         }
 488     }
 489 }
 490
 491 /// An iterator over the start and end indices of the matches of a
 492 /// substring within a larger string
 493 #[deriving(Clone)]
 494 pub struct MatchIndices<'a> {
 495     haystack: &'a str,
 496     needle: &'a str,
 497     position: uint,
 498 }
 499
 500 /// An iterator over the substrings of a string separated by a given
 501 /// search string
 502 #[deriving(Clone)]
 503 pub struct StrSplits<'a> {
 504     it: MatchIndices<'a>,
 505     last_end: uint,
 506     finished: bool
 507 }
 508
 509 impl<'a> Iterator<(uint, uint)> for MatchIndices<'a> {
 510     #[inline]
 511     fn next(&mut self) -> Option<(uint, uint)> {
 512         // See Issue #1932 for why this is a naive search
 513         let (h_len, n_len) = (self.haystack.len(), self.needle.len());
 514         let mut match_start = 0;
 515         let mut match_i = 0;
 516
 517         while self.position < h_len {
 518             if self.haystack[self.position] == self.needle[match_i] {
 519                 if match_i == 0 { match_start = self.position; }
 520                 match_i += 1;
 521                 self.position += 1;
 522
 523                 if match_i == n_len {
 524                     // found a match!
 525                     return Some((match_start, self.position));
 526                 }
 527             } else {
 528                 // failed match, backtrack
 529                 if match_i > 0 {
 530                     match_i = 0;
 531                     self.position = match_start;
 532                 }
 533                 self.position += 1;
 534             }
 535         }
 536         None
 537     }
 538 }
 539
 540 impl<'a> Iterator<&'a str> for StrSplits<'a> {
 541     #[inline]
 542     fn next(&mut self) -> Option<&'a str> {
 543         if self.finished { return None; }
 544
 545         match self.it.next() {
 546             Some((from, to)) => {
 547                 let ret = Some(self.it.haystack.slice(self.last_end, from));
 548                 self.last_end = to;
 549                 ret
 550             }
 551             None => {
 552                 self.finished = true;
 553                 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
 554             }
 555         }
 556     }
 557 }
 558
 559 // Helper functions used for Unicode normalization
 560 fn canonical_sort(comb: &mut [(char, u8)]) {
 561     use iter::range;
 562     use tuple::Tuple2;
 563
 564     let len = comb.len();
 565     for i in range(0, len) {
 566         let mut swapped = false;
 567         for j in range(1, len-i) {
 568             let class_a = *comb[j-1].ref1();
 569             let class_b = *comb[j].ref1();
 570             if class_a != 0 && class_b != 0 && class_a > class_b {
 571                 comb.swap(j-1, j);
 572                 swapped = true;
 573             }
 574         }
 575         if !swapped { break; }
 576     }
 577 }
 578
 579 #[deriving(Clone)]
 580 enum NormalizationForm {
 581     NFD,
 582     NFKD
 583 }
 584
 585 /// External iterator for a string's normalization's characters.
 586 /// Use with the `std::iter` module.
 587 #[deriving(Clone)]
 588 pub struct Normalizations<'a> {
 589     kind: NormalizationForm,
 590     iter: Chars<'a>,
 591     buffer: ~[(char, u8)],
 592     sorted: bool
 593 }
 594
 595 impl<'a> Iterator<char> for Normalizations<'a> {
 596     #[inline]
 597     fn next(&mut self) -> Option<char> {
 598         use unicode::decompose::canonical_combining_class;
 599
 600         match self.buffer.head() {
 601             Some(&(c, 0)) => {
 602                 self.sorted = false;
 603                 self.buffer.shift();
 604                 return Some(c);
 605             }
 606             Some(&(c, _)) if self.sorted => {
 607                 self.buffer.shift();
 608                 return Some(c);
 609             }
 610             _ => self.sorted = false
 611         }
 612
 613         let decomposer = match self.kind {
 614             NFD => char::decompose_canonical,
 615             NFKD => char::decompose_compatible
 616         };
 617
 618         if !self.sorted {
 619             for ch in self.iter {
 620                 let buffer = &mut self.buffer;
 621                 let sorted = &mut self.sorted;
 622                 decomposer(ch, |d| {
 623                     let class = canonical_combining_class(d);
 624                     if class == 0 && !*sorted {
 625                         canonical_sort(*buffer);
 626                         *sorted = true;
 627                     }
 628                     buffer.push((d, class));
 629                 });
 630                 if *sorted { break }
 631             }
 632         }
 633
 634         if !self.sorted {
 635             canonical_sort(self.buffer);
 636             self.sorted = true;
 637         }
 638
 639         match self.buffer.shift() {
 640             Some((c, 0)) => {
 641                 self.sorted = false;
 642                 Some(c)
 643             }
 644             Some((c, _)) => Some(c),
 645             None => None
 646         }
 647     }
 648
 649     fn size_hint(&self) -> (uint, Option<uint>) {
 650         let (lower, _) = self.iter.size_hint();
 651         (lower, None)
 652     }
 653 }
 654
 655 /// Replace all occurrences of one string with another
 656 ///
 657 /// # Arguments
 658 ///
 659 /// * s - The string containing substrings to replace
 660 /// * from - The string to replace
 661 /// * to - The replacement string
 662 ///
 663 /// # Return value
 664 ///
 665 /// The original string with all occurances of `from` replaced with `to`
 666 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
 667     let mut result = StrBuf::new();
 668     let mut last_end = 0;
 669     for (start, end) in s.match_indices(from) {
 670         result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
 671         result.push_str(to);
 672         last_end = end;
 673     }
 674     result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
 675     result.into_owned()
 676 }
 677
 678 /*
 679 Section: Comparing strings
 680 */
 681
 682 // share the implementation of the lang-item vs. non-lang-item
 683 // eq_slice.
 684 #[inline]
 685 fn eq_slice_(a: &str, b: &str) -> bool {
 686     a.len() == b.len() && unsafe {
 687         libc::memcmp(a.as_ptr() as *libc::c_void,
 688                      b.as_ptr() as *libc::c_void,
 689                      a.len() as libc::size_t) == 0
 690     }
 691 }
 692
 693 /// Bytewise slice equality
 694 #[cfg(not(test))]
 695 #[lang="str_eq"]
 696 #[inline]
 697 pub fn eq_slice(a: &str, b: &str) -> bool {
 698     eq_slice_(a, b)
 699 }
 700
 701 /// Bytewise slice equality
 702 #[cfg(test)]
 703 #[inline]
 704 pub fn eq_slice(a: &str, b: &str) -> bool {
 705     eq_slice_(a, b)
 706 }
 707
 708 /// Bytewise string equality
 709 #[cfg(not(test))]
 710 #[lang="uniq_str_eq"]
 711 #[inline]
 712 pub fn eq(a: &~str, b: &~str) -> bool {
 713     eq_slice(*a, *b)
 714 }
 715
 716 #[cfg(test)]
 717 #[inline]
 718 pub fn eq(a: &~str, b: &~str) -> bool {
 719     eq_slice(*a, *b)
 720 }
 721
 722 /*
 723 Section: Misc
 724 */
 725
 726 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
 727 /// returning `true` in that case, or, if it is invalid, `false` with
 728 /// `iter` reset such that it is pointing at the first byte in the
 729 /// invalid sequence.
 730 #[inline(always)]
 731 fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
 732     loop {
 733         // save the current thing we're pointing at.
 734         let old = *iter;
 735
 736         // restore the iterator we had at the start of this codepoint.
 737         macro_rules! err ( () => { {*iter = old; return false} });
 738         macro_rules! next ( () => {
 739                 match iter.next() {
 740                     Some(a) => *a,
 741                     // we needed data, but there was none: error!
 742                     None => err!()
 743                 }
 744             });
 745
 746         let first = match iter.next() {
 747             Some(&b) => b,
 748             // we're at the end of the iterator and a codepoint
 749             // boundary at the same time, so this string is valid.
 750             None => return true
 751         };
 752
 753         // ASCII characters are always valid, so only large
 754         // bytes need more examination.
 755         if first >= 128 {
 756             let w = utf8_char_width(first);
 757             let second = next!();
 758             // 2-byte encoding is for codepoints  \u0080 to  \u07ff
 759             //        first  C2 80        last DF BF
 760             // 3-byte encoding is for codepoints  \u0800 to  \uffff
 761             //        first  E0 A0 80     last EF BF BF
 762             //   excluding surrogates codepoints  \ud800 to  \udfff
 763             //               ED A0 80 to       ED BF BF
 764             // 4-byte encoding is for codepoints \u10000 to \u10ffff
 765             //        first  F0 90 80 80  last F4 8F BF BF
 766             //
 767             // Use the UTF-8 syntax from the RFC
 768             //
 769             // https://tools.ietf.org/html/rfc3629
 770             // UTF8-1      = %x00-7F
 771             // UTF8-2      = %xC2-DF UTF8-tail
 772             // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
 773             //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
 774             // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
 775             //               %xF4 %x80-8F 2( UTF8-tail )
 776             match w {
 777                 2 => if second & 192 != TAG_CONT_U8 {err!()},
 778                 3 => {
 779                     match (first, second, next!() & 192) {
 780                         (0xE0        , 0xA0 .. 0xBF, TAG_CONT_U8) |
 781                         (0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
 782                         (0xED        , 0x80 .. 0x9F, TAG_CONT_U8) |
 783                         (0xEE .. 0xEF, 0x80 .. 0xBF, TAG_CONT_U8) => {}
 784                         _ => err!()
 785                     }
 786                 }
 787                 4 => {
 788                     match (first, second, next!() & 192, next!() & 192) {
 789                         (0xF0        , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
 790                         (0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
 791                         (0xF4        , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
 792                         _ => err!()
 793                     }
 794                 }
 795                 _ => err!()
 796             }
 797         }
 798     }
 799 }
 800
 801 /// Determines if a vector of bytes contains valid UTF-8.
 802 pub fn is_utf8(v: &[u8]) -> bool {
 803     run_utf8_validation_iterator(&mut v.iter())
 804 }
 805
 806 #[inline(always)]
 807 fn first_non_utf8_index(v: &[u8]) -> Option<uint> {
 808     let mut it = v.iter();
 809
 810     let ok = run_utf8_validation_iterator(&mut it);
 811     if ok {
 812         None
 813     } else {
 814         // work out how many valid bytes we've consumed
 815         // (run_utf8_validation_iterator resets the iterator to just
 816         // after the last good byte), which we can do because the
 817         // vector iterator size_hint is exact.
 818         let (remaining, _) = it.size_hint();
 819         Some(v.len() - remaining)
 820     }
 821 }
 822
 823 /// Determines if a vector of `u16` contains valid UTF-16
 824 pub fn is_utf16(v: &[u16]) -> bool {
 825     let mut it = v.iter();
 826     macro_rules! next ( ($ret:expr) => {
 827             match it.next() { Some(u) => *u, None => return $ret }
 828         }
 829     )
 830     loop {
 831         let u = next!(true);
 832
 833         match char::from_u32(u as u32) {
 834             Some(_) => {}
 835             None => {
 836                 let u2 = next!(false);
 837                 if u < 0xD7FF || u > 0xDBFF ||
 838                     u2 < 0xDC00 || u2 > 0xDFFF { return false; }
 839             }
 840         }
 841     }
 842 }
 843
 844 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 845 /// of `u16`s.
 846 #[deriving(Clone)]
 847 pub struct UTF16Items<'a> {
 848     iter: slice::Items<'a, u16>
 849 }
 850 /// The possibilities for values decoded from a `u16` stream.
 851 #[deriving(Eq, TotalEq, Clone, Show)]
 852 pub enum UTF16Item {
 853     /// A valid codepoint.
 854     ScalarValue(char),
 855     /// An invalid surrogate without its pair.
 856     LoneSurrogate(u16)
 857 }
 858
 859 impl UTF16Item {
 860     /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
 861     /// replacement character (U+FFFD).
 862     #[inline]
 863     pub fn to_char_lossy(&self) -> char {
 864         match *self {
 865             ScalarValue(c) => c,
 866             LoneSurrogate(_) => '\uFFFD'
 867         }
 868     }
 869 }
 870
 871 impl<'a> Iterator<UTF16Item> for UTF16Items<'a> {
 872     fn next(&mut self) -> Option<UTF16Item> {
 873         let u = match self.iter.next() {
 874             Some(u) => *u,
 875             None => return None
 876         };
 877
 878         if u < 0xD800 || 0xDFFF < u {
 879             // not a surrogate
 880             Some(ScalarValue(unsafe {cast::transmute(u as u32)}))
 881         } else if u >= 0xDC00 {
 882             // a trailing surrogate
 883             Some(LoneSurrogate(u))
 884         } else {
 885             // preserve state for rewinding.
 886             let old = self.iter;
 887
 888             let u2 = match self.iter.next() {
 889                 Some(u2) => *u2,
 890                 // eof
 891                 None => return Some(LoneSurrogate(u))
 892             };
 893             if u2 < 0xDC00 || u2 > 0xDFFF {
 894                 // not a trailing surrogate so we're not a valid
 895                 // surrogate pair, so rewind to redecode u2 next time.
 896                 self.iter = old;
 897                 return Some(LoneSurrogate(u))
 898             }
 899
 900             // all ok, so lets decode it.
 901             let c = ((u - 0xD800) as u32 << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
 902             Some(ScalarValue(unsafe {cast::transmute(c)}))
 903         }
 904     }
 905
 906     #[inline]
 907     fn size_hint(&self) -> (uint, Option<uint>) {
 908         let (low, high) = self.iter.size_hint();
 909         // we could be entirely valid surrogates (2 elements per
 910         // char), or entirely non-surrogates (1 element per char)
 911         (low / 2, high)
 912     }
 913 }
 914
 915 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
 916 /// returning invalid surrogates as `LoneSurrogate`s.
 917 ///
 918 /// # Example
 919 ///
 920 /// ```rust
 921 /// use std::str;
 922 /// use std::str::{ScalarValue, LoneSurrogate};
 923 ///
 924 /// // 𝄞mus<invalid>ic<invalid>
 925 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 926 ///          0x0073, 0xDD1E, 0x0069, 0x0063,
 927 ///          0xD834];
 928 ///
 929 /// assert_eq!(str::utf16_items(v).collect::<~[_]>(),
 930 ///            ~[ScalarValue('𝄞'),
 931 ///              ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
 932 ///              LoneSurrogate(0xDD1E),
 933 ///              ScalarValue('i'), ScalarValue('c'),
 934 ///              LoneSurrogate(0xD834)]);
 935 /// ```
 936 pub fn utf16_items<'a>(v: &'a [u16]) -> UTF16Items<'a> {
 937     UTF16Items { iter : v.iter() }
 938 }
 939
 940 /// Return a slice of `v` ending at (and not including) the first NUL
 941 /// (0).
 942 ///
 943 /// # Example
 944 ///
 945 /// ```rust
 946 /// use std::str;
 947 ///
 948 /// // "abcd"
 949 /// let mut v = ['a' as u16, 'b' as u16, 'c' as u16, 'd' as u16];
 950 /// // no NULs so no change
 951 /// assert_eq!(str::truncate_utf16_at_nul(v), v.as_slice());
 952 ///
 953 /// // "ab\0d"
 954 /// v[2] = 0;
 955 /// assert_eq!(str::truncate_utf16_at_nul(v),
 956 ///            &['a' as u16, 'b' as u16]);
 957 /// ```
 958 pub fn truncate_utf16_at_nul<'a>(v: &'a [u16]) -> &'a [u16] {
 959     match v.iter().position(|c| *c == 0) {
 960         // don't include the 0
 961         Some(i) => v.slice_to(i),
 962         None => v
 963     }
 964 }
 965
 966 /// Decode a UTF-16 encoded vector `v` into a string, returning `None`
 967 /// if `v` contains any invalid data.
 968 ///
 969 /// # Example
 970 ///
 971 /// ```rust
 972 /// use std::str;
 973 ///
 974 /// // 𝄞music
 975 /// let mut v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 976 ///              0x0073, 0x0069, 0x0063];
 977 /// assert_eq!(str::from_utf16(v), Some(~"𝄞music"));
 978 ///
 979 /// // 𝄞mu<invalid>ic
 980 /// v[4] = 0xD800;
 981 /// assert_eq!(str::from_utf16(v), None);
 982 /// ```
 983 pub fn from_utf16(v: &[u16]) -> Option<~str> {
 984     let mut s = StrBuf::with_capacity(v.len() / 2);
 985     for c in utf16_items(v) {
 986         match c {
 987             ScalarValue(c) => s.push_char(c),
 988             LoneSurrogate(_) => return None
 989         }
 990     }
 991     Some(s.into_owned())
 992 }
 993
 994 /// Decode a UTF-16 encoded vector `v` into a string, replacing
 995 /// invalid data with the replacement character (U+FFFD).
 996 ///
 997 /// # Example
 998 /// ```rust
 999 /// use std::str;
1000 ///
1001 /// // 𝄞mus<invalid>ic<invalid>
1002 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
1003 ///          0x0073, 0xDD1E, 0x0069, 0x0063,
1004 ///          0xD834];
1005 ///
1006 /// assert_eq!(str::from_utf16_lossy(v),
1007 ///            ~"𝄞mus\uFFFDic\uFFFD");
1008 /// ```
1009 pub fn from_utf16_lossy(v: &[u16]) -> ~str {
1010     utf16_items(v).map(|c| c.to_char_lossy()).collect()
1011 }
1012
1013 // https://tools.ietf.org/html/rfc3629
1014 static UTF8_CHAR_WIDTH: [u8, ..256] = [
1015 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1016 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1017 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1018 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1019 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1020 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1021 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1022 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1023 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1024 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1025 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1026 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1027 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1028 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1029 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1030 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1031 ];
1032
1033 /// Given a first byte, determine how many bytes are in this UTF-8 character
1034 #[inline]
1035 pub fn utf8_char_width(b: u8) -> uint {
1036     return UTF8_CHAR_WIDTH[b as uint] as uint;
1037 }
1038
1039 /// Struct that contains a `char` and the index of the first byte of
1040 /// the next `char` in a string.  This can be used as a data structure
1041 /// for iterating over the UTF-8 bytes of a string.
1042 pub struct CharRange {
1043     /// Current `char`
1044     pub ch: char,
1045     /// Index of the first byte of the next `char`
1046     pub next: uint,
1047 }
1048
1049 // Return the initial codepoint accumulator for the first byte.
1050 // The first byte is special, only want bottom 5 bits for width 2, 4 bits
1051 // for width 3, and 3 bits for width 4
1052 macro_rules! utf8_first_byte(
1053     ($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
1054 )
1055
1056 // return the value of $ch updated with continuation byte $byte
1057 macro_rules! utf8_acc_cont_byte(
1058     ($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
1059 )
1060
1061 static TAG_CONT_U8: u8 = 128u8;
1062
1063 /// Converts a vector of bytes to a new utf-8 string.
1064 /// Any invalid utf-8 sequences are replaced with U+FFFD REPLACEMENT CHARACTER.
1065 ///
1066 /// # Example
1067 ///
1068 /// ```rust
1069 /// let input = bytes!("Hello ", 0xF0, 0x90, 0x80, "World");
1070 /// let output = std::str::from_utf8_lossy(input);
1071 /// assert_eq!(output.as_slice(), "Hello \uFFFDWorld");
1072 /// ```
1073 pub fn from_utf8_lossy<'a>(v: &'a [u8]) -> MaybeOwned<'a> {
1074     let firstbad = match first_non_utf8_index(v) {
1075         None => return Slice(unsafe { cast::transmute(v) }),
1076         Some(i) => i
1077     };
1078
1079     static REPLACEMENT: &'static [u8] = bytes!(0xEF, 0xBF, 0xBD); // U+FFFD in UTF-8
1080     let mut i = firstbad;
1081     let total = v.len();
1082     fn unsafe_get(xs: &[u8], i: uint) -> u8 {
1083         unsafe { *xs.unsafe_ref(i) }
1084     }
1085     fn safe_get(xs: &[u8], i: uint, total: uint) -> u8 {
1086         if i >= total {
1087             0
1088         } else {
1089             unsafe_get(xs, i)
1090         }
1091     }
1092
1093     let mut res = StrBuf::with_capacity(total);
1094
1095     if i > 0 {
1096         unsafe {
1097             res.push_bytes(v.slice_to(i))
1098         };
1099     }
1100
1101     // subseqidx is the index of the first byte of the subsequence we're looking at.
1102     // It's used to copy a bunch of contiguous good codepoints at once instead of copying
1103     // them one by one.
1104     let mut subseqidx = firstbad;
1105
1106     while i < total {
1107         let i_ = i;
1108         let byte = unsafe_get(v, i);
1109         i += 1;
1110
1111         macro_rules! error(() => ({
1112             unsafe {
1113                 if subseqidx != i_ {
1114                     res.push_bytes(v.slice(subseqidx, i_));
1115                 }
1116                 subseqidx = i;
1117                 res.push_bytes(REPLACEMENT);
1118             }
1119         }))
1120
1121         if byte < 128u8 {
1122             // subseqidx handles this
1123         } else {
1124             let w = utf8_char_width(byte);
1125
1126             match w {
1127                 2 => {
1128                     if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1129                         error!();
1130                         continue;
1131                     }
1132                     i += 1;
1133                 }
1134                 3 => {
1135                     match (byte, safe_get(v, i, total)) {
1136                         (0xE0        , 0xA0 .. 0xBF) => (),
1137                         (0xE1 .. 0xEC, 0x80 .. 0xBF) => (),
1138                         (0xED        , 0x80 .. 0x9F) => (),
1139                         (0xEE .. 0xEF, 0x80 .. 0xBF) => (),
1140                         _ => {
1141                             error!();
1142                             continue;
1143                         }
1144                     }
1145                     i += 1;
1146                     if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1147                         error!();
1148                         continue;
1149                     }
1150                     i += 1;
1151                 }
1152                 4 => {
1153                     match (byte, safe_get(v, i, total)) {
1154                         (0xF0        , 0x90 .. 0xBF) => (),
1155                         (0xF1 .. 0xF3, 0x80 .. 0xBF) => (),
1156                         (0xF4        , 0x80 .. 0x8F) => (),
1157                         _ => {
1158                             error!();
1159                             continue;
1160                         }
1161                     }
1162                     i += 1;
1163                     if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1164                         error!();
1165                         continue;
1166                     }
1167                     i += 1;
1168                     if safe_get(v, i, total) & 192u8 != TAG_CONT_U8 {
1169                         error!();
1170                         continue;
1171                     }
1172                     i += 1;
1173                 }
1174                 _ => {
1175                     error!();
1176                     continue;
1177                 }
1178             }
1179         }
1180     }
1181     if subseqidx < total {
1182         unsafe {
1183             res.push_bytes(v.slice(subseqidx, total))
1184         };
1185     }
1186     Owned(res.into_owned())
1187 }
1188
1189 /*
1190 Section: MaybeOwned
1191 */
1192
1193 /// A MaybeOwned is a string that can hold either a ~str or a &str.
1194 /// This can be useful as an optimization when an allocation is sometimes
1195 /// needed but not always.
1196 pub enum MaybeOwned<'a> {
1197     /// A borrowed string
1198     Slice(&'a str),
1199     /// An owned string
1200     Owned(~str)
1201 }
1202
1203 /// SendStr is a specialization of `MaybeOwned` to be sendable
1204 pub type SendStr = MaybeOwned<'static>;
1205
1206 impl<'a> MaybeOwned<'a> {
1207     /// Returns `true` if this `MaybeOwned` wraps an owned string
1208     #[inline]
1209     pub fn is_owned(&self) -> bool {
1210         match *self {
1211             Slice(_) => false,
1212             Owned(_) => true
1213         }
1214     }
1215
1216     /// Returns `true` if this `MaybeOwned` wraps a borrowed string
1217     #[inline]
1218     pub fn is_slice(&self) -> bool {
1219         match *self {
1220             Slice(_) => true,
1221             Owned(_) => false
1222         }
1223     }
1224 }
1225
1226 /// Trait for moving into a `MaybeOwned`
1227 pub trait IntoMaybeOwned<'a> {
1228     /// Moves self into a `MaybeOwned`
1229     fn into_maybe_owned(self) -> MaybeOwned<'a>;
1230 }
1231
1232 impl<'a> IntoMaybeOwned<'a> for ~str {
1233     #[inline]
1234     fn into_maybe_owned(self) -> MaybeOwned<'a> { Owned(self) }
1235 }
1236
1237 impl<'a> IntoMaybeOwned<'a> for &'a str {
1238     #[inline]
1239     fn into_maybe_owned(self) -> MaybeOwned<'a> { Slice(self) }
1240 }
1241
1242 impl<'a> IntoMaybeOwned<'a> for MaybeOwned<'a> {
1243     #[inline]
1244     fn into_maybe_owned(self) -> MaybeOwned<'a> { self }
1245 }
1246
1247 impl<'a> Eq for MaybeOwned<'a> {
1248     #[inline]
1249     fn eq(&self, other: &MaybeOwned) -> bool {
1250         self.as_slice() == other.as_slice()
1251     }
1252 }
1253
1254 impl<'a> TotalEq for MaybeOwned<'a> {}
1255
1256 impl<'a> Ord for MaybeOwned<'a> {
1257     #[inline]
1258     fn lt(&self, other: &MaybeOwned) -> bool {
1259         self.as_slice().lt(&other.as_slice())
1260     }
1261 }
1262
1263 impl<'a> TotalOrd for MaybeOwned<'a> {
1264     #[inline]
1265     fn cmp(&self, other: &MaybeOwned) -> Ordering {
1266         self.as_slice().cmp(&other.as_slice())
1267     }
1268 }
1269
1270 impl<'a, S: Str> Equiv<S> for MaybeOwned<'a> {
1271     #[inline]
1272     fn equiv(&self, other: &S) -> bool {
1273         self.as_slice() == other.as_slice()
1274     }
1275 }
1276
1277 impl<'a> Str for MaybeOwned<'a> {
1278     #[inline]
1279     fn as_slice<'b>(&'b self) -> &'b str {
1280         match *self {
1281             Slice(s) => s,
1282             Owned(ref s) => s.as_slice()
1283         }
1284     }
1285
1286     #[inline]
1287     fn into_owned(self) -> ~str {
1288         match self {
1289             Slice(s) => s.to_owned(),
1290             Owned(s) => s
1291         }
1292     }
1293 }
1294
1295 impl<'a> Container for MaybeOwned<'a> {
1296     #[inline]
1297     fn len(&self) -> uint { self.as_slice().len() }
1298 }
1299
1300 impl<'a> Clone for MaybeOwned<'a> {
1301     #[inline]
1302     fn clone(&self) -> MaybeOwned<'a> {
1303         match *self {
1304             Slice(s) => Slice(s),
1305             Owned(ref s) => Owned(s.to_owned())
1306         }
1307     }
1308 }
1309
1310 impl<'a> Default for MaybeOwned<'a> {
1311     #[inline]
1312     fn default() -> MaybeOwned<'a> { Slice("") }
1313 }
1314
1315 impl<'a, H: Writer> ::hash::Hash<H> for MaybeOwned<'a> {
1316     #[inline]
1317     fn hash(&self, hasher: &mut H) {
1318         match *self {
1319             Slice(s) => s.hash(hasher),
1320             Owned(ref s) => s.hash(hasher),
1321         }
1322     }
1323 }
1324
1325 impl<'a> fmt::Show for MaybeOwned<'a> {
1326     #[inline]
1327     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1328         match *self {
1329             Slice(ref s) => s.fmt(f),
1330             Owned(ref s) => s.fmt(f)
1331         }
1332     }
1333 }
1334
1335 /// Unsafe operations
1336 pub mod raw {
1337     use cast;
1338     use container::Container;
1339     use libc;
1340     use ptr;
1341     use ptr::RawPtr;
1342     use str::{is_utf8, OwnedStr, StrSlice};
1343     use slice;
1344     use slice::{MutableVector, ImmutableVector, OwnedVector};
1345     use raw::Slice;
1346
1347     /// Create a Rust string from a *u8 buffer of the given length
1348     pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
1349         let mut v: ~[u8] = slice::with_capacity(len);
1350         ptr::copy_memory(v.as_mut_ptr(), buf, len);
1351         v.set_len(len);
1352
1353         assert!(is_utf8(v));
1354         ::cast::transmute(v)
1355     }
1356
1357     #[lang="strdup_uniq"]
1358     #[cfg(not(test))]
1359     #[inline]
1360     unsafe fn strdup_uniq(ptr: *u8, len: uint) -> ~str {
1361         from_buf_len(ptr, len)
1362     }
1363
1364     /// Create a Rust string from a null-terminated C string
1365     pub unsafe fn from_c_str(buf: *libc::c_char) -> ~str {
1366         let mut curr = buf;
1367         let mut i = 0;
1368         while *curr != 0 {
1369             i += 1;
1370             curr = buf.offset(i);
1371         }
1372         from_buf_len(buf as *u8, i as uint)
1373     }
1374
1375     /// Converts a slice of bytes to a string slice without checking
1376     /// that the string contains valid UTF-8.
1377     pub unsafe fn from_utf8<'a>(v: &'a [u8]) -> &'a str {
1378         cast::transmute(v)
1379     }
1380
1381     /// Converts an owned vector of bytes to a new owned string. This assumes
1382     /// that the utf-8-ness of the vector has already been validated
1383     #[inline]
1384     pub unsafe fn from_utf8_owned(v: ~[u8]) -> ~str {
1385         cast::transmute(v)
1386     }
1387
1388     /// Converts a byte to a string.
1389     pub unsafe fn from_byte(u: u8) -> ~str { from_utf8_owned(~[u]) }
1390
1391     /// Form a slice from a C string. Unsafe because the caller must ensure the
1392     /// C string has the static lifetime, or else the return value may be
1393     /// invalidated later.
1394     pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
1395         let s = s as *u8;
1396         let mut curr = s;
1397         let mut len = 0u;
1398         while *curr != 0u8 {
1399             len += 1u;
1400             curr = s.offset(len as int);
1401         }
1402         let v = Slice { data: s, len: len };
1403         assert!(is_utf8(::cast::transmute(v)));
1404         ::cast::transmute(v)
1405     }
1406
1407     /// Takes a bytewise (not UTF-8) slice from a string.
1408     ///
1409     /// Returns the substring from [`begin`..`end`).
1410     ///
1411     /// # Failure
1412     ///
1413     /// If begin is greater than end.
1414     /// If end is greater than the length of the string.
1415     #[inline]
1416     pub unsafe fn slice_bytes<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1417         assert!(begin <= end);
1418         assert!(end <= s.len());
1419         slice_unchecked(s, begin, end)
1420     }
1421
1422     /// Takes a bytewise (not UTF-8) slice from a string.
1423     ///
1424     /// Returns the substring from [`begin`..`end`).
1425     ///
1426     /// Caller must check slice boundaries!
1427     #[inline]
1428     pub unsafe fn slice_unchecked<'a>(s: &'a str, begin: uint, end: uint) -> &'a str {
1429         cast::transmute(Slice {
1430                 data: s.as_ptr().offset(begin as int),
1431                 len: end - begin,
1432             })
1433     }
1434
1435     /// Access the str in its vector representation.
1436     /// The caller must preserve the valid UTF-8 property when modifying.
1437     #[inline]
1438     pub unsafe fn as_owned_vec<'a>(s: &'a mut ~str) -> &'a mut ~[u8] {
1439         cast::transmute(s)
1440     }
1441
1442     /// Sets the length of a string
1443     ///
1444     /// This will explicitly set the size of the string, without actually
1445     /// modifing its buffers, so it is up to the caller to ensure that
1446     /// the string is actually the specified size.
1447     #[test]
1448     fn test_from_buf_len() {
1449         unsafe {
1450             let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
1451             let b = a.as_ptr();
1452             let c = from_buf_len(b, 3u);
1453             assert_eq!(c, ~"AAA");
1454         }
1455     }
1456 }
1457
1458 /*
1459 Section: Trait implementations
1460 */
1461
1462 #[cfg(not(test))]
1463 #[allow(missing_doc)]
1464 pub mod traits {
1465     use container::Container;
1466     use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
1467     use iter::Iterator;
1468     use ops::Add;
1469     use option::{Some, None};
1470     use str::{Str, StrSlice, eq_slice};
1471     use strbuf::StrBuf;
1472
1473     impl<'a> Add<&'a str,~str> for &'a str {
1474         #[inline]
1475         fn add(&self, rhs: & &'a str) -> ~str {
1476             let mut ret = StrBuf::from_owned_str(self.to_owned());
1477             ret.push_str(*rhs);
1478             ret.into_owned()
1479         }
1480     }
1481
1482     impl<'a> TotalOrd for &'a str {
1483         #[inline]
1484         fn cmp(&self, other: & &'a str) -> Ordering {
1485             for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1486                 match s_b.cmp(&o_b) {
1487                     Greater => return Greater,
1488                     Less => return Less,
1489                     Equal => ()
1490                 }
1491             }
1492
1493             self.len().cmp(&other.len())
1494         }
1495     }
1496
1497     impl TotalOrd for ~str {
1498         #[inline]
1499         fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
1500     }
1501
1502     impl<'a> Eq for &'a str {
1503         #[inline]
1504         fn eq(&self, other: & &'a str) -> bool {
1505             eq_slice((*self), (*other))
1506         }
1507         #[inline]
1508         fn ne(&self, other: & &'a str) -> bool { !(*self).eq(other) }
1509     }
1510
1511     impl Eq for ~str {
1512         #[inline]
1513         fn eq(&self, other: &~str) -> bool {
1514             eq_slice((*self), (*other))
1515         }
1516     }
1517
1518     impl<'a> TotalEq for &'a str {}
1519
1520     impl TotalEq for ~str {}
1521
1522     impl<'a> Ord for &'a str {
1523         #[inline]
1524         fn lt(&self, other: & &'a str) -> bool { self.cmp(other) == Less }
1525     }
1526
1527     impl Ord for ~str {
1528         #[inline]
1529         fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1530     }
1531
1532     impl<'a, S: Str> Equiv<S> for &'a str {
1533         #[inline]
1534         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1535     }
1536
1537     impl<'a, S: Str> Equiv<S> for ~str {
1538         #[inline]
1539         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1540     }
1541 }
1542
1543 #[cfg(test)]
1544 pub mod traits {}
1545
1546 /// Any string that can be represented as a slice
1547 pub trait Str {
1548     /// Work with `self` as a slice.
1549     fn as_slice<'a>(&'a self) -> &'a str;
1550
1551     /// Convert `self` into a ~str, not making a copy if possible.
1552     fn into_owned(self) -> ~str;
1553
1554     /// Convert `self` into a `StrBuf`.
1555     #[inline]
1556     fn to_strbuf(&self) -> StrBuf {
1557         StrBuf::from_str(self.as_slice())
1558     }
1559
1560     /// Convert `self` into a `StrBuf`, not making a copy if possible.
1561     #[inline]
1562     fn into_strbuf(self) -> StrBuf {
1563         StrBuf::from_owned_str(self.into_owned())
1564     }
1565 }
1566
1567 impl<'a> Str for &'a str {
1568     #[inline]
1569     fn as_slice<'a>(&'a self) -> &'a str { *self }
1570
1571     #[inline]
1572     fn into_owned(self) -> ~str { self.to_owned() }
1573 }
1574
1575 impl<'a> Str for ~str {
1576     #[inline]
1577     fn as_slice<'a>(&'a self) -> &'a str {
1578         let s: &'a str = *self; s
1579     }
1580
1581     #[inline]
1582     fn into_owned(self) -> ~str { self }
1583 }
1584
1585 impl<'a> Container for &'a str {
1586     #[inline]
1587     fn len(&self) -> uint {
1588         self.repr().len
1589     }
1590 }
1591
1592 impl Container for ~str {
1593     #[inline]
1594     fn len(&self) -> uint { self.as_slice().len() }
1595 }
1596
1597 impl Mutable for ~str {
1598     /// Remove all content, make the string empty
1599     #[inline]
1600     fn clear(&mut self) {
1601         unsafe {
1602             self.set_len(0)
1603         }
1604     }
1605 }
1606
1607 /// Methods for string slices
1608 pub trait StrSlice<'a> {
1609     /// Returns true if one string contains another
1610     ///
1611     /// # Arguments
1612     ///
1613     /// - needle - The string to look for
1614     fn contains<'a>(&self, needle: &'a str) -> bool;
1615
1616     /// Returns true if a string contains a char.
1617     ///
1618     /// # Arguments
1619     ///
1620     /// - needle - The char to look for
1621     fn contains_char(&self, needle: char) -> bool;
1622
1623     /// An iterator over the characters of `self`. Note, this iterates
1624     /// over unicode code-points, not unicode graphemes.
1625     ///
1626     /// # Example
1627     ///
1628     /// ```rust
1629     /// let v: ~[char] = "abc åäö".chars().collect();
1630     /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1631     /// ```
1632     fn chars(&self) -> Chars<'a>;
1633
1634     /// An iterator over the characters of `self`, in reverse order.
1635     fn chars_rev(&self) -> RevChars<'a>;
1636
1637     /// An iterator over the bytes of `self`
1638     fn bytes(&self) -> Bytes<'a>;
1639
1640     /// An iterator over the bytes of `self`, in reverse order
1641     fn bytes_rev(&self) -> RevBytes<'a>;
1642
1643     /// An iterator over the characters of `self` and their byte offsets.
1644     fn char_indices(&self) -> CharOffsets<'a>;
1645
1646     /// An iterator over the characters of `self` and their byte offsets,
1647     /// in reverse order.
1648     fn char_indices_rev(&self) -> RevCharOffsets<'a>;
1649
1650     /// An iterator over substrings of `self`, separated by characters
1651     /// matched by `sep`.
1652     ///
1653     /// # Example
1654     ///
1655     /// ```rust
1656     /// let v: ~[&str] = "Mary had a little lamb".split(' ').collect();
1657     /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1658     ///
1659     /// let v: ~[&str] = "abc1def2ghi".split(|c: char| c.is_digit()).collect();
1660     /// assert_eq!(v, ~["abc", "def", "ghi"]);
1661     ///
1662     /// let v: ~[&str] = "lionXXtigerXleopard".split('X').collect();
1663     /// assert_eq!(v, ~["lion", "", "tiger", "leopard"]);
1664     /// ```
1665     fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1666
1667     /// An iterator over substrings of `self`, separated by characters
1668     /// matched by `sep`, restricted to splitting at most `count`
1669     /// times.
1670     ///
1671     /// # Example
1672     ///
1673     /// ```rust
1674     /// let v: ~[&str] = "Mary had a little lambda".splitn(' ', 2).collect();
1675     /// assert_eq!(v, ~["Mary", "had", "a little lambda"]);
1676     ///
1677     /// let v: ~[&str] = "abc1def2ghi".splitn(|c: char| c.is_digit(), 1).collect();
1678     /// assert_eq!(v, ~["abc", "def2ghi"]);
1679     ///
1680     /// let v: ~[&str] = "lionXXtigerXleopard".splitn('X', 2).collect();
1681     /// assert_eq!(v, ~["lion", "", "tigerXleopard"]);
1682     /// ```
1683     fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1684
1685     /// An iterator over substrings of `self`, separated by characters
1686     /// matched by `sep`.
1687     ///
1688     /// Equivalent to `split`, except that the trailing substring
1689     /// is skipped if empty (terminator semantics).
1690     ///
1691     /// # Example
1692     ///
1693     /// ```rust
1694     /// let v: ~[&str] = "A.B.".split_terminator('.').collect();
1695     /// assert_eq!(v, ~["A", "B"]);
1696     ///
1697     /// let v: ~[&str] = "A..B..".split_terminator('.').collect();
1698     /// assert_eq!(v, ~["A", "", "B", ""]);
1699     /// ```
1700     fn split_terminator<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep>;
1701
1702     /// An iterator over substrings of `self`, separated by characters
1703     /// matched by `sep`, in reverse order.
1704     ///
1705     /// # Example
1706     ///
1707     /// ```rust
1708     /// let v: ~[&str] = "Mary had a little lamb".rsplit(' ').collect();
1709     /// assert_eq!(v, ~["lamb", "little", "a", "had", "Mary"]);
1710     ///
1711     /// let v: ~[&str] = "abc1def2ghi".rsplit(|c: char| c.is_digit()).collect();
1712     /// assert_eq!(v, ~["ghi", "def", "abc"]);
1713     ///
1714     /// let v: ~[&str] = "lionXXtigerXleopard".rsplit('X').collect();
1715     /// assert_eq!(v, ~["leopard", "tiger", "", "lion"]);
1716     /// ```
1717     fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep>;
1718
1719     /// An iterator over substrings of `self`, separated by characters
1720     /// matched by `sep`, starting from the end of the string.
1721     /// Restricted to splitting at most `count` times.
1722     ///
1723     /// # Example
1724     ///
1725     /// ```rust
1726     /// let v: ~[&str] = "Mary had a little lamb".rsplitn(' ', 2).collect();
1727     /// assert_eq!(v, ~["lamb", "little", "Mary had a"]);
1728     ///
1729     /// let v: ~[&str] = "abc1def2ghi".rsplitn(|c: char| c.is_digit(), 1).collect();
1730     /// assert_eq!(v, ~["ghi", "abc1def"]);
1731     ///
1732     /// let v: ~[&str] = "lionXXtigerXleopard".rsplitn('X', 2).collect();
1733     /// assert_eq!(v, ~["leopard", "tiger", "lionX"]);
1734     /// ```
1735     fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint) -> CharSplitsN<'a, Sep>;
1736
1737     /// An iterator over the start and end indices of the disjoint
1738     /// matches of `sep` within `self`.
1739     ///
1740     /// That is, each returned value `(start, end)` satisfies
1741     /// `self.slice(start, end) == sep`. For matches of `sep` within
1742     /// `self` that overlap, only the indicies corresponding to the
1743     /// first match are returned.
1744     ///
1745     /// # Example
1746     ///
1747     /// ```rust
1748     /// let v: ~[(uint, uint)] = "abcXXXabcYYYabc".match_indices("abc").collect();
1749     /// assert_eq!(v, ~[(0,3), (6,9), (12,15)]);
1750     ///
1751     /// let v: ~[(uint, uint)] = "1abcabc2".match_indices("abc").collect();
1752     /// assert_eq!(v, ~[(1,4), (4,7)]);
1753     ///
1754     /// let v: ~[(uint, uint)] = "ababa".match_indices("aba").collect();
1755     /// assert_eq!(v, ~[(0, 3)]); // only the first `aba`
1756     /// ```
1757     fn match_indices(&self, sep: &'a str) -> MatchIndices<'a>;
1758
1759     /// An iterator over the substrings of `self` separated by `sep`.
1760     ///
1761     /// # Example
1762     ///
1763     /// ```rust
1764     /// let v: ~[&str] = "abcXXXabcYYYabc".split_str("abc").collect();
1765     /// assert_eq!(v, ~["", "XXX", "YYY", ""]);
1766     ///
1767     /// let v: ~[&str] = "1abcabc2".split_str("abc").collect();
1768     /// assert_eq!(v, ~["1", "", "2"]);
1769     /// ```
1770     fn split_str(&self, &'a str) -> StrSplits<'a>;
1771
1772     /// An iterator over the lines of a string (subsequences separated
1773     /// by `\n`). This does not include the empty string after a
1774     /// trailing `\n`.
1775     ///
1776     /// # Example
1777     ///
1778     /// ```rust
1779     /// let four_lines = "foo\nbar\n\nbaz\n";
1780     /// let v: ~[&str] = four_lines.lines().collect();
1781     /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1782     /// ```
1783     fn lines(&self) -> CharSplits<'a, char>;
1784
1785     /// An iterator over the lines of a string, separated by either
1786     /// `\n` or `\r\n`. As with `.lines()`, this does not include an
1787     /// empty trailing line.
1788     ///
1789     /// # Example
1790     ///
1791     /// ```rust
1792     /// let four_lines = "foo\r\nbar\n\r\nbaz\n";
1793     /// let v: ~[&str] = four_lines.lines_any().collect();
1794     /// assert_eq!(v, ~["foo", "bar", "", "baz"]);
1795     /// ```
1796     fn lines_any(&self) -> AnyLines<'a>;
1797
1798     /// An iterator over the words of a string (subsequences separated
1799     /// by any sequence of whitespace). Sequences of whitespace are
1800     /// collapsed, so empty "words" are not included.
1801     ///
1802     /// # Example
1803     ///
1804     /// ```rust
1805     /// let some_words = " Mary   had\ta little  \n\t lamb";
1806     /// let v: ~[&str] = some_words.words().collect();
1807     /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1808     /// ```
1809     fn words(&self) -> Words<'a>;
1810
1811     /// An Iterator over the string in Unicode Normalization Form D
1812     /// (canonical decomposition).
1813     fn nfd_chars(&self) -> Normalizations<'a>;
1814
1815     /// An Iterator over the string in Unicode Normalization Form KD
1816     /// (compatibility decomposition).
1817     fn nfkd_chars(&self) -> Normalizations<'a>;
1818
1819     /// Returns true if the string contains only whitespace.
1820     ///
1821     /// Whitespace characters are determined by `char::is_whitespace`.
1822     ///
1823     /// # Example
1824     ///
1825     /// ```rust
1826     /// assert!(" \t\n".is_whitespace());
1827     /// assert!("".is_whitespace());
1828     ///
1829     /// assert!( !"abc".is_whitespace());
1830     /// ```
1831     fn is_whitespace(&self) -> bool;
1832
1833     /// Returns true if the string contains only alphanumeric code
1834     /// points.
1835     ///
1836     /// Alphanumeric characters are determined by `char::is_alphanumeric`.
1837     ///
1838     /// # Example
1839     ///
1840     /// ```rust
1841     /// assert!("Löwe老虎Léopard123".is_alphanumeric());
1842     /// assert!("".is_alphanumeric());
1843     ///
1844     /// assert!( !" &*~".is_alphanumeric());
1845     /// ```
1846     fn is_alphanumeric(&self) -> bool;
1847
1848     /// Returns the number of Unicode code points (`char`) that a
1849     /// string holds.
1850     ///
1851     /// This does not perform any normalization, and is `O(n)`, since
1852     /// UTF-8 is a variable width encoding of code points.
1853     ///
1854     /// *Warning*: The number of code points in a string does not directly
1855     /// correspond to the number of visible characters or width of the
1856     /// visible text due to composing characters, and double- and
1857     /// zero-width ones.
1858     ///
1859     /// See also `.len()` for the byte length.
1860     ///
1861     /// # Example
1862     ///
1863     /// ```rust
1864     /// // composed forms of `ö` and `é`
1865     /// let c = "Löwe 老虎 Léopard"; // German, Simplified Chinese, French
1866     /// // decomposed forms of `ö` and `é`
1867     /// let d = "Lo\u0308we 老虎 Le\u0301opard";
1868     ///
1869     /// assert_eq!(c.char_len(), 15);
1870     /// assert_eq!(d.char_len(), 17);
1871     ///
1872     /// assert_eq!(c.len(), 21);
1873     /// assert_eq!(d.len(), 23);
1874     ///
1875     /// // the two strings *look* the same
1876     /// println!("{}", c);
1877     /// println!("{}", d);
1878     /// ```
1879     fn char_len(&self) -> uint;
1880
1881     /// Returns a slice of the given string from the byte range
1882     /// [`begin`..`end`).
1883     ///
1884     /// This operation is `O(1)`.
1885     ///
1886     /// Fails when `begin` and `end` do not point to valid characters
1887     /// or point beyond the last character of the string.
1888     ///
1889     /// See also `slice_to` and `slice_from` for slicing prefixes and
1890     /// suffixes of strings, and `slice_chars` for slicing based on
1891     /// code point counts.
1892     ///
1893     /// # Example
1894     ///
1895     /// ```rust
1896     /// let s = "Löwe 老虎 Léopard";
1897     /// assert_eq!(s.slice(0, 1), "L");
1898     ///
1899     /// assert_eq!(s.slice(1, 9), "öwe 老");
1900     ///
1901     /// // these will fail:
1902     /// // byte 2 lies within `ö`:
1903     /// // s.slice(2, 3);
1904     ///
1905     /// // byte 8 lies within `老`
1906     /// // s.slice(1, 8);
1907     ///
1908     /// // byte 100 is outside the string
1909     /// // s.slice(3, 100);
1910     /// ```
1911     fn slice(&self, begin: uint, end: uint) -> &'a str;
1912
1913     /// Returns a slice of the string from `begin` to its end.
1914     ///
1915     /// Equivalent to `self.slice(begin, self.len())`.
1916     ///
1917     /// Fails when `begin` does not point to a valid character, or is
1918     /// out of bounds.
1919     ///
1920     /// See also `slice`, `slice_to` and `slice_chars`.
1921     fn slice_from(&self, begin: uint) -> &'a str;
1922
1923     /// Returns a slice of the string from the beginning to byte
1924     /// `end`.
1925     ///
1926     /// Equivalent to `self.slice(0, end)`.
1927     ///
1928     /// Fails when `end` does not point to a valid character, or is
1929     /// out of bounds.
1930     ///
1931     /// See also `slice`, `slice_from` and `slice_chars`.
1932     fn slice_to(&self, end: uint) -> &'a str;
1933
1934     /// Returns a slice of the string from the character range
1935     /// [`begin`..`end`).
1936     ///
1937     /// That is, start at the `begin`-th code point of the string and
1938     /// continue to the `end`-th code point. This does not detect or
1939     /// handle edge cases such as leaving a combining character as the
1940     /// first code point of the string.
1941     ///
1942     /// Due to the design of UTF-8, this operation is `O(end)`.
1943     /// See `slice`, `slice_to` and `slice_from` for `O(1)`
1944     /// variants that use byte indices rather than code point
1945     /// indices.
1946     ///
1947     /// Fails if `begin` > `end` or the either `begin` or `end` are
1948     /// beyond the last character of the string.
1949     ///
1950     /// # Example
1951     ///
1952     /// ```rust
1953     /// let s = "Löwe 老虎 Léopard";
1954     /// assert_eq!(s.slice_chars(0, 4), "Löwe");
1955     /// assert_eq!(s.slice_chars(5, 7), "老虎");
1956     /// ```
1957     fn slice_chars(&self, begin: uint, end: uint) -> &'a str;
1958
1959     /// Returns true if `needle` is a prefix of the string.
1960     fn starts_with(&self, needle: &str) -> bool;
1961
1962     /// Returns true if `needle` is a suffix of the string.
1963     fn ends_with(&self, needle: &str) -> bool;
1964
1965     /// Escape each char in `s` with `char::escape_default`.
1966     fn escape_default(&self) -> ~str;
1967
1968     /// Escape each char in `s` with `char::escape_unicode`.
1969     fn escape_unicode(&self) -> ~str;
1970
1971     /// Returns a string with leading and trailing whitespace removed.
1972     fn trim(&self) -> &'a str;
1973
1974     /// Returns a string with leading whitespace removed.
1975     fn trim_left(&self) -> &'a str;
1976
1977     /// Returns a string with trailing whitespace removed.
1978     fn trim_right(&self) -> &'a str;
1979
1980     /// Returns a string with characters that match `to_trim` removed.
1981     ///
1982     /// # Arguments
1983     ///
1984     /// * to_trim - a character matcher
1985     ///
1986     /// # Example
1987     ///
1988     /// ```rust
1989     /// assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1990     /// assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1991     /// assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1992     /// ```
1993     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
1994
1995     /// Returns a string with leading `chars_to_trim` removed.
1996     ///
1997     /// # Arguments
1998     ///
1999     /// * to_trim - a character matcher
2000     ///
2001     /// # Example
2002     ///
2003     /// ```rust
2004     /// assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
2005     /// assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
2006     /// assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
2007     /// ```
2008     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2009
2010     /// Returns a string with trailing `chars_to_trim` removed.
2011     ///
2012     /// # Arguments
2013     ///
2014     /// * to_trim - a character matcher
2015     ///
2016     /// # Example
2017     ///
2018     /// ```rust
2019     /// assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
2020     /// assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
2021     /// assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
2022     /// ```
2023     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str;
2024
2025     /// Replace all occurrences of one string with another.
2026     ///
2027     /// # Arguments
2028     ///
2029     /// * `from` - The string to replace
2030     /// * `to` - The replacement string
2031     ///
2032     /// # Return value
2033     ///
2034     /// The original string with all occurances of `from` replaced with `to`.
2035     ///
2036     /// # Example
2037     ///
2038     /// ```rust
2039     /// let s = ~"Do you know the muffin man,
2040     /// The muffin man, the muffin man, ...";
2041     ///
2042     /// assert_eq!(s.replace("muffin man", "little lamb"),
2043     ///            ~"Do you know the little lamb,
2044     /// The little lamb, the little lamb, ...");
2045     ///
2046     /// // not found, so no change.
2047     /// assert_eq!(s.replace("cookie monster", "little lamb"), s);
2048     /// ```
2049     fn replace(&self, from: &str, to: &str) -> ~str;
2050
2051     /// Copy a slice into a new owned str.
2052     fn to_owned(&self) -> ~str;
2053
2054     /// Converts to a vector of `u16` encoded as UTF-16.
2055     fn to_utf16(&self) -> ~[u16];
2056
2057     /// Check that `index`-th byte lies at the start and/or end of a
2058     /// UTF-8 code point sequence.
2059     ///
2060     /// The start and end of the string (when `index == self.len()`)
2061     /// are considered to be boundaries.
2062     ///
2063     /// Fails if `index` is greater than `self.len()`.
2064     ///
2065     /// # Example
2066     ///
2067     /// ```rust
2068     /// let s = "Löwe 老虎 Léopard";
2069     /// assert!(s.is_char_boundary(0));
2070     /// // start of `老`
2071     /// assert!(s.is_char_boundary(6));
2072     /// assert!(s.is_char_boundary(s.len()));
2073     ///
2074     /// // second byte of `ö`
2075     /// assert!(!s.is_char_boundary(2));
2076     ///
2077     /// // third byte of `老`
2078     /// assert!(!s.is_char_boundary(8));
2079     /// ```
2080     fn is_char_boundary(&self, index: uint) -> bool;
2081
2082     /// Pluck a character out of a string and return the index of the next
2083     /// character.
2084     ///
2085     /// This function can be used to iterate over the unicode characters of a
2086     /// string.
2087     ///
2088     /// # Example
2089     ///
2090     /// This example manually iterate through the characters of a
2091     /// string; this should normally by done by `.chars()` or
2092     /// `.char_indices`.
2093     ///
2094     /// ```rust
2095     /// use std::str::CharRange;
2096     ///
2097     /// let s = "中华Việt Nam";
2098     /// let mut i = 0u;
2099     /// while i < s.len() {
2100     ///     let CharRange {ch, next} = s.char_range_at(i);
2101     ///     println!("{}: {}", i, ch);
2102     ///     i = next;
2103     /// }
2104     /// ```
2105     ///
2106     /// ## Output
2107     ///
2108     /// ```ignore
2109     /// 0: 中
2110     /// 3: 华
2111     /// 6: V
2112     /// 7: i
2113     /// 8: ệ
2114     /// 11: t
2115     /// 12:
2116     /// 13: N
2117     /// 14: a
2118     /// 15: m
2119     /// ```
2120     ///
2121     /// # Arguments
2122     ///
2123     /// * s - The string
2124     /// * i - The byte offset of the char to extract
2125     ///
2126     /// # Return value
2127     ///
2128     /// A record {ch: char, next: uint} containing the char value and the byte
2129     /// index of the next unicode character.
2130     ///
2131     /// # Failure
2132     ///
2133     /// If `i` is greater than or equal to the length of the string.
2134     /// If `i` is not the index of the beginning of a valid UTF-8 character.
2135     fn char_range_at(&self, start: uint) -> CharRange;
2136
2137     /// Given a byte position and a str, return the previous char and its position.
2138     ///
2139     /// This function can be used to iterate over a unicode string in reverse.
2140     ///
2141     /// Returns 0 for next index if called on start index 0.
2142     fn char_range_at_reverse(&self, start: uint) -> CharRange;
2143
2144     /// Plucks the character starting at the `i`th byte of a string
2145     fn char_at(&self, i: uint) -> char;
2146
2147     /// Plucks the character ending at the `i`th byte of a string
2148     fn char_at_reverse(&self, i: uint) -> char;
2149
2150     /// Work with the byte buffer of a string as a byte slice.
2151     fn as_bytes(&self) -> &'a [u8];
2152
2153     /// Returns the byte index of the first character of `self` that
2154     /// matches `search`.
2155     ///
2156     /// # Return value
2157     ///
2158     /// `Some` containing the byte index of the last matching character
2159     /// or `None` if there is no match
2160     ///
2161     /// # Example
2162     ///
2163     /// ```rust
2164     /// let s = "Löwe 老虎 Léopard";
2165     ///
2166     /// assert_eq!(s.find('L'), Some(0));
2167     /// assert_eq!(s.find('é'), Some(14));
2168     ///
2169     /// // the first space
2170     /// assert_eq!(s.find(|c: char| c.is_whitespace()), Some(5));
2171     ///
2172     /// // neither are found
2173     /// assert_eq!(s.find(&['1', '2']), None);
2174     /// ```
2175     fn find<C: CharEq>(&self, search: C) -> Option<uint>;
2176
2177     /// Returns the byte index of the last character of `self` that
2178     /// matches `search`.
2179     ///
2180     /// # Return value
2181     ///
2182     /// `Some` containing the byte index of the last matching character
2183     /// or `None` if there is no match.
2184     ///
2185     /// # Example
2186     ///
2187     /// ```rust
2188     /// let s = "Löwe 老虎 Léopard";
2189     ///
2190     /// assert_eq!(s.rfind('L'), Some(13));
2191     /// assert_eq!(s.rfind('é'), Some(14));
2192     ///
2193     /// // the second space
2194     /// assert_eq!(s.rfind(|c: char| c.is_whitespace()), Some(12));
2195     ///
2196     /// // searches for an occurrence of either `1` or `2`, but neither are found
2197     /// assert_eq!(s.rfind(&['1', '2']), None);
2198     /// ```
2199     fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
2200
2201     /// Returns the byte index of the first matching substring
2202     ///
2203     /// # Arguments
2204     ///
2205     /// * `needle` - The string to search for
2206     ///
2207     /// # Return value
2208     ///
2209     /// `Some` containing the byte index of the first matching substring
2210     /// or `None` if there is no match.
2211     ///
2212     /// # Example
2213     ///
2214     /// ```rust
2215     /// let s = "Löwe 老虎 Léopard";
2216     ///
2217     /// assert_eq!(s.find_str("老虎 L"), Some(6));
2218     /// assert_eq!(s.find_str("muffin man"), None);
2219     /// ```
2220     fn find_str(&self, &str) -> Option<uint>;
2221
2222     /// Given a string, make a new string with repeated copies of it.
2223     fn repeat(&self, nn: uint) -> ~str;
2224
2225     /// Retrieves the first character from a string slice and returns
2226     /// it. This does not allocate a new string; instead, it returns a
2227     /// slice that point one character beyond the character that was
2228     /// shifted. If the string does not contain any characters,
2229     /// a tuple of None and an empty string is returned instead.
2230     ///
2231     /// # Example
2232     ///
2233     /// ```rust
2234     /// let s = "Löwe 老虎 Léopard";
2235     /// let (c, s1) = s.slice_shift_char();
2236     /// assert_eq!(c, Some('L'));
2237     /// assert_eq!(s1, "öwe 老虎 Léopard");
2238     ///
2239     /// let (c, s2) = s1.slice_shift_char();
2240     /// assert_eq!(c, Some('ö'));
2241     /// assert_eq!(s2, "we 老虎 Léopard");
2242     /// ```
2243     fn slice_shift_char(&self) -> (Option<char>, &'a str);
2244
2245     /// Levenshtein Distance between two strings.
2246     fn lev_distance(&self, t: &str) -> uint;
2247
2248     /// Returns the byte offset of an inner slice relative to an enclosing outer slice.
2249     ///
2250     /// Fails if `inner` is not a direct slice contained within self.
2251     ///
2252     /// # Example
2253     ///
2254     /// ```rust
2255     /// let string = "a\nb\nc";
2256     /// let lines: ~[&str] = string.lines().collect();
2257     ///
2258     /// assert!(string.subslice_offset(lines[0]) == 0); // &"a"
2259     /// assert!(string.subslice_offset(lines[1]) == 2); // &"b"
2260     /// assert!(string.subslice_offset(lines[2]) == 4); // &"c"
2261     /// ```
2262     fn subslice_offset(&self, inner: &str) -> uint;
2263
2264     /// Return an unsafe pointer to the strings buffer.
2265     ///
2266     /// The caller must ensure that the string outlives this pointer,
2267     /// and that it is not reallocated (e.g. by pushing to the
2268     /// string).
2269     fn as_ptr(&self) -> *u8;
2270 }
2271
2272 impl<'a> StrSlice<'a> for &'a str {
2273     #[inline]
2274     fn contains<'a>(&self, needle: &'a str) -> bool {
2275         self.find_str(needle).is_some()
2276     }
2277
2278     #[inline]
2279     fn contains_char(&self, needle: char) -> bool {
2280         self.find(needle).is_some()
2281     }
2282
2283     #[inline]
2284     fn chars(&self) -> Chars<'a> {
2285         Chars{string: *self}
2286     }
2287
2288     #[inline]
2289     fn chars_rev(&self) -> RevChars<'a> {
2290         self.chars().rev()
2291     }
2292
2293     #[inline]
2294     fn bytes(&self) -> Bytes<'a> {
2295         self.as_bytes().iter().map(|&b| b)
2296     }
2297
2298     #[inline]
2299     fn bytes_rev(&self) -> RevBytes<'a> {
2300         self.bytes().rev()
2301     }
2302
2303     #[inline]
2304     fn char_indices(&self) -> CharOffsets<'a> {
2305         CharOffsets{string: *self, iter: self.chars()}
2306     }
2307
2308     #[inline]
2309     fn char_indices_rev(&self) -> RevCharOffsets<'a> {
2310         self.char_indices().rev()
2311     }
2312
2313     #[inline]
2314     fn split<Sep: CharEq>(&self, sep: Sep) -> CharSplits<'a, Sep> {
2315         CharSplits {
2316             string: *self,
2317             only_ascii: sep.only_ascii(),
2318             sep: sep,
2319             allow_trailing_empty: true,
2320             finished: false,
2321         }
2322     }
2323
2324     #[inline]
2325     fn splitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2326         -> CharSplitsN<'a, Sep> {
2327         CharSplitsN {
2328             iter: self.split(sep),
2329             count: count,
2330             invert: false,
2331         }
2332     }
2333
2334     #[inline]
2335     fn split_terminator<Sep: CharEq>(&self, sep: Sep)
2336         -> CharSplits<'a, Sep> {
2337         CharSplits {
2338             allow_trailing_empty: false,
2339             ..self.split(sep)
2340         }
2341     }
2342
2343     #[inline]
2344     fn rsplit<Sep: CharEq>(&self, sep: Sep) -> RevCharSplits<'a, Sep> {
2345         self.split(sep).rev()
2346     }
2347
2348     #[inline]
2349     fn rsplitn<Sep: CharEq>(&self, sep: Sep, count: uint)
2350         -> CharSplitsN<'a, Sep> {
2351         CharSplitsN {
2352             iter: self.split(sep),
2353             count: count,
2354             invert: true,
2355         }
2356     }
2357
2358     #[inline]
2359     fn match_indices(&self, sep: &'a str) -> MatchIndices<'a> {
2360         assert!(!sep.is_empty())
2361         MatchIndices {
2362             haystack: *self,
2363             needle: sep,
2364             position: 0
2365         }
2366     }
2367
2368     #[inline]
2369     fn split_str(&self, sep: &'a str) -> StrSplits<'a> {
2370         StrSplits {
2371             it: self.match_indices(sep),
2372             last_end: 0,
2373             finished: false
2374         }
2375     }
2376
2377     #[inline]
2378     fn lines(&self) -> CharSplits<'a, char> {
2379         self.split_terminator('\n')
2380     }
2381
2382     fn lines_any(&self) -> AnyLines<'a> {
2383         self.lines().map(|line| {
2384             let l = line.len();
2385             if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
2386             else { line }
2387         })
2388     }
2389
2390     #[inline]
2391     fn words(&self) -> Words<'a> {
2392         self.split(char::is_whitespace).filter(|s| !s.is_empty())
2393     }
2394
2395     #[inline]
2396     fn nfd_chars(&self) -> Normalizations<'a> {
2397         Normalizations {
2398             iter: self.chars(),
2399             buffer: ~[],
2400             sorted: false,
2401             kind: NFD
2402         }
2403     }
2404
2405     #[inline]
2406     fn nfkd_chars(&self) -> Normalizations<'a> {
2407         Normalizations {
2408             iter: self.chars(),
2409             buffer: ~[],
2410             sorted: false,
2411             kind: NFKD
2412         }
2413     }
2414
2415     #[inline]
2416     fn is_whitespace(&self) -> bool { self.chars().all(char::is_whitespace) }
2417
2418     #[inline]
2419     fn is_alphanumeric(&self) -> bool { self.chars().all(char::is_alphanumeric) }
2420
2421     #[inline]
2422     fn char_len(&self) -> uint { self.chars().len() }
2423
2424     #[inline]
2425     fn slice(&self, begin: uint, end: uint) -> &'a str {
2426         assert!(self.is_char_boundary(begin) && self.is_char_boundary(end));
2427         unsafe { raw::slice_bytes(*self, begin, end) }
2428     }
2429
2430     #[inline]
2431     fn slice_from(&self, begin: uint) -> &'a str {
2432         self.slice(begin, self.len())
2433     }
2434
2435     #[inline]
2436     fn slice_to(&self, end: uint) -> &'a str {
2437         assert!(self.is_char_boundary(end));
2438         unsafe { raw::slice_bytes(*self, 0, end) }
2439     }
2440
2441     fn slice_chars(&self, begin: uint, end: uint) -> &'a str {
2442         assert!(begin <= end);
2443         let mut count = 0;
2444         let mut begin_byte = None;
2445         let mut end_byte = None;
2446
2447         // This could be even more efficient by not decoding,
2448         // only finding the char boundaries
2449         for (idx, _) in self.char_indices() {
2450             if count == begin { begin_byte = Some(idx); }
2451             if count == end { end_byte = Some(idx); break; }
2452             count += 1;
2453         }
2454         if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
2455         if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
2456
2457         match (begin_byte, end_byte) {
2458             (None, _) => fail!("slice_chars: `begin` is beyond end of string"),
2459             (_, None) => fail!("slice_chars: `end` is beyond end of string"),
2460             (Some(a), Some(b)) => unsafe { raw::slice_bytes(*self, a, b) }
2461         }
2462     }
2463
2464     #[inline]
2465     fn starts_with<'a>(&self, needle: &'a str) -> bool {
2466         let n = needle.len();
2467         self.len() >= n && needle.as_bytes() == self.as_bytes().slice_to(n)
2468     }
2469
2470     #[inline]
2471     fn ends_with(&self, needle: &str) -> bool {
2472         let (m, n) = (self.len(), needle.len());
2473         m >= n && needle.as_bytes() == self.as_bytes().slice_from(m - n)
2474     }
2475
2476     fn escape_default(&self) -> ~str {
2477         let mut out = StrBuf::with_capacity(self.len());
2478         for c in self.chars() {
2479             c.escape_default(|c| out.push_char(c));
2480         }
2481         out.into_owned()
2482     }
2483
2484     fn escape_unicode(&self) -> ~str {
2485         let mut out = StrBuf::with_capacity(self.len());
2486         for c in self.chars() {
2487             c.escape_unicode(|c| out.push_char(c));
2488         }
2489         out.into_owned()
2490     }
2491
2492     #[inline]
2493     fn trim(&self) -> &'a str {
2494         self.trim_left().trim_right()
2495     }
2496
2497     #[inline]
2498     fn trim_left(&self) -> &'a str {
2499         self.trim_left_chars(&char::is_whitespace)
2500     }
2501
2502     #[inline]
2503     fn trim_right(&self) -> &'a str {
2504         self.trim_right_chars(&char::is_whitespace)
2505     }
2506
2507     #[inline]
2508     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2509         self.trim_left_chars(to_trim).trim_right_chars(to_trim)
2510     }
2511
2512     #[inline]
2513     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2514         match self.find(|c: char| !to_trim.matches(c)) {
2515             None => "",
2516             Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
2517         }
2518     }
2519
2520     #[inline]
2521     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'a str {
2522         match self.rfind(|c: char| !to_trim.matches(c)) {
2523             None => "",
2524             Some(last) => {
2525                 let next = self.char_range_at(last).next;
2526                 unsafe { raw::slice_bytes(*self, 0u, next) }
2527             }
2528         }
2529     }
2530
2531     fn replace(&self, from: &str, to: &str) -> ~str {
2532         let mut result = StrBuf::new();
2533         let mut last_end = 0;
2534         for (start, end) in self.match_indices(from) {
2535             result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
2536             result.push_str(to);
2537             last_end = end;
2538         }
2539         result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
2540         result.into_owned()
2541     }
2542
2543     #[inline]
2544     fn to_owned(&self) -> ~str {
2545         let len = self.len();
2546         unsafe {
2547             let mut v = slice::with_capacity(len);
2548
2549             ptr::copy_memory(v.as_mut_ptr(), self.as_ptr(), len);
2550             v.set_len(len);
2551             ::cast::transmute(v)
2552         }
2553     }
2554
2555     fn to_utf16(&self) -> ~[u16] {
2556         let mut u = ~[];
2557         for ch in self.chars() {
2558             // Arithmetic with u32 literals is easier on the eyes than chars.
2559             let mut ch = ch as u32;
2560
2561             if (ch & 0xFFFF_u32) == ch {
2562                 // The BMP falls through (assuming non-surrogate, as it
2563                 // should)
2564                 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
2565                 u.push(ch as u16)
2566             } else {
2567                 // Supplementary planes break into surrogates.
2568                 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
2569                 ch -= 0x1_0000_u32;
2570                 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
2571                 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
2572                 u.push_all([w1, w2])
2573             }
2574         }
2575         u
2576     }
2577
2578     #[inline]
2579     fn is_char_boundary(&self, index: uint) -> bool {
2580         if index == self.len() { return true; }
2581         let b = self[index];
2582         return b < 128u8 || b >= 192u8;
2583     }
2584
2585     #[inline]
2586     fn char_range_at(&self, i: uint) -> CharRange {
2587         if self[i] < 128u8 {
2588             return CharRange {ch: self[i] as char, next: i + 1 };
2589         }
2590
2591         // Multibyte case is a fn to allow char_range_at to inline cleanly
2592         fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
2593             let mut val = s[i] as u32;
2594             let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2595             assert!((w != 0));
2596
2597             val = utf8_first_byte!(val, w);
2598             val = utf8_acc_cont_byte!(val, s[i + 1]);
2599             if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2600             if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2601
2602             return CharRange {ch: unsafe { transmute(val) }, next: i + w};
2603         }
2604
2605         return multibyte_char_range_at(*self, i);
2606     }
2607
2608     #[inline]
2609     fn char_range_at_reverse(&self, start: uint) -> CharRange {
2610         let mut prev = start;
2611
2612         prev = prev.saturating_sub(1);
2613         if self[prev] < 128 { return CharRange{ch: self[prev] as char, next: prev} }
2614
2615         // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
2616         fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
2617             // while there is a previous byte == 10......
2618             while i > 0 && s[i] & 192u8 == TAG_CONT_U8 {
2619                 i -= 1u;
2620             }
2621
2622             let mut val = s[i] as u32;
2623             let w = UTF8_CHAR_WIDTH[val as uint] as uint;
2624             assert!((w != 0));
2625
2626             val = utf8_first_byte!(val, w);
2627             val = utf8_acc_cont_byte!(val, s[i + 1]);
2628             if w > 2 { val = utf8_acc_cont_byte!(val, s[i + 2]); }
2629             if w > 3 { val = utf8_acc_cont_byte!(val, s[i + 3]); }
2630
2631             return CharRange {ch: unsafe { transmute(val) }, next: i};
2632         }
2633
2634         return multibyte_char_range_at_reverse(*self, prev);
2635     }
2636
2637     #[inline]
2638     fn char_at(&self, i: uint) -> char {
2639         self.char_range_at(i).ch
2640     }
2641
2642     #[inline]
2643     fn char_at_reverse(&self, i: uint) -> char {
2644         self.char_range_at_reverse(i).ch
2645     }
2646
2647     #[inline]
2648     fn as_bytes(&self) -> &'a [u8] {
2649         unsafe { cast::transmute(*self) }
2650     }
2651
2652     fn find<C: CharEq>(&self, search: C) -> Option<uint> {
2653         if search.only_ascii() {
2654             self.bytes().position(|b| search.matches(b as char))
2655         } else {
2656             for (index, c) in self.char_indices() {
2657                 if search.matches(c) { return Some(index); }
2658             }
2659             None
2660         }
2661     }
2662
2663     fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
2664         if search.only_ascii() {
2665             self.bytes().rposition(|b| search.matches(b as char))
2666         } else {
2667             for (index, c) in self.char_indices_rev() {
2668                 if search.matches(c) { return Some(index); }
2669             }
2670             None
2671         }
2672     }
2673
2674     fn find_str(&self, needle: &str) -> Option<uint> {
2675         if needle.is_empty() {
2676             Some(0)
2677         } else {
2678             self.match_indices(needle)
2679                 .next()
2680                 .map(|(start, _end)| start)
2681         }
2682     }
2683
2684     fn repeat(&self, nn: uint) -> ~str {
2685         let mut ret = StrBuf::with_capacity(nn * self.len());
2686         for _ in range(0, nn) {
2687             ret.push_str(*self);
2688         }
2689         ret.into_owned()
2690     }
2691
2692     #[inline]
2693     fn slice_shift_char(&self) -> (Option<char>, &'a str) {
2694         if self.is_empty() {
2695             return (None, *self);
2696         } else {
2697             let CharRange {ch, next} = self.char_range_at(0u);
2698             let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
2699             return (Some(ch), next_s);
2700         }
2701     }
2702
2703     fn lev_distance(&self, t: &str) -> uint {
2704         let slen = self.len();
2705         let tlen = t.len();
2706
2707         if slen == 0 { return tlen; }
2708         if tlen == 0 { return slen; }
2709
2710         let mut dcol = slice::from_fn(tlen + 1, |x| x);
2711
2712         for (i, sc) in self.chars().enumerate() {
2713
2714             let mut current = i;
2715             dcol[0] = current + 1;
2716
2717             for (j, tc) in t.chars().enumerate() {
2718
2719                 let next = dcol[j + 1];
2720
2721                 if sc == tc {
2722                     dcol[j + 1] = current;
2723                 } else {
2724                     dcol[j + 1] = ::cmp::min(current, next);
2725                     dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
2726                 }
2727
2728                 current = next;
2729             }
2730         }
2731
2732         return dcol[tlen];
2733     }
2734
2735     fn subslice_offset(&self, inner: &str) -> uint {
2736         let a_start = self.as_ptr() as uint;
2737         let a_end = a_start + self.len();
2738         let b_start = inner.as_ptr() as uint;
2739         let b_end = b_start + inner.len();
2740
2741         assert!(a_start <= b_start);
2742         assert!(b_end <= a_end);
2743         b_start - a_start
2744     }
2745
2746     #[inline]
2747     fn as_ptr(&self) -> *u8 {
2748         self.repr().data
2749     }
2750 }
2751
2752 /// Methods for owned strings
2753 pub trait OwnedStr {
2754     /// Shorten a string to the specified length (which must be <= the current length)
2755     fn truncate(&mut self, len: uint);
2756
2757     /// Consumes the string, returning the underlying byte buffer.
2758     ///
2759     /// The buffer does not have a null terminator.
2760     fn into_bytes(self) -> ~[u8];
2761
2762     /// Sets the length of a string
2763     ///
2764     /// This will explicitly set the size of the string, without actually
2765     /// modifying its buffers, so it is up to the caller to ensure that
2766     /// the string is actually the specified size.
2767     unsafe fn set_len(&mut self, new_len: uint);
2768
2769     /// Pushes the given string onto this string, returning the concatenation of the two strings.
2770     fn append(self, rhs: &str) -> ~str;
2771 }
2772
2773 impl OwnedStr for ~str {
2774     #[inline]
2775     fn truncate(&mut self, len: uint) {
2776         assert!(len <= self.len());
2777         assert!(self.is_char_boundary(len));
2778         unsafe { self.set_len(len); }
2779     }
2780
2781     #[inline]
2782     fn into_bytes(self) -> ~[u8] {
2783         unsafe { cast::transmute(self) }
2784     }
2785
2786     #[inline]
2787     unsafe fn set_len(&mut self, new_len: uint) {
2788         raw::as_owned_vec(self).set_len(new_len)
2789     }
2790
2791     #[inline]
2792     fn append(self, rhs: &str) -> ~str {
2793         let mut new_str = StrBuf::from_owned_str(self);
2794         new_str.push_str(rhs);
2795         new_str.into_owned()
2796     }
2797 }
2798
2799 impl Clone for ~str {
2800     #[inline]
2801     fn clone(&self) -> ~str {
2802         self.to_owned()
2803     }
2804 }
2805
2806 impl FromIterator<char> for ~str {
2807     #[inline]
2808     fn from_iter<T: Iterator<char>>(iterator: T) -> ~str {
2809         let (lower, _) = iterator.size_hint();
2810         let mut buf = StrBuf::with_capacity(lower);
2811         buf.extend(iterator);
2812         buf.into_owned()
2813     }
2814 }
2815
2816 // This works because every lifetime is a sub-lifetime of 'static
2817 impl<'a> Default for &'a str {
2818     fn default() -> &'a str { "" }
2819 }
2820
2821 impl Default for ~str {
2822     fn default() -> ~str { ~"" }
2823 }
2824
2825 #[cfg(test)]
2826 mod tests {
2827     use iter::AdditiveIterator;
2828     use default::Default;
2829     use prelude::*;
2830     use str::*;
2831     use strbuf::StrBuf;
2832
2833     #[test]
2834     fn test_eq() {
2835         assert!((eq(&~"", &~"")));
2836         assert!((eq(&~"foo", &~"foo")));
2837         assert!((!eq(&~"foo", &~"bar")));
2838     }
2839
2840     #[test]
2841     fn test_eq_slice() {
2842         assert!((eq_slice("foobar".slice(0, 3), "foo")));
2843         assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2844         assert!((!eq_slice("foo1", "foo2")));
2845     }
2846
2847     #[test]
2848     fn test_le() {
2849         assert!("" <= "");
2850         assert!("" <= "foo");
2851         assert!("foo" <= "foo");
2852         assert!("foo" != "bar");
2853     }
2854
2855     #[test]
2856     fn test_len() {
2857         assert_eq!("".len(), 0u);
2858         assert_eq!("hello world".len(), 11u);
2859         assert_eq!("\x63".len(), 1u);
2860         assert_eq!("\xa2".len(), 2u);
2861         assert_eq!("\u03c0".len(), 2u);
2862         assert_eq!("\u2620".len(), 3u);
2863         assert_eq!("\U0001d11e".len(), 4u);
2864
2865         assert_eq!("".char_len(), 0u);
2866         assert_eq!("hello world".char_len(), 11u);
2867         assert_eq!("\x63".char_len(), 1u);
2868         assert_eq!("\xa2".char_len(), 1u);
2869         assert_eq!("\u03c0".char_len(), 1u);
2870         assert_eq!("\u2620".char_len(), 1u);
2871         assert_eq!("\U0001d11e".char_len(), 1u);
2872         assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2873     }
2874
2875     #[test]
2876     fn test_find() {
2877         assert_eq!("hello".find('l'), Some(2u));
2878         assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2879         assert!("hello".find('x').is_none());
2880         assert!("hello".find(|c:char| c == 'x').is_none());
2881         assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2882         assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2883     }
2884
2885     #[test]
2886     fn test_rfind() {
2887         assert_eq!("hello".rfind('l'), Some(3u));
2888         assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2889         assert!("hello".rfind('x').is_none());
2890         assert!("hello".rfind(|c:char| c == 'x').is_none());
2891         assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2892         assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2893     }
2894
2895     #[test]
2896     fn test_collect() {
2897         let empty = ~"";
2898         let s: ~str = empty.chars().collect();
2899         assert_eq!(empty, s);
2900         let data = ~"ประเทศไทย中";
2901         let s: ~str = data.chars().collect();
2902         assert_eq!(data, s);
2903     }
2904
2905     #[test]
2906     fn test_into_bytes() {
2907         let data = ~"asdf";
2908         let buf = data.into_bytes();
2909         assert_eq!(bytes!("asdf"), buf.as_slice());
2910     }
2911
2912     #[test]
2913     fn test_find_str() {
2914         // byte positions
2915         assert_eq!("".find_str(""), Some(0u));
2916         assert!("banana".find_str("apple pie").is_none());
2917
2918         let data = "abcabc";
2919         assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2920         assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2921         assert!(data.slice(2u, 4u).find_str("ab").is_none());
2922
2923         let mut data = ~"ประเทศไทย中华Việt Nam";
2924         data = data + data;
2925         assert!(data.find_str("ไท华").is_none());
2926         assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2927         assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2928
2929         assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2930         assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2931         assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2932         assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2933         assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2934
2935         assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2936         assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2937         assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2938         assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2939         assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2940     }
2941
2942     #[test]
2943     fn test_slice_chars() {
2944         fn t(a: &str, b: &str, start: uint) {
2945             assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2946         }
2947         t("", "", 0);
2948         t("hello", "llo", 2);
2949         t("hello", "el", 1);
2950         t("αβλ", "β", 1);
2951         t("αβλ", "", 3);
2952         assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2953     }
2954
2955     #[test]
2956     fn test_concat() {
2957         fn t(v: &[~str], s: &str) {
2958             assert_eq!(v.concat(), s.to_str());
2959         }
2960         t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2961         let v: &[~str] = [];
2962         t(v, "");
2963         t([~"hi"], "hi");
2964     }
2965
2966     #[test]
2967     fn test_connect() {
2968         fn t(v: &[~str], sep: &str, s: &str) {
2969             assert_eq!(v.connect(sep), s.to_str());
2970         }
2971         t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2972           " ", "you know I'm no good");
2973         let v: &[~str] = [];
2974         t(v, " ", "");
2975         t([~"hi"], " ", "hi");
2976     }
2977
2978     #[test]
2979     fn test_concat_slices() {
2980         fn t(v: &[&str], s: &str) {
2981             assert_eq!(v.concat(), s.to_str());
2982         }
2983         t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2984         let v: &[&str] = [];
2985         t(v, "");
2986         t(["hi"], "hi");
2987     }
2988
2989     #[test]
2990     fn test_connect_slices() {
2991         fn t(v: &[&str], sep: &str, s: &str) {
2992             assert_eq!(v.connect(sep), s.to_str());
2993         }
2994         t(["you", "know", "I'm", "no", "good"],
2995           " ", "you know I'm no good");
2996         t([], " ", "");
2997         t(["hi"], " ", "hi");
2998     }
2999
3000     #[test]
3001     fn test_repeat() {
3002         assert_eq!("x".repeat(4), ~"xxxx");
3003         assert_eq!("hi".repeat(4), ~"hihihihi");
3004         assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
3005         assert_eq!("".repeat(4), ~"");
3006         assert_eq!("hi".repeat(0), ~"");
3007     }
3008
3009     #[test]
3010     fn test_unsafe_slice() {
3011         assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
3012         assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
3013         assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
3014         fn a_million_letter_a() -> ~str {
3015             let mut i = 0;
3016             let mut rs = StrBuf::new();
3017             while i < 100000 {
3018                 rs.push_str("aaaaaaaaaa");
3019                 i += 1;
3020             }
3021             rs.into_owned()
3022         }
3023         fn half_a_million_letter_a() -> ~str {
3024             let mut i = 0;
3025             let mut rs = StrBuf::new();
3026             while i < 100000 {
3027                 rs.push_str("aaaaa");
3028                 i += 1;
3029             }
3030             rs.into_owned()
3031         }
3032         let letters = a_million_letter_a();
3033         assert!(half_a_million_letter_a() ==
3034             unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
3035     }
3036
3037     #[test]
3038     fn test_starts_with() {
3039         assert!(("".starts_with("")));
3040         assert!(("abc".starts_with("")));
3041         assert!(("abc".starts_with("a")));
3042         assert!((!"a".starts_with("abc")));
3043         assert!((!"".starts_with("abc")));
3044         assert!((!"ödd".starts_with("-")));
3045         assert!(("ödd".starts_with("öd")));
3046     }
3047
3048     #[test]
3049     fn test_ends_with() {
3050         assert!(("".ends_with("")));
3051         assert!(("abc".ends_with("")));
3052         assert!(("abc".ends_with("c")));
3053         assert!((!"a".ends_with("abc")));
3054         assert!((!"".ends_with("abc")));
3055         assert!((!"ddö".ends_with("-")));
3056         assert!(("ddö".ends_with("dö")));
3057     }
3058
3059     #[test]
3060     fn test_is_empty() {
3061         assert!("".is_empty());
3062         assert!(!"a".is_empty());
3063     }
3064
3065     #[test]
3066     fn test_replace() {
3067         let a = "a";
3068         assert_eq!("".replace(a, "b"), ~"");
3069         assert_eq!("a".replace(a, "b"), ~"b");
3070         assert_eq!("ab".replace(a, "b"), ~"bb");
3071         let test = "test";
3072         assert!(" test test ".replace(test, "toast") ==
3073             ~" toast toast ");
3074         assert_eq!(" test test ".replace(test, ""), ~"   ");
3075     }
3076
3077     #[test]
3078     fn test_replace_2a() {
3079         let data = ~"ประเทศไทย中华";
3080         let repl = ~"دولة الكويت";
3081
3082         let a = ~"ประเ";
3083         let a2 = ~"دولة الكويتทศไทย中华";
3084         assert_eq!(data.replace(a, repl), a2);
3085     }
3086
3087     #[test]
3088     fn test_replace_2b() {
3089         let data = ~"ประเทศไทย中华";
3090         let repl = ~"دولة الكويت";
3091
3092         let b = ~"ะเ";
3093         let b2 = ~"ปรدولة الكويتทศไทย中华";
3094         assert_eq!(data.replace(b, repl), b2);
3095     }
3096
3097     #[test]
3098     fn test_replace_2c() {
3099         let data = ~"ประเทศไทย中华";
3100         let repl = ~"دولة الكويت";
3101
3102         let c = ~"中华";
3103         let c2 = ~"ประเทศไทยدولة الكويت";
3104         assert_eq!(data.replace(c, repl), c2);
3105     }
3106
3107     #[test]
3108     fn test_replace_2d() {
3109         let data = ~"ประเทศไทย中华";
3110         let repl = ~"دولة الكويت";
3111
3112         let d = ~"ไท华";
3113         assert_eq!(data.replace(d, repl), data);
3114     }
3115
3116     #[test]
3117     fn test_slice() {
3118         assert_eq!("ab", "abc".slice(0, 2));
3119         assert_eq!("bc", "abc".slice(1, 3));
3120         assert_eq!("", "abc".slice(1, 1));
3121         assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
3122
3123         let data = "ประเทศไทย中华";
3124         assert_eq!("ป", data.slice(0, 3));
3125         assert_eq!("ร", data.slice(3, 6));
3126         assert_eq!("", data.slice(3, 3));
3127         assert_eq!("华", data.slice(30, 33));
3128
3129         fn a_million_letter_X() -> ~str {
3130             let mut i = 0;
3131             let mut rs = StrBuf::new();
3132             while i < 100000 {
3133                 rs.push_str("华华华华华华华华华华");
3134                 i += 1;
3135             }
3136             rs.into_owned()
3137         }
3138         fn half_a_million_letter_X() -> ~str {
3139             let mut i = 0;
3140             let mut rs = StrBuf::new();
3141             while i < 100000 {
3142                 rs.push_str("华华华华华");
3143                 i += 1;
3144             }
3145             rs.into_owned()
3146         }
3147         let letters = a_million_letter_X();
3148         assert!(half_a_million_letter_X() ==
3149             letters.slice(0u, 3u * 500000u).to_owned());
3150     }
3151
3152     #[test]
3153     fn test_slice_2() {
3154         let ss = "中华Việt Nam";
3155
3156         assert_eq!("华", ss.slice(3u, 6u));
3157         assert_eq!("Việt Nam", ss.slice(6u, 16u));
3158
3159         assert_eq!("ab", "abc".slice(0u, 2u));
3160         assert_eq!("bc", "abc".slice(1u, 3u));
3161         assert_eq!("", "abc".slice(1u, 1u));
3162
3163         assert_eq!("中", ss.slice(0u, 3u));
3164         assert_eq!("华V", ss.slice(3u, 7u));
3165         assert_eq!("", ss.slice(3u, 3u));
3166         /*0: 中
3167           3: 华
3168           6: V
3169           7: i
3170           8: ệ
3171          11: t
3172          12:
3173          13: N
3174          14: a
3175          15: m */
3176     }
3177
3178     #[test]
3179     #[should_fail]
3180     fn test_slice_fail() {
3181         "中华Việt Nam".slice(0u, 2u);
3182     }
3183
3184     #[test]
3185     fn test_slice_from() {
3186         assert_eq!("abcd".slice_from(0), "abcd");
3187         assert_eq!("abcd".slice_from(2), "cd");
3188         assert_eq!("abcd".slice_from(4), "");
3189     }
3190     #[test]
3191     fn test_slice_to() {
3192         assert_eq!("abcd".slice_to(0), "");
3193         assert_eq!("abcd".slice_to(2), "ab");
3194         assert_eq!("abcd".slice_to(4), "abcd");
3195     }
3196
3197     #[test]
3198     fn test_trim_left_chars() {
3199         let v: &[char] = &[];
3200         assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
3201         assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3202         assert_eq!(" ***  *** ".trim_left_chars(& &['*', ' ']), "");
3203         assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
3204
3205         assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
3206         assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
3207         assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
3208     }
3209
3210     #[test]
3211     fn test_trim_right_chars() {
3212         let v: &[char] = &[];
3213         assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
3214         assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
3215         assert_eq!(" ***  *** ".trim_right_chars(& &['*', ' ']), "");
3216         assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
3217
3218         assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
3219         assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
3220         assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
3221     }
3222
3223     #[test]
3224     fn test_trim_chars() {
3225         let v: &[char] = &[];
3226         assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
3227         assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
3228         assert_eq!(" ***  *** ".trim_chars(& &['*', ' ']), "");
3229         assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
3230
3231         assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
3232         assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
3233         assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
3234     }
3235
3236     #[test]
3237     fn test_trim_left() {
3238         assert_eq!("".trim_left(), "");
3239         assert_eq!("a".trim_left(), "a");
3240         assert_eq!("    ".trim_left(), "");
3241         assert_eq!("     blah".trim_left(), "blah");
3242         assert_eq!("   \u3000  wut".trim_left(), "wut");
3243         assert_eq!("hey ".trim_left(), "hey ");
3244     }
3245
3246     #[test]
3247     fn test_trim_right() {
3248         assert_eq!("".trim_right(), "");
3249         assert_eq!("a".trim_right(), "a");
3250         assert_eq!("    ".trim_right(), "");
3251         assert_eq!("blah     ".trim_right(), "blah");
3252         assert_eq!("wut   \u3000  ".trim_right(), "wut");
3253         assert_eq!(" hey".trim_right(), " hey");
3254     }
3255
3256     #[test]
3257     fn test_trim() {
3258         assert_eq!("".trim(), "");
3259         assert_eq!("a".trim(), "a");
3260         assert_eq!("    ".trim(), "");
3261         assert_eq!("    blah     ".trim(), "blah");
3262         assert_eq!("\nwut   \u3000  ".trim(), "wut");
3263         assert_eq!(" hey dude ".trim(), "hey dude");
3264     }
3265
3266     #[test]
3267     fn test_is_whitespace() {
3268         assert!("".is_whitespace());
3269         assert!(" ".is_whitespace());
3270         assert!("\u2009".is_whitespace()); // Thin space
3271         assert!("  \n\t   ".is_whitespace());
3272         assert!(!"   _   ".is_whitespace());
3273     }
3274
3275     #[test]
3276     fn test_slice_shift_char() {
3277         let data = "ประเทศไทย中";
3278         assert_eq!(data.slice_shift_char(), (Some('ป'), "ระเทศไทย中"));
3279     }
3280
3281     #[test]
3282     fn test_slice_shift_char_2() {
3283         let empty = "";
3284         assert_eq!(empty.slice_shift_char(), (None, ""));
3285     }
3286
3287     #[test]
3288     fn test_is_utf8() {
3289         // deny overlong encodings
3290         assert!(!is_utf8([0xc0, 0x80]));
3291         assert!(!is_utf8([0xc0, 0xae]));
3292         assert!(!is_utf8([0xe0, 0x80, 0x80]));
3293         assert!(!is_utf8([0xe0, 0x80, 0xaf]));
3294         assert!(!is_utf8([0xe0, 0x81, 0x81]));
3295         assert!(!is_utf8([0xf0, 0x82, 0x82, 0xac]));
3296         assert!(!is_utf8([0xf4, 0x90, 0x80, 0x80]));
3297
3298         // deny surrogates
3299         assert!(!is_utf8([0xED, 0xA0, 0x80]));
3300         assert!(!is_utf8([0xED, 0xBF, 0xBF]));
3301
3302         assert!(is_utf8([0xC2, 0x80]));
3303         assert!(is_utf8([0xDF, 0xBF]));
3304         assert!(is_utf8([0xE0, 0xA0, 0x80]));
3305         assert!(is_utf8([0xED, 0x9F, 0xBF]));
3306         assert!(is_utf8([0xEE, 0x80, 0x80]));
3307         assert!(is_utf8([0xEF, 0xBF, 0xBF]));
3308         assert!(is_utf8([0xF0, 0x90, 0x80, 0x80]));
3309         assert!(is_utf8([0xF4, 0x8F, 0xBF, 0xBF]));
3310     }
3311
3312     #[test]
3313     fn test_is_utf16() {
3314         macro_rules! pos ( ($($e:expr),*) => { { $(assert!(is_utf16($e));)* } });
3315
3316         // non-surrogates
3317         pos!([0x0000],
3318              [0x0001, 0x0002],
3319              [0xD7FF],
3320              [0xE000]);
3321
3322         // surrogate pairs (randomly generated with Python 3's
3323         // .encode('utf-16be'))
3324         pos!([0xdb54, 0xdf16, 0xd880, 0xdee0, 0xdb6a, 0xdd45],
3325              [0xd91f, 0xdeb1, 0xdb31, 0xdd84, 0xd8e2, 0xde14],
3326              [0xdb9f, 0xdc26, 0xdb6f, 0xde58, 0xd850, 0xdfae]);
3327
3328         // mixtures (also random)
3329         pos!([0xd921, 0xdcc2, 0x002d, 0x004d, 0xdb32, 0xdf65],
3330              [0xdb45, 0xdd2d, 0x006a, 0xdacd, 0xddfe, 0x0006],
3331              [0x0067, 0xd8ff, 0xddb7, 0x000f, 0xd900, 0xdc80]);
3332
3333         // negative tests
3334         macro_rules! neg ( ($($e:expr),*) => { { $(assert!(!is_utf16($e));)* } });
3335
3336         neg!(
3337             // surrogate + regular unit
3338             [0xdb45, 0x0000],
3339             // surrogate + lead surrogate
3340             [0xd900, 0xd900],
3341             // unterminated surrogate
3342             [0xd8ff],
3343             // trail surrogate without a lead
3344             [0xddb7]);
3345
3346         // random byte sequences that Python 3's .decode('utf-16be')
3347         // failed on
3348         neg!([0x5b3d, 0x0141, 0xde9e, 0x8fdc, 0xc6e7],
3349              [0xdf5a, 0x82a5, 0x62b9, 0xb447, 0x92f3],
3350              [0xda4e, 0x42bc, 0x4462, 0xee98, 0xc2ca],
3351              [0xbe00, 0xb04a, 0x6ecb, 0xdd89, 0xe278],
3352              [0x0465, 0xab56, 0xdbb6, 0xa893, 0x665e],
3353              [0x6b7f, 0x0a19, 0x40f4, 0xa657, 0xdcc5],
3354              [0x9b50, 0xda5e, 0x24ec, 0x03ad, 0x6dee],
3355              [0x8d17, 0xcaa7, 0xf4ae, 0xdf6e, 0xbed7],
3356              [0xdaee, 0x2584, 0x7d30, 0xa626, 0x121a],
3357              [0xd956, 0x4b43, 0x7570, 0xccd6, 0x4f4a],
3358              [0x9dcf, 0x1b49, 0x4ba5, 0xfce9, 0xdffe],
3359              [0x6572, 0xce53, 0xb05a, 0xf6af, 0xdacf],
3360              [0x1b90, 0x728c, 0x9906, 0xdb68, 0xf46e],
3361              [0x1606, 0xbeca, 0xbe76, 0x860f, 0xdfa5],
3362              [0x8b4f, 0xde7a, 0xd220, 0x9fac, 0x2b6f],
3363              [0xb8fe, 0xebbe, 0xda32, 0x1a5f, 0x8b8b],
3364              [0x934b, 0x8956, 0xc434, 0x1881, 0xddf7],
3365              [0x5a95, 0x13fc, 0xf116, 0xd89b, 0x93f9],
3366              [0xd640, 0x71f1, 0xdd7d, 0x77eb, 0x1cd8],
3367              [0x348b, 0xaef0, 0xdb2c, 0xebf1, 0x1282],
3368              [0x50d7, 0xd824, 0x5010, 0xb369, 0x22ea]);
3369     }
3370
3371     #[test]
3372     fn test_raw_from_c_str() {
3373         unsafe {
3374             let a = ~[65, 65, 65, 65, 65, 65, 65, 0];
3375             let b = a.as_ptr();
3376             let c = raw::from_c_str(b);
3377             assert_eq!(c, ~"AAAAAAA");
3378         }
3379     }
3380
3381     #[test]
3382     fn test_as_bytes() {
3383         // no null
3384         let v = [
3385             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3386             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3387             109
3388         ];
3389         assert_eq!("".as_bytes(), &[]);
3390         assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3391         assert_eq!("ศไทย中华Việt Nam".as_bytes(), v.as_slice());
3392     }
3393
3394     #[test]
3395     #[should_fail]
3396     fn test_as_bytes_fail() {
3397         // Don't double free. (I'm not sure if this exercises the
3398         // original problem code path anymore.)
3399         let s = ~"";
3400         let _bytes = s.as_bytes();
3401         fail!();
3402     }
3403
3404     #[test]
3405     fn test_as_ptr() {
3406         let buf = "hello".as_ptr();
3407         unsafe {
3408             assert_eq!(*buf.offset(0), 'h' as u8);
3409             assert_eq!(*buf.offset(1), 'e' as u8);
3410             assert_eq!(*buf.offset(2), 'l' as u8);
3411             assert_eq!(*buf.offset(3), 'l' as u8);
3412             assert_eq!(*buf.offset(4), 'o' as u8);
3413         }
3414     }
3415
3416     #[test]
3417     fn test_subslice_offset() {
3418         let a = "kernelsprite";
3419         let b = a.slice(7, a.len());
3420         let c = a.slice(0, a.len() - 6);
3421         assert_eq!(a.subslice_offset(b), 7);
3422         assert_eq!(a.subslice_offset(c), 0);
3423
3424         let string = "a\nb\nc";
3425         let mut lines = ~[];
3426         for line in string.lines() { lines.push(line) }
3427         assert_eq!(string.subslice_offset(lines[0]), 0);
3428         assert_eq!(string.subslice_offset(lines[1]), 2);
3429         assert_eq!(string.subslice_offset(lines[2]), 4);
3430     }
3431
3432     #[test]
3433     #[should_fail]
3434     fn test_subslice_offset_2() {
3435         let a = "alchemiter";
3436         let b = "cruxtruder";
3437         a.subslice_offset(b);
3438     }
3439
3440     #[test]
3441     fn vec_str_conversions() {
3442         let s1: ~str = ~"All mimsy were the borogoves";
3443
3444         let v: ~[u8] = s1.as_bytes().to_owned();
3445         let s2: ~str = from_utf8(v).unwrap().to_owned();
3446         let mut i: uint = 0u;
3447         let n1: uint = s1.len();
3448         let n2: uint = v.len();
3449         assert_eq!(n1, n2);
3450         while i < n1 {
3451             let a: u8 = s1[i];
3452             let b: u8 = s2[i];
3453             debug!("{}", a);
3454             debug!("{}", b);
3455             assert_eq!(a, b);
3456             i += 1u;
3457         }
3458     }
3459
3460     #[test]
3461     fn test_contains() {
3462         assert!("abcde".contains("bcd"));
3463         assert!("abcde".contains("abcd"));
3464         assert!("abcde".contains("bcde"));
3465         assert!("abcde".contains(""));
3466         assert!("".contains(""));
3467         assert!(!"abcde".contains("def"));
3468         assert!(!"".contains("a"));
3469
3470         let data = ~"ประเทศไทย中华Việt Nam";
3471         assert!(data.contains("ประเ"));
3472         assert!(data.contains("ะเ"));
3473         assert!(data.contains("中华"));
3474         assert!(!data.contains("ไท华"));
3475     }
3476
3477     #[test]
3478     fn test_contains_char() {
3479         assert!("abc".contains_char('b'));
3480         assert!("a".contains_char('a'));
3481         assert!(!"abc".contains_char('d'));
3482         assert!(!"".contains_char('a'));
3483     }
3484
3485     #[test]
3486     fn test_utf16() {
3487         let pairs =
3488             [(~"𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n",
3489               ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3490                 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3491                 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3492                 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3493
3494              (~"𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n",
3495               ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3496                 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3497                 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3498                 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3499                 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3500                 0x000a_u16]),
3501
3502              (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3503               ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3504                 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3505                 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3506                 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3507                 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3508                 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3509                 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3510
3511              (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3512               ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3513                 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3514                 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3515                 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3516                 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3517                 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3518                 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3519                 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3520                 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3521                 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3522                 0x000a_u16 ]),
3523              // Issue #12318, even-numbered non-BMP planes
3524              (~"\U00020000",
3525               ~[0xD840, 0xDC00])];
3526
3527         for p in pairs.iter() {
3528             let (s, u) = (*p).clone();
3529             assert!(is_utf16(u));
3530             assert_eq!(s.to_utf16(), u);
3531
3532             assert_eq!(from_utf16(u).unwrap(), s);
3533             assert_eq!(from_utf16_lossy(u), s);
3534
3535             assert_eq!(from_utf16(s.to_utf16()).unwrap(), s);
3536             assert_eq!(from_utf16(u).unwrap().to_utf16(), u);
3537         }
3538     }
3539
3540     #[test]
3541     fn test_utf16_invalid() {
3542         // completely positive cases tested above.
3543         // lead + eof
3544         assert_eq!(from_utf16([0xD800]), None);
3545         // lead + lead
3546         assert_eq!(from_utf16([0xD800, 0xD800]), None);
3547
3548         // isolated trail
3549         assert_eq!(from_utf16([0x0061, 0xDC00]), None);
3550
3551         // general
3552         assert_eq!(from_utf16([0xD800, 0xd801, 0xdc8b, 0xD800]), None);
3553     }
3554
3555     #[test]
3556     fn test_utf16_lossy() {
3557         // completely positive cases tested above.
3558         // lead + eof
3559         assert_eq!(from_utf16_lossy([0xD800]), ~"\uFFFD");
3560         // lead + lead
3561         assert_eq!(from_utf16_lossy([0xD800, 0xD800]), ~"\uFFFD\uFFFD");
3562
3563         // isolated trail
3564         assert_eq!(from_utf16_lossy([0x0061, 0xDC00]), ~"a\uFFFD");
3565
3566         // general
3567         assert_eq!(from_utf16_lossy([0xD800, 0xd801, 0xdc8b, 0xD800]), ~"\uFFFD𐒋\uFFFD");
3568     }
3569
3570     #[test]
3571     fn test_truncate_utf16_at_nul() {
3572         let v = [];
3573         assert_eq!(truncate_utf16_at_nul(v), &[]);
3574
3575         let v = [0, 2, 3];
3576         assert_eq!(truncate_utf16_at_nul(v), &[]);
3577
3578         let v = [1, 0, 3];
3579         assert_eq!(truncate_utf16_at_nul(v), &[1]);
3580
3581         let v = [1, 2, 0];
3582         assert_eq!(truncate_utf16_at_nul(v), &[1, 2]);
3583
3584         let v = [1, 2, 3];
3585         assert_eq!(truncate_utf16_at_nul(v), &[1, 2, 3]);
3586     }
3587
3588     #[test]
3589     fn test_char_at() {
3590         let s = ~"ศไทย中华Việt Nam";
3591         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3592         let mut pos = 0;
3593         for ch in v.iter() {
3594             assert!(s.char_at(pos) == *ch);
3595             pos += from_char(*ch).len();
3596         }
3597     }
3598
3599     #[test]
3600     fn test_char_at_reverse() {
3601         let s = ~"ศไทย中华Việt Nam";
3602         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3603         let mut pos = s.len();
3604         for ch in v.rev_iter() {
3605             assert!(s.char_at_reverse(pos) == *ch);
3606             pos -= from_char(*ch).len();
3607         }
3608     }
3609
3610     #[test]
3611     fn test_escape_unicode() {
3612         assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3613         assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3614         assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3615         assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3616         assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3617         assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3618         assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3619         assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3620         assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3621     }
3622
3623     #[test]
3624     fn test_escape_default() {
3625         assert_eq!("abc".escape_default(), ~"abc");
3626         assert_eq!("a c".escape_default(), ~"a c");
3627         assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3628         assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3629         assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3630         assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3631         assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3632         assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3633     }
3634
3635     #[test]
3636     fn test_total_ord() {
3637         "1234".cmp(& &"123") == Greater;
3638         "123".cmp(& &"1234") == Less;
3639         "1234".cmp(& &"1234") == Equal;
3640         "12345555".cmp(& &"123456") == Less;
3641         "22".cmp(& &"1234") == Greater;
3642     }
3643
3644     #[test]
3645     fn test_char_range_at() {
3646         let data = ~"b¢€𤭢𤭢€¢b";
3647         assert_eq!('b', data.char_range_at(0).ch);
3648         assert_eq!('¢', data.char_range_at(1).ch);
3649         assert_eq!('€', data.char_range_at(3).ch);
3650         assert_eq!('𤭢', data.char_range_at(6).ch);
3651         assert_eq!('𤭢', data.char_range_at(10).ch);
3652         assert_eq!('€', data.char_range_at(14).ch);
3653         assert_eq!('¢', data.char_range_at(17).ch);
3654         assert_eq!('b', data.char_range_at(19).ch);
3655     }
3656
3657     #[test]
3658     fn test_char_range_at_reverse_underflow() {
3659         assert_eq!("abc".char_range_at_reverse(0).next, 0);
3660     }
3661
3662     #[test]
3663     fn test_add() {
3664         #![allow(unnecessary_allocation)]
3665         macro_rules! t (
3666             ($s1:expr, $s2:expr, $e:expr) => { {
3667                 let s1 = $s1;
3668                 let s2 = $s2;
3669                 let e = $e;
3670                 assert_eq!(s1 + s2, e.to_owned());
3671                 assert_eq!(s1.to_owned() + s2, e.to_owned());
3672             } }
3673         );
3674
3675         t!("foo",  "bar", "foobar");
3676         t!("foo", ~"bar", "foobar");
3677         t!("ศไทย中",  "华Việt Nam", "ศไทย中华Việt Nam");
3678         t!("ศไทย中", ~"华Việt Nam", "ศไทย中华Việt Nam");
3679     }
3680
3681     #[test]
3682     fn test_iterator() {
3683         use iter::*;
3684         let s = ~"ศไทย中华Việt Nam";
3685         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3686
3687         let mut pos = 0;
3688         let mut it = s.chars();
3689
3690         for c in it {
3691             assert_eq!(c, v[pos]);
3692             pos += 1;
3693         }
3694         assert_eq!(pos, v.len());
3695     }
3696
3697     #[test]
3698     fn test_rev_iterator() {
3699         use iter::*;
3700         let s = ~"ศไทย中华Việt Nam";
3701         let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3702
3703         let mut pos = 0;
3704         let mut it = s.chars_rev();
3705
3706         for c in it {
3707             assert_eq!(c, v[pos]);
3708             pos += 1;
3709         }
3710         assert_eq!(pos, v.len());
3711     }
3712
3713     #[test]
3714     fn test_iterator_clone() {
3715         let s = "ศไทย中华Việt Nam";
3716         let mut it = s.chars();
3717         it.next();
3718         assert!(it.zip(it.clone()).all(|(x,y)| x == y));
3719     }
3720
3721     #[test]
3722     fn test_bytesator() {
3723         let s = ~"ศไทย中华Việt Nam";
3724         let v = [
3725             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3726             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3727             109
3728         ];
3729         let mut pos = 0;
3730
3731         for b in s.bytes() {
3732             assert_eq!(b, v[pos]);
3733             pos += 1;
3734         }
3735     }
3736
3737     #[test]
3738     fn test_bytes_revator() {
3739         let s = ~"ศไทย中华Việt Nam";
3740         let v = [
3741             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3742             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3743             109
3744         ];
3745         let mut pos = v.len();
3746
3747         for b in s.bytes_rev() {
3748             pos -= 1;
3749             assert_eq!(b, v[pos]);
3750         }
3751     }
3752
3753     #[test]
3754     fn test_char_indicesator() {
3755         use iter::*;
3756         let s = "ศไทย中华Việt Nam";
3757         let p = [0, 3, 6, 9, 12, 15, 18, 19, 20, 23, 24, 25, 26, 27];
3758         let v = ['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3759
3760         let mut pos = 0;
3761         let mut it = s.char_indices();
3762
3763         for c in it {
3764             assert_eq!(c, (p[pos], v[pos]));
3765             pos += 1;
3766         }
3767         assert_eq!(pos, v.len());
3768         assert_eq!(pos, p.len());
3769     }
3770
3771     #[test]
3772     fn test_char_indices_revator() {
3773         use iter::*;
3774         let s = "ศไทย中华Việt Nam";
3775         let p = [27, 26, 25, 24, 23, 20, 19, 18, 15, 12, 9, 6, 3, 0];
3776         let v = ['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3777
3778         let mut pos = 0;
3779         let mut it = s.char_indices_rev();
3780
3781         for c in it {
3782             assert_eq!(c, (p[pos], v[pos]));
3783             pos += 1;
3784         }
3785         assert_eq!(pos, v.len());
3786         assert_eq!(pos, p.len());
3787     }
3788
3789     #[test]
3790     fn test_split_char_iterator() {
3791         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3792
3793         let split: ~[&str] = data.split(' ').collect();
3794         assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3795
3796         let mut rsplit: ~[&str] = data.rsplit(' ').collect();
3797         rsplit.reverse();
3798         assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3799
3800         let split: ~[&str] = data.split(|c: char| c == ' ').collect();
3801         assert_eq!( split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3802
3803         let mut rsplit: ~[&str] = data.rsplit(|c: char| c == ' ').collect();
3804         rsplit.reverse();
3805         assert_eq!(rsplit, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3806
3807         // Unicode
3808         let split: ~[&str] = data.split('ä').collect();
3809         assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3810
3811         let mut rsplit: ~[&str] = data.rsplit('ä').collect();
3812         rsplit.reverse();
3813         assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3814
3815         let split: ~[&str] = data.split(|c: char| c == 'ä').collect();
3816         assert_eq!( split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3817
3818         let mut rsplit: ~[&str] = data.rsplit(|c: char| c == 'ä').collect();
3819         rsplit.reverse();
3820         assert_eq!(rsplit, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3821     }
3822
3823     #[test]
3824     fn test_splitn_char_iterator() {
3825         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3826
3827         let split: ~[&str] = data.splitn(' ', 3).collect();
3828         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3829
3830         let split: ~[&str] = data.splitn(|c: char| c == ' ', 3).collect();
3831         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3832
3833         // Unicode
3834         let split: ~[&str] = data.splitn('ä', 3).collect();
3835         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3836
3837         let split: ~[&str] = data.splitn(|c: char| c == 'ä', 3).collect();
3838         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3839     }
3840
3841     #[test]
3842     fn test_rsplitn_char_iterator() {
3843         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3844
3845         let mut split: ~[&str] = data.rsplitn(' ', 3).collect();
3846         split.reverse();
3847         assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3848
3849         let mut split: ~[&str] = data.rsplitn(|c: char| c == ' ', 3).collect();
3850         split.reverse();
3851         assert_eq!(split, ~["\nMäry häd ä", "little", "lämb\nLittle", "lämb\n"]);
3852
3853         // Unicode
3854         let mut split: ~[&str] = data.rsplitn('ä', 3).collect();
3855         split.reverse();
3856         assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3857
3858         let mut split: ~[&str] = data.rsplitn(|c: char| c == 'ä', 3).collect();
3859         split.reverse();
3860         assert_eq!(split, ~["\nMäry häd ", " little l", "mb\nLittle l", "mb\n"]);
3861     }
3862
3863     #[test]
3864     fn test_split_char_iterator_no_trailing() {
3865         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3866
3867         let split: ~[&str] = data.split('\n').collect();
3868         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3869
3870         let split: ~[&str] = data.split_terminator('\n').collect();
3871         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3872     }
3873
3874     #[test]
3875     fn test_rev_split_char_iterator_no_trailing() {
3876         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3877
3878         let mut split: ~[&str] = data.split('\n').rev().collect();
3879         split.reverse();
3880         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3881
3882         let mut split: ~[&str] = data.split_terminator('\n').rev().collect();
3883         split.reverse();
3884         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3885     }
3886
3887     #[test]
3888     fn test_words() {
3889         let data = "\n \tMäry   häd\tä  little lämb\nLittle lämb\n";
3890         let words: ~[&str] = data.words().collect();
3891         assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3892     }
3893
3894     #[test]
3895     fn test_nfd_chars() {
3896         assert_eq!("abc".nfd_chars().collect::<~str>(), ~"abc");
3897         assert_eq!("\u1e0b\u01c4".nfd_chars().collect::<~str>(), ~"d\u0307\u01c4");
3898         assert_eq!("\u2026".nfd_chars().collect::<~str>(), ~"\u2026");
3899         assert_eq!("\u2126".nfd_chars().collect::<~str>(), ~"\u03a9");
3900         assert_eq!("\u1e0b\u0323".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3901         assert_eq!("\u1e0d\u0307".nfd_chars().collect::<~str>(), ~"d\u0323\u0307");
3902         assert_eq!("a\u0301".nfd_chars().collect::<~str>(), ~"a\u0301");
3903         assert_eq!("\u0301a".nfd_chars().collect::<~str>(), ~"\u0301a");
3904         assert_eq!("\ud4db".nfd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3905         assert_eq!("\uac1c".nfd_chars().collect::<~str>(), ~"\u1100\u1162");
3906     }
3907
3908     #[test]
3909     fn test_nfkd_chars() {
3910         assert_eq!("abc".nfkd_chars().collect::<~str>(), ~"abc");
3911         assert_eq!("\u1e0b\u01c4".nfkd_chars().collect::<~str>(), ~"d\u0307DZ\u030c");
3912         assert_eq!("\u2026".nfkd_chars().collect::<~str>(), ~"...");
3913         assert_eq!("\u2126".nfkd_chars().collect::<~str>(), ~"\u03a9");
3914         assert_eq!("\u1e0b\u0323".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3915         assert_eq!("\u1e0d\u0307".nfkd_chars().collect::<~str>(), ~"d\u0323\u0307");
3916         assert_eq!("a\u0301".nfkd_chars().collect::<~str>(), ~"a\u0301");
3917         assert_eq!("\u0301a".nfkd_chars().collect::<~str>(), ~"\u0301a");
3918         assert_eq!("\ud4db".nfkd_chars().collect::<~str>(), ~"\u1111\u1171\u11b6");
3919         assert_eq!("\uac1c".nfkd_chars().collect::<~str>(), ~"\u1100\u1162");
3920     }
3921
3922     #[test]
3923     fn test_lines() {
3924         let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3925         let lines: ~[&str] = data.lines().collect();
3926         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3927
3928         let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3929         let lines: ~[&str] = data.lines().collect();
3930         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3931     }
3932
3933     #[test]
3934     fn test_split_strator() {
3935         fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3936             let v: ~[&str] = s.split_str(sep).collect();
3937             assert_eq!(v, u);
3938         }
3939         t("--1233345--", "12345", ~["--1233345--"]);
3940         t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3941         t("::hello::there", "::", ~["", "hello", "there"]);
3942         t("hello::there::", "::", ~["hello", "there", ""]);
3943         t("::hello::there::", "::", ~["", "hello", "there", ""]);
3944         t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3945         t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3946         t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3947         t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3948         t("", ".", ~[""]);
3949         t("zz", "zz", ~["",""]);
3950         t("ok", "z", ~["ok"]);
3951         t("zzz", "zz", ~["","z"]);
3952         t("zzzzz", "zz", ~["","","z"]);
3953     }
3954
3955     #[test]
3956     fn test_str_default() {
3957         use default::Default;
3958         fn t<S: Default + Str>() {
3959             let s: S = Default::default();
3960             assert_eq!(s.as_slice(), "");
3961         }
3962
3963         t::<&str>();
3964         t::<~str>();
3965     }
3966
3967     #[test]
3968     fn test_str_container() {
3969         fn sum_len<S: Container>(v: &[S]) -> uint {
3970             v.iter().map(|x| x.len()).sum()
3971         }
3972
3973         let s = ~"01234";
3974         assert_eq!(5, sum_len(["012", "", "34"]));
3975         assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3976         assert_eq!(5, sum_len([s.as_slice()]));
3977     }
3978
3979     #[test]
3980     fn test_str_from_utf8() {
3981         let xs = bytes!("hello");
3982         assert_eq!(from_utf8(xs), Some("hello"));
3983
3984         let xs = bytes!("ศไทย中华Việt Nam");
3985         assert_eq!(from_utf8(xs), Some("ศไทย中华Việt Nam"));
3986
3987         let xs = bytes!("hello", 0xff);
3988         assert_eq!(from_utf8(xs), None);
3989     }
3990
3991     #[test]
3992     fn test_str_from_utf8_owned() {
3993         let xs = bytes!("hello").to_owned();
3994         assert_eq!(from_utf8_owned(xs), Some(~"hello"));
3995
3996         let xs = bytes!("ศไทย中华Việt Nam").to_owned();
3997         assert_eq!(from_utf8_owned(xs), Some(~"ศไทย中华Việt Nam"));
3998
3999         let xs = bytes!("hello", 0xff).to_owned();
4000         assert_eq!(from_utf8_owned(xs), None);
4001     }
4002
4003     #[test]
4004     fn test_str_from_utf8_lossy() {
4005         let xs = bytes!("hello");
4006         assert_eq!(from_utf8_lossy(xs), Slice("hello"));
4007
4008         let xs = bytes!("ศไทย中华Việt Nam");
4009         assert_eq!(from_utf8_lossy(xs), Slice("ศไทย中华Việt Nam"));
4010
4011         let xs = bytes!("Hello", 0xC2, " There", 0xFF, " Goodbye");
4012         assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD There\uFFFD Goodbye"));
4013
4014         let xs = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4015         assert_eq!(from_utf8_lossy(xs), Owned(~"Hello\uFFFD\uFFFD There\uFFFD Goodbye"));
4016
4017         let xs = bytes!(0xF5, "foo", 0xF5, 0x80, "bar");
4018         assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFD\uFFFDbar"));
4019
4020         let xs = bytes!(0xF1, "foo", 0xF1, 0x80, "bar", 0xF1, 0x80, 0x80, "baz");
4021         assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFDbaz"));
4022
4023         let xs = bytes!(0xF4, "foo", 0xF4, 0x80, "bar", 0xF4, 0xBF, "baz");
4024         assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFDfoo\uFFFDbar\uFFFD\uFFFDbaz"));
4025
4026         let xs = bytes!(0xF0, 0x80, 0x80, 0x80, "foo", 0xF0, 0x90, 0x80, 0x80, "bar");
4027         assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFD\uFFFDfoo\U00010000bar"));
4028
4029         // surrogates
4030         let xs = bytes!(0xED, 0xA0, 0x80, "foo", 0xED, 0xBF, 0xBF, "bar");
4031         assert_eq!(from_utf8_lossy(xs), Owned(~"\uFFFD\uFFFD\uFFFDfoo\uFFFD\uFFFD\uFFFDbar"));
4032     }
4033
4034     #[test]
4035     fn test_from_str() {
4036       let owned: Option<~str> = from_str(&"string");
4037       assert_eq!(owned, Some(~"string"));
4038     }
4039
4040     #[test]
4041     fn test_maybe_owned_traits() {
4042         let s = Slice("abcde");
4043         assert_eq!(s.len(), 5);
4044         assert_eq!(s.as_slice(), "abcde");
4045         assert_eq!(s.to_str(), ~"abcde");
4046         assert_eq!(format!("{}", s), ~"abcde");
4047         assert!(s.lt(&Owned(~"bcdef")));
4048         assert_eq!(Slice(""), Default::default());
4049
4050         let o = Owned(~"abcde");
4051         assert_eq!(o.len(), 5);
4052         assert_eq!(o.as_slice(), "abcde");
4053         assert_eq!(o.to_str(), ~"abcde");
4054         assert_eq!(format!("{}", o), ~"abcde");
4055         assert!(o.lt(&Slice("bcdef")));
4056         assert_eq!(Owned(~""), Default::default());
4057
4058         assert!(s.cmp(&o) == Equal);
4059         assert!(s.equiv(&o));
4060
4061         assert!(o.cmp(&s) == Equal);
4062         assert!(o.equiv(&s));
4063     }
4064
4065     #[test]
4066     fn test_maybe_owned_methods() {
4067         let s = Slice("abcde");
4068         assert!(s.is_slice());
4069         assert!(!s.is_owned());
4070
4071         let o = Owned(~"abcde");
4072         assert!(!o.is_slice());
4073         assert!(o.is_owned());
4074     }
4075
4076     #[test]
4077     fn test_maybe_owned_clone() {
4078         assert_eq!(Owned(~"abcde"), Slice("abcde").clone());
4079         assert_eq!(Owned(~"abcde"), Owned(~"abcde").clone());
4080         assert_eq!(Slice("abcde"), Slice("abcde").clone());
4081         assert_eq!(Slice("abcde"), Owned(~"abcde").clone());
4082     }
4083
4084     #[test]
4085     fn test_maybe_owned_into_owned() {
4086         assert_eq!(Slice("abcde").into_owned(), ~"abcde");
4087         assert_eq!(Owned(~"abcde").into_owned(), ~"abcde");
4088     }
4089
4090     #[test]
4091     fn test_into_maybe_owned() {
4092         assert_eq!("abcde".into_maybe_owned(), Slice("abcde"));
4093         assert_eq!((~"abcde").into_maybe_owned(), Slice("abcde"));
4094         assert_eq!("abcde".into_maybe_owned(), Owned(~"abcde"));
4095         assert_eq!((~"abcde").into_maybe_owned(), Owned(~"abcde"));
4096     }
4097 }
4098
4099 #[cfg(test)]
4100 mod bench {
4101     extern crate test;
4102     use self::test::BenchHarness;
4103     use super::*;
4104     use prelude::*;
4105
4106     #[bench]
4107     fn char_iterator(bh: &mut BenchHarness) {
4108         let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4109         let len = s.char_len();
4110
4111         bh.iter(|| assert_eq!(s.chars().len(), len));
4112     }
4113
4114     #[bench]
4115     fn char_iterator_ascii(bh: &mut BenchHarness) {
4116         let s = "Mary had a little lamb, Little lamb
4117         Mary had a little lamb, Little lamb
4118         Mary had a little lamb, Little lamb
4119         Mary had a little lamb, Little lamb
4120         Mary had a little lamb, Little lamb
4121         Mary had a little lamb, Little lamb";
4122         let len = s.char_len();
4123
4124         bh.iter(|| assert_eq!(s.chars().len(), len));
4125     }
4126
4127     #[bench]
4128     fn char_iterator_rev(bh: &mut BenchHarness) {
4129         let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4130         let len = s.char_len();
4131
4132         bh.iter(|| assert_eq!(s.chars_rev().len(), len));
4133     }
4134
4135     #[bench]
4136     fn char_indicesator(bh: &mut BenchHarness) {
4137         let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4138         let len = s.char_len();
4139
4140         bh.iter(|| assert_eq!(s.char_indices().len(), len));
4141     }
4142
4143     #[bench]
4144     fn char_indicesator_rev(bh: &mut BenchHarness) {
4145         let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4146         let len = s.char_len();
4147
4148         bh.iter(|| assert_eq!(s.char_indices_rev().len(), len));
4149     }
4150
4151     #[bench]
4152     fn split_unicode_ascii(bh: &mut BenchHarness) {
4153         let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4154
4155         bh.iter(|| assert_eq!(s.split('V').len(), 3));
4156     }
4157
4158     #[bench]
4159     fn split_unicode_not_ascii(bh: &mut BenchHarness) {
4160         struct NotAscii(char);
4161         impl CharEq for NotAscii {
4162             fn matches(&self, c: char) -> bool {
4163                 let NotAscii(cc) = *self;
4164                 cc == c
4165             }
4166             fn only_ascii(&self) -> bool { false }
4167         }
4168         let s = "ประเทศไทย中华Việt Namประเทศไทย中华Việt Nam";
4169
4170         bh.iter(|| assert_eq!(s.split(NotAscii('V')).len(), 3));
4171     }
4172
4173
4174     #[bench]
4175     fn split_ascii(bh: &mut BenchHarness) {
4176         let s = "Mary had a little lamb, Little lamb, little-lamb.";
4177         let len = s.split(' ').len();
4178
4179         bh.iter(|| assert_eq!(s.split(' ').len(), len));
4180     }
4181
4182     #[bench]
4183     fn split_not_ascii(bh: &mut BenchHarness) {
4184         struct NotAscii(char);
4185         impl CharEq for NotAscii {
4186             #[inline]
4187             fn matches(&self, c: char) -> bool {
4188                 let NotAscii(cc) = *self;
4189                 cc == c
4190             }
4191             fn only_ascii(&self) -> bool { false }
4192         }
4193         let s = "Mary had a little lamb, Little lamb, little-lamb.";
4194         let len = s.split(' ').len();
4195
4196         bh.iter(|| assert_eq!(s.split(NotAscii(' ')).len(), len));
4197     }
4198
4199     #[bench]
4200     fn split_extern_fn(bh: &mut BenchHarness) {
4201         let s = "Mary had a little lamb, Little lamb, little-lamb.";
4202         let len = s.split(' ').len();
4203         fn pred(c: char) -> bool { c == ' ' }
4204
4205         bh.iter(|| assert_eq!(s.split(pred).len(), len));
4206     }
4207
4208     #[bench]
4209     fn split_closure(bh: &mut BenchHarness) {
4210         let s = "Mary had a little lamb, Little lamb, little-lamb.";
4211         let len = s.split(' ').len();
4212
4213         bh.iter(|| assert_eq!(s.split(|c: char| c == ' ').len(), len));
4214     }
4215
4216     #[bench]
4217     fn split_slice(bh: &mut BenchHarness) {
4218         let s = "Mary had a little lamb, Little lamb, little-lamb.";
4219         let len = s.split(' ').len();
4220
4221         bh.iter(|| assert_eq!(s.split(&[' ']).len(), len));
4222     }
4223
4224     #[bench]
4225     fn is_utf8_100_ascii(bh: &mut BenchHarness) {
4226
4227         let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4228                         Lorem ipsum dolor sit amet, consectetur. ");
4229
4230         assert_eq!(100, s.len());
4231         bh.iter(|| {
4232             is_utf8(s)
4233         });
4234     }
4235
4236     #[bench]
4237     fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
4238         let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4239         assert_eq!(100, s.len());
4240         bh.iter(|| {
4241             is_utf8(s)
4242         });
4243     }
4244
4245     #[bench]
4246     fn from_utf8_lossy_100_ascii(bh: &mut BenchHarness) {
4247         let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
4248                         Lorem ipsum dolor sit amet, consectetur. ");
4249
4250         assert_eq!(100, s.len());
4251         bh.iter(|| {
4252             let _ = from_utf8_lossy(s);
4253         });
4254     }
4255
4256     #[bench]
4257     fn from_utf8_lossy_100_multibyte(bh: &mut BenchHarness) {
4258         let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
4259         assert_eq!(100, s.len());
4260         bh.iter(|| {
4261             let _ = from_utf8_lossy(s);
4262         });
4263     }
4264
4265     #[bench]
4266     fn from_utf8_lossy_invalid(bh: &mut BenchHarness) {
4267         let s = bytes!("Hello", 0xC0, 0x80, " There", 0xE6, 0x83, " Goodbye");
4268         bh.iter(|| {
4269             let _ = from_utf8_lossy(s);
4270         });
4271     }
4272
4273     #[bench]
4274     fn from_utf8_lossy_100_invalid(bh: &mut BenchHarness) {
4275         let s = ::slice::from_elem(100, 0xF5u8);
4276         bh.iter(|| {
4277             let _ = from_utf8_lossy(s);
4278         });
4279     }
4280
4281     #[bench]
4282     fn bench_connect(bh: &mut BenchHarness) {
4283         let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
4284         let sep = "→";
4285         let v = [s, s, s, s, s, s, s, s, s, s];
4286         bh.iter(|| {
4287             assert_eq!(v.connect(sep).len(), s.len() * 10 + sep.len() * 9);
4288         })
4289     }
4290 }