src/libstd/str.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12  * String manipulation
  13  *
  14  * Strings are a packed UTF-8 representation of text, stored as null
  15  * terminated buffers of u8 bytes.  Strings should be indexed in bytes,
  16  * for efficiency, but UTF-8 unsafe operations should be avoided.
  17  */
  18
  19 use at_vec;
  20 use cast;
  21 use char;
  22 use char::Char;
  23 use clone::Clone;
  24 use container::{Container, Mutable};
  25 use iter::Times;
  26 use iterator::{Iterator, IteratorUtil, FilterIterator, AdditiveIterator, MapIterator};
  27 use libc;
  28 use num::Zero;
  29 use option::{None, Option, Some};
  30 use ptr;
  31 use ptr::RawPtr;
  32 use to_str::ToStr;
  33 use uint;
  34 use vec;
  35 use vec::{OwnedVector, OwnedCopyableVector, ImmutableVector, MutableVector};
  36
  37 /*
  38 Section: Conditions
  39 */
  40 condition! {
  41     not_utf8: (~str) -> ~str;
  42 }
  43
  44 /*
  45 Section: Creating a string
  46 */
  47
  48 /**
  49  * Convert a vector of bytes to a new UTF-8 string
  50  *
  51  * # Failure
  52  *
  53  * Raises the `not_utf8` condition if invalid UTF-8
  54  */
  55 pub fn from_bytes(vv: &[u8]) -> ~str {
  56     use str::not_utf8::cond;
  57
  58     if !is_utf8(vv) {
  59         let first_bad_byte = *vv.iter().find_(|&b| !is_utf8([*b])).get();
  60         cond.raise(fmt!("from_bytes: input is not UTF-8; first bad byte is %u",
  61                         first_bad_byte as uint))
  62     }
  63     else {
  64         return unsafe { raw::from_bytes(vv) }
  65     }
  66 }
  67
  68 /**
  69  * Consumes a vector of bytes to create a new utf-8 string
  70  *
  71  * # Failure
  72  *
  73  * Raises the `not_utf8` condition if invalid UTF-8
  74  */
  75 pub fn from_bytes_owned(vv: ~[u8]) -> ~str {
  76     use str::not_utf8::cond;
  77
  78     if !is_utf8(vv) {
  79         let first_bad_byte = *vv.iter().find_(|&b| !is_utf8([*b])).get();
  80         cond.raise(fmt!("from_bytes: input is not UTF-8; first bad byte is %u",
  81                         first_bad_byte as uint))
  82     } else {
  83         return unsafe { raw::from_bytes_owned(vv) }
  84     }
  85 }
  86
  87 /**
  88  * Convert a vector of bytes to a UTF-8 string.
  89  * The vector needs to be one byte longer than the string, and end with a 0 byte.
  90  *
  91  * Compared to `from_bytes()`, this fn doesn't need to allocate a new owned str.
  92  *
  93  * # Failure
  94  *
  95  * Fails if invalid UTF-8
  96  * Fails if not null terminated
  97  */
  98 pub fn from_bytes_with_null<'a>(vv: &'a [u8]) -> &'a str {
  99     assert_eq!(vv[vv.len() - 1], 0);
 100     assert!(is_utf8(vv));
 101     return unsafe { raw::from_bytes_with_null(vv) };
 102 }
 103
 104 /**
 105  * Converts a vector to a string slice without performing any allocations.
 106  *
 107  * Once the slice has been validated as utf-8, it is transmuted in-place and
 108  * returned as a '&str' instead of a '&[u8]'
 109  *
 110  * # Failure
 111  *
 112  * Fails if invalid UTF-8
 113  */
 114 pub fn from_bytes_slice<'a>(vector: &'a [u8]) -> &'a str {
 115     unsafe {
 116         assert!(is_utf8(vector));
 117         let (ptr, len): (*u8, uint) = ::cast::transmute(vector);
 118         let string: &'a str = ::cast::transmute((ptr, len + 1));
 119         string
 120     }
 121 }
 122
 123 impl ToStr for ~str {
 124     #[inline]
 125     fn to_str(&self) -> ~str { self.to_owned() }
 126 }
 127 impl<'self> ToStr for &'self str {
 128     #[inline]
 129     fn to_str(&self) -> ~str { self.to_owned() }
 130 }
 131 impl ToStr for @str {
 132     #[inline]
 133     fn to_str(&self) -> ~str { self.to_owned() }
 134 }
 135
 136 /**
 137  * Convert a byte to a UTF-8 string
 138  *
 139  * # Failure
 140  *
 141  * Fails if invalid UTF-8
 142  */
 143 pub fn from_byte(b: u8) -> ~str {
 144     assert!(b < 128u8);
 145     unsafe { ::cast::transmute(~[b, 0u8]) }
 146 }
 147
 148 /// Convert a char to a string
 149 pub fn from_char(ch: char) -> ~str {
 150     let mut buf = ~"";
 151     buf.push_char(ch);
 152     buf
 153 }
 154
 155 /// Convert a vector of chars to a string
 156 pub fn from_chars(chs: &[char]) -> ~str {
 157     let mut buf = ~"";
 158     buf.reserve(chs.len());
 159     for chs.iter().advance |ch| {
 160         buf.push_char(*ch)
 161     }
 162     buf
 163 }
 164
 165 #[doc(hidden)]
 166 pub fn push_str(lhs: &mut ~str, rhs: &str) {
 167     lhs.push_str(rhs)
 168 }
 169
 170 #[allow(missing_doc)]
 171 pub trait StrVector {
 172     pub fn concat(&self) -> ~str;
 173     pub fn connect(&self, sep: &str) -> ~str;
 174 }
 175
 176 impl<'self, S: Str> StrVector for &'self [S] {
 177     /// Concatenate a vector of strings.
 178     pub fn concat(&self) -> ~str {
 179         if self.is_empty() { return ~""; }
 180
 181         let len = self.iter().transform(|s| s.as_slice().len()).sum();
 182
 183         let mut s = with_capacity(len);
 184
 185         unsafe {
 186             do s.as_mut_buf |buf, _| {
 187                 let mut buf = buf;
 188                 for self.iter().advance |ss| {
 189                     do ss.as_slice().as_imm_buf |ssbuf, sslen| {
 190                         let sslen = sslen - 1;
 191                         ptr::copy_memory(buf, ssbuf, sslen);
 192                         buf = buf.offset(sslen);
 193                     }
 194                 }
 195             }
 196             raw::set_len(&mut s, len);
 197         }
 198         s
 199     }
 200
 201     /// Concatenate a vector of strings, placing a given separator between each.
 202     pub fn connect(&self, sep: &str) -> ~str {
 203         if self.is_empty() { return ~""; }
 204
 205         // concat is faster
 206         if sep.is_empty() { return self.concat(); }
 207
 208         // this is wrong without the guarantee that `self` is non-empty
 209         let len = sep.len() * (self.len() - 1)
 210             + self.iter().transform(|s| s.as_slice().len()).sum();
 211         let mut s = ~"";
 212         let mut first = true;
 213
 214         s.reserve(len);
 215
 216         unsafe {
 217             do s.as_mut_buf |buf, _| {
 218                 do sep.as_imm_buf |sepbuf, seplen| {
 219                     let seplen = seplen - 1;
 220                     let mut buf = ::cast::transmute_mut_unsafe(buf);
 221                     for self.iter().advance |ss| {
 222                         do ss.as_slice().as_imm_buf |ssbuf, sslen| {
 223                             let sslen = sslen - 1;
 224                             if first {
 225                                 first = false;
 226                             } else {
 227                                 ptr::copy_memory(buf, sepbuf, seplen);
 228                                 buf = buf.offset(seplen);
 229                             }
 230                             ptr::copy_memory(buf, ssbuf, sslen);
 231                             buf = buf.offset(sslen);
 232                         }
 233                     }
 234                 }
 235             }
 236             raw::set_len(&mut s, len);
 237         }
 238         s
 239     }
 240 }
 241
 242 /// Something that can be used to compare against a character
 243 pub trait CharEq {
 244     /// Determine if the splitter should split at the given character
 245     fn matches(&self, char) -> bool;
 246     /// Indicate if this is only concerned about ASCII characters,
 247     /// which can allow for a faster implementation.
 248     fn only_ascii(&self) -> bool;
 249 }
 250 impl CharEq for char {
 251     #[inline]
 252     fn matches(&self, c: char) -> bool { *self == c }
 253
 254     fn only_ascii(&self) -> bool { (*self as uint) < 128 }
 255 }
 256 impl<'self> CharEq for &'self fn(char) -> bool {
 257     #[inline]
 258     fn matches(&self, c: char) -> bool { (*self)(c) }
 259
 260     fn only_ascii(&self) -> bool { false }
 261 }
 262 impl CharEq for extern "Rust" fn(char) -> bool {
 263     #[inline]
 264     fn matches(&self, c: char) -> bool { (*self)(c) }
 265
 266     fn only_ascii(&self) -> bool { false }
 267 }
 268
 269 impl<'self, C: CharEq> CharEq for &'self [C] {
 270     #[inline]
 271     fn matches(&self, c: char) -> bool {
 272         self.iter().any(|m| m.matches(c))
 273     }
 274
 275     fn only_ascii(&self) -> bool {
 276         self.iter().all(|m| m.only_ascii())
 277     }
 278 }
 279
 280
 281 /// An iterator over the substrings of a string, separated by `sep`.
 282 #[deriving(Clone)]
 283 pub struct StrCharSplitIterator<'self,Sep> {
 284     priv string: &'self str,
 285     priv position: uint,
 286     priv sep: Sep,
 287     /// The number of splits remaining
 288     priv count: uint,
 289     /// Whether an empty string at the end is allowed
 290     priv allow_trailing_empty: bool,
 291     priv finished: bool,
 292     priv only_ascii: bool
 293 }
 294
 295 /// An iterator over the words of a string, separated by an sequence of whitespace
 296 pub type WordIterator<'self> =
 297     FilterIterator<'self, &'self str,
 298              StrCharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
 299
 300 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
 301 pub type AnyLineIterator<'self> =
 302     MapIterator<'self, &'self str, &'self str, StrCharSplitIterator<'self, char>>;
 303
 304 impl<'self, Sep: CharEq> Iterator<&'self str> for StrCharSplitIterator<'self, Sep> {
 305     #[inline]
 306     fn next(&mut self) -> Option<&'self str> {
 307         if self.finished { return None }
 308
 309         let l = self.string.len();
 310         let start = self.position;
 311
 312         if self.only_ascii {
 313             // this gives a *huge* speed up for splitting on ASCII
 314             // characters (e.g. '\n' or ' ')
 315             while self.position < l && self.count > 0 {
 316                 let byte = self.string[self.position];
 317
 318                 if self.sep.matches(byte as char) {
 319                     let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
 320                     self.position += 1;
 321                     self.count -= 1;
 322                     return Some(slice);
 323                 }
 324                 self.position += 1;
 325             }
 326         } else {
 327             while self.position < l && self.count > 0 {
 328                 let CharRange {ch, next} = self.string.char_range_at(self.position);
 329
 330                 if self.sep.matches(ch) {
 331                     let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
 332                     self.position = next;
 333                     self.count -= 1;
 334                     return Some(slice);
 335                 }
 336                 self.position = next;
 337             }
 338         }
 339         self.finished = true;
 340         if self.allow_trailing_empty || start < l {
 341             Some(unsafe { raw::slice_bytes(self.string, start, l) })
 342         } else {
 343             None
 344         }
 345     }
 346 }
 347
 348 /// An iterator over the start and end indicies of the matches of a
 349 /// substring within a larger string
 350 #[deriving(Clone)]
 351 pub struct StrMatchesIndexIterator<'self> {
 352     priv haystack: &'self str,
 353     priv needle: &'self str,
 354     priv position: uint,
 355 }
 356
 357 /// An iterator over the substrings of a string separated by a given
 358 /// search string
 359 #[deriving(Clone)]
 360 pub struct StrStrSplitIterator<'self> {
 361     priv it: StrMatchesIndexIterator<'self>,
 362     priv last_end: uint,
 363     priv finished: bool
 364 }
 365
 366 impl<'self> Iterator<(uint, uint)> for StrMatchesIndexIterator<'self> {
 367     #[inline]
 368     fn next(&mut self) -> Option<(uint, uint)> {
 369         // See Issue #1932 for why this is a naive search
 370         let (h_len, n_len) = (self.haystack.len(), self.needle.len());
 371         let mut match_start = 0;
 372         let mut match_i = 0;
 373
 374         while self.position < h_len {
 375             if self.haystack[self.position] == self.needle[match_i] {
 376                 if match_i == 0 { match_start = self.position; }
 377                 match_i += 1;
 378                 self.position += 1;
 379
 380                 if match_i == n_len {
 381                     // found a match!
 382                     return Some((match_start, self.position));
 383                 }
 384             } else {
 385                 // failed match, backtrack
 386                 if match_i > 0 {
 387                     match_i = 0;
 388                     self.position = match_start;
 389                 }
 390                 self.position += 1;
 391             }
 392         }
 393         None
 394     }
 395 }
 396
 397 impl<'self> Iterator<&'self str> for StrStrSplitIterator<'self> {
 398     #[inline]
 399     fn next(&mut self) -> Option<&'self str> {
 400         if self.finished { return None; }
 401
 402         match self.it.next() {
 403             Some((from, to)) => {
 404                 let ret = Some(self.it.haystack.slice(self.last_end, from));
 405                 self.last_end = to;
 406                 ret
 407             }
 408             None => {
 409                 self.finished = true;
 410                 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
 411             }
 412         }
 413     }
 414 }
 415
 416 /** Splits a string into substrings with possibly internal whitespace,
 417  *  each of them at most `lim` bytes long. The substrings have leading and trailing
 418  *  whitespace removed, and are only cut at whitespace boundaries.
 419  *
 420  *  #Failure:
 421  *
 422  *  Fails during iteration if the string contains a non-whitespace
 423  *  sequence longer than the limit.
 424  */
 425 pub fn each_split_within<'a>(ss: &'a str,
 426                               lim: uint,
 427                               it: &fn(&'a str) -> bool) -> bool {
 428     // Just for fun, let's write this as an state machine:
 429
 430     enum SplitWithinState {
 431         A,  // leading whitespace, initial state
 432         B,  // words
 433         C,  // internal and trailing whitespace
 434     }
 435     enum Whitespace {
 436         Ws, // current char is whitespace
 437         Cr  // current char is not whitespace
 438     }
 439     enum LengthLimit {
 440         UnderLim, // current char makes current substring still fit in limit
 441         OverLim   // current char makes current substring no longer fit in limit
 442     }
 443
 444     let mut slice_start = 0;
 445     let mut last_start = 0;
 446     let mut last_end = 0;
 447     let mut state = A;
 448     let mut fake_i = ss.len();
 449     let mut lim = lim;
 450
 451     let mut cont = true;
 452     let slice: &fn() = || { cont = it(ss.slice(slice_start, last_end)) };
 453
 454     // if the limit is larger than the string, lower it to save cycles
 455     if (lim >= fake_i) {
 456         lim = fake_i;
 457     }
 458
 459     let machine: &fn((uint, char)) -> bool = |(i, c)| {
 460         let whitespace = if char::is_whitespace(c)       { Ws }       else { Cr };
 461         let limit      = if (i - slice_start + 1) <= lim { UnderLim } else { OverLim };
 462
 463         state = match (state, whitespace, limit) {
 464             (A, Ws, _)        => { A }
 465             (A, Cr, _)        => { slice_start = i; last_start = i; B }
 466
 467             (B, Cr, UnderLim) => { B }
 468             (B, Cr, OverLim)  if (i - last_start + 1) > lim
 469                               => fail!("word starting with %? longer than limit!",
 470                                        ss.slice(last_start, i + 1)),
 471             (B, Cr, OverLim)  => { slice(); slice_start = last_start; B }
 472             (B, Ws, UnderLim) => { last_end = i; C }
 473             (B, Ws, OverLim)  => { last_end = i; slice(); A }
 474
 475             (C, Cr, UnderLim) => { last_start = i; B }
 476             (C, Cr, OverLim)  => { slice(); slice_start = i; last_start = i; last_end = i; B }
 477             (C, Ws, OverLim)  => { slice(); A }
 478             (C, Ws, UnderLim) => { C }
 479         };
 480
 481         cont
 482     };
 483
 484     ss.iter().enumerate().advance(|x| machine(x));
 485
 486     // Let the automaton 'run out' by supplying trailing whitespace
 487     while cont && match state { B | C => true, A => false } {
 488         machine((fake_i, ' '));
 489         fake_i += 1;
 490     }
 491     return cont;
 492 }
 493
 494 /**
 495  * Replace all occurrences of one string with another
 496  *
 497  * # Arguments
 498  *
 499  * * s - The string containing substrings to replace
 500  * * from - The string to replace
 501  * * to - The replacement string
 502  *
 503  * # Return value
 504  *
 505  * The original string with all occurances of `from` replaced with `to`
 506  */
 507 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
 508     let mut result = ~"";
 509     let mut last_end = 0;
 510     for s.matches_index_iter(from).advance |(start, end)| {
 511         result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
 512         result.push_str(to);
 513         last_end = end;
 514     }
 515     result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
 516     result
 517 }
 518
 519 /*
 520 Section: Comparing strings
 521 */
 522
 523 /// Bytewise slice equality
 524 #[cfg(not(test))]
 525 #[lang="str_eq"]
 526 #[inline]
 527 pub fn eq_slice(a: &str, b: &str) -> bool {
 528     do a.as_imm_buf |ap, alen| {
 529         do b.as_imm_buf |bp, blen| {
 530             if (alen != blen) { false }
 531             else {
 532                 unsafe {
 533                     libc::memcmp(ap as *libc::c_void,
 534                                  bp as *libc::c_void,
 535                                  (alen - 1) as libc::size_t) == 0
 536                 }
 537             }
 538         }
 539     }
 540 }
 541
 542 #[cfg(test)]
 543 #[inline]
 544 pub fn eq_slice(a: &str, b: &str) -> bool {
 545     do a.as_imm_buf |ap, alen| {
 546         do b.as_imm_buf |bp, blen| {
 547             if (alen != blen) { false }
 548             else {
 549                 unsafe {
 550                     libc::memcmp(ap as *libc::c_void,
 551                                  bp as *libc::c_void,
 552                                  (alen - 1) as libc::size_t) == 0
 553                 }
 554             }
 555         }
 556     }
 557 }
 558
 559 /// Bytewise string equality
 560 #[cfg(not(test))]
 561 #[lang="uniq_str_eq"]
 562 #[inline]
 563 pub fn eq(a: &~str, b: &~str) -> bool {
 564     eq_slice(*a, *b)
 565 }
 566
 567 #[cfg(test)]
 568 #[inline]
 569 pub fn eq(a: &~str, b: &~str) -> bool {
 570     eq_slice(*a, *b)
 571 }
 572
 573 /*
 574 Section: Searching
 575 */
 576
 577 // Utility used by various searching functions
 578 fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
 579     let mut i = at;
 580     for needle.bytes_iter().advance |c| { if haystack[i] != c { return false; } i += 1u; }
 581     return true;
 582 }
 583
 584 /*
 585 Section: Misc
 586 */
 587
 588 /// Determines if a vector of bytes contains valid UTF-8
 589 pub fn is_utf8(v: &[u8]) -> bool {
 590     let mut i = 0u;
 591     let total = v.len();
 592     while i < total {
 593         if v[i] < 128u8 {
 594             i += 1u;
 595         } else {
 596             let w = utf8_char_width(v[i]);
 597             if w == 0u { return false; }
 598
 599             let nexti = i + w;
 600             if nexti > total { return false; }
 601
 602             if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
 603             if w > 2 {
 604                 if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
 605                 if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
 606             }
 607
 608             i = nexti;
 609         }
 610     }
 611     true
 612 }
 613
 614 /// Determines if a vector of `u16` contains valid UTF-16
 615 pub fn is_utf16(v: &[u16]) -> bool {
 616     let len = v.len();
 617     let mut i = 0u;
 618     while (i < len) {
 619         let u = v[i];
 620
 621         if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
 622             i += 1u;
 623
 624         } else {
 625             if i+1u < len { return false; }
 626             let u2 = v[i+1u];
 627             if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; }
 628             if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; }
 629             i += 2u;
 630         }
 631     }
 632     return true;
 633 }
 634
 635 /// Iterates over the utf-16 characters in the specified slice, yielding each
 636 /// decoded unicode character to the function provided.
 637 ///
 638 /// # Failures
 639 ///
 640 /// * Fails on invalid utf-16 data
 641 pub fn utf16_chars(v: &[u16], f: &fn(char)) {
 642     let len = v.len();
 643     let mut i = 0u;
 644     while (i < len && v[i] != 0u16) {
 645         let u = v[i];
 646
 647         if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
 648             f(u as char);
 649             i += 1u;
 650
 651         } else {
 652             let u2 = v[i+1u];
 653             assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16);
 654             assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16);
 655             let mut c = (u - 0xD800_u16) as char;
 656             c = c << 10;
 657             c |= (u2 - 0xDC00_u16) as char;
 658             c |= 0x1_0000_u32 as char;
 659             f(c);
 660             i += 2u;
 661         }
 662     }
 663 }
 664
 665 /**
 666  * Allocates a new string from the utf-16 slice provided
 667  */
 668 pub fn from_utf16(v: &[u16]) -> ~str {
 669     let mut buf = ~"";
 670     buf.reserve(v.len());
 671     utf16_chars(v, |ch| buf.push_char(ch));
 672     buf
 673 }
 674
 675 /**
 676  * Allocates a new string with the specified capacity. The string returned is
 677  * the empty string, but has capacity for much more.
 678  */
 679 #[inline]
 680 pub fn with_capacity(capacity: uint) -> ~str {
 681     let mut buf = ~"";
 682     buf.reserve(capacity);
 683     buf
 684 }
 685
 686 /**
 687  * As char_len but for a slice of a string
 688  *
 689  * # Arguments
 690  *
 691  * * s - A valid string
 692  * * start - The position inside `s` where to start counting in bytes
 693  * * end - The position where to stop counting
 694  *
 695  * # Return value
 696  *
 697  * The number of Unicode characters in `s` between the given indices.
 698  */
 699 pub fn count_chars(s: &str, start: uint, end: uint) -> uint {
 700     assert!(s.is_char_boundary(start));
 701     assert!(s.is_char_boundary(end));
 702     let mut i = start;
 703     let mut len = 0u;
 704     while i < end {
 705         let next = s.char_range_at(i).next;
 706         len += 1u;
 707         i = next;
 708     }
 709     return len;
 710 }
 711
 712 /// Counts the number of bytes taken by the first `n` chars in `s`
 713 /// starting from `start`.
 714 pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
 715     assert!(s.is_char_boundary(start));
 716     let mut end = start;
 717     let mut cnt = n;
 718     let l = s.len();
 719     while cnt > 0u {
 720         assert!(end < l);
 721         let next = s.char_range_at(end).next;
 722         cnt -= 1u;
 723         end = next;
 724     }
 725     end - start
 726 }
 727
 728 // https://tools.ietf.org/html/rfc3629
 729 static UTF8_CHAR_WIDTH: [u8, ..256] = [
 730 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 731 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 732 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 733 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 734 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 735 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 736 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 737 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 738 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 739 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 740 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 741 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 742 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 743 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 744 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 745 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 746 ];
 747
 748 /// Given a first byte, determine how many bytes are in this UTF-8 character
 749 pub fn utf8_char_width(b: u8) -> uint {
 750     return UTF8_CHAR_WIDTH[b] as uint;
 751 }
 752
 753 #[allow(missing_doc)]
 754 pub struct CharRange {
 755     ch: char,
 756     next: uint
 757 }
 758
 759 // UTF-8 tags and ranges
 760 static TAG_CONT_U8: u8 = 128u8;
 761 static TAG_CONT: uint = 128u;
 762 static MAX_ONE_B: uint = 128u;
 763 static TAG_TWO_B: uint = 192u;
 764 static MAX_TWO_B: uint = 2048u;
 765 static TAG_THREE_B: uint = 224u;
 766 static MAX_THREE_B: uint = 65536u;
 767 static TAG_FOUR_B: uint = 240u;
 768
 769 /// Unsafe operations
 770 pub mod raw {
 771     use cast;
 772     use libc;
 773     use ptr;
 774     use str::raw;
 775     use str::{is_utf8};
 776     use vec;
 777     use vec::MutableVector;
 778
 779     /// Create a Rust string from a null-terminated *u8 buffer
 780     pub unsafe fn from_buf(buf: *u8) -> ~str {
 781         let mut curr = buf;
 782         let mut i = 0u;
 783         while *curr != 0u8 {
 784             i += 1u;
 785             curr = ptr::offset(buf, i);
 786         }
 787         return from_buf_len(buf, i);
 788     }
 789
 790     /// Create a Rust string from a *u8 buffer of the given length
 791     pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
 792         let mut v: ~[u8] = vec::with_capacity(len + 1);
 793         v.as_mut_buf(|vbuf, _len| {
 794             ptr::copy_memory(vbuf, buf as *u8, len)
 795         });
 796         vec::raw::set_len(&mut v, len);
 797         v.push(0u8);
 798
 799         assert!(is_utf8(v));
 800         return ::cast::transmute(v);
 801     }
 802
 803     /// Create a Rust string from a null-terminated C string
 804     pub unsafe fn from_c_str(c_str: *libc::c_char) -> ~str {
 805         from_buf(::cast::transmute(c_str))
 806     }
 807
 808     /// Create a Rust string from a `*c_char` buffer of the given length
 809     pub unsafe fn from_c_str_len(c_str: *libc::c_char, len: uint) -> ~str {
 810         from_buf_len(::cast::transmute(c_str), len)
 811     }
 812
 813     /// Converts a vector of bytes to a new owned string.
 814     pub unsafe fn from_bytes(v: &[u8]) -> ~str {
 815         do v.as_imm_buf |buf, len| {
 816             from_buf_len(buf, len)
 817         }
 818     }
 819
 820     /// Converts an owned vector of bytes to a new owned string. This assumes
 821     /// that the utf-8-ness of the vector has already been validated
 822     pub unsafe fn from_bytes_owned(mut v: ~[u8]) -> ~str {
 823         v.push(0u8);
 824         cast::transmute(v)
 825     }
 826
 827     /// Converts a vector of bytes to a string.
 828     /// The byte slice needs to contain valid utf8 and needs to be one byte longer than
 829     /// the string, if possible ending in a 0 byte.
 830     pub unsafe fn from_bytes_with_null<'a>(v: &'a [u8]) -> &'a str {
 831         cast::transmute(v)
 832     }
 833
 834     /// Converts a byte to a string.
 835     pub unsafe fn from_byte(u: u8) -> ~str { raw::from_bytes([u]) }
 836
 837     /// Form a slice from a C string. Unsafe because the caller must ensure the
 838     /// C string has the static lifetime, or else the return value may be
 839     /// invalidated later.
 840     pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
 841         let s = s as *u8;
 842         let mut curr = s;
 843         let mut len = 0u;
 844         while *curr != 0u8 {
 845             len += 1u;
 846             curr = ptr::offset(s, len);
 847         }
 848         let v = (s, len + 1);
 849         assert!(is_utf8(::cast::transmute(v)));
 850         ::cast::transmute(v)
 851     }
 852
 853     /**
 854      * Takes a bytewise (not UTF-8) slice from a string.
 855      *
 856      * Returns the substring from [`begin`..`end`).
 857      *
 858      * # Failure
 859      *
 860      * If begin is greater than end.
 861      * If end is greater than the length of the string.
 862      */
 863     #[inline]
 864     pub unsafe fn slice_bytes(s: &str, begin: uint, end: uint) -> &str {
 865         do s.as_imm_buf |sbuf, n| {
 866              assert!((begin <= end));
 867              assert!((end <= n));
 868
 869              let tuple = (ptr::offset(sbuf, begin), end - begin + 1);
 870              ::cast::transmute(tuple)
 871         }
 872     }
 873
 874     /// Appends a byte to a string. (Not UTF-8 safe).
 875     pub unsafe fn push_byte(s: &mut ~str, b: u8) {
 876         let new_len = s.len() + 1;
 877         s.reserve_at_least(new_len);
 878         do s.as_mut_buf |buf, len| {
 879             *ptr::mut_offset(buf, len) = b;
 880         }
 881         set_len(&mut *s, new_len);
 882     }
 883
 884     /// Appends a vector of bytes to a string. (Not UTF-8 safe).
 885     unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
 886         let new_len = s.len() + bytes.len();
 887         s.reserve_at_least(new_len);
 888         for bytes.iter().advance |byte| { push_byte(&mut *s, *byte); }
 889     }
 890
 891     /// Removes the last byte from a string and returns it. (Not UTF-8 safe).
 892     pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
 893         let len = s.len();
 894         assert!((len > 0u));
 895         let b = s[len - 1u];
 896         set_len(s, len - 1u);
 897         return b;
 898     }
 899
 900     /// Removes the first byte from a string and returns it. (Not UTF-8 safe).
 901     pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
 902         let len = s.len();
 903         assert!((len > 0u));
 904         let b = s[0];
 905         *s = s.slice(1, len).to_owned();
 906         return b;
 907     }
 908
 909     /// Sets the length of the string and adds the null terminator
 910     #[inline]
 911     pub unsafe fn set_len(v: &mut ~str, new_len: uint) {
 912         let v: **mut vec::UnboxedVecRepr = cast::transmute(v);
 913         let repr: *mut vec::UnboxedVecRepr = *v;
 914         (*repr).fill = new_len + 1u;
 915         let null = ptr::mut_offset(cast::transmute(&((*repr).data)),
 916                                    new_len);
 917         *null = 0u8;
 918     }
 919
 920     #[test]
 921     fn test_from_buf_len() {
 922         unsafe {
 923             let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
 924             let b = vec::raw::to_ptr(a);
 925             let c = from_buf_len(b, 3u);
 926             assert_eq!(c, ~"AAA");
 927         }
 928     }
 929
 930 }
 931
 932 #[cfg(not(test))]
 933 pub mod traits {
 934     use ops::Add;
 935     use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
 936     use super::{Str, eq_slice};
 937
 938     impl<'self> Add<&'self str,~str> for &'self str {
 939         #[inline]
 940         fn add(&self, rhs: & &'self str) -> ~str {
 941             let mut ret = self.to_owned();
 942             ret.push_str(*rhs);
 943             ret
 944         }
 945     }
 946
 947     impl<'self> TotalOrd for &'self str {
 948         #[inline]
 949         fn cmp(&self, other: & &'self str) -> Ordering {
 950             for self.bytes_iter().zip(other.bytes_iter()).advance |(s_b, o_b)| {
 951                 match s_b.cmp(&o_b) {
 952                     Greater => return Greater,
 953                     Less => return Less,
 954                     Equal => ()
 955                 }
 956             }
 957
 958             self.len().cmp(&other.len())
 959         }
 960     }
 961
 962     impl TotalOrd for ~str {
 963         #[inline]
 964         fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
 965     }
 966
 967     impl TotalOrd for @str {
 968         #[inline]
 969         fn cmp(&self, other: &@str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
 970     }
 971
 972     impl<'self> Eq for &'self str {
 973         #[inline]
 974         fn eq(&self, other: & &'self str) -> bool {
 975             eq_slice((*self), (*other))
 976         }
 977         #[inline]
 978         fn ne(&self, other: & &'self str) -> bool { !(*self).eq(other) }
 979     }
 980
 981     impl Eq for ~str {
 982         #[inline]
 983         fn eq(&self, other: &~str) -> bool {
 984             eq_slice((*self), (*other))
 985         }
 986         #[inline]
 987         fn ne(&self, other: &~str) -> bool { !(*self).eq(other) }
 988     }
 989
 990     impl Eq for @str {
 991         #[inline]
 992         fn eq(&self, other: &@str) -> bool {
 993             eq_slice((*self), (*other))
 994         }
 995         #[inline]
 996         fn ne(&self, other: &@str) -> bool { !(*self).eq(other) }
 997     }
 998
 999     impl<'self> TotalEq for &'self str {
1000         #[inline]
1001         fn equals(&self, other: & &'self str) -> bool {
1002             eq_slice((*self), (*other))
1003         }
1004     }
1005
1006     impl TotalEq for ~str {
1007         #[inline]
1008         fn equals(&self, other: &~str) -> bool {
1009             eq_slice((*self), (*other))
1010         }
1011     }
1012
1013     impl TotalEq for @str {
1014         #[inline]
1015         fn equals(&self, other: &@str) -> bool {
1016             eq_slice((*self), (*other))
1017         }
1018     }
1019
1020     impl<'self> Ord for &'self str {
1021         #[inline]
1022         fn lt(&self, other: & &'self str) -> bool { self.cmp(other) == Less }
1023         #[inline]
1024         fn le(&self, other: & &'self str) -> bool { self.cmp(other) != Greater }
1025         #[inline]
1026         fn ge(&self, other: & &'self str) -> bool { self.cmp(other) != Less }
1027         #[inline]
1028         fn gt(&self, other: & &'self str) -> bool { self.cmp(other) == Greater }
1029     }
1030
1031     impl Ord for ~str {
1032         #[inline]
1033         fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1034         #[inline]
1035         fn le(&self, other: &~str) -> bool { self.cmp(other) != Greater }
1036         #[inline]
1037         fn ge(&self, other: &~str) -> bool { self.cmp(other) != Less }
1038         #[inline]
1039         fn gt(&self, other: &~str) -> bool { self.cmp(other) == Greater }
1040     }
1041
1042     impl Ord for @str {
1043         #[inline]
1044         fn lt(&self, other: &@str) -> bool { self.cmp(other) == Less }
1045         #[inline]
1046         fn le(&self, other: &@str) -> bool { self.cmp(other) != Greater }
1047         #[inline]
1048         fn ge(&self, other: &@str) -> bool { self.cmp(other) != Less }
1049         #[inline]
1050         fn gt(&self, other: &@str) -> bool { self.cmp(other) == Greater }
1051     }
1052
1053     impl<'self, S: Str> Equiv<S> for &'self str {
1054         #[inline]
1055         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1056     }
1057
1058     impl<'self, S: Str> Equiv<S> for @str {
1059         #[inline]
1060         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1061     }
1062
1063     impl<'self, S: Str> Equiv<S> for ~str {
1064         #[inline]
1065         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1066     }
1067 }
1068
1069 #[cfg(test)]
1070 pub mod traits {}
1071
1072 /// Any string that can be represented as a slice
1073 pub trait Str {
1074     /// Work with `self` as a slice.
1075     fn as_slice<'a>(&'a self) -> &'a str;
1076 }
1077
1078 impl<'self> Str for &'self str {
1079     #[inline]
1080     fn as_slice<'a>(&'a self) -> &'a str { *self }
1081 }
1082 impl<'self> Str for ~str {
1083     #[inline]
1084     fn as_slice<'a>(&'a self) -> &'a str {
1085         let s: &'a str = *self; s
1086     }
1087 }
1088 impl<'self> Str for @str {
1089     #[inline]
1090     fn as_slice<'a>(&'a self) -> &'a str {
1091         let s: &'a str = *self; s
1092     }
1093 }
1094
1095 impl<'self> Container for &'self str {
1096     #[inline]
1097     fn len(&self) -> uint {
1098         do self.as_imm_buf |_p, n| { n - 1u }
1099     }
1100     #[inline]
1101     fn is_empty(&self) -> bool {
1102         self.len() == 0
1103     }
1104 }
1105
1106 impl Container for ~str {
1107     #[inline]
1108     fn len(&self) -> uint { self.as_slice().len() }
1109     #[inline]
1110     fn is_empty(&self) -> bool { self.len() == 0 }
1111 }
1112
1113 impl Container for @str {
1114     #[inline]
1115     fn len(&self) -> uint { self.as_slice().len() }
1116     #[inline]
1117     fn is_empty(&self) -> bool { self.len() == 0 }
1118 }
1119
1120 impl Mutable for ~str {
1121     /// Remove all content, make the string empty
1122     #[inline]
1123     fn clear(&mut self) {
1124         unsafe {
1125             raw::set_len(self, 0)
1126         }
1127     }
1128 }
1129
1130
1131 #[allow(missing_doc)]
1132 pub trait StrSlice<'self> {
1133     fn contains<'a>(&self, needle: &'a str) -> bool;
1134     fn contains_char(&self, needle: char) -> bool;
1135     fn iter(&self) -> StrCharIterator<'self>;
1136     fn rev_iter(&self) -> StrCharRevIterator<'self>;
1137     fn bytes_iter(&self) -> StrBytesIterator<'self>;
1138     fn bytes_rev_iter(&self) -> StrBytesRevIterator<'self>;
1139     fn split_iter<Sep: CharEq>(&self, sep: Sep) -> StrCharSplitIterator<'self, Sep>;
1140     fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> StrCharSplitIterator<'self, Sep>;
1141     fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
1142         -> StrCharSplitIterator<'self, Sep>;
1143     fn matches_index_iter(&self, sep: &'self str) -> StrMatchesIndexIterator<'self>;
1144     fn split_str_iter(&self, &'self str) -> StrStrSplitIterator<'self>;
1145     fn line_iter(&self) -> StrCharSplitIterator<'self, char>;
1146     fn any_line_iter(&self) -> AnyLineIterator<'self>;
1147     fn word_iter(&self) -> WordIterator<'self>;
1148     fn ends_with(&self, needle: &str) -> bool;
1149     fn is_whitespace(&self) -> bool;
1150     fn is_alphanumeric(&self) -> bool;
1151     fn char_len(&self) -> uint;
1152
1153     fn slice(&self, begin: uint, end: uint) -> &'self str;
1154     fn slice_from(&self, begin: uint) -> &'self str;
1155     fn slice_to(&self, end: uint) -> &'self str;
1156
1157     fn slice_chars(&self, begin: uint, end: uint) -> &'self str;
1158
1159     fn starts_with(&self, needle: &str) -> bool;
1160     fn escape_default(&self) -> ~str;
1161     fn escape_unicode(&self) -> ~str;
1162     fn trim(&self) -> &'self str;
1163     fn trim_left(&self) -> &'self str;
1164     fn trim_right(&self) -> &'self str;
1165     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1166     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1167     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1168     fn replace(&self, from: &str, to: &str) -> ~str;
1169     fn to_owned(&self) -> ~str;
1170     fn to_managed(&self) -> @str;
1171     fn to_utf16(&self) -> ~[u16];
1172     fn is_char_boundary(&self, index: uint) -> bool;
1173     fn char_range_at(&self, start: uint) -> CharRange;
1174     fn char_at(&self, i: uint) -> char;
1175     fn char_range_at_reverse(&self, start: uint) -> CharRange;
1176     fn char_at_reverse(&self, i: uint) -> char;
1177     fn as_bytes(&self) -> &'self [u8];
1178
1179     fn find<C: CharEq>(&self, search: C) -> Option<uint>;
1180     fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
1181     fn find_str(&self, &str) -> Option<uint>;
1182
1183     fn repeat(&self, nn: uint) -> ~str;
1184
1185     fn slice_shift_char(&self) -> (char, &'self str);
1186
1187     fn map_chars(&self, ff: &fn(char) -> char) -> ~str;
1188
1189     fn lev_distance(&self, t: &str) -> uint;
1190
1191     fn subslice_offset(&self, inner: &str) -> uint;
1192
1193     fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T;
1194     fn as_c_str<T>(&self, f: &fn(*libc::c_char) -> T) -> T;
1195 }
1196
1197 /// Extension methods for strings
1198 impl<'self> StrSlice<'self> for &'self str {
1199     /**
1200      * Returns true if one string contains another
1201      *
1202      * # Arguments
1203      *
1204      * * needle - The string to look for
1205      */
1206     #[inline]
1207     fn contains<'a>(&self, needle: &'a str) -> bool {
1208         self.find_str(needle).is_some()
1209     }
1210     /**
1211      * Returns true if a string contains a char.
1212      *
1213      * # Arguments
1214      *
1215      * * needle - The char to look for
1216      */
1217     #[inline]
1218     fn contains_char(&self, needle: char) -> bool {
1219         self.find(needle).is_some()
1220     }
1221     /// An iterator over the characters of `self`. Note, this iterates
1222     /// over unicode code-points, not unicode graphemes.
1223     ///
1224     /// # Example
1225     ///
1226     /// ~~~ {.rust}
1227     /// let v: ~[char] = "abc åäö".iter().collect();
1228     /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1229     /// ~~~
1230     #[inline]
1231     fn iter(&self) -> StrCharIterator<'self> {
1232         StrCharIterator {
1233             index: 0,
1234             string: *self
1235         }
1236     }
1237     /// An iterator over the characters of `self`, in reverse order.
1238     #[inline]
1239     fn rev_iter(&self) -> StrCharRevIterator<'self> {
1240         StrCharRevIterator {
1241             index: self.len(),
1242             string: *self
1243         }
1244     }
1245
1246     /// An iterator over the bytes of `self`
1247     #[inline]
1248     fn bytes_iter(&self) -> StrBytesIterator<'self> {
1249         StrBytesIterator { it: self.as_bytes().iter() }
1250     }
1251     /// An iterator over the bytes of `self`, in reverse order
1252     #[inline]
1253     fn bytes_rev_iter(&self) -> StrBytesRevIterator<'self> {
1254         StrBytesRevIterator { it: self.as_bytes().rev_iter() }
1255     }
1256
1257     /// An iterator over substrings of `self`, separated by characters
1258     /// matched by `sep`.
1259     ///
1260     /// # Example
1261     ///
1262     /// ~~~ {.rust}
1263     /// let v: ~[&str] = "Mary had a little lamb".split_iter(' ').collect();
1264     /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1265     ///
1266     /// let v: ~[&str] = "abc1def2ghi".split_iter(|c: char| c.is_digit()).collect();
1267     /// assert_eq!(v, ~["abc", "def", "ghi"]);
1268     /// ~~~
1269     #[inline]
1270     fn split_iter<Sep: CharEq>(&self, sep: Sep) -> StrCharSplitIterator<'self, Sep> {
1271         self.split_options_iter(sep, self.len(), true)
1272     }
1273
1274     /// An iterator over substrings of `self`, separated by characters
1275     /// matched by `sep`, restricted to splitting at most `count`
1276     /// times.
1277     #[inline]
1278     fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> StrCharSplitIterator<'self, Sep> {
1279         self.split_options_iter(sep, count, true)
1280     }
1281
1282     /// An iterator over substrings of `self`, separated by characters
1283     /// matched by `sep`, splitting at most `count` times, and
1284     /// possibly not including the trailing empty substring, if it
1285     /// exists.
1286     #[inline]
1287     fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
1288         -> StrCharSplitIterator<'self, Sep> {
1289         let only_ascii = sep.only_ascii();
1290         StrCharSplitIterator {
1291             string: *self,
1292             position: 0,
1293             sep: sep,
1294             count: count,
1295             allow_trailing_empty: allow_trailing_empty,
1296             finished: false,
1297             only_ascii: only_ascii
1298         }
1299     }
1300     /// An iterator over the start and end indices of each match of
1301     /// `sep` within `self`.
1302     #[inline]
1303     fn matches_index_iter(&self, sep: &'self str) -> StrMatchesIndexIterator<'self> {
1304         assert!(!sep.is_empty())
1305         StrMatchesIndexIterator {
1306             haystack: *self,
1307             needle: sep,
1308             position: 0
1309         }
1310     }
1311     /**
1312      * An iterator over the substrings of `self` separated by `sep`.
1313      *
1314      * # Example
1315      *
1316      * ~~~ {.rust}
1317      * let v: ~[&str] = "abcXXXabcYYYabc".split_str_iter("abc").collect()
1318      * assert_eq!(v, ["", "XXX", "YYY", ""]);
1319      * ~~~
1320      */
1321     #[inline]
1322     fn split_str_iter(&self, sep: &'self str) -> StrStrSplitIterator<'self> {
1323         StrStrSplitIterator {
1324             it: self.matches_index_iter(sep),
1325             last_end: 0,
1326             finished: false
1327         }
1328     }
1329
1330     /// An iterator over the lines of a string (subsequences separated
1331     /// by `\n`).
1332     #[inline]
1333     fn line_iter(&self) -> StrCharSplitIterator<'self, char> {
1334         self.split_options_iter('\n', self.len(), false)
1335     }
1336
1337     /// An iterator over the lines of a string, separated by either
1338     /// `\n` or (`\r\n`).
1339     fn any_line_iter(&self) -> AnyLineIterator<'self> {
1340         do self.line_iter().transform |line| {
1341             let l = line.len();
1342             if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
1343             else { line }
1344         }
1345     }
1346
1347     /// An iterator over the words of a string (subsequences separated
1348     /// by any sequence of whitespace).
1349     #[inline]
1350     fn word_iter(&self) -> WordIterator<'self> {
1351         self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
1352     }
1353
1354     /**
1355      * Returns true if the string contains only whitespace
1356      *
1357      * Whitespace characters are determined by `char::is_whitespace`
1358      */
1359     #[inline]
1360     fn is_whitespace(&self) -> bool { self.iter().all(char::is_whitespace) }
1361     /**
1362      * Returns true if the string contains only alphanumerics
1363      *
1364      * Alphanumeric characters are determined by `char::is_alphanumeric`
1365      */
1366     #[inline]
1367     fn is_alphanumeric(&self) -> bool { self.iter().all(char::is_alphanumeric) }
1368     /// Returns the number of characters that a string holds
1369     #[inline]
1370     fn char_len(&self) -> uint { self.iter().len_() }
1371
1372     /**
1373      * Returns a slice of the given string from the byte range
1374      * [`begin`..`end`)
1375      *
1376      * Fails when `begin` and `end` do not point to valid characters or
1377      * beyond the last character of the string
1378      */
1379     #[inline]
1380     fn slice(&self, begin: uint, end: uint) -> &'self str {
1381         assert!(self.is_char_boundary(begin));
1382         assert!(self.is_char_boundary(end));
1383         unsafe { raw::slice_bytes(*self, begin, end) }
1384     }
1385     /// Returns a slice of the string from `begin` to its end.
1386     ///
1387     /// Fails when `begin` does not point to a valid character, or is
1388     /// out of bounds.
1389     #[inline]
1390     fn slice_from(&self, begin: uint) -> &'self str {
1391         self.slice(begin, self.len())
1392     }
1393     /// Returns a slice of the string from the beginning to byte
1394     /// `end`.
1395     ///
1396     /// Fails when `end` does not point to a valid character, or is
1397     /// out of bounds.
1398     #[inline]
1399     fn slice_to(&self, end: uint) -> &'self str {
1400         self.slice(0, end)
1401     }
1402
1403     /// Returns a slice of the string from the char range
1404     /// [`begin`..`end`).
1405     ///
1406     /// Fails if `begin` > `end` or the either `begin` or `end` are
1407     /// beyond the last character of the string.
1408     fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
1409         assert!(begin <= end);
1410         // not sure how to use the iterators for this nicely.
1411         let mut position = 0;
1412         let mut count = 0;
1413         let l = self.len();
1414         while count < begin && position < l {
1415             position = self.char_range_at(position).next;
1416             count += 1;
1417         }
1418         if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
1419         let start_byte = position;
1420         while count < end && position < l {
1421             position = self.char_range_at(position).next;
1422             count += 1;
1423         }
1424         if count < end { fail!("Attempted to end slice_chars beyond end of string") }
1425
1426         self.slice(start_byte, position)
1427     }
1428
1429     /// Returns true if `needle` is a prefix of the string.
1430     fn starts_with<'a>(&self, needle: &'a str) -> bool {
1431         let (self_len, needle_len) = (self.len(), needle.len());
1432         if needle_len == 0u { true }
1433         else if needle_len > self_len { false }
1434         else { match_at(*self, needle, 0u) }
1435     }
1436     /// Returns true if `needle` is a suffix of the string.
1437     fn ends_with(&self, needle: &str) -> bool {
1438         let (self_len, needle_len) = (self.len(), needle.len());
1439         if needle_len == 0u { true }
1440         else if needle_len > self_len { false }
1441         else { match_at(*self, needle, self_len - needle_len) }
1442     }
1443
1444     /// Escape each char in `s` with char::escape_default.
1445     fn escape_default(&self) -> ~str {
1446         let mut out: ~str = ~"";
1447         out.reserve_at_least(self.len());
1448         for self.iter().advance |c| {
1449             do c.escape_default |c| {
1450                 out.push_char(c);
1451             }
1452         }
1453         out
1454     }
1455
1456     /// Escape each char in `s` with char::escape_unicode.
1457     fn escape_unicode(&self) -> ~str {
1458         let mut out: ~str = ~"";
1459         out.reserve_at_least(self.len());
1460         for self.iter().advance |c| {
1461             do c.escape_unicode |c| {
1462                 out.push_char(c);
1463             }
1464         }
1465         out
1466     }
1467
1468     /// Returns a string with leading and trailing whitespace removed
1469     #[inline]
1470     fn trim(&self) -> &'self str {
1471         self.trim_left().trim_right()
1472     }
1473     /// Returns a string with leading whitespace removed
1474     #[inline]
1475     fn trim_left(&self) -> &'self str {
1476         self.trim_left_chars(&char::is_whitespace)
1477     }
1478     /// Returns a string with trailing whitespace removed
1479     #[inline]
1480     fn trim_right(&self) -> &'self str {
1481         self.trim_right_chars(&char::is_whitespace)
1482     }
1483
1484     /**
1485      * Returns a string with characters that match `to_trim` removed.
1486      *
1487      * # Arguments
1488      *
1489      * * to_trim - a character matcher
1490      *
1491      * # Example
1492      *
1493      * ~~~ {.rust}
1494      * assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1495      * assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1496      * assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1497      * ~~~
1498      */
1499     #[inline]
1500     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1501         self.trim_left_chars(to_trim).trim_right_chars(to_trim)
1502     }
1503     /**
1504      * Returns a string with leading `chars_to_trim` removed.
1505      *
1506      * # Arguments
1507      *
1508      * * to_trim - a character matcher
1509      *
1510      * # Example
1511      *
1512      * ~~~ {.rust}
1513      * assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
1514      * assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
1515      * assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
1516      * ~~~
1517      */
1518     #[inline]
1519     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1520         match self.find(|c: char| !to_trim.matches(c)) {
1521             None => "",
1522             Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
1523         }
1524     }
1525     /**
1526      * Returns a string with trailing `chars_to_trim` removed.
1527      *
1528      * # Arguments
1529      *
1530      * * to_trim - a character matcher
1531      *
1532      * # Example
1533      *
1534      * ~~~ {.rust}
1535      * assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
1536      * assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
1537      * assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
1538      * ~~~
1539      */
1540     #[inline]
1541     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1542         match self.rfind(|c: char| !to_trim.matches(c)) {
1543             None => "",
1544             Some(last) => {
1545                 let next = self.char_range_at(last).next;
1546                 unsafe { raw::slice_bytes(*self, 0u, next) }
1547             }
1548         }
1549     }
1550
1551     /**
1552      * Replace all occurrences of one string with another
1553      *
1554      * # Arguments
1555      *
1556      * * from - The string to replace
1557      * * to - The replacement string
1558      *
1559      * # Return value
1560      *
1561      * The original string with all occurances of `from` replaced with `to`
1562      */
1563     pub fn replace(&self, from: &str, to: &str) -> ~str {
1564         let mut result = ~"";
1565         let mut last_end = 0;
1566         for self.matches_index_iter(from).advance |(start, end)| {
1567             result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
1568             result.push_str(to);
1569             last_end = end;
1570         }
1571         result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
1572         result
1573     }
1574
1575     /// Copy a slice into a new unique str
1576     #[inline]
1577     fn to_owned(&self) -> ~str {
1578         do self.as_imm_buf |src, len| {
1579             assert!(len > 0);
1580             unsafe {
1581                 let mut v = vec::with_capacity(len);
1582
1583                 do v.as_mut_buf |dst, _| {
1584                     ptr::copy_memory(dst, src, len - 1);
1585                 }
1586                 vec::raw::set_len(&mut v, len - 1);
1587                 v.push(0u8);
1588                 ::cast::transmute(v)
1589             }
1590         }
1591     }
1592
1593     #[inline]
1594     fn to_managed(&self) -> @str {
1595         let v = at_vec::from_fn(self.len() + 1, |i| {
1596             if i == self.len() { 0 } else { self[i] }
1597         });
1598         unsafe { ::cast::transmute(v) }
1599     }
1600
1601     /// Converts to a vector of `u16` encoded as UTF-16.
1602     fn to_utf16(&self) -> ~[u16] {
1603         let mut u = ~[];
1604         for self.iter().advance |ch| {
1605             // Arithmetic with u32 literals is easier on the eyes than chars.
1606             let mut ch = ch as u32;
1607
1608             if (ch & 0xFFFF_u32) == ch {
1609                 // The BMP falls through (assuming non-surrogate, as it
1610                 // should)
1611                 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
1612                 u.push(ch as u16)
1613             } else {
1614                 // Supplementary planes break into surrogates.
1615                 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
1616                 ch -= 0x1_0000_u32;
1617                 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
1618                 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
1619                 u.push_all([w1, w2])
1620             }
1621         }
1622         u
1623     }
1624
1625     /**
1626      * Returns false if the index points into the middle of a multi-byte
1627      * character sequence.
1628      */
1629     fn is_char_boundary(&self, index: uint) -> bool {
1630         if index == self.len() { return true; }
1631         let b = self[index];
1632         return b < 128u8 || b >= 192u8;
1633     }
1634
1635     /**
1636      * Pluck a character out of a string and return the index of the next
1637      * character.
1638      *
1639      * This function can be used to iterate over the unicode characters of a
1640      * string.
1641      *
1642      * # Example
1643      *
1644      * ~~~ {.rust}
1645      * let s = "中华Việt Nam";
1646      * let i = 0u;
1647      * while i < s.len() {
1648      *     let CharRange {ch, next} = s.char_range_at(i);
1649      *     printfln!("%u: %c", i, ch);
1650      *     i = next;
1651      * }
1652      * ~~~
1653      *
1654      * # Example output
1655      *
1656      * ~~~
1657      * 0: 中
1658      * 3: 华
1659      * 6: V
1660      * 7: i
1661      * 8: ệ
1662      * 11: t
1663      * 12:
1664      * 13: N
1665      * 14: a
1666      * 15: m
1667      * ~~~
1668      *
1669      * # Arguments
1670      *
1671      * * s - The string
1672      * * i - The byte offset of the char to extract
1673      *
1674      * # Return value
1675      *
1676      * A record {ch: char, next: uint} containing the char value and the byte
1677      * index of the next unicode character.
1678      *
1679      * # Failure
1680      *
1681      * If `i` is greater than or equal to the length of the string.
1682      * If `i` is not the index of the beginning of a valid UTF-8 character.
1683      */
1684     #[inline]
1685     fn char_range_at(&self, i: uint) -> CharRange {
1686         if (self[i] < 128u8) {
1687             return CharRange {ch: self[i] as char, next: i + 1 };
1688         }
1689
1690         // Multibyte case is a fn to allow char_range_at to inline cleanly
1691         fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1692             let mut val = s[i] as uint;
1693             let w = UTF8_CHAR_WIDTH[val] as uint;
1694             assert!((w != 0));
1695
1696             // First byte is special, only want bottom 5 bits for width 2, 4 bits
1697             // for width 3, and 3 bits for width 4
1698             val &= 0x7Fu >> w;
1699             val = (val << 6) | (s[i + 1] & 63u8) as uint;
1700             if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1701             if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1702
1703             return CharRange {ch: val as char, next: i + w};
1704         }
1705
1706         return multibyte_char_range_at(*self, i);
1707     }
1708
1709     /// Plucks the character starting at the `i`th byte of a string
1710     #[inline]
1711     fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch }
1712
1713     /**
1714      * Given a byte position and a str, return the previous char and its position.
1715      *
1716      * This function can be used to iterate over a unicode string in reverse.
1717      *
1718      * Returns 0 for next index if called on start index 0.
1719      */
1720     fn char_range_at_reverse(&self, start: uint) -> CharRange {
1721         let mut prev = start;
1722
1723         // while there is a previous byte == 10......
1724         while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 {
1725             prev -= 1u;
1726         }
1727
1728         // now refer to the initial byte of previous char
1729         if prev > 0u {
1730             prev -= 1u;
1731         } else {
1732             prev = 0u;
1733         }
1734
1735
1736         let ch = self.char_at(prev);
1737         return CharRange {ch:ch, next:prev};
1738     }
1739
1740     /// Plucks the character ending at the `i`th byte of a string
1741     #[inline]
1742     fn char_at_reverse(&self, i: uint) -> char {
1743         self.char_range_at_reverse(i).ch
1744     }
1745
1746     /**
1747      * Work with the byte buffer of a string as a byte slice.
1748      *
1749      * The byte slice does not include the null terminator.
1750      */
1751     fn as_bytes(&self) -> &'self [u8] {
1752         unsafe {
1753             let (ptr, len): (*u8, uint) = ::cast::transmute(*self);
1754             let outgoing_tuple: (*u8, uint) = (ptr, len - 1);
1755             ::cast::transmute(outgoing_tuple)
1756         }
1757     }
1758
1759     /**
1760      * Returns the byte index of the first character of `self` that matches `search`
1761      *
1762      * # Return value
1763      *
1764      * `Some` containing the byte index of the last matching character
1765      * or `None` if there is no match
1766      */
1767     fn find<C: CharEq>(&self, search: C) -> Option<uint> {
1768         if search.only_ascii() {
1769             for self.bytes_iter().enumerate().advance |(i, b)| {
1770                 if search.matches(b as char) { return Some(i) }
1771             }
1772         } else {
1773             let mut index = 0;
1774             for self.iter().advance |c| {
1775                 if search.matches(c) { return Some(index); }
1776                 index += c.len_utf8_bytes();
1777             }
1778         }
1779
1780         None
1781     }
1782     /**
1783      * Returns the byte index of the last character of `self` that matches `search`
1784      *
1785      * # Return value
1786      *
1787      * `Some` containing the byte index of the last matching character
1788      * or `None` if there is no match
1789      */
1790     fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
1791         let mut index = self.len();
1792         if search.only_ascii() {
1793             for self.bytes_rev_iter().advance |b| {
1794                 index -= 1;
1795                 if search.matches(b as char) { return Some(index); }
1796             }
1797         } else {
1798             for self.rev_iter().advance |c| {
1799                 index -= c.len_utf8_bytes();
1800                 if search.matches(c) { return Some(index); }
1801             }
1802         }
1803
1804         None
1805     }
1806
1807     /**
1808      * Returns the byte index of the first matching substring
1809      *
1810      * # Arguments
1811      *
1812      * * `needle` - The string to search for
1813      *
1814      * # Return value
1815      *
1816      * `Some` containing the byte index of the first matching substring
1817      * or `None` if there is no match
1818      */
1819     fn find_str(&self, needle: &str) -> Option<uint> {
1820         if needle.is_empty() {
1821             Some(0)
1822         } else {
1823             self.matches_index_iter(needle)
1824                 .next()
1825                 .map_consume(|(start, _end)| start)
1826         }
1827     }
1828
1829     /// Given a string, make a new string with repeated copies of it.
1830     fn repeat(&self, nn: uint) -> ~str {
1831         do self.as_imm_buf |buf, len| {
1832             // ignore the NULL terminator
1833             let len = len - 1;
1834             let mut ret = with_capacity(nn * len);
1835
1836             unsafe {
1837                 do ret.as_mut_buf |rbuf, _len| {
1838                     let mut rbuf = rbuf;
1839
1840                     for nn.times {
1841                         ptr::copy_memory(rbuf, buf, len);
1842                         rbuf = rbuf.offset(len);
1843                     }
1844                 }
1845                 raw::set_len(&mut ret, nn * len);
1846             }
1847             ret
1848         }
1849     }
1850
1851     /**
1852      * Retrieves the first character from a string slice and returns
1853      * it. This does not allocate a new string; instead, it returns a
1854      * slice that point one character beyond the character that was
1855      * shifted.
1856      *
1857      * # Failure
1858      *
1859      * If the string does not contain any characters
1860      */
1861     #[inline]
1862     fn slice_shift_char(&self) -> (char, &'self str) {
1863         let CharRange {ch, next} = self.char_range_at(0u);
1864         let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
1865         return (ch, next_s);
1866     }
1867
1868
1869     /// Apply a function to each character.
1870     fn map_chars(&self, ff: &fn(char) -> char) -> ~str {
1871         let mut result = with_capacity(self.len());
1872         for self.iter().advance |cc| {
1873             result.push_char(ff(cc));
1874         }
1875         result
1876     }
1877
1878     /// Levenshtein Distance between two strings.
1879     fn lev_distance(&self, t: &str) -> uint {
1880         let slen = self.len();
1881         let tlen = t.len();
1882
1883         if slen == 0 { return tlen; }
1884         if tlen == 0 { return slen; }
1885
1886         let mut dcol = vec::from_fn(tlen + 1, |x| x);
1887
1888         for self.iter().enumerate().advance |(i, sc)| {
1889
1890             let mut current = i;
1891             dcol[0] = current + 1;
1892
1893             for t.iter().enumerate().advance |(j, tc)| {
1894
1895                 let next = dcol[j + 1];
1896
1897                 if sc == tc {
1898                     dcol[j + 1] = current;
1899                 } else {
1900                     dcol[j + 1] = ::cmp::min(current, next);
1901                     dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
1902                 }
1903
1904                 current = next;
1905             }
1906         }
1907
1908         return dcol[tlen];
1909     }
1910
1911
1912     /**
1913      * Returns the byte offset of an inner slice relative to an enclosing outer slice.
1914      *
1915      * Fails if `inner` is not a direct slice contained within self.
1916      *
1917      * # Example
1918      *
1919      * ~~~ {.rust}
1920      * let string = "a\nb\nc";
1921      * let mut lines = ~[];
1922      * for string.line_iter().advance |line| { lines.push(line) }
1923      *
1924      * assert!(string.subslice_offset(lines[0]) == 0); // &"a"
1925      * assert!(string.subslice_offset(lines[1]) == 2); // &"b"
1926      * assert!(string.subslice_offset(lines[2]) == 4); // &"c"
1927      * ~~~
1928      */
1929     #[inline]
1930     fn subslice_offset(&self, inner: &str) -> uint {
1931         do self.as_imm_buf |a, a_len| {
1932             do inner.as_imm_buf |b, b_len| {
1933                 let a_start: uint;
1934                 let a_end: uint;
1935                 let b_start: uint;
1936                 let b_end: uint;
1937                 unsafe {
1938                     a_start = cast::transmute(a); a_end = a_len + cast::transmute(a);
1939                     b_start = cast::transmute(b); b_end = b_len + cast::transmute(b);
1940                 }
1941                 assert!(a_start <= b_start);
1942                 assert!(b_end <= a_end);
1943                 b_start - a_start
1944             }
1945         }
1946     }
1947
1948     /**
1949      * Work with the byte buffer and length of a slice.
1950      *
1951      * The given length is one byte longer than the 'official' indexable
1952      * length of the string. This is to permit probing the byte past the
1953      * indexable area for a null byte, as is the case in slices pointing
1954      * to full strings, or suffixes of them.
1955      */
1956     #[inline]
1957     fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T {
1958         let v: &[u8] = unsafe { cast::transmute(*self) };
1959         v.as_imm_buf(f)
1960     }
1961
1962     /**
1963      * Work with the byte buffer of a string as a null-terminated C string.
1964      *
1965      * Allows for unsafe manipulation of strings, which is useful for foreign
1966      * interop. This is similar to `str::as_buf`, but guarantees null-termination.
1967      * If the given slice is not already null-terminated, this function will
1968      * allocate a temporary, copy the slice, null terminate it, and pass
1969      * that instead.
1970      *
1971      * # Example
1972      *
1973      * ~~~ {.rust}
1974      * let s = "PATH".as_c_str(|path| libc::getenv(path));
1975      * ~~~
1976      */
1977     #[inline]
1978     fn as_c_str<T>(&self, f: &fn(*libc::c_char) -> T) -> T {
1979         do self.as_imm_buf |buf, len| {
1980             // NB: len includes the trailing null.
1981             assert!(len > 0);
1982             if unsafe { *(ptr::offset(buf, len - 1)) != 0 } {
1983                 self.to_owned().as_c_str(|s| f(s))
1984             } else {
1985                 f(buf as *libc::c_char)
1986             }
1987         }
1988     }
1989 }
1990
1991 #[allow(missing_doc)]
1992 pub trait NullTerminatedStr {
1993     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8];
1994 }
1995
1996 impl NullTerminatedStr for ~str {
1997     /**
1998      * Work with the byte buffer of a string as a byte slice.
1999      *
2000      * The byte slice does include the null terminator.
2001      */
2002     #[inline]
2003     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8] {
2004         let ptr: &'a ~[u8] = unsafe { ::cast::transmute(self) };
2005         let slice: &'a [u8] = *ptr;
2006         slice
2007     }
2008 }
2009 impl NullTerminatedStr for @str {
2010     /**
2011      * Work with the byte buffer of a string as a byte slice.
2012      *
2013      * The byte slice does include the null terminator.
2014      */
2015     #[inline]
2016     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8] {
2017         let ptr: &'a @[u8] = unsafe { ::cast::transmute(self) };
2018         let slice: &'a [u8] = *ptr;
2019         slice
2020     }
2021 }
2022
2023 #[allow(missing_doc)]
2024 pub trait OwnedStr {
2025     fn push_str_no_overallocate(&mut self, rhs: &str);
2026     fn push_str(&mut self, rhs: &str);
2027     fn push_char(&mut self, c: char);
2028     fn pop_char(&mut self) -> char;
2029     fn shift_char(&mut self) -> char;
2030     fn unshift_char(&mut self, ch: char);
2031     fn append(self, rhs: &str) -> ~str;
2032     fn reserve(&mut self, n: uint);
2033     fn reserve_at_least(&mut self, n: uint);
2034     fn capacity(&self) -> uint;
2035     fn to_bytes_with_null(self) -> ~[u8];
2036
2037     /**
2038      * Work with the mutable byte buffer and length of a slice.
2039      *
2040      * The given length is one byte longer than the 'official' indexable
2041      * length of the string. This is to permit probing the byte past the
2042      * indexable area for a null byte, as is the case in slices pointing
2043      * to full strings, or suffixes of them.
2044      *
2045      * Make sure any mutations to this buffer keep this string valid UTF8.
2046      */
2047     fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T;
2048 }
2049
2050 impl OwnedStr for ~str {
2051     /// Appends a string slice to the back of a string, without overallocating
2052     #[inline]
2053     fn push_str_no_overallocate(&mut self, rhs: &str) {
2054         unsafe {
2055             let llen = self.len();
2056             let rlen = rhs.len();
2057             self.reserve(llen + rlen);
2058             do self.as_imm_buf |lbuf, _llen| {
2059                 do rhs.as_imm_buf |rbuf, _rlen| {
2060                     let dst = ptr::offset(lbuf, llen);
2061                     let dst = ::cast::transmute_mut_unsafe(dst);
2062                     ptr::copy_memory(dst, rbuf, rlen);
2063                 }
2064             }
2065             raw::set_len(self, llen + rlen);
2066         }
2067     }
2068
2069     /// Appends a string slice to the back of a string
2070     #[inline]
2071     fn push_str(&mut self, rhs: &str) {
2072         unsafe {
2073             let llen = self.len();
2074             let rlen = rhs.len();
2075             self.reserve_at_least(llen + rlen);
2076             do self.as_imm_buf |lbuf, _llen| {
2077                 do rhs.as_imm_buf |rbuf, _rlen| {
2078                     let dst = ptr::offset(lbuf, llen);
2079                     let dst = ::cast::transmute_mut_unsafe(dst);
2080                     ptr::copy_memory(dst, rbuf, rlen);
2081                 }
2082             }
2083             raw::set_len(self, llen + rlen);
2084         }
2085     }
2086     /// Appends a character to the back of a string
2087     #[inline]
2088     fn push_char(&mut self, c: char) {
2089         assert!(c as uint <= 0x10ffff); // FIXME: #7609: should be enforced on all `char`
2090         unsafe {
2091             let code = c as uint;
2092             let nb = if code < MAX_ONE_B { 1u }
2093             else if code < MAX_TWO_B { 2u }
2094             else if code < MAX_THREE_B { 3u }
2095             else { 4u };
2096             let len = self.len();
2097             let new_len = len + nb;
2098             self.reserve_at_least(new_len);
2099             let off = len;
2100             do self.as_mut_buf |buf, _len| {
2101                 match nb {
2102                     1u => {
2103                         *ptr::mut_offset(buf, off) = code as u8;
2104                     }
2105                     2u => {
2106                         *ptr::mut_offset(buf, off) = (code >> 6u & 31u | TAG_TWO_B) as u8;
2107                         *ptr::mut_offset(buf, off + 1u) = (code & 63u | TAG_CONT) as u8;
2108                     }
2109                     3u => {
2110                         *ptr::mut_offset(buf, off) = (code >> 12u & 15u | TAG_THREE_B) as u8;
2111                         *ptr::mut_offset(buf, off + 1u) = (code >> 6u & 63u | TAG_CONT) as u8;
2112                         *ptr::mut_offset(buf, off + 2u) = (code & 63u | TAG_CONT) as u8;
2113                     }
2114                     4u => {
2115                         *ptr::mut_offset(buf, off) = (code >> 18u & 7u | TAG_FOUR_B) as u8;
2116                         *ptr::mut_offset(buf, off + 1u) = (code >> 12u & 63u | TAG_CONT) as u8;
2117                         *ptr::mut_offset(buf, off + 2u) = (code >> 6u & 63u | TAG_CONT) as u8;
2118                         *ptr::mut_offset(buf, off + 3u) = (code & 63u | TAG_CONT) as u8;
2119                     }
2120                     _ => {}
2121                 }
2122             }
2123             raw::set_len(self, new_len);
2124         }
2125     }
2126     /**
2127      * Remove the final character from a string and return it
2128      *
2129      * # Failure
2130      *
2131      * If the string does not contain any characters
2132      */
2133     fn pop_char(&mut self) -> char {
2134         let end = self.len();
2135         assert!(end > 0u);
2136         let CharRange {ch, next} = self.char_range_at_reverse(end);
2137         unsafe { raw::set_len(self, next); }
2138         return ch;
2139     }
2140
2141     /**
2142      * Remove the first character from a string and return it
2143      *
2144      * # Failure
2145      *
2146      * If the string does not contain any characters
2147      */
2148     fn shift_char(&mut self) -> char {
2149         let CharRange {ch, next} = self.char_range_at(0u);
2150         *self = self.slice(next, self.len()).to_owned();
2151         return ch;
2152     }
2153
2154     /// Prepend a char to a string
2155     fn unshift_char(&mut self, ch: char) {
2156         // This could be more efficient.
2157         let mut new_str = ~"";
2158         new_str.push_char(ch);
2159         new_str.push_str(*self);
2160         *self = new_str;
2161     }
2162
2163     /// Concatenate two strings together.
2164     #[inline]
2165     fn append(self, rhs: &str) -> ~str {
2166         let mut new_str = self;
2167         new_str.push_str_no_overallocate(rhs);
2168         new_str
2169     }
2170
2171     /**
2172      * Reserves capacity for exactly `n` bytes in the given string, not including
2173      * the null terminator.
2174      *
2175      * Assuming single-byte characters, the resulting string will be large
2176      * enough to hold a string of length `n`. To account for the null terminator,
2177      * the underlying buffer will have the size `n` + 1.
2178      *
2179      * If the capacity for `s` is already equal to or greater than the requested
2180      * capacity, then no action is taken.
2181      *
2182      * # Arguments
2183      *
2184      * * s - A string
2185      * * n - The number of bytes to reserve space for
2186      */
2187     #[inline]
2188     pub fn reserve(&mut self, n: uint) {
2189         unsafe {
2190             let v: *mut ~[u8] = cast::transmute(self);
2191             (*v).reserve(n + 1);
2192         }
2193     }
2194
2195     /**
2196      * Reserves capacity for at least `n` bytes in the given string, not including
2197      * the null terminator.
2198      *
2199      * Assuming single-byte characters, the resulting string will be large
2200      * enough to hold a string of length `n`. To account for the null terminator,
2201      * the underlying buffer will have the size `n` + 1.
2202      *
2203      * This function will over-allocate in order to amortize the allocation costs
2204      * in scenarios where the caller may need to repeatedly reserve additional
2205      * space.
2206      *
2207      * If the capacity for `s` is already equal to or greater than the requested
2208      * capacity, then no action is taken.
2209      *
2210      * # Arguments
2211      *
2212      * * s - A string
2213      * * n - The number of bytes to reserve space for
2214      */
2215     #[inline]
2216     fn reserve_at_least(&mut self, n: uint) {
2217         self.reserve(uint::next_power_of_two(n + 1u) - 1u)
2218     }
2219
2220     /**
2221      * Returns the number of single-byte characters the string can hold without
2222      * reallocating
2223      */
2224     fn capacity(&self) -> uint {
2225         let buf: &~[u8] = unsafe { cast::transmute(self) };
2226         let vcap = buf.capacity();
2227         assert!(vcap > 0u);
2228         vcap - 1u
2229     }
2230
2231     /// Convert to a vector of bytes. This does not allocate a new
2232     /// string, and includes the null terminator.
2233     #[inline]
2234     fn to_bytes_with_null(self) -> ~[u8] {
2235         unsafe { ::cast::transmute(self) }
2236     }
2237
2238     #[inline]
2239     fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T {
2240         let v: &mut ~[u8] = unsafe { cast::transmute(self) };
2241         v.as_mut_buf(f)
2242     }
2243 }
2244
2245 impl Clone for ~str {
2246     #[inline]
2247     fn clone(&self) -> ~str {
2248         self.to_owned()
2249     }
2250 }
2251
2252 impl Clone for @str {
2253     #[inline]
2254     fn clone(&self) -> @str {
2255         *self
2256     }
2257 }
2258
2259 /// External iterator for a string's characters. Use with the `std::iterator`
2260 /// module.
2261 #[deriving(Clone)]
2262 pub struct StrCharIterator<'self> {
2263     priv index: uint,
2264     priv string: &'self str,
2265 }
2266
2267 impl<'self> Iterator<char> for StrCharIterator<'self> {
2268     #[inline]
2269     fn next(&mut self) -> Option<char> {
2270         if self.index < self.string.len() {
2271             let CharRange {ch, next} = self.string.char_range_at(self.index);
2272             self.index = next;
2273             Some(ch)
2274         } else {
2275             None
2276         }
2277     }
2278 }
2279 /// External iterator for a string's characters in reverse order. Use
2280 /// with the `std::iterator` module.
2281 #[deriving(Clone)]
2282 pub struct StrCharRevIterator<'self> {
2283     priv index: uint,
2284     priv string: &'self str,
2285 }
2286
2287 impl<'self> Iterator<char> for StrCharRevIterator<'self> {
2288     #[inline]
2289     fn next(&mut self) -> Option<char> {
2290         if self.index > 0 {
2291             let CharRange {ch, next} = self.string.char_range_at_reverse(self.index);
2292             self.index = next;
2293             Some(ch)
2294         } else {
2295             None
2296         }
2297     }
2298 }
2299
2300 /// External iterator for a string's bytes. Use with the `std::iterator`
2301 /// module.
2302 #[deriving(Clone)]
2303 pub struct StrBytesIterator<'self> {
2304     priv it: vec::VecIterator<'self, u8>
2305 }
2306
2307 impl<'self> Iterator<u8> for StrBytesIterator<'self> {
2308     #[inline]
2309     fn next(&mut self) -> Option<u8> {
2310         self.it.next().map_consume(|&x| x)
2311     }
2312 }
2313
2314 /// External iterator for a string's bytes in reverse order. Use with
2315 /// the `std::iterator` module.
2316 #[deriving(Clone)]
2317 pub struct StrBytesRevIterator<'self> {
2318     priv it: vec::VecRevIterator<'self, u8>
2319 }
2320
2321 impl<'self> Iterator<u8> for StrBytesRevIterator<'self> {
2322     #[inline]
2323     fn next(&mut self) -> Option<u8> {
2324         self.it.next().map_consume(|&x| x)
2325     }
2326 }
2327
2328 // This works because every lifetime is a sub-lifetime of 'static
2329 impl<'self> Zero for &'self str {
2330     fn zero() -> &'self str { "" }
2331     fn is_zero(&self) -> bool { self.is_empty() }
2332 }
2333
2334 impl Zero for ~str {
2335     fn zero() -> ~str { ~"" }
2336     fn is_zero(&self) -> bool { self.len() == 0 }
2337 }
2338
2339 impl Zero for @str {
2340     fn zero() -> @str { @"" }
2341     fn is_zero(&self) -> bool { self.len() == 0 }
2342 }
2343
2344 #[cfg(test)]
2345 mod tests {
2346     use iterator::IteratorUtil;
2347     use container::Container;
2348     use option::Some;
2349     use libc::c_char;
2350     use libc;
2351     use ptr;
2352     use str::*;
2353     use uint;
2354     use vec;
2355     use vec::{ImmutableVector, CopyableVector};
2356     use cmp::{TotalOrd, Less, Equal, Greater};
2357
2358     #[test]
2359     fn test_eq() {
2360         assert!((eq(&~"", &~"")));
2361         assert!((eq(&~"foo", &~"foo")));
2362         assert!((!eq(&~"foo", &~"bar")));
2363     }
2364
2365     #[test]
2366     fn test_eq_slice() {
2367         assert!((eq_slice("foobar".slice(0, 3), "foo")));
2368         assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2369         assert!((!eq_slice("foo1", "foo2")));
2370     }
2371
2372     #[test]
2373     fn test_le() {
2374         assert!("" <= "");
2375         assert!("" <= "foo");
2376         assert!("foo" <= "foo");
2377         assert!("foo" != "bar");
2378     }
2379
2380     #[test]
2381     fn test_len() {
2382         assert_eq!("".len(), 0u);
2383         assert_eq!("hello world".len(), 11u);
2384         assert_eq!("\x63".len(), 1u);
2385         assert_eq!("\xa2".len(), 2u);
2386         assert_eq!("\u03c0".len(), 2u);
2387         assert_eq!("\u2620".len(), 3u);
2388         assert_eq!("\U0001d11e".len(), 4u);
2389
2390         assert_eq!("".char_len(), 0u);
2391         assert_eq!("hello world".char_len(), 11u);
2392         assert_eq!("\x63".char_len(), 1u);
2393         assert_eq!("\xa2".char_len(), 1u);
2394         assert_eq!("\u03c0".char_len(), 1u);
2395         assert_eq!("\u2620".char_len(), 1u);
2396         assert_eq!("\U0001d11e".char_len(), 1u);
2397         assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2398     }
2399
2400     #[test]
2401     fn test_find() {
2402         assert_eq!("hello".find('l'), Some(2u));
2403         assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2404         assert!("hello".find('x').is_none());
2405         assert!("hello".find(|c:char| c == 'x').is_none());
2406         assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2407         assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2408     }
2409
2410     #[test]
2411     fn test_rfind() {
2412         assert_eq!("hello".rfind('l'), Some(3u));
2413         assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2414         assert!("hello".rfind('x').is_none());
2415         assert!("hello".rfind(|c:char| c == 'x').is_none());
2416         assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2417         assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2418     }
2419
2420     #[test]
2421     fn test_push_str() {
2422         let mut s = ~"";
2423         s.push_str("");
2424         assert_eq!(s.slice_from(0), "");
2425         s.push_str("abc");
2426         assert_eq!(s.slice_from(0), "abc");
2427         s.push_str("ประเทศไทย中华Việt Nam");
2428         assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
2429     }
2430     #[test]
2431     fn test_append() {
2432         let mut s = ~"";
2433         s = s.append("");
2434         assert_eq!(s.slice_from(0), "");
2435         s = s.append("abc");
2436         assert_eq!(s.slice_from(0), "abc");
2437         s = s.append("ประเทศไทย中华Việt Nam");
2438         assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
2439     }
2440
2441     #[test]
2442     fn test_pop_char() {
2443         let mut data = ~"ประเทศไทย中华";
2444         let cc = data.pop_char();
2445         assert_eq!(~"ประเทศไทย中", data);
2446         assert_eq!('华', cc);
2447     }
2448
2449     #[test]
2450     fn test_pop_char_2() {
2451         let mut data2 = ~"华";
2452         let cc2 = data2.pop_char();
2453         assert_eq!(~"", data2);
2454         assert_eq!('华', cc2);
2455     }
2456
2457     #[test]
2458     #[should_fail]
2459     #[ignore(cfg(windows))]
2460     fn test_pop_char_fail() {
2461         let mut data = ~"";
2462         let _cc3 = data.pop_char();
2463     }
2464
2465     #[test]
2466     fn test_push_char() {
2467         let mut data = ~"ประเทศไทย中";
2468         data.push_char('华');
2469         data.push_char('b'); // 1 byte
2470         data.push_char('¢'); // 2 byte
2471         data.push_char('€'); // 3 byte
2472         data.push_char('𤭢'); // 4 byte
2473         assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
2474     }
2475
2476     #[test]
2477     fn test_shift_char() {
2478         let mut data = ~"ประเทศไทย中";
2479         let cc = data.shift_char();
2480         assert_eq!(~"ระเทศไทย中", data);
2481         assert_eq!('ป', cc);
2482     }
2483
2484     #[test]
2485     fn test_unshift_char() {
2486         let mut data = ~"ประเทศไทย中";
2487         data.unshift_char('华');
2488         assert_eq!(~"华ประเทศไทย中", data);
2489     }
2490
2491     #[test]
2492     fn test_clear() {
2493         let mut empty = ~"";
2494         empty.clear();
2495         assert_eq!("", empty.as_slice());
2496         let mut data = ~"ประเทศไทย中";
2497         data.clear();
2498         assert_eq!("", data.as_slice());
2499         data.push_char('华');
2500         assert_eq!("华", data.as_slice());
2501     }
2502
2503     #[test]
2504     fn test_split_within() {
2505         fn t(s: &str, i: uint, u: &[~str]) {
2506             let mut v = ~[];
2507             for each_split_within(s, i) |s| { v.push(s.to_owned()) }
2508             assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
2509         }
2510         t("", 0, []);
2511         t("", 15, []);
2512         t("hello", 15, [~"hello"]);
2513         t("\nMary had a little lamb\nLittle lamb\n", 15,
2514             [~"Mary had a", ~"little lamb", ~"Little lamb"]);
2515         t("\nMary had a little lamb\nLittle lamb\n", uint::max_value,
2516             [~"Mary had a little lamb\nLittle lamb"]);
2517     }
2518
2519     #[test]
2520     fn test_find_str() {
2521         // byte positions
2522         assert_eq!("".find_str(""), Some(0u));
2523         assert!("banana".find_str("apple pie").is_none());
2524
2525         let data = "abcabc";
2526         assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2527         assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2528         assert!(data.slice(2u, 4u).find_str("ab").is_none());
2529
2530         let mut data = ~"ประเทศไทย中华Việt Nam";
2531         data = data + data;
2532         assert!(data.find_str("ไท华").is_none());
2533         assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2534         assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2535
2536         assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2537         assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2538         assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2539         assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2540         assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2541
2542         assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2543         assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2544         assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2545         assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2546         assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2547     }
2548
2549     #[test]
2550     fn test_slice_chars() {
2551         fn t(a: &str, b: &str, start: uint) {
2552             assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2553         }
2554         t("hello", "llo", 2);
2555         t("hello", "el", 1);
2556         assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2557     }
2558
2559     #[test]
2560     fn test_concat() {
2561         fn t(v: &[~str], s: &str) {
2562             assert_eq!(v.concat(), s.to_str());
2563         }
2564         t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2565         let v: &[~str] = [];
2566         t(v, "");
2567         t([~"hi"], "hi");
2568     }
2569
2570     #[test]
2571     fn test_connect() {
2572         fn t(v: &[~str], sep: &str, s: &str) {
2573             assert_eq!(v.connect(sep), s.to_str());
2574         }
2575         t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2576           " ", "you know I'm no good");
2577         let v: &[~str] = [];
2578         t(v, " ", "");
2579         t([~"hi"], " ", "hi");
2580     }
2581
2582     #[test]
2583     fn test_concat_slices() {
2584         fn t(v: &[&str], s: &str) {
2585             assert_eq!(v.concat(), s.to_str());
2586         }
2587         t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2588         let v: &[&str] = [];
2589         t(v, "");
2590         t(["hi"], "hi");
2591     }
2592
2593     #[test]
2594     fn test_connect_slices() {
2595         fn t(v: &[&str], sep: &str, s: &str) {
2596             assert_eq!(v.connect(sep), s.to_str());
2597         }
2598         t(["you", "know", "I'm", "no", "good"],
2599           " ", "you know I'm no good");
2600         t([], " ", "");
2601         t(["hi"], " ", "hi");
2602     }
2603
2604     #[test]
2605     fn test_repeat() {
2606         assert_eq!("x".repeat(4), ~"xxxx");
2607         assert_eq!("hi".repeat(4), ~"hihihihi");
2608         assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
2609         assert_eq!("".repeat(4), ~"");
2610         assert_eq!("hi".repeat(0), ~"");
2611     }
2612
2613     #[test]
2614     fn test_unsafe_slice() {
2615         assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2616         assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
2617         assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2618         fn a_million_letter_a() -> ~str {
2619             let mut i = 0;
2620             let mut rs = ~"";
2621             while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
2622             rs
2623         }
2624         fn half_a_million_letter_a() -> ~str {
2625             let mut i = 0;
2626             let mut rs = ~"";
2627             while i < 100000 { rs.push_str("aaaaa"); i += 1; }
2628             rs
2629         }
2630         let letters = a_million_letter_a();
2631         assert!(half_a_million_letter_a() ==
2632             unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
2633     }
2634
2635     #[test]
2636     fn test_starts_with() {
2637         assert!(("".starts_with("")));
2638         assert!(("abc".starts_with("")));
2639         assert!(("abc".starts_with("a")));
2640         assert!((!"a".starts_with("abc")));
2641         assert!((!"".starts_with("abc")));
2642     }
2643
2644     #[test]
2645     fn test_ends_with() {
2646         assert!(("".ends_with("")));
2647         assert!(("abc".ends_with("")));
2648         assert!(("abc".ends_with("c")));
2649         assert!((!"a".ends_with("abc")));
2650         assert!((!"".ends_with("abc")));
2651     }
2652
2653     #[test]
2654     fn test_is_empty() {
2655         assert!("".is_empty());
2656         assert!(!"a".is_empty());
2657     }
2658
2659     #[test]
2660     fn test_replace() {
2661         let a = "a";
2662         assert_eq!("".replace(a, "b"), ~"");
2663         assert_eq!("a".replace(a, "b"), ~"b");
2664         assert_eq!("ab".replace(a, "b"), ~"bb");
2665         let test = "test";
2666         assert!(" test test ".replace(test, "toast") ==
2667             ~" toast toast ");
2668         assert_eq!(" test test ".replace(test, ""), ~"   ");
2669     }
2670
2671     #[test]
2672     fn test_replace_2a() {
2673         let data = ~"ประเทศไทย中华";
2674         let repl = ~"دولة الكويت";
2675
2676         let a = ~"ประเ";
2677         let A = ~"دولة الكويتทศไทย中华";
2678         assert_eq!(data.replace(a, repl), A);
2679     }
2680
2681     #[test]
2682     fn test_replace_2b() {
2683         let data = ~"ประเทศไทย中华";
2684         let repl = ~"دولة الكويت";
2685
2686         let b = ~"ะเ";
2687         let B = ~"ปรدولة الكويتทศไทย中华";
2688         assert_eq!(data.replace(b,   repl), B);
2689     }
2690
2691     #[test]
2692     fn test_replace_2c() {
2693         let data = ~"ประเทศไทย中华";
2694         let repl = ~"دولة الكويت";
2695
2696         let c = ~"中华";
2697         let C = ~"ประเทศไทยدولة الكويت";
2698         assert_eq!(data.replace(c, repl), C);
2699     }
2700
2701     #[test]
2702     fn test_replace_2d() {
2703         let data = ~"ประเทศไทย中华";
2704         let repl = ~"دولة الكويت";
2705
2706         let d = ~"ไท华";
2707         assert_eq!(data.replace(d, repl), data);
2708     }
2709
2710     #[test]
2711     fn test_slice() {
2712         assert_eq!("ab", "abc".slice(0, 2));
2713         assert_eq!("bc", "abc".slice(1, 3));
2714         assert_eq!("", "abc".slice(1, 1));
2715         assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
2716
2717         let data = "ประเทศไทย中华";
2718         assert_eq!("ป", data.slice(0, 3));
2719         assert_eq!("ร", data.slice(3, 6));
2720         assert_eq!("", data.slice(3, 3));
2721         assert_eq!("华", data.slice(30, 33));
2722
2723         fn a_million_letter_X() -> ~str {
2724             let mut i = 0;
2725             let mut rs = ~"";
2726             while i < 100000 {
2727                 push_str(&mut rs, "华华华华华华华华华华");
2728                 i += 1;
2729             }
2730             rs
2731         }
2732         fn half_a_million_letter_X() -> ~str {
2733             let mut i = 0;
2734             let mut rs = ~"";
2735             while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
2736             rs
2737         }
2738         let letters = a_million_letter_X();
2739         assert!(half_a_million_letter_X() ==
2740             letters.slice(0u, 3u * 500000u).to_owned());
2741     }
2742
2743     #[test]
2744     fn test_slice_2() {
2745         let ss = "中华Việt Nam";
2746
2747         assert_eq!("华", ss.slice(3u, 6u));
2748         assert_eq!("Việt Nam", ss.slice(6u, 16u));
2749
2750         assert_eq!("ab", "abc".slice(0u, 2u));
2751         assert_eq!("bc", "abc".slice(1u, 3u));
2752         assert_eq!("", "abc".slice(1u, 1u));
2753
2754         assert_eq!("中", ss.slice(0u, 3u));
2755         assert_eq!("华V", ss.slice(3u, 7u));
2756         assert_eq!("", ss.slice(3u, 3u));
2757         /*0: 中
2758           3: 华
2759           6: V
2760           7: i
2761           8: ệ
2762          11: t
2763          12:
2764          13: N
2765          14: a
2766          15: m */
2767     }
2768
2769     #[test]
2770     #[should_fail]
2771     #[ignore(cfg(windows))]
2772     fn test_slice_fail() {
2773         "中华Việt Nam".slice(0u, 2u);
2774     }
2775
2776     #[test]
2777     fn test_slice_from() {
2778         assert_eq!("abcd".slice_from(0), "abcd");
2779         assert_eq!("abcd".slice_from(2), "cd");
2780         assert_eq!("abcd".slice_from(4), "");
2781     }
2782     #[test]
2783     fn test_slice_to() {
2784         assert_eq!("abcd".slice_to(0), "");
2785         assert_eq!("abcd".slice_to(2), "ab");
2786         assert_eq!("abcd".slice_to(4), "abcd");
2787     }
2788
2789     #[test]
2790     fn test_trim_left_chars() {
2791         let v: &[char] = &[];
2792         assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
2793         assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
2794         assert_eq!(" ***  *** ".trim_left_chars(& &['*', ' ']), "");
2795         assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
2796
2797         assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
2798         assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
2799         assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
2800     }
2801
2802     #[test]
2803     fn test_trim_right_chars() {
2804         let v: &[char] = &[];
2805         assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
2806         assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
2807         assert_eq!(" ***  *** ".trim_right_chars(& &['*', ' ']), "");
2808         assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
2809
2810         assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
2811         assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
2812         assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
2813     }
2814
2815     #[test]
2816     fn test_trim_chars() {
2817         let v: &[char] = &[];
2818         assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
2819         assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
2820         assert_eq!(" ***  *** ".trim_chars(& &['*', ' ']), "");
2821         assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
2822
2823         assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
2824         assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
2825         assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
2826     }
2827
2828     #[test]
2829     fn test_trim_left() {
2830         assert_eq!("".trim_left(), "");
2831         assert_eq!("a".trim_left(), "a");
2832         assert_eq!("    ".trim_left(), "");
2833         assert_eq!("     blah".trim_left(), "blah");
2834         assert_eq!("   \u3000  wut".trim_left(), "wut");
2835         assert_eq!("hey ".trim_left(), "hey ");
2836     }
2837
2838     #[test]
2839     fn test_trim_right() {
2840         assert_eq!("".trim_right(), "");
2841         assert_eq!("a".trim_right(), "a");
2842         assert_eq!("    ".trim_right(), "");
2843         assert_eq!("blah     ".trim_right(), "blah");
2844         assert_eq!("wut   \u3000  ".trim_right(), "wut");
2845         assert_eq!(" hey".trim_right(), " hey");
2846     }
2847
2848     #[test]
2849     fn test_trim() {
2850         assert_eq!("".trim(), "");
2851         assert_eq!("a".trim(), "a");
2852         assert_eq!("    ".trim(), "");
2853         assert_eq!("    blah     ".trim(), "blah");
2854         assert_eq!("\nwut   \u3000  ".trim(), "wut");
2855         assert_eq!(" hey dude ".trim(), "hey dude");
2856     }
2857
2858     #[test]
2859     fn test_is_whitespace() {
2860         assert!("".is_whitespace());
2861         assert!(" ".is_whitespace());
2862         assert!("\u2009".is_whitespace()); // Thin space
2863         assert!("  \n\t   ".is_whitespace());
2864         assert!(!"   _   ".is_whitespace());
2865     }
2866
2867     #[test]
2868     fn test_shift_byte() {
2869         let mut s = ~"ABC";
2870         let b = unsafe{raw::shift_byte(&mut s)};
2871         assert_eq!(s, ~"BC");
2872         assert_eq!(b, 65u8);
2873     }
2874
2875     #[test]
2876     fn test_pop_byte() {
2877         let mut s = ~"ABC";
2878         let b = unsafe{raw::pop_byte(&mut s)};
2879         assert_eq!(s, ~"AB");
2880         assert_eq!(b, 67u8);
2881     }
2882
2883     #[test]
2884     fn test_unsafe_from_bytes() {
2885         let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8];
2886         let b = unsafe { raw::from_bytes(a) };
2887         assert_eq!(b, ~"AAAAAAA");
2888     }
2889
2890     #[test]
2891     fn test_from_bytes() {
2892         let ss = ~"ศไทย中华Việt Nam";
2893         let bb = ~[0xe0_u8, 0xb8_u8, 0xa8_u8,
2894                   0xe0_u8, 0xb9_u8, 0x84_u8,
2895                   0xe0_u8, 0xb8_u8, 0x97_u8,
2896                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2897                   0xe4_u8, 0xb8_u8, 0xad_u8,
2898                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2899                   0x56_u8, 0x69_u8, 0xe1_u8,
2900                   0xbb_u8, 0x87_u8, 0x74_u8,
2901                   0x20_u8, 0x4e_u8, 0x61_u8,
2902                   0x6d_u8];
2903
2904         assert_eq!(ss, from_bytes(bb));
2905     }
2906
2907     #[test]
2908     #[ignore(cfg(windows))]
2909     fn test_from_bytes_fail() {
2910         use str::not_utf8::cond;
2911
2912         let bb = ~[0xff_u8, 0xb8_u8, 0xa8_u8,
2913                   0xe0_u8, 0xb9_u8, 0x84_u8,
2914                   0xe0_u8, 0xb8_u8, 0x97_u8,
2915                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2916                   0xe4_u8, 0xb8_u8, 0xad_u8,
2917                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2918                   0x56_u8, 0x69_u8, 0xe1_u8,
2919                   0xbb_u8, 0x87_u8, 0x74_u8,
2920                   0x20_u8, 0x4e_u8, 0x61_u8,
2921                   0x6d_u8];
2922
2923         let mut error_happened = false;
2924         let _x = do cond.trap(|err| {
2925             assert_eq!(err, ~"from_bytes: input is not UTF-8; first bad byte is 255");
2926             error_happened = true;
2927             ~""
2928         }).in {
2929             from_bytes(bb)
2930         };
2931         assert!(error_happened);
2932     }
2933
2934     #[test]
2935     fn test_unsafe_from_bytes_with_null() {
2936         let a = [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
2937         let b = unsafe { raw::from_bytes_with_null(a) };
2938         assert_eq!(b, "AAAAAAA");
2939     }
2940
2941     #[test]
2942     fn test_from_bytes_with_null() {
2943         let ss = "ศไทย中华Việt Nam";
2944         let bb = [0xe0_u8, 0xb8_u8, 0xa8_u8,
2945                   0xe0_u8, 0xb9_u8, 0x84_u8,
2946                   0xe0_u8, 0xb8_u8, 0x97_u8,
2947                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2948                   0xe4_u8, 0xb8_u8, 0xad_u8,
2949                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2950                   0x56_u8, 0x69_u8, 0xe1_u8,
2951                   0xbb_u8, 0x87_u8, 0x74_u8,
2952                   0x20_u8, 0x4e_u8, 0x61_u8,
2953                   0x6d_u8, 0x0_u8];
2954
2955         assert_eq!(ss, from_bytes_with_null(bb));
2956     }
2957
2958     #[test]
2959     #[should_fail]
2960     #[ignore(cfg(windows))]
2961     fn test_from_bytes_with_null_fail() {
2962         let bb = [0xff_u8, 0xb8_u8, 0xa8_u8,
2963                   0xe0_u8, 0xb9_u8, 0x84_u8,
2964                   0xe0_u8, 0xb8_u8, 0x97_u8,
2965                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2966                   0xe4_u8, 0xb8_u8, 0xad_u8,
2967                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2968                   0x56_u8, 0x69_u8, 0xe1_u8,
2969                   0xbb_u8, 0x87_u8, 0x74_u8,
2970                   0x20_u8, 0x4e_u8, 0x61_u8,
2971                   0x6d_u8, 0x0_u8];
2972
2973          let _x = from_bytes_with_null(bb);
2974     }
2975
2976     #[test]
2977     #[should_fail]
2978     #[ignore(cfg(windows))]
2979     fn test_from_bytes_with_null_fail_2() {
2980         let bb = [0xff_u8, 0xb8_u8, 0xa8_u8,
2981                   0xe0_u8, 0xb9_u8, 0x84_u8,
2982                   0xe0_u8, 0xb8_u8, 0x97_u8,
2983                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2984                   0xe4_u8, 0xb8_u8, 0xad_u8,
2985                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2986                   0x56_u8, 0x69_u8, 0xe1_u8,
2987                   0xbb_u8, 0x87_u8, 0x74_u8,
2988                   0x20_u8, 0x4e_u8, 0x61_u8,
2989                   0x6d_u8, 0x60_u8];
2990
2991          let _x = from_bytes_with_null(bb);
2992     }
2993
2994     #[test]
2995     fn test_from_buf() {
2996         unsafe {
2997             let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
2998             let b = vec::raw::to_ptr(a);
2999             let c = raw::from_buf(b);
3000             assert_eq!(c, ~"AAAAAAA");
3001         }
3002     }
3003
3004     #[test]
3005     fn test_as_bytes() {
3006         // no null
3007         let v = [
3008             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3009             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3010             109
3011         ];
3012         assert_eq!("".as_bytes(), &[]);
3013         assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3014         assert_eq!("ศไทย中华Việt Nam".as_bytes(), v);
3015     }
3016
3017     #[test]
3018     fn test_as_bytes_with_null() {
3019         // has null
3020         let v = [
3021             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3022             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3023             109, 0
3024         ];
3025
3026         let s1 = @"";
3027         let s2 = @"abc";
3028         let s3 = @"ศไทย中华Việt Nam";
3029         assert_eq!(s1.as_bytes_with_null(), &[0]);
3030         assert_eq!(s2.as_bytes_with_null(), &['a' as u8, 'b' as u8, 'c' as u8, 0]);
3031         assert_eq!(s3.as_bytes_with_null(), v);
3032
3033         let s1 = ~"";
3034         let s2 = ~"abc";
3035         let s3 = ~"ศไทย中华Việt Nam";
3036         assert_eq!(s1.as_bytes_with_null(), &[0]);
3037         assert_eq!(s2.as_bytes_with_null(), &['a' as u8, 'b' as u8, 'c' as u8, 0]);
3038         assert_eq!(s3.as_bytes_with_null(), v);
3039     }
3040
3041     #[test]
3042     fn test_to_bytes_with_null() {
3043         let s = ~"ศไทย中华Việt Nam";
3044         let v = ~[
3045             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3046             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3047             109, 0
3048         ];
3049         assert_eq!((~"").to_bytes_with_null(), ~[0]);
3050         assert_eq!((~"abc").to_bytes_with_null(),
3051                    ~['a' as u8, 'b' as u8, 'c' as u8, 0]);
3052         assert_eq!(s.to_bytes_with_null(), v);
3053     }
3054
3055     #[test]
3056     #[ignore(cfg(windows))]
3057     #[should_fail]
3058     fn test_as_bytes_fail() {
3059         // Don't double free. (I'm not sure if this exercises the
3060         // original problem code path anymore.)
3061         let s = ~"";
3062         let _bytes = s.as_bytes_with_null();
3063         fail!();
3064     }
3065
3066     #[test]
3067     fn test_as_imm_buf() {
3068         do "".as_imm_buf |buf, len| {
3069             assert_eq!(len, 1);
3070             unsafe {
3071                 assert_eq!(*ptr::offset(buf, 0), 0);
3072             }
3073         }
3074
3075         do "hello".as_imm_buf |buf, len| {
3076             assert_eq!(len, 6);
3077             unsafe {
3078                 assert_eq!(*ptr::offset(buf, 0), 'h' as u8);
3079                 assert_eq!(*ptr::offset(buf, 1), 'e' as u8);
3080                 assert_eq!(*ptr::offset(buf, 2), 'l' as u8);
3081                 assert_eq!(*ptr::offset(buf, 3), 'l' as u8);
3082                 assert_eq!(*ptr::offset(buf, 4), 'o' as u8);
3083                 assert_eq!(*ptr::offset(buf, 5), 0);
3084             }
3085         }
3086     }
3087
3088     #[test]
3089     fn test_as_c_str() {
3090         let a = ~"";
3091         do a.as_c_str |buf| {
3092             unsafe {
3093                 assert_eq!(*ptr::offset(buf, 0), 0);
3094             }
3095         }
3096
3097         let a = ~"hello";
3098         do a.as_c_str |buf| {
3099             unsafe {
3100                 assert_eq!(*ptr::offset(buf, 0), 'h' as libc::c_char);
3101                 assert_eq!(*ptr::offset(buf, 1), 'e' as libc::c_char);
3102                 assert_eq!(*ptr::offset(buf, 2), 'l' as libc::c_char);
3103                 assert_eq!(*ptr::offset(buf, 3), 'l' as libc::c_char);
3104                 assert_eq!(*ptr::offset(buf, 4), 'o' as libc::c_char);
3105                 assert_eq!(*ptr::offset(buf, 5), 0);
3106             }
3107         }
3108     }
3109
3110     #[test]
3111     fn test_subslice_offset() {
3112         let a = "kernelsprite";
3113         let b = a.slice(7, a.len());
3114         let c = a.slice(0, a.len() - 6);
3115         assert_eq!(a.subslice_offset(b), 7);
3116         assert_eq!(a.subslice_offset(c), 0);
3117
3118         let string = "a\nb\nc";
3119         let mut lines = ~[];
3120         for string.line_iter().advance |line| { lines.push(line) }
3121         assert_eq!(string.subslice_offset(lines[0]), 0);
3122         assert_eq!(string.subslice_offset(lines[1]), 2);
3123         assert_eq!(string.subslice_offset(lines[2]), 4);
3124     }
3125
3126     #[test]
3127     #[should_fail]
3128     fn test_subslice_offset_2() {
3129         let a = "alchemiter";
3130         let b = "cruxtruder";
3131         a.subslice_offset(b);
3132     }
3133
3134     #[test]
3135     fn vec_str_conversions() {
3136         let s1: ~str = ~"All mimsy were the borogoves";
3137
3138         let v: ~[u8] = s1.as_bytes().to_owned();
3139         let s2: ~str = from_bytes(v);
3140         let mut i: uint = 0u;
3141         let n1: uint = s1.len();
3142         let n2: uint = v.len();
3143         assert_eq!(n1, n2);
3144         while i < n1 {
3145             let a: u8 = s1[i];
3146             let b: u8 = s2[i];
3147             debug!(a);
3148             debug!(b);
3149             assert_eq!(a, b);
3150             i += 1u;
3151         }
3152     }
3153
3154     #[test]
3155     fn test_contains() {
3156         assert!("abcde".contains("bcd"));
3157         assert!("abcde".contains("abcd"));
3158         assert!("abcde".contains("bcde"));
3159         assert!("abcde".contains(""));
3160         assert!("".contains(""));
3161         assert!(!"abcde".contains("def"));
3162         assert!(!"".contains("a"));
3163
3164         let data = ~"ประเทศไทย中华Việt Nam";
3165         assert!(data.contains("ประเ"));
3166         assert!(data.contains("ะเ"));
3167         assert!(data.contains("中华"));
3168         assert!(!data.contains("ไท华"));
3169     }
3170
3171     #[test]
3172     fn test_contains_char() {
3173         assert!("abc".contains_char('b'));
3174         assert!("a".contains_char('a'));
3175         assert!(!"abc".contains_char('d'));
3176         assert!(!"".contains_char('a'));
3177     }
3178
3179     #[test]
3180     fn test_map() {
3181         assert_eq!(~"", "".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
3182         assert_eq!(~"YMCA", "ymca".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
3183     }
3184
3185     #[test]
3186     fn test_utf16() {
3187         let pairs =
3188             [(~"𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n",
3189               ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3190                 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3191                 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3192                 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3193
3194              (~"𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n",
3195               ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3196                 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3197                 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3198                 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3199                 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3200                 0x000a_u16]),
3201
3202              (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3203               ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3204                 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3205                 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3206                 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3207                 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3208                 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3209                 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3210
3211              (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3212               ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3213                 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3214                 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3215                 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3216                 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3217                 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3218                 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3219                 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3220                 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3221                 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3222                 0x000a_u16 ]) ];
3223
3224         for pairs.iter().advance |p| {
3225             let (s, u) = (*p).clone();
3226             assert!(s.to_utf16() == u);
3227             assert!(from_utf16(u) == s);
3228             assert!(from_utf16(s.to_utf16()) == s);
3229             assert!(from_utf16(u).to_utf16() == u);
3230         }
3231     }
3232
3233     #[test]
3234     fn test_char_at() {
3235         let s = ~"ศไทย中华Việt Nam";
3236         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3237         let mut pos = 0;
3238         for v.iter().advance |ch| {
3239             assert!(s.char_at(pos) == *ch);
3240             pos += from_char(*ch).len();
3241         }
3242     }
3243
3244     #[test]
3245     fn test_char_at_reverse() {
3246         let s = ~"ศไทย中华Việt Nam";
3247         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3248         let mut pos = s.len();
3249         for v.rev_iter().advance |ch| {
3250             assert!(s.char_at_reverse(pos) == *ch);
3251             pos -= from_char(*ch).len();
3252         }
3253     }
3254
3255     #[test]
3256     fn test_escape_unicode() {
3257         assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3258         assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3259         assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3260         assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3261         assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3262         assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3263         assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3264         assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3265         assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3266     }
3267
3268     #[test]
3269     fn test_escape_default() {
3270         assert_eq!("abc".escape_default(), ~"abc");
3271         assert_eq!("a c".escape_default(), ~"a c");
3272         assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3273         assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3274         assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3275         assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3276         assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3277         assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3278     }
3279
3280     #[test]
3281     fn test_to_managed() {
3282         assert_eq!("abc".to_managed(), @"abc");
3283         assert_eq!("abcdef".slice(1, 5).to_managed(), @"bcde");
3284     }
3285
3286     #[test]
3287     fn test_total_ord() {
3288         "1234".cmp(& &"123") == Greater;
3289         "123".cmp(& &"1234") == Less;
3290         "1234".cmp(& &"1234") == Equal;
3291         "12345555".cmp(& &"123456") == Less;
3292         "22".cmp(& &"1234") == Greater;
3293     }
3294
3295     #[test]
3296     fn test_char_range_at() {
3297         let data = ~"b¢€𤭢𤭢€¢b";
3298         assert_eq!('b', data.char_range_at(0).ch);
3299         assert_eq!('¢', data.char_range_at(1).ch);
3300         assert_eq!('€', data.char_range_at(3).ch);
3301         assert_eq!('𤭢', data.char_range_at(6).ch);
3302         assert_eq!('𤭢', data.char_range_at(10).ch);
3303         assert_eq!('€', data.char_range_at(14).ch);
3304         assert_eq!('¢', data.char_range_at(17).ch);
3305         assert_eq!('b', data.char_range_at(19).ch);
3306     }
3307
3308     #[test]
3309     fn test_char_range_at_reverse_underflow() {
3310         assert_eq!("abc".char_range_at_reverse(0).next, 0);
3311     }
3312
3313     #[test]
3314     fn test_add() {
3315         #[allow(unnecessary_allocation)];
3316         macro_rules! t (
3317             ($s1:expr, $s2:expr, $e:expr) => {
3318                 assert_eq!($s1 + $s2, $e);
3319                 assert_eq!($s1.to_owned() + $s2, $e);
3320                 assert_eq!($s1.to_managed() + $s2, $e);
3321             }
3322         );
3323
3324         t!("foo",  "bar", ~"foobar");
3325         t!("foo", @"bar", ~"foobar");
3326         t!("foo", ~"bar", ~"foobar");
3327         t!("ศไทย中",  "华Việt Nam", ~"ศไทย中华Việt Nam");
3328         t!("ศไทย中", @"华Việt Nam", ~"ศไทย中华Việt Nam");
3329         t!("ศไทย中", ~"华Việt Nam", ~"ศไทย中华Việt Nam");
3330     }
3331
3332     #[test]
3333     fn test_iterator() {
3334         use iterator::*;
3335         let s = ~"ศไทย中华Việt Nam";
3336         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3337
3338         let mut pos = 0;
3339         let mut it = s.iter();
3340
3341         for it.advance |c| {
3342             assert_eq!(c, v[pos]);
3343             pos += 1;
3344         }
3345         assert_eq!(pos, v.len());
3346     }
3347
3348     #[test]
3349     fn test_rev_iterator() {
3350         use iterator::*;
3351         let s = ~"ศไทย中华Việt Nam";
3352         let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3353
3354         let mut pos = 0;
3355         let mut it = s.rev_iter();
3356
3357         for it.advance |c| {
3358             assert_eq!(c, v[pos]);
3359             pos += 1;
3360         }
3361         assert_eq!(pos, v.len());
3362     }
3363
3364     #[test]
3365     fn test_bytes_iterator() {
3366         let s = ~"ศไทย中华Việt Nam";
3367         let v = [
3368             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3369             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3370             109
3371         ];
3372         let mut pos = 0;
3373
3374         for s.bytes_iter().advance |b| {
3375             assert_eq!(b, v[pos]);
3376             pos += 1;
3377         }
3378     }
3379
3380     #[test]
3381     fn test_bytes_rev_iterator() {
3382         let s = ~"ศไทย中华Việt Nam";
3383         let v = [
3384             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3385             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3386             109
3387         ];
3388         let mut pos = v.len();
3389
3390         for s.bytes_rev_iter().advance |b| {
3391             pos -= 1;
3392             assert_eq!(b, v[pos]);
3393         }
3394     }
3395
3396     #[test]
3397     fn test_split_char_iterator() {
3398         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3399
3400         let split: ~[&str] = data.split_iter(' ').collect();
3401         assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3402
3403         let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
3404         assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3405
3406         // Unicode
3407         let split: ~[&str] = data.split_iter('ä').collect();
3408         assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3409
3410         let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
3411         assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3412     }
3413     #[test]
3414     fn test_splitn_char_iterator() {
3415         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3416
3417         let split: ~[&str] = data.splitn_iter(' ', 3).collect();
3418         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3419
3420         let split: ~[&str] = data.splitn_iter(|c: char| c == ' ', 3).collect();
3421         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3422
3423         // Unicode
3424         let split: ~[&str] = data.splitn_iter('ä', 3).collect();
3425         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3426
3427         let split: ~[&str] = data.splitn_iter(|c: char| c == 'ä', 3).collect();
3428         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3429     }
3430
3431     #[test]
3432     fn test_split_char_iterator_no_trailing() {
3433         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3434
3435         let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect();
3436         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3437
3438         let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect();
3439         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3440     }
3441
3442     #[test]
3443     fn test_word_iter() {
3444         let data = "\n \tMäry   häd\tä  little lämb\nLittle lämb\n";
3445         let words: ~[&str] = data.word_iter().collect();
3446         assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3447     }
3448
3449     #[test]
3450     fn test_line_iter() {
3451         let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3452         let lines: ~[&str] = data.line_iter().collect();
3453         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3454
3455         let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3456         let lines: ~[&str] = data.line_iter().collect();
3457         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3458     }
3459
3460     #[test]
3461     fn test_split_str_iterator() {
3462         fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3463             let v: ~[&str] = s.split_str_iter(sep).collect();
3464             assert_eq!(v, u);
3465         }
3466         t("--1233345--", "12345", ~["--1233345--"]);
3467         t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3468         t("::hello::there", "::", ~["", "hello", "there"]);
3469         t("hello::there::", "::", ~["hello", "there", ""]);
3470         t("::hello::there::", "::", ~["", "hello", "there", ""]);
3471         t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3472         t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3473         t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3474         t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3475         t("", ".", ~[""]);
3476         t("zz", "zz", ~["",""]);
3477         t("ok", "z", ~["ok"]);
3478         t("zzz", "zz", ~["","z"]);
3479         t("zzzzz", "zz", ~["","","z"]);
3480     }
3481
3482     #[test]
3483     fn test_str_zero() {
3484         use num::Zero;
3485         fn t<S: Zero + Str>() {
3486             let s: S = Zero::zero();
3487             assert_eq!(s.as_slice(), "");
3488             assert!(s.is_zero());
3489         }
3490
3491         t::<&str>();
3492         t::<@str>();
3493         t::<~str>();
3494     }
3495
3496     #[test]
3497     fn test_str_container() {
3498         fn sum_len<S: Container>(v: &[S]) -> uint {
3499             v.iter().transform(|x| x.len()).sum()
3500         }
3501
3502         let s = ~"01234";
3503         assert_eq!(5, sum_len(["012", "", "34"]));
3504         assert_eq!(5, sum_len([@"01", @"2", @"34", @""]));
3505         assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3506         assert_eq!(5, sum_len([s.as_slice()]));
3507     }
3508 }
3509
3510 #[cfg(test)]
3511 mod bench {
3512     use extra::test::BenchHarness;
3513     use str;
3514
3515     #[bench]
3516     fn is_utf8_100_ascii(bh: &mut BenchHarness) {
3517
3518         let s = bytes!("Hello there, the quick brown fox jumped over the lazy dog! \
3519                         Lorem ipsum dolor sit amet, consectetur. ");
3520
3521         assert_eq!(100, s.len());
3522         do bh.iter {
3523             str::is_utf8(s);
3524         }
3525     }
3526
3527     #[bench]
3528     fn is_utf8_100_multibyte(bh: &mut BenchHarness) {
3529         let s = bytes!("𐌀𐌖𐌋𐌄𐌑𐌉ปรدولة الكويتทศไทย中华𐍅𐌿𐌻𐍆𐌹𐌻𐌰");
3530         assert_eq!(100, s.len());
3531         do bh.iter {
3532             str::is_utf8(s);
3533         }
3534     }
3535
3536     #[bench]
3537     fn map_chars_100_ascii(bh: &mut BenchHarness) {
3538         let s = "HelloHelloHelloHelloHelloHelloHelloHelloHelloHello\
3539                  HelloHelloHelloHelloHelloHelloHelloHelloHelloHello";
3540         do bh.iter {
3541             s.map_chars(|c| ((c as uint) + 1) as char);
3542         }
3543     }
3544
3545     #[bench]
3546     fn map_chars_100_multibytes(bh: &mut BenchHarness) {
3547         let s = "𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑\
3548                  𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑\
3549                  𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑\
3550                  𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑𐌀𐌖𐌋𐌄𐌑";
3551         do bh.iter {
3552             s.map_chars(|c| ((c as uint) + 1) as char);
3553         }
3554     }
3555 }