src/libstd/str.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 /*!
  12  * String manipulation
  13  *
  14  * Strings are a packed UTF-8 representation of text, stored as null
  15  * terminated buffers of u8 bytes.  Strings should be indexed in bytes,
  16  * for efficiency, but UTF-8 unsafe operations should be avoided.
  17  */
  18
  19 use at_vec;
  20 use cast;
  21 use char;
  22 use char::Char;
  23 use clone::Clone;
  24 use container::{Container, Mutable};
  25 use iter::Times;
  26 use iterator::{Iterator, IteratorUtil, FilterIterator, AdditiveIterator, MapIterator};
  27 use libc;
  28 use num::Zero;
  29 use option::{None, Option, Some};
  30 use ptr;
  31 use ptr::RawPtr;
  32 use to_str::ToStr;
  33 use uint;
  34 use vec;
  35 use vec::{OwnedVector, OwnedCopyableVector, ImmutableVector, MutableVector};
  36
  37 /*
  38 Section: Conditions
  39 */
  40 condition! {
  41     not_utf8: (~str) -> ~str;
  42 }
  43
  44 /*
  45 Section: Creating a string
  46 */
  47
  48 /**
  49  * Convert a vector of bytes to a new UTF-8 string
  50  *
  51  * # Failure
  52  *
  53  * Raises the `not_utf8` condition if invalid UTF-8
  54  */
  55 pub fn from_bytes(vv: &[u8]) -> ~str {
  56     use str::not_utf8::cond;
  57
  58     if !is_utf8(vv) {
  59         let first_bad_byte = *vv.iter().find_(|&b| !is_utf8([*b])).get();
  60         cond.raise(fmt!("from_bytes: input is not UTF-8; first bad byte is %u",
  61                         first_bad_byte as uint))
  62     }
  63     else {
  64         return unsafe { raw::from_bytes(vv) }
  65     }
  66 }
  67
  68 /**
  69  * Consumes a vector of bytes to create a new utf-8 string
  70  *
  71  * # Failure
  72  *
  73  * Raises the `not_utf8` condition if invalid UTF-8
  74  */
  75 pub fn from_bytes_owned(vv: ~[u8]) -> ~str {
  76     use str::not_utf8::cond;
  77
  78     if !is_utf8(vv) {
  79         let first_bad_byte = *vv.iter().find_(|&b| !is_utf8([*b])).get();
  80         cond.raise(fmt!("from_bytes: input is not UTF-8; first bad byte is %u",
  81                         first_bad_byte as uint))
  82     } else {
  83         return unsafe { raw::from_bytes_owned(vv) }
  84     }
  85 }
  86
  87 /**
  88  * Convert a vector of bytes to a UTF-8 string.
  89  * The vector needs to be one byte longer than the string, and end with a 0 byte.
  90  *
  91  * Compared to `from_bytes()`, this fn doesn't need to allocate a new owned str.
  92  *
  93  * # Failure
  94  *
  95  * Fails if invalid UTF-8
  96  * Fails if not null terminated
  97  */
  98 pub fn from_bytes_with_null<'a>(vv: &'a [u8]) -> &'a str {
  99     assert_eq!(vv[vv.len() - 1], 0);
 100     assert!(is_utf8(vv));
 101     return unsafe { raw::from_bytes_with_null(vv) };
 102 }
 103
 104 /**
 105  * Converts a vector to a string slice without performing any allocations.
 106  *
 107  * Once the slice has been validated as utf-8, it is transmuted in-place and
 108  * returned as a '&str' instead of a '&[u8]'
 109  *
 110  * # Failure
 111  *
 112  * Fails if invalid UTF-8
 113  */
 114 pub fn from_bytes_slice<'a>(vector: &'a [u8]) -> &'a str {
 115     unsafe {
 116         assert!(is_utf8(vector));
 117         let (ptr, len): (*u8, uint) = ::cast::transmute(vector);
 118         let string: &'a str = ::cast::transmute((ptr, len + 1));
 119         string
 120     }
 121 }
 122
 123 impl ToStr for ~str {
 124     #[inline]
 125     fn to_str(&self) -> ~str { self.to_owned() }
 126 }
 127 impl<'self> ToStr for &'self str {
 128     #[inline]
 129     fn to_str(&self) -> ~str { self.to_owned() }
 130 }
 131 impl ToStr for @str {
 132     #[inline]
 133     fn to_str(&self) -> ~str { self.to_owned() }
 134 }
 135
 136 /**
 137  * Convert a byte to a UTF-8 string
 138  *
 139  * # Failure
 140  *
 141  * Fails if invalid UTF-8
 142  */
 143 pub fn from_byte(b: u8) -> ~str {
 144     assert!(b < 128u8);
 145     unsafe { ::cast::transmute(~[b, 0u8]) }
 146 }
 147
 148 /// Convert a char to a string
 149 pub fn from_char(ch: char) -> ~str {
 150     let mut buf = ~"";
 151     buf.push_char(ch);
 152     buf
 153 }
 154
 155 /// Convert a vector of chars to a string
 156 pub fn from_chars(chs: &[char]) -> ~str {
 157     let mut buf = ~"";
 158     buf.reserve(chs.len());
 159     for chs.iter().advance |ch| {
 160         buf.push_char(*ch)
 161     }
 162     buf
 163 }
 164
 165 #[doc(hidden)]
 166 pub fn push_str(lhs: &mut ~str, rhs: &str) {
 167     lhs.push_str(rhs)
 168 }
 169
 170 #[allow(missing_doc)]
 171 pub trait StrVector {
 172     pub fn concat(&self) -> ~str;
 173     pub fn connect(&self, sep: &str) -> ~str;
 174 }
 175
 176 impl<'self, S: Str> StrVector for &'self [S] {
 177     /// Concatenate a vector of strings.
 178     pub fn concat(&self) -> ~str {
 179         if self.is_empty() { return ~""; }
 180
 181         let len = self.iter().transform(|s| s.as_slice().len()).sum();
 182
 183         let mut s = with_capacity(len);
 184
 185         unsafe {
 186             do s.as_mut_buf |buf, _| {
 187                 let mut buf = buf;
 188                 for self.iter().advance |ss| {
 189                     do ss.as_slice().as_imm_buf |ssbuf, sslen| {
 190                         let sslen = sslen - 1;
 191                         ptr::copy_memory(buf, ssbuf, sslen);
 192                         buf = buf.offset(sslen);
 193                     }
 194                 }
 195             }
 196             raw::set_len(&mut s, len);
 197         }
 198         s
 199     }
 200
 201     /// Concatenate a vector of strings, placing a given separator between each.
 202     pub fn connect(&self, sep: &str) -> ~str {
 203         if self.is_empty() { return ~""; }
 204
 205         // concat is faster
 206         if sep.is_empty() { return self.concat(); }
 207
 208         // this is wrong without the guarantee that `self` is non-empty
 209         let len = sep.len() * (self.len() - 1)
 210             + self.iter().transform(|s| s.as_slice().len()).sum();
 211         let mut s = ~"";
 212         let mut first = true;
 213
 214         s.reserve(len);
 215
 216         unsafe {
 217             do s.as_mut_buf |buf, _| {
 218                 do sep.as_imm_buf |sepbuf, seplen| {
 219                     let seplen = seplen - 1;
 220                     let mut buf = ::cast::transmute_mut_unsafe(buf);
 221                     for self.iter().advance |ss| {
 222                         do ss.as_slice().as_imm_buf |ssbuf, sslen| {
 223                             let sslen = sslen - 1;
 224                             if first {
 225                                 first = false;
 226                             } else {
 227                                 ptr::copy_memory(buf, sepbuf, seplen);
 228                                 buf = buf.offset(seplen);
 229                             }
 230                             ptr::copy_memory(buf, ssbuf, sslen);
 231                             buf = buf.offset(sslen);
 232                         }
 233                     }
 234                 }
 235             }
 236             raw::set_len(&mut s, len);
 237         }
 238         s
 239     }
 240 }
 241
 242 /// Something that can be used to compare against a character
 243 pub trait CharEq {
 244     /// Determine if the splitter should split at the given character
 245     fn matches(&self, char) -> bool;
 246     /// Indicate if this is only concerned about ASCII characters,
 247     /// which can allow for a faster implementation.
 248     fn only_ascii(&self) -> bool;
 249 }
 250 impl CharEq for char {
 251     #[inline]
 252     fn matches(&self, c: char) -> bool { *self == c }
 253
 254     fn only_ascii(&self) -> bool { (*self as uint) < 128 }
 255 }
 256 impl<'self> CharEq for &'self fn(char) -> bool {
 257     #[inline]
 258     fn matches(&self, c: char) -> bool { (*self)(c) }
 259
 260     fn only_ascii(&self) -> bool { false }
 261 }
 262 impl CharEq for extern "Rust" fn(char) -> bool {
 263     #[inline]
 264     fn matches(&self, c: char) -> bool { (*self)(c) }
 265
 266     fn only_ascii(&self) -> bool { false }
 267 }
 268
 269 impl<'self, C: CharEq> CharEq for &'self [C] {
 270     #[inline]
 271     fn matches(&self, c: char) -> bool {
 272         self.iter().any(|m| m.matches(c))
 273     }
 274
 275     fn only_ascii(&self) -> bool {
 276         self.iter().all(|m| m.only_ascii())
 277     }
 278 }
 279
 280
 281 /// An iterator over the substrings of a string, separated by `sep`.
 282 #[deriving(Clone)]
 283 pub struct StrCharSplitIterator<'self,Sep> {
 284     priv string: &'self str,
 285     priv position: uint,
 286     priv sep: Sep,
 287     /// The number of splits remaining
 288     priv count: uint,
 289     /// Whether an empty string at the end is allowed
 290     priv allow_trailing_empty: bool,
 291     priv finished: bool,
 292     priv only_ascii: bool
 293 }
 294
 295 /// An iterator over the words of a string, separated by an sequence of whitespace
 296 pub type WordIterator<'self> =
 297     FilterIterator<'self, &'self str,
 298              StrCharSplitIterator<'self, extern "Rust" fn(char) -> bool>>;
 299
 300 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
 301 pub type AnyLineIterator<'self> =
 302     MapIterator<'self, &'self str, &'self str, StrCharSplitIterator<'self, char>>;
 303
 304 impl<'self, Sep: CharEq> Iterator<&'self str> for StrCharSplitIterator<'self, Sep> {
 305     #[inline]
 306     fn next(&mut self) -> Option<&'self str> {
 307         if self.finished { return None }
 308
 309         let l = self.string.len();
 310         let start = self.position;
 311
 312         if self.only_ascii {
 313             // this gives a *huge* speed up for splitting on ASCII
 314             // characters (e.g. '\n' or ' ')
 315             while self.position < l && self.count > 0 {
 316                 let byte = self.string[self.position];
 317
 318                 if self.sep.matches(byte as char) {
 319                     let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
 320                     self.position += 1;
 321                     self.count -= 1;
 322                     return Some(slice);
 323                 }
 324                 self.position += 1;
 325             }
 326         } else {
 327             while self.position < l && self.count > 0 {
 328                 let CharRange {ch, next} = self.string.char_range_at(self.position);
 329
 330                 if self.sep.matches(ch) {
 331                     let slice = unsafe { raw::slice_bytes(self.string, start, self.position) };
 332                     self.position = next;
 333                     self.count -= 1;
 334                     return Some(slice);
 335                 }
 336                 self.position = next;
 337             }
 338         }
 339         self.finished = true;
 340         if self.allow_trailing_empty || start < l {
 341             Some(unsafe { raw::slice_bytes(self.string, start, l) })
 342         } else {
 343             None
 344         }
 345     }
 346 }
 347
 348 /// An iterator over the start and end indicies of the matches of a
 349 /// substring within a larger string
 350 #[deriving(Clone)]
 351 pub struct StrMatchesIndexIterator<'self> {
 352     priv haystack: &'self str,
 353     priv needle: &'self str,
 354     priv position: uint,
 355 }
 356
 357 /// An iterator over the substrings of a string separated by a given
 358 /// search string
 359 #[deriving(Clone)]
 360 pub struct StrStrSplitIterator<'self> {
 361     priv it: StrMatchesIndexIterator<'self>,
 362     priv last_end: uint,
 363     priv finished: bool
 364 }
 365
 366 impl<'self> Iterator<(uint, uint)> for StrMatchesIndexIterator<'self> {
 367     #[inline]
 368     fn next(&mut self) -> Option<(uint, uint)> {
 369         // See Issue #1932 for why this is a naive search
 370         let (h_len, n_len) = (self.haystack.len(), self.needle.len());
 371         let mut match_start = 0;
 372         let mut match_i = 0;
 373
 374         while self.position < h_len {
 375             if self.haystack[self.position] == self.needle[match_i] {
 376                 if match_i == 0 { match_start = self.position; }
 377                 match_i += 1;
 378                 self.position += 1;
 379
 380                 if match_i == n_len {
 381                     // found a match!
 382                     return Some((match_start, self.position));
 383                 }
 384             } else {
 385                 // failed match, backtrack
 386                 if match_i > 0 {
 387                     match_i = 0;
 388                     self.position = match_start;
 389                 }
 390                 self.position += 1;
 391             }
 392         }
 393         None
 394     }
 395 }
 396
 397 impl<'self> Iterator<&'self str> for StrStrSplitIterator<'self> {
 398     #[inline]
 399     fn next(&mut self) -> Option<&'self str> {
 400         if self.finished { return None; }
 401
 402         match self.it.next() {
 403             Some((from, to)) => {
 404                 let ret = Some(self.it.haystack.slice(self.last_end, from));
 405                 self.last_end = to;
 406                 ret
 407             }
 408             None => {
 409                 self.finished = true;
 410                 Some(self.it.haystack.slice(self.last_end, self.it.haystack.len()))
 411             }
 412         }
 413     }
 414 }
 415
 416 /** Splits a string into substrings with possibly internal whitespace,
 417  *  each of them at most `lim` bytes long. The substrings have leading and trailing
 418  *  whitespace removed, and are only cut at whitespace boundaries.
 419  *
 420  *  #Failure:
 421  *
 422  *  Fails during iteration if the string contains a non-whitespace
 423  *  sequence longer than the limit.
 424  */
 425 pub fn each_split_within<'a>(ss: &'a str,
 426                               lim: uint,
 427                               it: &fn(&'a str) -> bool) -> bool {
 428     // Just for fun, let's write this as an state machine:
 429
 430     enum SplitWithinState {
 431         A,  // leading whitespace, initial state
 432         B,  // words
 433         C,  // internal and trailing whitespace
 434     }
 435     enum Whitespace {
 436         Ws, // current char is whitespace
 437         Cr  // current char is not whitespace
 438     }
 439     enum LengthLimit {
 440         UnderLim, // current char makes current substring still fit in limit
 441         OverLim   // current char makes current substring no longer fit in limit
 442     }
 443
 444     let mut slice_start = 0;
 445     let mut last_start = 0;
 446     let mut last_end = 0;
 447     let mut state = A;
 448     let mut fake_i = ss.len();
 449     let mut lim = lim;
 450
 451     let mut cont = true;
 452     let slice: &fn() = || { cont = it(ss.slice(slice_start, last_end)) };
 453
 454     // if the limit is larger than the string, lower it to save cycles
 455     if (lim >= fake_i) {
 456         lim = fake_i;
 457     }
 458
 459     let machine: &fn((uint, char)) -> bool = |(i, c)| {
 460         let whitespace = if char::is_whitespace(c)       { Ws }       else { Cr };
 461         let limit      = if (i - slice_start + 1) <= lim { UnderLim } else { OverLim };
 462
 463         state = match (state, whitespace, limit) {
 464             (A, Ws, _)        => { A }
 465             (A, Cr, _)        => { slice_start = i; last_start = i; B }
 466
 467             (B, Cr, UnderLim) => { B }
 468             (B, Cr, OverLim)  if (i - last_start + 1) > lim
 469                               => fail!("word starting with %? longer than limit!",
 470                                        ss.slice(last_start, i + 1)),
 471             (B, Cr, OverLim)  => { slice(); slice_start = last_start; B }
 472             (B, Ws, UnderLim) => { last_end = i; C }
 473             (B, Ws, OverLim)  => { last_end = i; slice(); A }
 474
 475             (C, Cr, UnderLim) => { last_start = i; B }
 476             (C, Cr, OverLim)  => { slice(); slice_start = i; last_start = i; last_end = i; B }
 477             (C, Ws, OverLim)  => { slice(); A }
 478             (C, Ws, UnderLim) => { C }
 479         };
 480
 481         cont
 482     };
 483
 484     ss.iter().enumerate().advance(|x| machine(x));
 485
 486     // Let the automaton 'run out' by supplying trailing whitespace
 487     while cont && match state { B | C => true, A => false } {
 488         machine((fake_i, ' '));
 489         fake_i += 1;
 490     }
 491     return cont;
 492 }
 493
 494 /**
 495  * Replace all occurrences of one string with another
 496  *
 497  * # Arguments
 498  *
 499  * * s - The string containing substrings to replace
 500  * * from - The string to replace
 501  * * to - The replacement string
 502  *
 503  * # Return value
 504  *
 505  * The original string with all occurances of `from` replaced with `to`
 506  */
 507 pub fn replace(s: &str, from: &str, to: &str) -> ~str {
 508     let mut result = ~"";
 509     let mut last_end = 0;
 510     for s.matches_index_iter(from).advance |(start, end)| {
 511         result.push_str(unsafe{raw::slice_bytes(s, last_end, start)});
 512         result.push_str(to);
 513         last_end = end;
 514     }
 515     result.push_str(unsafe{raw::slice_bytes(s, last_end, s.len())});
 516     result
 517 }
 518
 519 /*
 520 Section: Comparing strings
 521 */
 522
 523 /// Bytewise slice equality
 524 #[cfg(not(test))]
 525 #[lang="str_eq"]
 526 #[inline]
 527 pub fn eq_slice(a: &str, b: &str) -> bool {
 528     do a.as_imm_buf |ap, alen| {
 529         do b.as_imm_buf |bp, blen| {
 530             if (alen != blen) { false }
 531             else {
 532                 unsafe {
 533                     libc::memcmp(ap as *libc::c_void,
 534                                  bp as *libc::c_void,
 535                                  (alen - 1) as libc::size_t) == 0
 536                 }
 537             }
 538         }
 539     }
 540 }
 541
 542 #[cfg(test)]
 543 #[inline]
 544 pub fn eq_slice(a: &str, b: &str) -> bool {
 545     do a.as_imm_buf |ap, alen| {
 546         do b.as_imm_buf |bp, blen| {
 547             if (alen != blen) { false }
 548             else {
 549                 unsafe {
 550                     libc::memcmp(ap as *libc::c_void,
 551                                  bp as *libc::c_void,
 552                                  (alen - 1) as libc::size_t) == 0
 553                 }
 554             }
 555         }
 556     }
 557 }
 558
 559 /// Bytewise string equality
 560 #[cfg(not(test))]
 561 #[lang="uniq_str_eq"]
 562 #[inline]
 563 pub fn eq(a: &~str, b: &~str) -> bool {
 564     eq_slice(*a, *b)
 565 }
 566
 567 #[cfg(test)]
 568 #[inline]
 569 pub fn eq(a: &~str, b: &~str) -> bool {
 570     eq_slice(*a, *b)
 571 }
 572
 573 /*
 574 Section: Searching
 575 */
 576
 577 // Utility used by various searching functions
 578 fn match_at<'a,'b>(haystack: &'a str, needle: &'b str, at: uint) -> bool {
 579     let mut i = at;
 580     for needle.bytes_iter().advance |c| { if haystack[i] != c { return false; } i += 1u; }
 581     return true;
 582 }
 583
 584 /*
 585 Section: Misc
 586 */
 587
 588 /// Determines if a vector of bytes contains valid UTF-8
 589 pub fn is_utf8(v: &[u8]) -> bool {
 590     let mut i = 0u;
 591     let total = v.len();
 592     while i < total {
 593         if v[i] < 128u8 {
 594             i += 1u;
 595         } else {
 596             let w = utf8_char_width(v[i]);
 597             if w == 0u { return false; }
 598
 599             let nexti = i + w;
 600             if nexti > total { return false; }
 601
 602             if v[i + 1] & 192u8 != TAG_CONT_U8 { return false; }
 603             if w > 2 {
 604                 if v[i + 2] & 192u8 != TAG_CONT_U8 { return false; }
 605                 if w > 3 && (v[i + 3] & 192u8 != TAG_CONT_U8) { return false; }
 606             }
 607
 608             i = nexti;
 609         }
 610     }
 611     true
 612 }
 613
 614 /// Determines if a vector of `u16` contains valid UTF-16
 615 pub fn is_utf16(v: &[u16]) -> bool {
 616     let len = v.len();
 617     let mut i = 0u;
 618     while (i < len) {
 619         let u = v[i];
 620
 621         if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
 622             i += 1u;
 623
 624         } else {
 625             if i+1u < len { return false; }
 626             let u2 = v[i+1u];
 627             if u < 0xD7FF_u16 || u > 0xDBFF_u16 { return false; }
 628             if u2 < 0xDC00_u16 || u2 > 0xDFFF_u16 { return false; }
 629             i += 2u;
 630         }
 631     }
 632     return true;
 633 }
 634
 635 /// Iterates over the utf-16 characters in the specified slice, yielding each
 636 /// decoded unicode character to the function provided.
 637 ///
 638 /// # Failures
 639 ///
 640 /// * Fails on invalid utf-16 data
 641 pub fn utf16_chars(v: &[u16], f: &fn(char)) {
 642     let len = v.len();
 643     let mut i = 0u;
 644     while (i < len && v[i] != 0u16) {
 645         let u = v[i];
 646
 647         if  u <= 0xD7FF_u16 || u >= 0xE000_u16 {
 648             f(u as char);
 649             i += 1u;
 650
 651         } else {
 652             let u2 = v[i+1u];
 653             assert!(u >= 0xD800_u16 && u <= 0xDBFF_u16);
 654             assert!(u2 >= 0xDC00_u16 && u2 <= 0xDFFF_u16);
 655             let mut c = (u - 0xD800_u16) as char;
 656             c = c << 10;
 657             c |= (u2 - 0xDC00_u16) as char;
 658             c |= 0x1_0000_u32 as char;
 659             f(c);
 660             i += 2u;
 661         }
 662     }
 663 }
 664
 665 /**
 666  * Allocates a new string from the utf-16 slice provided
 667  */
 668 pub fn from_utf16(v: &[u16]) -> ~str {
 669     let mut buf = ~"";
 670     buf.reserve(v.len());
 671     utf16_chars(v, |ch| buf.push_char(ch));
 672     buf
 673 }
 674
 675 /**
 676  * Allocates a new string with the specified capacity. The string returned is
 677  * the empty string, but has capacity for much more.
 678  */
 679 #[inline]
 680 pub fn with_capacity(capacity: uint) -> ~str {
 681     let mut buf = ~"";
 682     buf.reserve(capacity);
 683     buf
 684 }
 685
 686 /**
 687  * As char_len but for a slice of a string
 688  *
 689  * # Arguments
 690  *
 691  * * s - A valid string
 692  * * start - The position inside `s` where to start counting in bytes
 693  * * end - The position where to stop counting
 694  *
 695  * # Return value
 696  *
 697  * The number of Unicode characters in `s` between the given indices.
 698  */
 699 pub fn count_chars(s: &str, start: uint, end: uint) -> uint {
 700     assert!(s.is_char_boundary(start));
 701     assert!(s.is_char_boundary(end));
 702     let mut i = start;
 703     let mut len = 0u;
 704     while i < end {
 705         let next = s.char_range_at(i).next;
 706         len += 1u;
 707         i = next;
 708     }
 709     return len;
 710 }
 711
 712 /// Counts the number of bytes taken by the first `n` chars in `s`
 713 /// starting from `start`.
 714 pub fn count_bytes<'b>(s: &'b str, start: uint, n: uint) -> uint {
 715     assert!(s.is_char_boundary(start));
 716     let mut end = start;
 717     let mut cnt = n;
 718     let l = s.len();
 719     while cnt > 0u {
 720         assert!(end < l);
 721         let next = s.char_range_at(end).next;
 722         cnt -= 1u;
 723         end = next;
 724     }
 725     end - start
 726 }
 727
 728 // https://tools.ietf.org/html/rfc3629
 729 static UTF8_CHAR_WIDTH: [u8, ..256] = [
 730 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 731 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 732 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 733 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 734 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 735 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 736 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 737 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 738 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 739 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 740 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 741 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 742 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 743 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 744 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 745 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 746 ];
 747
 748 /// Given a first byte, determine how many bytes are in this UTF-8 character
 749 pub fn utf8_char_width(b: u8) -> uint {
 750     return UTF8_CHAR_WIDTH[b] as uint;
 751 }
 752
 753 #[allow(missing_doc)]
 754 pub struct CharRange {
 755     ch: char,
 756     next: uint
 757 }
 758
 759 // UTF-8 tags and ranges
 760 static TAG_CONT_U8: u8 = 128u8;
 761 static TAG_CONT: uint = 128u;
 762 static MAX_ONE_B: uint = 128u;
 763 static TAG_TWO_B: uint = 192u;
 764 static MAX_TWO_B: uint = 2048u;
 765 static TAG_THREE_B: uint = 224u;
 766 static MAX_THREE_B: uint = 65536u;
 767 static TAG_FOUR_B: uint = 240u;
 768
 769 /// Unsafe operations
 770 pub mod raw {
 771     use cast;
 772     use libc;
 773     use ptr;
 774     use str::raw;
 775     use str::{is_utf8};
 776     use vec;
 777     use vec::MutableVector;
 778
 779     /// Create a Rust string from a null-terminated *u8 buffer
 780     pub unsafe fn from_buf(buf: *u8) -> ~str {
 781         let mut curr = buf;
 782         let mut i = 0u;
 783         while *curr != 0u8 {
 784             i += 1u;
 785             curr = ptr::offset(buf, i);
 786         }
 787         return from_buf_len(buf, i);
 788     }
 789
 790     /// Create a Rust string from a *u8 buffer of the given length
 791     pub unsafe fn from_buf_len(buf: *u8, len: uint) -> ~str {
 792         let mut v: ~[u8] = vec::with_capacity(len + 1);
 793         v.as_mut_buf(|vbuf, _len| {
 794             ptr::copy_memory(vbuf, buf as *u8, len)
 795         });
 796         vec::raw::set_len(&mut v, len);
 797         v.push(0u8);
 798
 799         assert!(is_utf8(v));
 800         return ::cast::transmute(v);
 801     }
 802
 803     /// Create a Rust string from a null-terminated C string
 804     pub unsafe fn from_c_str(c_str: *libc::c_char) -> ~str {
 805         from_buf(::cast::transmute(c_str))
 806     }
 807
 808     /// Create a Rust string from a `*c_char` buffer of the given length
 809     pub unsafe fn from_c_str_len(c_str: *libc::c_char, len: uint) -> ~str {
 810         from_buf_len(::cast::transmute(c_str), len)
 811     }
 812
 813     /// Converts a vector of bytes to a new owned string.
 814     pub unsafe fn from_bytes(v: &[u8]) -> ~str {
 815         do v.as_imm_buf |buf, len| {
 816             from_buf_len(buf, len)
 817         }
 818     }
 819
 820     /// Converts an owned vector of bytes to a new owned string. This assumes
 821     /// that the utf-8-ness of the vector has already been validated
 822     pub unsafe fn from_bytes_owned(mut v: ~[u8]) -> ~str {
 823         v.push(0u8);
 824         cast::transmute(v)
 825     }
 826
 827     /// Converts a vector of bytes to a string.
 828     /// The byte slice needs to contain valid utf8 and needs to be one byte longer than
 829     /// the string, if possible ending in a 0 byte.
 830     pub unsafe fn from_bytes_with_null<'a>(v: &'a [u8]) -> &'a str {
 831         cast::transmute(v)
 832     }
 833
 834     /// Converts a byte to a string.
 835     pub unsafe fn from_byte(u: u8) -> ~str { raw::from_bytes([u]) }
 836
 837     /// Form a slice from a C string. Unsafe because the caller must ensure the
 838     /// C string has the static lifetime, or else the return value may be
 839     /// invalidated later.
 840     pub unsafe fn c_str_to_static_slice(s: *libc::c_char) -> &'static str {
 841         let s = s as *u8;
 842         let mut curr = s;
 843         let mut len = 0u;
 844         while *curr != 0u8 {
 845             len += 1u;
 846             curr = ptr::offset(s, len);
 847         }
 848         let v = (s, len + 1);
 849         assert!(is_utf8(::cast::transmute(v)));
 850         ::cast::transmute(v)
 851     }
 852
 853     /**
 854      * Takes a bytewise (not UTF-8) slice from a string.
 855      *
 856      * Returns the substring from [`begin`..`end`).
 857      *
 858      * # Failure
 859      *
 860      * If begin is greater than end.
 861      * If end is greater than the length of the string.
 862      */
 863     #[inline]
 864     pub unsafe fn slice_bytes(s: &str, begin: uint, end: uint) -> &str {
 865         do s.as_imm_buf |sbuf, n| {
 866              assert!((begin <= end));
 867              assert!((end <= n));
 868
 869              let tuple = (ptr::offset(sbuf, begin), end - begin + 1);
 870              ::cast::transmute(tuple)
 871         }
 872     }
 873
 874     /// Appends a byte to a string. (Not UTF-8 safe).
 875     pub unsafe fn push_byte(s: &mut ~str, b: u8) {
 876         let new_len = s.len() + 1;
 877         s.reserve_at_least(new_len);
 878         do s.as_mut_buf |buf, len| {
 879             *ptr::mut_offset(buf, len) = b;
 880         }
 881         set_len(&mut *s, new_len);
 882     }
 883
 884     /// Appends a vector of bytes to a string. (Not UTF-8 safe).
 885     unsafe fn push_bytes(s: &mut ~str, bytes: &[u8]) {
 886         let new_len = s.len() + bytes.len();
 887         s.reserve_at_least(new_len);
 888         for bytes.iter().advance |byte| { push_byte(&mut *s, *byte); }
 889     }
 890
 891     /// Removes the last byte from a string and returns it. (Not UTF-8 safe).
 892     pub unsafe fn pop_byte(s: &mut ~str) -> u8 {
 893         let len = s.len();
 894         assert!((len > 0u));
 895         let b = s[len - 1u];
 896         set_len(s, len - 1u);
 897         return b;
 898     }
 899
 900     /// Removes the first byte from a string and returns it. (Not UTF-8 safe).
 901     pub unsafe fn shift_byte(s: &mut ~str) -> u8 {
 902         let len = s.len();
 903         assert!((len > 0u));
 904         let b = s[0];
 905         *s = s.slice(1, len).to_owned();
 906         return b;
 907     }
 908
 909     /// Sets the length of the string and adds the null terminator
 910     #[inline]
 911     pub unsafe fn set_len(v: &mut ~str, new_len: uint) {
 912         let v: **mut vec::UnboxedVecRepr = cast::transmute(v);
 913         let repr: *mut vec::UnboxedVecRepr = *v;
 914         (*repr).fill = new_len + 1u;
 915         let null = ptr::mut_offset(cast::transmute(&((*repr).data)),
 916                                    new_len);
 917         *null = 0u8;
 918     }
 919
 920     #[test]
 921     fn test_from_buf_len() {
 922         unsafe {
 923             let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
 924             let b = vec::raw::to_ptr(a);
 925             let c = from_buf_len(b, 3u);
 926             assert_eq!(c, ~"AAA");
 927         }
 928     }
 929
 930 }
 931
 932 #[cfg(not(test))]
 933 pub mod traits {
 934     use ops::Add;
 935     use cmp::{TotalOrd, Ordering, Less, Equal, Greater, Eq, Ord, Equiv, TotalEq};
 936     use super::{Str, eq_slice};
 937
 938     impl<'self> Add<&'self str,~str> for &'self str {
 939         #[inline]
 940         fn add(&self, rhs: & &'self str) -> ~str {
 941             let mut ret = self.to_owned();
 942             ret.push_str(*rhs);
 943             ret
 944         }
 945     }
 946
 947     impl<'self> TotalOrd for &'self str {
 948         #[inline]
 949         fn cmp(&self, other: & &'self str) -> Ordering {
 950             for self.bytes_iter().zip(other.bytes_iter()).advance |(s_b, o_b)| {
 951                 match s_b.cmp(&o_b) {
 952                     Greater => return Greater,
 953                     Less => return Less,
 954                     Equal => ()
 955                 }
 956             }
 957
 958             self.len().cmp(&other.len())
 959         }
 960     }
 961
 962     impl TotalOrd for ~str {
 963         #[inline]
 964         fn cmp(&self, other: &~str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
 965     }
 966
 967     impl TotalOrd for @str {
 968         #[inline]
 969         fn cmp(&self, other: &@str) -> Ordering { self.as_slice().cmp(&other.as_slice()) }
 970     }
 971
 972     impl<'self> Eq for &'self str {
 973         #[inline]
 974         fn eq(&self, other: & &'self str) -> bool {
 975             eq_slice((*self), (*other))
 976         }
 977         #[inline]
 978         fn ne(&self, other: & &'self str) -> bool { !(*self).eq(other) }
 979     }
 980
 981     impl Eq for ~str {
 982         #[inline]
 983         fn eq(&self, other: &~str) -> bool {
 984             eq_slice((*self), (*other))
 985         }
 986         #[inline]
 987         fn ne(&self, other: &~str) -> bool { !(*self).eq(other) }
 988     }
 989
 990     impl Eq for @str {
 991         #[inline]
 992         fn eq(&self, other: &@str) -> bool {
 993             eq_slice((*self), (*other))
 994         }
 995         #[inline]
 996         fn ne(&self, other: &@str) -> bool { !(*self).eq(other) }
 997     }
 998
 999     impl<'self> TotalEq for &'self str {
1000         #[inline]
1001         fn equals(&self, other: & &'self str) -> bool {
1002             eq_slice((*self), (*other))
1003         }
1004     }
1005
1006     impl TotalEq for ~str {
1007         #[inline]
1008         fn equals(&self, other: &~str) -> bool {
1009             eq_slice((*self), (*other))
1010         }
1011     }
1012
1013     impl TotalEq for @str {
1014         #[inline]
1015         fn equals(&self, other: &@str) -> bool {
1016             eq_slice((*self), (*other))
1017         }
1018     }
1019
1020     impl<'self> Ord for &'self str {
1021         #[inline]
1022         fn lt(&self, other: & &'self str) -> bool { self.cmp(other) == Less }
1023         #[inline]
1024         fn le(&self, other: & &'self str) -> bool { self.cmp(other) != Greater }
1025         #[inline]
1026         fn ge(&self, other: & &'self str) -> bool { self.cmp(other) != Less }
1027         #[inline]
1028         fn gt(&self, other: & &'self str) -> bool { self.cmp(other) == Greater }
1029     }
1030
1031     impl Ord for ~str {
1032         #[inline]
1033         fn lt(&self, other: &~str) -> bool { self.cmp(other) == Less }
1034         #[inline]
1035         fn le(&self, other: &~str) -> bool { self.cmp(other) != Greater }
1036         #[inline]
1037         fn ge(&self, other: &~str) -> bool { self.cmp(other) != Less }
1038         #[inline]
1039         fn gt(&self, other: &~str) -> bool { self.cmp(other) == Greater }
1040     }
1041
1042     impl Ord for @str {
1043         #[inline]
1044         fn lt(&self, other: &@str) -> bool { self.cmp(other) == Less }
1045         #[inline]
1046         fn le(&self, other: &@str) -> bool { self.cmp(other) != Greater }
1047         #[inline]
1048         fn ge(&self, other: &@str) -> bool { self.cmp(other) != Less }
1049         #[inline]
1050         fn gt(&self, other: &@str) -> bool { self.cmp(other) == Greater }
1051     }
1052
1053     impl<'self, S: Str> Equiv<S> for &'self str {
1054         #[inline]
1055         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1056     }
1057
1058     impl<'self, S: Str> Equiv<S> for @str {
1059         #[inline]
1060         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1061     }
1062
1063     impl<'self, S: Str> Equiv<S> for ~str {
1064         #[inline]
1065         fn equiv(&self, other: &S) -> bool { eq_slice(*self, other.as_slice()) }
1066     }
1067 }
1068
1069 #[cfg(test)]
1070 pub mod traits {}
1071
1072 /// Any string that can be represented as a slice
1073 pub trait Str {
1074     /// Work with `self` as a slice.
1075     fn as_slice<'a>(&'a self) -> &'a str;
1076 }
1077
1078 impl<'self> Str for &'self str {
1079     #[inline]
1080     fn as_slice<'a>(&'a self) -> &'a str { *self }
1081 }
1082 impl<'self> Str for ~str {
1083     #[inline]
1084     fn as_slice<'a>(&'a self) -> &'a str {
1085         let s: &'a str = *self; s
1086     }
1087 }
1088 impl<'self> Str for @str {
1089     #[inline]
1090     fn as_slice<'a>(&'a self) -> &'a str {
1091         let s: &'a str = *self; s
1092     }
1093 }
1094
1095 impl<'self> Container for &'self str {
1096     #[inline]
1097     fn len(&self) -> uint {
1098         do self.as_imm_buf |_p, n| { n - 1u }
1099     }
1100     #[inline]
1101     fn is_empty(&self) -> bool {
1102         self.len() == 0
1103     }
1104 }
1105
1106 impl Container for ~str {
1107     #[inline]
1108     fn len(&self) -> uint { self.as_slice().len() }
1109     #[inline]
1110     fn is_empty(&self) -> bool { self.len() == 0 }
1111 }
1112
1113 impl Container for @str {
1114     #[inline]
1115     fn len(&self) -> uint { self.as_slice().len() }
1116     #[inline]
1117     fn is_empty(&self) -> bool { self.len() == 0 }
1118 }
1119
1120 impl Mutable for ~str {
1121     /// Remove all content, make the string empty
1122     #[inline]
1123     fn clear(&mut self) {
1124         unsafe {
1125             raw::set_len(self, 0)
1126         }
1127     }
1128 }
1129
1130
1131 #[allow(missing_doc)]
1132 pub trait StrSlice<'self> {
1133     fn contains<'a>(&self, needle: &'a str) -> bool;
1134     fn contains_char(&self, needle: char) -> bool;
1135     fn iter(&self) -> StrCharIterator<'self>;
1136     fn rev_iter(&self) -> StrCharRevIterator<'self>;
1137     fn bytes_iter(&self) -> StrBytesIterator<'self>;
1138     fn bytes_rev_iter(&self) -> StrBytesRevIterator<'self>;
1139     fn split_iter<Sep: CharEq>(&self, sep: Sep) -> StrCharSplitIterator<'self, Sep>;
1140     fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> StrCharSplitIterator<'self, Sep>;
1141     fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
1142         -> StrCharSplitIterator<'self, Sep>;
1143     fn matches_index_iter(&self, sep: &'self str) -> StrMatchesIndexIterator<'self>;
1144     fn split_str_iter(&self, &'self str) -> StrStrSplitIterator<'self>;
1145     fn line_iter(&self) -> StrCharSplitIterator<'self, char>;
1146     fn any_line_iter(&self) -> AnyLineIterator<'self>;
1147     fn word_iter(&self) -> WordIterator<'self>;
1148     fn ends_with(&self, needle: &str) -> bool;
1149     fn is_whitespace(&self) -> bool;
1150     fn is_alphanumeric(&self) -> bool;
1151     fn char_len(&self) -> uint;
1152
1153     fn slice(&self, begin: uint, end: uint) -> &'self str;
1154     fn slice_from(&self, begin: uint) -> &'self str;
1155     fn slice_to(&self, end: uint) -> &'self str;
1156
1157     fn slice_chars(&self, begin: uint, end: uint) -> &'self str;
1158
1159     fn starts_with(&self, needle: &str) -> bool;
1160     fn escape_default(&self) -> ~str;
1161     fn escape_unicode(&self) -> ~str;
1162     fn trim(&self) -> &'self str;
1163     fn trim_left(&self) -> &'self str;
1164     fn trim_right(&self) -> &'self str;
1165     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1166     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1167     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str;
1168     fn replace(&self, from: &str, to: &str) -> ~str;
1169     fn to_owned(&self) -> ~str;
1170     fn to_managed(&self) -> @str;
1171     fn to_utf16(&self) -> ~[u16];
1172     fn is_char_boundary(&self, index: uint) -> bool;
1173     fn char_range_at(&self, start: uint) -> CharRange;
1174     fn char_at(&self, i: uint) -> char;
1175     fn char_range_at_reverse(&self, start: uint) -> CharRange;
1176     fn char_at_reverse(&self, i: uint) -> char;
1177     fn as_bytes(&self) -> &'self [u8];
1178
1179     fn find<C: CharEq>(&self, search: C) -> Option<uint>;
1180     fn rfind<C: CharEq>(&self, search: C) -> Option<uint>;
1181     fn find_str(&self, &str) -> Option<uint>;
1182
1183     fn repeat(&self, nn: uint) -> ~str;
1184
1185     fn slice_shift_char(&self) -> (char, &'self str);
1186
1187     fn map_chars(&self, ff: &fn(char) -> char) -> ~str;
1188
1189     fn lev_distance(&self, t: &str) -> uint;
1190
1191     fn subslice_offset(&self, inner: &str) -> uint;
1192
1193     fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T;
1194     fn as_c_str<T>(&self, f: &fn(*libc::c_char) -> T) -> T;
1195 }
1196
1197 /// Extension methods for strings
1198 impl<'self> StrSlice<'self> for &'self str {
1199     /**
1200      * Returns true if one string contains another
1201      *
1202      * # Arguments
1203      *
1204      * * needle - The string to look for
1205      */
1206     #[inline]
1207     fn contains<'a>(&self, needle: &'a str) -> bool {
1208         self.find_str(needle).is_some()
1209     }
1210     /**
1211      * Returns true if a string contains a char.
1212      *
1213      * # Arguments
1214      *
1215      * * needle - The char to look for
1216      */
1217     #[inline]
1218     fn contains_char(&self, needle: char) -> bool {
1219         self.find(needle).is_some()
1220     }
1221     /// An iterator over the characters of `self`. Note, this iterates
1222     /// over unicode code-points, not unicode graphemes.
1223     ///
1224     /// # Example
1225     ///
1226     /// ~~~ {.rust}
1227     /// let v: ~[char] = "abc åäö".iter().collect();
1228     /// assert_eq!(v, ~['a', 'b', 'c', ' ', 'å', 'ä', 'ö']);
1229     /// ~~~
1230     #[inline]
1231     fn iter(&self) -> StrCharIterator<'self> {
1232         StrCharIterator {
1233             index: 0,
1234             string: *self
1235         }
1236     }
1237     /// An iterator over the characters of `self`, in reverse order.
1238     #[inline]
1239     fn rev_iter(&self) -> StrCharRevIterator<'self> {
1240         StrCharRevIterator {
1241             index: self.len(),
1242             string: *self
1243         }
1244     }
1245
1246     /// An iterator over the bytes of `self`
1247     #[inline]
1248     fn bytes_iter(&self) -> StrBytesIterator<'self> {
1249         StrBytesIterator { it: self.as_bytes().iter() }
1250     }
1251     /// An iterator over the bytes of `self`, in reverse order
1252     #[inline]
1253     fn bytes_rev_iter(&self) -> StrBytesRevIterator<'self> {
1254         StrBytesRevIterator { it: self.as_bytes().rev_iter() }
1255     }
1256
1257     /// An iterator over substrings of `self`, separated by characters
1258     /// matched by `sep`.
1259     ///
1260     /// # Example
1261     ///
1262     /// ~~~ {.rust}
1263     /// let v: ~[&str] = "Mary had a little lamb".split_iter(' ').collect();
1264     /// assert_eq!(v, ~["Mary", "had", "a", "little", "lamb"]);
1265     ///
1266     /// let v: ~[&str] = "abc1def2ghi".split_iter(|c: char| c.is_digit()).collect();
1267     /// assert_eq!(v, ~["abc", "def", "ghi"]);
1268     /// ~~~
1269     #[inline]
1270     fn split_iter<Sep: CharEq>(&self, sep: Sep) -> StrCharSplitIterator<'self, Sep> {
1271         self.split_options_iter(sep, self.len(), true)
1272     }
1273
1274     /// An iterator over substrings of `self`, separated by characters
1275     /// matched by `sep`, restricted to splitting at most `count`
1276     /// times.
1277     #[inline]
1278     fn splitn_iter<Sep: CharEq>(&self, sep: Sep, count: uint) -> StrCharSplitIterator<'self, Sep> {
1279         self.split_options_iter(sep, count, true)
1280     }
1281
1282     /// An iterator over substrings of `self`, separated by characters
1283     /// matched by `sep`, splitting at most `count` times, and
1284     /// possibly not including the trailing empty substring, if it
1285     /// exists.
1286     #[inline]
1287     fn split_options_iter<Sep: CharEq>(&self, sep: Sep, count: uint, allow_trailing_empty: bool)
1288         -> StrCharSplitIterator<'self, Sep> {
1289         let only_ascii = sep.only_ascii();
1290         StrCharSplitIterator {
1291             string: *self,
1292             position: 0,
1293             sep: sep,
1294             count: count,
1295             allow_trailing_empty: allow_trailing_empty,
1296             finished: false,
1297             only_ascii: only_ascii
1298         }
1299     }
1300     /// An iterator over the start and end indices of each match of
1301     /// `sep` within `self`.
1302     #[inline]
1303     fn matches_index_iter(&self, sep: &'self str) -> StrMatchesIndexIterator<'self> {
1304         assert!(!sep.is_empty())
1305         StrMatchesIndexIterator {
1306             haystack: *self,
1307             needle: sep,
1308             position: 0
1309         }
1310     }
1311     /**
1312      * An iterator over the substrings of `self` separated by `sep`.
1313      *
1314      * # Example
1315      *
1316      * ~~~ {.rust}
1317      * let v: ~[&str] = "abcXXXabcYYYabc".split_str_iter("abc").collect()
1318      * assert_eq!(v, ["", "XXX", "YYY", ""]);
1319      * ~~~
1320      */
1321     #[inline]
1322     fn split_str_iter(&self, sep: &'self str) -> StrStrSplitIterator<'self> {
1323         StrStrSplitIterator {
1324             it: self.matches_index_iter(sep),
1325             last_end: 0,
1326             finished: false
1327         }
1328     }
1329
1330     /// An iterator over the lines of a string (subsequences separated
1331     /// by `\n`).
1332     #[inline]
1333     fn line_iter(&self) -> StrCharSplitIterator<'self, char> {
1334         self.split_options_iter('\n', self.len(), false)
1335     }
1336
1337     /// An iterator over the lines of a string, separated by either
1338     /// `\n` or (`\r\n`).
1339     fn any_line_iter(&self) -> AnyLineIterator<'self> {
1340         do self.line_iter().transform |line| {
1341             let l = line.len();
1342             if l > 0 && line[l - 1] == '\r' as u8 { line.slice(0, l - 1) }
1343             else { line }
1344         }
1345     }
1346
1347     /// An iterator over the words of a string (subsequences separated
1348     /// by any sequence of whitespace).
1349     #[inline]
1350     fn word_iter(&self) -> WordIterator<'self> {
1351         self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
1352     }
1353
1354     /**
1355      * Returns true if the string contains only whitespace
1356      *
1357      * Whitespace characters are determined by `char::is_whitespace`
1358      */
1359     #[inline]
1360     fn is_whitespace(&self) -> bool { self.iter().all(char::is_whitespace) }
1361     /**
1362      * Returns true if the string contains only alphanumerics
1363      *
1364      * Alphanumeric characters are determined by `char::is_alphanumeric`
1365      */
1366     #[inline]
1367     fn is_alphanumeric(&self) -> bool { self.iter().all(char::is_alphanumeric) }
1368     /// Returns the number of characters that a string holds
1369     #[inline]
1370     fn char_len(&self) -> uint { self.iter().len_() }
1371
1372     /**
1373      * Returns a slice of the given string from the byte range
1374      * [`begin`..`end`)
1375      *
1376      * Fails when `begin` and `end` do not point to valid characters or
1377      * beyond the last character of the string
1378      */
1379     #[inline]
1380     fn slice(&self, begin: uint, end: uint) -> &'self str {
1381         assert!(self.is_char_boundary(begin));
1382         assert!(self.is_char_boundary(end));
1383         unsafe { raw::slice_bytes(*self, begin, end) }
1384     }
1385     /// Returns a slice of the string from `begin` to its end.
1386     ///
1387     /// Fails when `begin` does not point to a valid character, or is
1388     /// out of bounds.
1389     #[inline]
1390     fn slice_from(&self, begin: uint) -> &'self str {
1391         self.slice(begin, self.len())
1392     }
1393     /// Returns a slice of the string from the beginning to byte
1394     /// `end`.
1395     ///
1396     /// Fails when `end` does not point to a valid character, or is
1397     /// out of bounds.
1398     #[inline]
1399     fn slice_to(&self, end: uint) -> &'self str {
1400         self.slice(0, end)
1401     }
1402
1403     /// Returns a slice of the string from the char range
1404     /// [`begin`..`end`).
1405     ///
1406     /// Fails if `begin` > `end` or the either `begin` or `end` are
1407     /// beyond the last character of the string.
1408     fn slice_chars(&self, begin: uint, end: uint) -> &'self str {
1409         assert!(begin <= end);
1410         // not sure how to use the iterators for this nicely.
1411         let mut position = 0;
1412         let mut count = 0;
1413         let l = self.len();
1414         while count < begin && position < l {
1415             position = self.char_range_at(position).next;
1416             count += 1;
1417         }
1418         if count < begin { fail!("Attempted to begin slice_chars beyond end of string") }
1419         let start_byte = position;
1420         while count < end && position < l {
1421             position = self.char_range_at(position).next;
1422             count += 1;
1423         }
1424         if count < end { fail!("Attempted to end slice_chars beyond end of string") }
1425
1426         self.slice(start_byte, position)
1427     }
1428
1429     /// Returns true if `needle` is a prefix of the string.
1430     fn starts_with<'a>(&self, needle: &'a str) -> bool {
1431         let (self_len, needle_len) = (self.len(), needle.len());
1432         if needle_len == 0u { true }
1433         else if needle_len > self_len { false }
1434         else { match_at(*self, needle, 0u) }
1435     }
1436     /// Returns true if `needle` is a suffix of the string.
1437     fn ends_with(&self, needle: &str) -> bool {
1438         let (self_len, needle_len) = (self.len(), needle.len());
1439         if needle_len == 0u { true }
1440         else if needle_len > self_len { false }
1441         else { match_at(*self, needle, self_len - needle_len) }
1442     }
1443
1444     /// Escape each char in `s` with char::escape_default.
1445     fn escape_default(&self) -> ~str {
1446         let mut out: ~str = ~"";
1447         out.reserve_at_least(self.len());
1448         for self.iter().advance |c| {
1449             do c.escape_default |c| {
1450                 out.push_char(c);
1451             }
1452         }
1453         out
1454     }
1455
1456     /// Escape each char in `s` with char::escape_unicode.
1457     fn escape_unicode(&self) -> ~str {
1458         let mut out: ~str = ~"";
1459         out.reserve_at_least(self.len());
1460         for self.iter().advance |c| {
1461             do c.escape_unicode |c| {
1462                 out.push_char(c);
1463             }
1464         }
1465         out
1466     }
1467
1468     /// Returns a string with leading and trailing whitespace removed
1469     #[inline]
1470     fn trim(&self) -> &'self str {
1471         self.trim_left().trim_right()
1472     }
1473     /// Returns a string with leading whitespace removed
1474     #[inline]
1475     fn trim_left(&self) -> &'self str {
1476         self.trim_left_chars(&char::is_whitespace)
1477     }
1478     /// Returns a string with trailing whitespace removed
1479     #[inline]
1480     fn trim_right(&self) -> &'self str {
1481         self.trim_right_chars(&char::is_whitespace)
1482     }
1483
1484     /**
1485      * Returns a string with characters that match `to_trim` removed.
1486      *
1487      * # Arguments
1488      *
1489      * * to_trim - a character matcher
1490      *
1491      * # Example
1492      *
1493      * ~~~ {.rust}
1494      * assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar")
1495      * assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar")
1496      * assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar")
1497      * ~~~
1498      */
1499     #[inline]
1500     fn trim_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1501         self.trim_left_chars(to_trim).trim_right_chars(to_trim)
1502     }
1503     /**
1504      * Returns a string with leading `chars_to_trim` removed.
1505      *
1506      * # Arguments
1507      *
1508      * * to_trim - a character matcher
1509      *
1510      * # Example
1511      *
1512      * ~~~ {.rust}
1513      * assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11")
1514      * assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12")
1515      * assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123")
1516      * ~~~
1517      */
1518     #[inline]
1519     fn trim_left_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1520         match self.find(|c: char| !to_trim.matches(c)) {
1521             None => "",
1522             Some(first) => unsafe { raw::slice_bytes(*self, first, self.len()) }
1523         }
1524     }
1525     /**
1526      * Returns a string with trailing `chars_to_trim` removed.
1527      *
1528      * # Arguments
1529      *
1530      * * to_trim - a character matcher
1531      *
1532      * # Example
1533      *
1534      * ~~~ {.rust}
1535      * assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar")
1536      * assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar")
1537      * assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar")
1538      * ~~~
1539      */
1540     #[inline]
1541     fn trim_right_chars<C: CharEq>(&self, to_trim: &C) -> &'self str {
1542         match self.rfind(|c: char| !to_trim.matches(c)) {
1543             None => "",
1544             Some(last) => {
1545                 let next = self.char_range_at(last).next;
1546                 unsafe { raw::slice_bytes(*self, 0u, next) }
1547             }
1548         }
1549     }
1550
1551     /**
1552      * Replace all occurrences of one string with another
1553      *
1554      * # Arguments
1555      *
1556      * * from - The string to replace
1557      * * to - The replacement string
1558      *
1559      * # Return value
1560      *
1561      * The original string with all occurances of `from` replaced with `to`
1562      */
1563     pub fn replace(&self, from: &str, to: &str) -> ~str {
1564         let mut result = ~"";
1565         let mut last_end = 0;
1566         for self.matches_index_iter(from).advance |(start, end)| {
1567             result.push_str(unsafe{raw::slice_bytes(*self, last_end, start)});
1568             result.push_str(to);
1569             last_end = end;
1570         }
1571         result.push_str(unsafe{raw::slice_bytes(*self, last_end, self.len())});
1572         result
1573     }
1574
1575     /// Copy a slice into a new unique str
1576     #[inline]
1577     fn to_owned(&self) -> ~str {
1578         do self.as_imm_buf |src, len| {
1579             assert!(len > 0);
1580             unsafe {
1581                 let mut v = vec::with_capacity(len);
1582
1583                 do v.as_mut_buf |dst, _| {
1584                     ptr::copy_memory(dst, src, len - 1);
1585                 }
1586                 vec::raw::set_len(&mut v, len - 1);
1587                 v.push(0u8);
1588                 ::cast::transmute(v)
1589             }
1590         }
1591     }
1592
1593     #[inline]
1594     fn to_managed(&self) -> @str {
1595         let v = at_vec::from_fn(self.len() + 1, |i| {
1596             if i == self.len() { 0 } else { self[i] }
1597         });
1598         unsafe { ::cast::transmute(v) }
1599     }
1600
1601     /// Converts to a vector of `u16` encoded as UTF-16.
1602     fn to_utf16(&self) -> ~[u16] {
1603         let mut u = ~[];
1604         for self.iter().advance |ch| {
1605             // Arithmetic with u32 literals is easier on the eyes than chars.
1606             let mut ch = ch as u32;
1607
1608             if (ch & 0xFFFF_u32) == ch {
1609                 // The BMP falls through (assuming non-surrogate, as it
1610                 // should)
1611                 assert!(ch <= 0xD7FF_u32 || ch >= 0xE000_u32);
1612                 u.push(ch as u16)
1613             } else {
1614                 // Supplementary planes break into surrogates.
1615                 assert!(ch >= 0x1_0000_u32 && ch <= 0x10_FFFF_u32);
1616                 ch -= 0x1_0000_u32;
1617                 let w1 = 0xD800_u16 | ((ch >> 10) as u16);
1618                 let w2 = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
1619                 u.push_all([w1, w2])
1620             }
1621         }
1622         u
1623     }
1624
1625     /**
1626      * Returns false if the index points into the middle of a multi-byte
1627      * character sequence.
1628      */
1629     fn is_char_boundary(&self, index: uint) -> bool {
1630         if index == self.len() { return true; }
1631         let b = self[index];
1632         return b < 128u8 || b >= 192u8;
1633     }
1634
1635     /**
1636      * Pluck a character out of a string and return the index of the next
1637      * character.
1638      *
1639      * This function can be used to iterate over the unicode characters of a
1640      * string.
1641      *
1642      * # Example
1643      *
1644      * ~~~ {.rust}
1645      * let s = "中华Việt Nam";
1646      * let i = 0u;
1647      * while i < s.len() {
1648      *     let CharRange {ch, next} = s.char_range_at(i);
1649      *     std::io::println(fmt!("%u: %c",i,ch));
1650      *     i = next;
1651      * }
1652      * ~~~
1653      *
1654      * # Example output
1655      *
1656      * ~~~
1657      * 0: 中
1658      * 3: 华
1659      * 6: V
1660      * 7: i
1661      * 8: ệ
1662      * 11: t
1663      * 12:
1664      * 13: N
1665      * 14: a
1666      * 15: m
1667      * ~~~
1668      *
1669      * # Arguments
1670      *
1671      * * s - The string
1672      * * i - The byte offset of the char to extract
1673      *
1674      * # Return value
1675      *
1676      * A record {ch: char, next: uint} containing the char value and the byte
1677      * index of the next unicode character.
1678      *
1679      * # Failure
1680      *
1681      * If `i` is greater than or equal to the length of the string.
1682      * If `i` is not the index of the beginning of a valid UTF-8 character.
1683      */
1684     #[inline]
1685     fn char_range_at(&self, i: uint) -> CharRange {
1686         if (self[i] < 128u8) {
1687             return CharRange {ch: self[i] as char, next: i + 1 };
1688         }
1689
1690         // Multibyte case is a fn to allow char_range_at to inline cleanly
1691         fn multibyte_char_range_at(s: &str, i: uint) -> CharRange {
1692             let mut val = s[i] as uint;
1693             let w = UTF8_CHAR_WIDTH[val] as uint;
1694             assert!((w != 0));
1695
1696             // First byte is special, only want bottom 5 bits for width 2, 4 bits
1697             // for width 3, and 3 bits for width 4
1698             val &= 0x7Fu >> w;
1699             val = (val << 6) | (s[i + 1] & 63u8) as uint;
1700             if w > 2 { val = (val << 6) | (s[i + 2] & 63u8) as uint; }
1701             if w > 3 { val = (val << 6) | (s[i + 3] & 63u8) as uint; }
1702
1703             return CharRange {ch: val as char, next: i + w};
1704         }
1705
1706         return multibyte_char_range_at(*self, i);
1707     }
1708
1709     /// Plucks the character starting at the `i`th byte of a string
1710     #[inline]
1711     fn char_at(&self, i: uint) -> char { self.char_range_at(i).ch }
1712
1713     /**
1714      * Given a byte position and a str, return the previous char and its position.
1715      *
1716      * This function can be used to iterate over a unicode string in reverse.
1717      *
1718      * Returns 0 for next index if called on start index 0.
1719      */
1720     fn char_range_at_reverse(&self, start: uint) -> CharRange {
1721         let mut prev = start;
1722
1723         // while there is a previous byte == 10......
1724         while prev > 0u && self[prev - 1u] & 192u8 == TAG_CONT_U8 {
1725             prev -= 1u;
1726         }
1727
1728         // now refer to the initial byte of previous char
1729         if prev > 0u {
1730             prev -= 1u;
1731         } else {
1732             prev = 0u;
1733         }
1734
1735
1736         let ch = self.char_at(prev);
1737         return CharRange {ch:ch, next:prev};
1738     }
1739
1740     /// Plucks the character ending at the `i`th byte of a string
1741     #[inline]
1742     fn char_at_reverse(&self, i: uint) -> char {
1743         self.char_range_at_reverse(i).ch
1744     }
1745
1746     /**
1747      * Work with the byte buffer of a string as a byte slice.
1748      *
1749      * The byte slice does not include the null terminator.
1750      */
1751     fn as_bytes(&self) -> &'self [u8] {
1752         unsafe {
1753             let (ptr, len): (*u8, uint) = ::cast::transmute(*self);
1754             let outgoing_tuple: (*u8, uint) = (ptr, len - 1);
1755             ::cast::transmute(outgoing_tuple)
1756         }
1757     }
1758
1759     /**
1760      * Returns the byte index of the first character of `self` that matches `search`
1761      *
1762      * # Return value
1763      *
1764      * `Some` containing the byte index of the last matching character
1765      * or `None` if there is no match
1766      */
1767     fn find<C: CharEq>(&self, search: C) -> Option<uint> {
1768         if search.only_ascii() {
1769             for self.bytes_iter().enumerate().advance |(i, b)| {
1770                 if search.matches(b as char) { return Some(i) }
1771             }
1772         } else {
1773             let mut index = 0;
1774             for self.iter().advance |c| {
1775                 if search.matches(c) { return Some(index); }
1776                 index += c.len_utf8_bytes();
1777             }
1778         }
1779
1780         None
1781     }
1782     /**
1783      * Returns the byte index of the last character of `self` that matches `search`
1784      *
1785      * # Return value
1786      *
1787      * `Some` containing the byte index of the last matching character
1788      * or `None` if there is no match
1789      */
1790     fn rfind<C: CharEq>(&self, search: C) -> Option<uint> {
1791         let mut index = self.len();
1792         if search.only_ascii() {
1793             for self.bytes_rev_iter().advance |b| {
1794                 index -= 1;
1795                 if search.matches(b as char) { return Some(index); }
1796             }
1797         } else {
1798             for self.rev_iter().advance |c| {
1799                 index -= c.len_utf8_bytes();
1800                 if search.matches(c) { return Some(index); }
1801             }
1802         }
1803
1804         None
1805     }
1806
1807     /**
1808      * Returns the byte index of the first matching substring
1809      *
1810      * # Arguments
1811      *
1812      * * `needle` - The string to search for
1813      *
1814      * # Return value
1815      *
1816      * `Some` containing the byte index of the first matching substring
1817      * or `None` if there is no match
1818      */
1819     fn find_str(&self, needle: &str) -> Option<uint> {
1820         if needle.is_empty() {
1821             Some(0)
1822         } else {
1823             self.matches_index_iter(needle)
1824                 .next()
1825                 .map_consume(|(start, _end)| start)
1826         }
1827     }
1828
1829     /// Given a string, make a new string with repeated copies of it.
1830     fn repeat(&self, nn: uint) -> ~str {
1831         do self.as_imm_buf |buf, len| {
1832             // ignore the NULL terminator
1833             let len = len - 1;
1834             let mut ret = with_capacity(nn * len);
1835
1836             unsafe {
1837                 do ret.as_mut_buf |rbuf, _len| {
1838                     let mut rbuf = rbuf;
1839
1840                     for nn.times {
1841                         ptr::copy_memory(rbuf, buf, len);
1842                         rbuf = rbuf.offset(len);
1843                     }
1844                 }
1845                 raw::set_len(&mut ret, nn * len);
1846             }
1847             ret
1848         }
1849     }
1850
1851     /**
1852      * Retrieves the first character from a string slice and returns
1853      * it. This does not allocate a new string; instead, it returns a
1854      * slice that point one character beyond the character that was
1855      * shifted.
1856      *
1857      * # Failure
1858      *
1859      * If the string does not contain any characters
1860      */
1861     #[inline]
1862     fn slice_shift_char(&self) -> (char, &'self str) {
1863         let CharRange {ch, next} = self.char_range_at(0u);
1864         let next_s = unsafe { raw::slice_bytes(*self, next, self.len()) };
1865         return (ch, next_s);
1866     }
1867
1868
1869     /// Apply a function to each character.
1870     fn map_chars(&self, ff: &fn(char) -> char) -> ~str {
1871         let mut result = with_capacity(self.len());
1872         for self.iter().advance |cc| {
1873             result.push_char(ff(cc));
1874         }
1875         result
1876     }
1877
1878     /// Levenshtein Distance between two strings.
1879     fn lev_distance(&self, t: &str) -> uint {
1880         let slen = self.len();
1881         let tlen = t.len();
1882
1883         if slen == 0 { return tlen; }
1884         if tlen == 0 { return slen; }
1885
1886         let mut dcol = vec::from_fn(tlen + 1, |x| x);
1887
1888         for self.iter().enumerate().advance |(i, sc)| {
1889
1890             let mut current = i;
1891             dcol[0] = current + 1;
1892
1893             for t.iter().enumerate().advance |(j, tc)| {
1894
1895                 let next = dcol[j + 1];
1896
1897                 if sc == tc {
1898                     dcol[j + 1] = current;
1899                 } else {
1900                     dcol[j + 1] = ::cmp::min(current, next);
1901                     dcol[j + 1] = ::cmp::min(dcol[j + 1], dcol[j]) + 1;
1902                 }
1903
1904                 current = next;
1905             }
1906         }
1907
1908         return dcol[tlen];
1909     }
1910
1911
1912     /**
1913      * Returns the byte offset of an inner slice relative to an enclosing outer slice.
1914      *
1915      * Fails if `inner` is not a direct slice contained within self.
1916      *
1917      * # Example
1918      *
1919      * ~~~ {.rust}
1920      * let string = "a\nb\nc";
1921      * let mut lines = ~[];
1922      * for string.line_iter().advance |line| { lines.push(line) }
1923      *
1924      * assert!(string.subslice_offset(lines[0]) == 0); // &"a"
1925      * assert!(string.subslice_offset(lines[1]) == 2); // &"b"
1926      * assert!(string.subslice_offset(lines[2]) == 4); // &"c"
1927      * ~~~
1928      */
1929     #[inline]
1930     fn subslice_offset(&self, inner: &str) -> uint {
1931         do self.as_imm_buf |a, a_len| {
1932             do inner.as_imm_buf |b, b_len| {
1933                 let a_start: uint;
1934                 let a_end: uint;
1935                 let b_start: uint;
1936                 let b_end: uint;
1937                 unsafe {
1938                     a_start = cast::transmute(a); a_end = a_len + cast::transmute(a);
1939                     b_start = cast::transmute(b); b_end = b_len + cast::transmute(b);
1940                 }
1941                 assert!(a_start <= b_start);
1942                 assert!(b_end <= a_end);
1943                 b_start - a_start
1944             }
1945         }
1946     }
1947
1948     /**
1949      * Work with the byte buffer and length of a slice.
1950      *
1951      * The given length is one byte longer than the 'official' indexable
1952      * length of the string. This is to permit probing the byte past the
1953      * indexable area for a null byte, as is the case in slices pointing
1954      * to full strings, or suffixes of them.
1955      */
1956     #[inline]
1957     fn as_imm_buf<T>(&self, f: &fn(*u8, uint) -> T) -> T {
1958         let v: &[u8] = unsafe { cast::transmute(*self) };
1959         v.as_imm_buf(f)
1960     }
1961
1962     /**
1963      * Work with the byte buffer of a string as a null-terminated C string.
1964      *
1965      * Allows for unsafe manipulation of strings, which is useful for foreign
1966      * interop. This is similar to `str::as_buf`, but guarantees null-termination.
1967      * If the given slice is not already null-terminated, this function will
1968      * allocate a temporary, copy the slice, null terminate it, and pass
1969      * that instead.
1970      *
1971      * # Example
1972      *
1973      * ~~~ {.rust}
1974      * let s = "PATH".as_c_str(|path| libc::getenv(path));
1975      * ~~~
1976      */
1977     #[inline]
1978     fn as_c_str<T>(&self, f: &fn(*libc::c_char) -> T) -> T {
1979         do self.as_imm_buf |buf, len| {
1980             // NB: len includes the trailing null.
1981             assert!(len > 0);
1982             if unsafe { *(ptr::offset(buf, len - 1)) != 0 } {
1983                 self.to_owned().as_c_str(|s| f(s))
1984             } else {
1985                 f(buf as *libc::c_char)
1986             }
1987         }
1988     }
1989 }
1990
1991 #[allow(missing_doc)]
1992 pub trait NullTerminatedStr {
1993     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8];
1994 }
1995
1996 impl NullTerminatedStr for ~str {
1997     /**
1998      * Work with the byte buffer of a string as a byte slice.
1999      *
2000      * The byte slice does include the null terminator.
2001      */
2002     #[inline]
2003     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8] {
2004         let ptr: &'a ~[u8] = unsafe { ::cast::transmute(self) };
2005         let slice: &'a [u8] = *ptr;
2006         slice
2007     }
2008 }
2009 impl NullTerminatedStr for @str {
2010     /**
2011      * Work with the byte buffer of a string as a byte slice.
2012      *
2013      * The byte slice does include the null terminator.
2014      */
2015     #[inline]
2016     fn as_bytes_with_null<'a>(&'a self) -> &'a [u8] {
2017         let ptr: &'a @[u8] = unsafe { ::cast::transmute(self) };
2018         let slice: &'a [u8] = *ptr;
2019         slice
2020     }
2021 }
2022
2023 #[allow(missing_doc)]
2024 pub trait OwnedStr {
2025     fn push_str_no_overallocate(&mut self, rhs: &str);
2026     fn push_str(&mut self, rhs: &str);
2027     fn push_char(&mut self, c: char);
2028     fn pop_char(&mut self) -> char;
2029     fn shift_char(&mut self) -> char;
2030     fn unshift_char(&mut self, ch: char);
2031     fn append(&self, rhs: &str) -> ~str; // FIXME #4850: this should consume self.
2032     fn reserve(&mut self, n: uint);
2033     fn reserve_at_least(&mut self, n: uint);
2034     fn capacity(&self) -> uint;
2035     fn to_bytes_with_null(self) -> ~[u8];
2036
2037     /**
2038      * Work with the mutable byte buffer and length of a slice.
2039      *
2040      * The given length is one byte longer than the 'official' indexable
2041      * length of the string. This is to permit probing the byte past the
2042      * indexable area for a null byte, as is the case in slices pointing
2043      * to full strings, or suffixes of them.
2044      *
2045      * Make sure any mutations to this buffer keep this string valid UTF8.
2046      */
2047     fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T;
2048 }
2049
2050 impl OwnedStr for ~str {
2051     /// Appends a string slice to the back of a string, without overallocating
2052     #[inline]
2053     fn push_str_no_overallocate(&mut self, rhs: &str) {
2054         unsafe {
2055             let llen = self.len();
2056             let rlen = rhs.len();
2057             self.reserve(llen + rlen);
2058             do self.as_imm_buf |lbuf, _llen| {
2059                 do rhs.as_imm_buf |rbuf, _rlen| {
2060                     let dst = ptr::offset(lbuf, llen);
2061                     let dst = ::cast::transmute_mut_unsafe(dst);
2062                     ptr::copy_memory(dst, rbuf, rlen);
2063                 }
2064             }
2065             raw::set_len(self, llen + rlen);
2066         }
2067     }
2068
2069     /// Appends a string slice to the back of a string
2070     #[inline]
2071     fn push_str(&mut self, rhs: &str) {
2072         unsafe {
2073             let llen = self.len();
2074             let rlen = rhs.len();
2075             self.reserve_at_least(llen + rlen);
2076             do self.as_imm_buf |lbuf, _llen| {
2077                 do rhs.as_imm_buf |rbuf, _rlen| {
2078                     let dst = ptr::offset(lbuf, llen);
2079                     let dst = ::cast::transmute_mut_unsafe(dst);
2080                     ptr::copy_memory(dst, rbuf, rlen);
2081                 }
2082             }
2083             raw::set_len(self, llen + rlen);
2084         }
2085     }
2086     /// Appends a character to the back of a string
2087     #[inline]
2088     fn push_char(&mut self, c: char) {
2089         assert!(c as uint <= 0x10ffff); // FIXME: #7609: should be enforced on all `char`
2090         unsafe {
2091             let code = c as uint;
2092             let nb = if code < MAX_ONE_B { 1u }
2093             else if code < MAX_TWO_B { 2u }
2094             else if code < MAX_THREE_B { 3u }
2095             else { 4u };
2096             let len = self.len();
2097             let new_len = len + nb;
2098             self.reserve_at_least(new_len);
2099             let off = len;
2100             do self.as_mut_buf |buf, _len| {
2101                 match nb {
2102                     1u => {
2103                         *ptr::mut_offset(buf, off) = code as u8;
2104                     }
2105                     2u => {
2106                         *ptr::mut_offset(buf, off) = (code >> 6u & 31u | TAG_TWO_B) as u8;
2107                         *ptr::mut_offset(buf, off + 1u) = (code & 63u | TAG_CONT) as u8;
2108                     }
2109                     3u => {
2110                         *ptr::mut_offset(buf, off) = (code >> 12u & 15u | TAG_THREE_B) as u8;
2111                         *ptr::mut_offset(buf, off + 1u) = (code >> 6u & 63u | TAG_CONT) as u8;
2112                         *ptr::mut_offset(buf, off + 2u) = (code & 63u | TAG_CONT) as u8;
2113                     }
2114                     4u => {
2115                         *ptr::mut_offset(buf, off) = (code >> 18u & 7u | TAG_FOUR_B) as u8;
2116                         *ptr::mut_offset(buf, off + 1u) = (code >> 12u & 63u | TAG_CONT) as u8;
2117                         *ptr::mut_offset(buf, off + 2u) = (code >> 6u & 63u | TAG_CONT) as u8;
2118                         *ptr::mut_offset(buf, off + 3u) = (code & 63u | TAG_CONT) as u8;
2119                     }
2120                     _ => {}
2121                 }
2122             }
2123             raw::set_len(self, new_len);
2124         }
2125     }
2126     /**
2127      * Remove the final character from a string and return it
2128      *
2129      * # Failure
2130      *
2131      * If the string does not contain any characters
2132      */
2133     fn pop_char(&mut self) -> char {
2134         let end = self.len();
2135         assert!(end > 0u);
2136         let CharRange {ch, next} = self.char_range_at_reverse(end);
2137         unsafe { raw::set_len(self, next); }
2138         return ch;
2139     }
2140
2141     /**
2142      * Remove the first character from a string and return it
2143      *
2144      * # Failure
2145      *
2146      * If the string does not contain any characters
2147      */
2148     fn shift_char(&mut self) -> char {
2149         let CharRange {ch, next} = self.char_range_at(0u);
2150         *self = self.slice(next, self.len()).to_owned();
2151         return ch;
2152     }
2153
2154     /// Prepend a char to a string
2155     fn unshift_char(&mut self, ch: char) {
2156         // This could be more efficient.
2157         let mut new_str = ~"";
2158         new_str.push_char(ch);
2159         new_str.push_str(*self);
2160         *self = new_str;
2161     }
2162
2163     /// Concatenate two strings together.
2164     #[inline]
2165     fn append(&self, rhs: &str) -> ~str {
2166         // FIXME #4850: this should consume self, but that causes segfaults
2167         let mut v = self.clone();
2168         v.push_str_no_overallocate(rhs);
2169         v
2170     }
2171
2172     /**
2173      * Reserves capacity for exactly `n` bytes in the given string, not including
2174      * the null terminator.
2175      *
2176      * Assuming single-byte characters, the resulting string will be large
2177      * enough to hold a string of length `n`. To account for the null terminator,
2178      * the underlying buffer will have the size `n` + 1.
2179      *
2180      * If the capacity for `s` is already equal to or greater than the requested
2181      * capacity, then no action is taken.
2182      *
2183      * # Arguments
2184      *
2185      * * s - A string
2186      * * n - The number of bytes to reserve space for
2187      */
2188     #[inline]
2189     pub fn reserve(&mut self, n: uint) {
2190         unsafe {
2191             let v: *mut ~[u8] = cast::transmute(self);
2192             (*v).reserve(n + 1);
2193         }
2194     }
2195
2196     /**
2197      * Reserves capacity for at least `n` bytes in the given string, not including
2198      * the null terminator.
2199      *
2200      * Assuming single-byte characters, the resulting string will be large
2201      * enough to hold a string of length `n`. To account for the null terminator,
2202      * the underlying buffer will have the size `n` + 1.
2203      *
2204      * This function will over-allocate in order to amortize the allocation costs
2205      * in scenarios where the caller may need to repeatedly reserve additional
2206      * space.
2207      *
2208      * If the capacity for `s` is already equal to or greater than the requested
2209      * capacity, then no action is taken.
2210      *
2211      * # Arguments
2212      *
2213      * * s - A string
2214      * * n - The number of bytes to reserve space for
2215      */
2216     #[inline]
2217     fn reserve_at_least(&mut self, n: uint) {
2218         self.reserve(uint::next_power_of_two(n + 1u) - 1u)
2219     }
2220
2221     /**
2222      * Returns the number of single-byte characters the string can hold without
2223      * reallocating
2224      */
2225     fn capacity(&self) -> uint {
2226         let buf: &~[u8] = unsafe { cast::transmute(self) };
2227         let vcap = buf.capacity();
2228         assert!(vcap > 0u);
2229         vcap - 1u
2230     }
2231
2232     /// Convert to a vector of bytes. This does not allocate a new
2233     /// string, and includes the null terminator.
2234     #[inline]
2235     fn to_bytes_with_null(self) -> ~[u8] {
2236         unsafe { ::cast::transmute(self) }
2237     }
2238
2239     #[inline]
2240     fn as_mut_buf<T>(&mut self, f: &fn(*mut u8, uint) -> T) -> T {
2241         let v: &mut ~[u8] = unsafe { cast::transmute(self) };
2242         v.as_mut_buf(f)
2243     }
2244 }
2245
2246 impl Clone for ~str {
2247     #[inline]
2248     fn clone(&self) -> ~str {
2249         self.to_owned()
2250     }
2251 }
2252
2253 impl Clone for @str {
2254     #[inline]
2255     fn clone(&self) -> @str {
2256         *self
2257     }
2258 }
2259
2260 /// External iterator for a string's characters. Use with the `std::iterator`
2261 /// module.
2262 #[deriving(Clone)]
2263 pub struct StrCharIterator<'self> {
2264     priv index: uint,
2265     priv string: &'self str,
2266 }
2267
2268 impl<'self> Iterator<char> for StrCharIterator<'self> {
2269     #[inline]
2270     fn next(&mut self) -> Option<char> {
2271         if self.index < self.string.len() {
2272             let CharRange {ch, next} = self.string.char_range_at(self.index);
2273             self.index = next;
2274             Some(ch)
2275         } else {
2276             None
2277         }
2278     }
2279 }
2280 /// External iterator for a string's characters in reverse order. Use
2281 /// with the `std::iterator` module.
2282 #[deriving(Clone)]
2283 pub struct StrCharRevIterator<'self> {
2284     priv index: uint,
2285     priv string: &'self str,
2286 }
2287
2288 impl<'self> Iterator<char> for StrCharRevIterator<'self> {
2289     #[inline]
2290     fn next(&mut self) -> Option<char> {
2291         if self.index > 0 {
2292             let CharRange {ch, next} = self.string.char_range_at_reverse(self.index);
2293             self.index = next;
2294             Some(ch)
2295         } else {
2296             None
2297         }
2298     }
2299 }
2300
2301 /// External iterator for a string's bytes. Use with the `std::iterator`
2302 /// module.
2303 #[deriving(Clone)]
2304 pub struct StrBytesIterator<'self> {
2305     priv it: vec::VecIterator<'self, u8>
2306 }
2307
2308 impl<'self> Iterator<u8> for StrBytesIterator<'self> {
2309     #[inline]
2310     fn next(&mut self) -> Option<u8> {
2311         self.it.next().map_consume(|&x| x)
2312     }
2313 }
2314
2315 /// External iterator for a string's bytes in reverse order. Use with
2316 /// the `std::iterator` module.
2317 #[deriving(Clone)]
2318 pub struct StrBytesRevIterator<'self> {
2319     priv it: vec::VecRevIterator<'self, u8>
2320 }
2321
2322 impl<'self> Iterator<u8> for StrBytesRevIterator<'self> {
2323     #[inline]
2324     fn next(&mut self) -> Option<u8> {
2325         self.it.next().map_consume(|&x| x)
2326     }
2327 }
2328
2329 // This works because every lifetime is a sub-lifetime of 'static
2330 impl<'self> Zero for &'self str {
2331     fn zero() -> &'self str { "" }
2332     fn is_zero(&self) -> bool { self.is_empty() }
2333 }
2334
2335 impl Zero for ~str {
2336     fn zero() -> ~str { ~"" }
2337     fn is_zero(&self) -> bool { self.len() == 0 }
2338 }
2339
2340 impl Zero for @str {
2341     fn zero() -> @str { @"" }
2342     fn is_zero(&self) -> bool { self.len() == 0 }
2343 }
2344
2345 #[cfg(test)]
2346 mod tests {
2347     use iterator::IteratorUtil;
2348     use container::Container;
2349     use option::Some;
2350     use libc::c_char;
2351     use libc;
2352     use ptr;
2353     use str::*;
2354     use uint;
2355     use vec;
2356     use vec::{ImmutableVector, CopyableVector};
2357     use cmp::{TotalOrd, Less, Equal, Greater};
2358
2359     #[test]
2360     fn test_eq() {
2361         assert!((eq(&~"", &~"")));
2362         assert!((eq(&~"foo", &~"foo")));
2363         assert!((!eq(&~"foo", &~"bar")));
2364     }
2365
2366     #[test]
2367     fn test_eq_slice() {
2368         assert!((eq_slice("foobar".slice(0, 3), "foo")));
2369         assert!((eq_slice("barfoo".slice(3, 6), "foo")));
2370         assert!((!eq_slice("foo1", "foo2")));
2371     }
2372
2373     #[test]
2374     fn test_le() {
2375         assert!("" <= "");
2376         assert!("" <= "foo");
2377         assert!("foo" <= "foo");
2378         assert!("foo" != "bar");
2379     }
2380
2381     #[test]
2382     fn test_len() {
2383         assert_eq!("".len(), 0u);
2384         assert_eq!("hello world".len(), 11u);
2385         assert_eq!("\x63".len(), 1u);
2386         assert_eq!("\xa2".len(), 2u);
2387         assert_eq!("\u03c0".len(), 2u);
2388         assert_eq!("\u2620".len(), 3u);
2389         assert_eq!("\U0001d11e".len(), 4u);
2390
2391         assert_eq!("".char_len(), 0u);
2392         assert_eq!("hello world".char_len(), 11u);
2393         assert_eq!("\x63".char_len(), 1u);
2394         assert_eq!("\xa2".char_len(), 1u);
2395         assert_eq!("\u03c0".char_len(), 1u);
2396         assert_eq!("\u2620".char_len(), 1u);
2397         assert_eq!("\U0001d11e".char_len(), 1u);
2398         assert_eq!("ประเทศไทย中华Việt Nam".char_len(), 19u);
2399     }
2400
2401     #[test]
2402     fn test_find() {
2403         assert_eq!("hello".find('l'), Some(2u));
2404         assert_eq!("hello".find(|c:char| c == 'o'), Some(4u));
2405         assert!("hello".find('x').is_none());
2406         assert!("hello".find(|c:char| c == 'x').is_none());
2407         assert_eq!("ประเทศไทย中华Việt Nam".find('华'), Some(30u));
2408         assert_eq!("ประเทศไทย中华Việt Nam".find(|c: char| c == '华'), Some(30u));
2409     }
2410
2411     #[test]
2412     fn test_rfind() {
2413         assert_eq!("hello".rfind('l'), Some(3u));
2414         assert_eq!("hello".rfind(|c:char| c == 'o'), Some(4u));
2415         assert!("hello".rfind('x').is_none());
2416         assert!("hello".rfind(|c:char| c == 'x').is_none());
2417         assert_eq!("ประเทศไทย中华Việt Nam".rfind('华'), Some(30u));
2418         assert_eq!("ประเทศไทย中华Việt Nam".rfind(|c: char| c == '华'), Some(30u));
2419     }
2420
2421     #[test]
2422     fn test_push_str() {
2423         let mut s = ~"";
2424         s.push_str("");
2425         assert_eq!(s.slice_from(0), "");
2426         s.push_str("abc");
2427         assert_eq!(s.slice_from(0), "abc");
2428         s.push_str("ประเทศไทย中华Việt Nam");
2429         assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
2430     }
2431     #[test]
2432     fn test_append() {
2433         let mut s = ~"";
2434         s = s.append("");
2435         assert_eq!(s.slice_from(0), "");
2436         s = s.append("abc");
2437         assert_eq!(s.slice_from(0), "abc");
2438         s = s.append("ประเทศไทย中华Việt Nam");
2439         assert_eq!(s.slice_from(0), "abcประเทศไทย中华Việt Nam");
2440     }
2441
2442     #[test]
2443     fn test_pop_char() {
2444         let mut data = ~"ประเทศไทย中华";
2445         let cc = data.pop_char();
2446         assert_eq!(~"ประเทศไทย中", data);
2447         assert_eq!('华', cc);
2448     }
2449
2450     #[test]
2451     fn test_pop_char_2() {
2452         let mut data2 = ~"华";
2453         let cc2 = data2.pop_char();
2454         assert_eq!(~"", data2);
2455         assert_eq!('华', cc2);
2456     }
2457
2458     #[test]
2459     #[should_fail]
2460     #[ignore(cfg(windows))]
2461     fn test_pop_char_fail() {
2462         let mut data = ~"";
2463         let _cc3 = data.pop_char();
2464     }
2465
2466     #[test]
2467     fn test_push_char() {
2468         let mut data = ~"ประเทศไทย中";
2469         data.push_char('华');
2470         data.push_char('b'); // 1 byte
2471         data.push_char('¢'); // 2 byte
2472         data.push_char('€'); // 3 byte
2473         data.push_char('𤭢'); // 4 byte
2474         assert_eq!(~"ประเทศไทย中华b¢€𤭢", data);
2475     }
2476
2477     #[test]
2478     fn test_shift_char() {
2479         let mut data = ~"ประเทศไทย中";
2480         let cc = data.shift_char();
2481         assert_eq!(~"ระเทศไทย中", data);
2482         assert_eq!('ป', cc);
2483     }
2484
2485     #[test]
2486     fn test_unshift_char() {
2487         let mut data = ~"ประเทศไทย中";
2488         data.unshift_char('华');
2489         assert_eq!(~"华ประเทศไทย中", data);
2490     }
2491
2492     #[test]
2493     fn test_clear() {
2494         let mut empty = ~"";
2495         empty.clear();
2496         assert_eq!("", empty.as_slice());
2497         let mut data = ~"ประเทศไทย中";
2498         data.clear();
2499         assert_eq!("", data.as_slice());
2500         data.push_char('华');
2501         assert_eq!("华", data.as_slice());
2502     }
2503
2504     #[test]
2505     fn test_split_within() {
2506         fn t(s: &str, i: uint, u: &[~str]) {
2507             let mut v = ~[];
2508             for each_split_within(s, i) |s| { v.push(s.to_owned()) }
2509             assert!(v.iter().zip(u.iter()).all(|(a,b)| a == b));
2510         }
2511         t("", 0, []);
2512         t("", 15, []);
2513         t("hello", 15, [~"hello"]);
2514         t("\nMary had a little lamb\nLittle lamb\n", 15,
2515             [~"Mary had a", ~"little lamb", ~"Little lamb"]);
2516         t("\nMary had a little lamb\nLittle lamb\n", uint::max_value,
2517             [~"Mary had a little lamb\nLittle lamb"]);
2518     }
2519
2520     #[test]
2521     fn test_find_str() {
2522         // byte positions
2523         assert_eq!("".find_str(""), Some(0u));
2524         assert!("banana".find_str("apple pie").is_none());
2525
2526         let data = "abcabc";
2527         assert_eq!(data.slice(0u, 6u).find_str("ab"), Some(0u));
2528         assert_eq!(data.slice(2u, 6u).find_str("ab"), Some(3u - 2u));
2529         assert!(data.slice(2u, 4u).find_str("ab").is_none());
2530
2531         let mut data = ~"ประเทศไทย中华Việt Nam";
2532         data = data + data;
2533         assert!(data.find_str("ไท华").is_none());
2534         assert_eq!(data.slice(0u, 43u).find_str(""), Some(0u));
2535         assert_eq!(data.slice(6u, 43u).find_str(""), Some(6u - 6u));
2536
2537         assert_eq!(data.slice(0u, 43u).find_str("ประ"), Some( 0u));
2538         assert_eq!(data.slice(0u, 43u).find_str("ทศไ"), Some(12u));
2539         assert_eq!(data.slice(0u, 43u).find_str("ย中"), Some(24u));
2540         assert_eq!(data.slice(0u, 43u).find_str("iệt"), Some(34u));
2541         assert_eq!(data.slice(0u, 43u).find_str("Nam"), Some(40u));
2542
2543         assert_eq!(data.slice(43u, 86u).find_str("ประ"), Some(43u - 43u));
2544         assert_eq!(data.slice(43u, 86u).find_str("ทศไ"), Some(55u - 43u));
2545         assert_eq!(data.slice(43u, 86u).find_str("ย中"), Some(67u - 43u));
2546         assert_eq!(data.slice(43u, 86u).find_str("iệt"), Some(77u - 43u));
2547         assert_eq!(data.slice(43u, 86u).find_str("Nam"), Some(83u - 43u));
2548     }
2549
2550     #[test]
2551     fn test_slice_chars() {
2552         fn t(a: &str, b: &str, start: uint) {
2553             assert_eq!(a.slice_chars(start, start + b.char_len()), b);
2554         }
2555         t("hello", "llo", 2);
2556         t("hello", "el", 1);
2557         assert_eq!("ะเทศไท", "ประเทศไทย中华Việt Nam".slice_chars(2, 8));
2558     }
2559
2560     #[test]
2561     fn test_concat() {
2562         fn t(v: &[~str], s: &str) {
2563             assert_eq!(v.concat(), s.to_str());
2564         }
2565         t([~"you", ~"know", ~"I'm", ~"no", ~"good"], "youknowI'mnogood");
2566         let v: &[~str] = [];
2567         t(v, "");
2568         t([~"hi"], "hi");
2569     }
2570
2571     #[test]
2572     fn test_connect() {
2573         fn t(v: &[~str], sep: &str, s: &str) {
2574             assert_eq!(v.connect(sep), s.to_str());
2575         }
2576         t([~"you", ~"know", ~"I'm", ~"no", ~"good"],
2577           " ", "you know I'm no good");
2578         let v: &[~str] = [];
2579         t(v, " ", "");
2580         t([~"hi"], " ", "hi");
2581     }
2582
2583     #[test]
2584     fn test_concat_slices() {
2585         fn t(v: &[&str], s: &str) {
2586             assert_eq!(v.concat(), s.to_str());
2587         }
2588         t(["you", "know", "I'm", "no", "good"], "youknowI'mnogood");
2589         let v: &[&str] = [];
2590         t(v, "");
2591         t(["hi"], "hi");
2592     }
2593
2594     #[test]
2595     fn test_connect_slices() {
2596         fn t(v: &[&str], sep: &str, s: &str) {
2597             assert_eq!(v.connect(sep), s.to_str());
2598         }
2599         t(["you", "know", "I'm", "no", "good"],
2600           " ", "you know I'm no good");
2601         t([], " ", "");
2602         t(["hi"], " ", "hi");
2603     }
2604
2605     #[test]
2606     fn test_repeat() {
2607         assert_eq!("x".repeat(4), ~"xxxx");
2608         assert_eq!("hi".repeat(4), ~"hihihihi");
2609         assert_eq!("ไท华".repeat(3), ~"ไท华ไท华ไท华");
2610         assert_eq!("".repeat(4), ~"");
2611         assert_eq!("hi".repeat(0), ~"");
2612     }
2613
2614     #[test]
2615     fn test_unsafe_slice() {
2616         assert_eq!("ab", unsafe {raw::slice_bytes("abc", 0, 2)});
2617         assert_eq!("bc", unsafe {raw::slice_bytes("abc", 1, 3)});
2618         assert_eq!("", unsafe {raw::slice_bytes("abc", 1, 1)});
2619         fn a_million_letter_a() -> ~str {
2620             let mut i = 0;
2621             let mut rs = ~"";
2622             while i < 100000 { rs.push_str("aaaaaaaaaa"); i += 1; }
2623             rs
2624         }
2625         fn half_a_million_letter_a() -> ~str {
2626             let mut i = 0;
2627             let mut rs = ~"";
2628             while i < 100000 { rs.push_str("aaaaa"); i += 1; }
2629             rs
2630         }
2631         let letters = a_million_letter_a();
2632         assert!(half_a_million_letter_a() ==
2633             unsafe {raw::slice_bytes(letters, 0u, 500000)}.to_owned());
2634     }
2635
2636     #[test]
2637     fn test_starts_with() {
2638         assert!(("".starts_with("")));
2639         assert!(("abc".starts_with("")));
2640         assert!(("abc".starts_with("a")));
2641         assert!((!"a".starts_with("abc")));
2642         assert!((!"".starts_with("abc")));
2643     }
2644
2645     #[test]
2646     fn test_ends_with() {
2647         assert!(("".ends_with("")));
2648         assert!(("abc".ends_with("")));
2649         assert!(("abc".ends_with("c")));
2650         assert!((!"a".ends_with("abc")));
2651         assert!((!"".ends_with("abc")));
2652     }
2653
2654     #[test]
2655     fn test_is_empty() {
2656         assert!("".is_empty());
2657         assert!(!"a".is_empty());
2658     }
2659
2660     #[test]
2661     fn test_replace() {
2662         let a = "a";
2663         assert_eq!("".replace(a, "b"), ~"");
2664         assert_eq!("a".replace(a, "b"), ~"b");
2665         assert_eq!("ab".replace(a, "b"), ~"bb");
2666         let test = "test";
2667         assert!(" test test ".replace(test, "toast") ==
2668             ~" toast toast ");
2669         assert_eq!(" test test ".replace(test, ""), ~"   ");
2670     }
2671
2672     #[test]
2673     fn test_replace_2a() {
2674         let data = ~"ประเทศไทย中华";
2675         let repl = ~"دولة الكويت";
2676
2677         let a = ~"ประเ";
2678         let A = ~"دولة الكويتทศไทย中华";
2679         assert_eq!(data.replace(a, repl), A);
2680     }
2681
2682     #[test]
2683     fn test_replace_2b() {
2684         let data = ~"ประเทศไทย中华";
2685         let repl = ~"دولة الكويت";
2686
2687         let b = ~"ะเ";
2688         let B = ~"ปรدولة الكويتทศไทย中华";
2689         assert_eq!(data.replace(b,   repl), B);
2690     }
2691
2692     #[test]
2693     fn test_replace_2c() {
2694         let data = ~"ประเทศไทย中华";
2695         let repl = ~"دولة الكويت";
2696
2697         let c = ~"中华";
2698         let C = ~"ประเทศไทยدولة الكويت";
2699         assert_eq!(data.replace(c, repl), C);
2700     }
2701
2702     #[test]
2703     fn test_replace_2d() {
2704         let data = ~"ประเทศไทย中华";
2705         let repl = ~"دولة الكويت";
2706
2707         let d = ~"ไท华";
2708         assert_eq!(data.replace(d, repl), data);
2709     }
2710
2711     #[test]
2712     fn test_slice() {
2713         assert_eq!("ab", "abc".slice(0, 2));
2714         assert_eq!("bc", "abc".slice(1, 3));
2715         assert_eq!("", "abc".slice(1, 1));
2716         assert_eq!("\u65e5", "\u65e5\u672c".slice(0, 3));
2717
2718         let data = "ประเทศไทย中华";
2719         assert_eq!("ป", data.slice(0, 3));
2720         assert_eq!("ร", data.slice(3, 6));
2721         assert_eq!("", data.slice(3, 3));
2722         assert_eq!("华", data.slice(30, 33));
2723
2724         fn a_million_letter_X() -> ~str {
2725             let mut i = 0;
2726             let mut rs = ~"";
2727             while i < 100000 {
2728                 push_str(&mut rs, "华华华华华华华华华华");
2729                 i += 1;
2730             }
2731             rs
2732         }
2733         fn half_a_million_letter_X() -> ~str {
2734             let mut i = 0;
2735             let mut rs = ~"";
2736             while i < 100000 { push_str(&mut rs, "华华华华华"); i += 1; }
2737             rs
2738         }
2739         let letters = a_million_letter_X();
2740         assert!(half_a_million_letter_X() ==
2741             letters.slice(0u, 3u * 500000u).to_owned());
2742     }
2743
2744     #[test]
2745     fn test_slice_2() {
2746         let ss = "中华Việt Nam";
2747
2748         assert_eq!("华", ss.slice(3u, 6u));
2749         assert_eq!("Việt Nam", ss.slice(6u, 16u));
2750
2751         assert_eq!("ab", "abc".slice(0u, 2u));
2752         assert_eq!("bc", "abc".slice(1u, 3u));
2753         assert_eq!("", "abc".slice(1u, 1u));
2754
2755         assert_eq!("中", ss.slice(0u, 3u));
2756         assert_eq!("华V", ss.slice(3u, 7u));
2757         assert_eq!("", ss.slice(3u, 3u));
2758         /*0: 中
2759           3: 华
2760           6: V
2761           7: i
2762           8: ệ
2763          11: t
2764          12:
2765          13: N
2766          14: a
2767          15: m */
2768     }
2769
2770     #[test]
2771     #[should_fail]
2772     #[ignore(cfg(windows))]
2773     fn test_slice_fail() {
2774         "中华Việt Nam".slice(0u, 2u);
2775     }
2776
2777     #[test]
2778     fn test_slice_from() {
2779         assert_eq!("abcd".slice_from(0), "abcd");
2780         assert_eq!("abcd".slice_from(2), "cd");
2781         assert_eq!("abcd".slice_from(4), "");
2782     }
2783     #[test]
2784     fn test_slice_to() {
2785         assert_eq!("abcd".slice_to(0), "");
2786         assert_eq!("abcd".slice_to(2), "ab");
2787         assert_eq!("abcd".slice_to(4), "abcd");
2788     }
2789
2790     #[test]
2791     fn test_trim_left_chars() {
2792         let v: &[char] = &[];
2793         assert_eq!(" *** foo *** ".trim_left_chars(&v), " *** foo *** ");
2794         assert_eq!(" *** foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
2795         assert_eq!(" ***  *** ".trim_left_chars(& &['*', ' ']), "");
2796         assert_eq!("foo *** ".trim_left_chars(& &['*', ' ']), "foo *** ");
2797
2798         assert_eq!("11foo1bar11".trim_left_chars(&'1'), "foo1bar11");
2799         assert_eq!("12foo1bar12".trim_left_chars(& &['1', '2']), "foo1bar12");
2800         assert_eq!("123foo1bar123".trim_left_chars(&|c: char| c.is_digit()), "foo1bar123");
2801     }
2802
2803     #[test]
2804     fn test_trim_right_chars() {
2805         let v: &[char] = &[];
2806         assert_eq!(" *** foo *** ".trim_right_chars(&v), " *** foo *** ");
2807         assert_eq!(" *** foo *** ".trim_right_chars(& &['*', ' ']), " *** foo");
2808         assert_eq!(" ***  *** ".trim_right_chars(& &['*', ' ']), "");
2809         assert_eq!(" *** foo".trim_right_chars(& &['*', ' ']), " *** foo");
2810
2811         assert_eq!("11foo1bar11".trim_right_chars(&'1'), "11foo1bar");
2812         assert_eq!("12foo1bar12".trim_right_chars(& &['1', '2']), "12foo1bar");
2813         assert_eq!("123foo1bar123".trim_right_chars(&|c: char| c.is_digit()), "123foo1bar");
2814     }
2815
2816     #[test]
2817     fn test_trim_chars() {
2818         let v: &[char] = &[];
2819         assert_eq!(" *** foo *** ".trim_chars(&v), " *** foo *** ");
2820         assert_eq!(" *** foo *** ".trim_chars(& &['*', ' ']), "foo");
2821         assert_eq!(" ***  *** ".trim_chars(& &['*', ' ']), "");
2822         assert_eq!("foo".trim_chars(& &['*', ' ']), "foo");
2823
2824         assert_eq!("11foo1bar11".trim_chars(&'1'), "foo1bar");
2825         assert_eq!("12foo1bar12".trim_chars(& &['1', '2']), "foo1bar");
2826         assert_eq!("123foo1bar123".trim_chars(&|c: char| c.is_digit()), "foo1bar");
2827     }
2828
2829     #[test]
2830     fn test_trim_left() {
2831         assert_eq!("".trim_left(), "");
2832         assert_eq!("a".trim_left(), "a");
2833         assert_eq!("    ".trim_left(), "");
2834         assert_eq!("     blah".trim_left(), "blah");
2835         assert_eq!("   \u3000  wut".trim_left(), "wut");
2836         assert_eq!("hey ".trim_left(), "hey ");
2837     }
2838
2839     #[test]
2840     fn test_trim_right() {
2841         assert_eq!("".trim_right(), "");
2842         assert_eq!("a".trim_right(), "a");
2843         assert_eq!("    ".trim_right(), "");
2844         assert_eq!("blah     ".trim_right(), "blah");
2845         assert_eq!("wut   \u3000  ".trim_right(), "wut");
2846         assert_eq!(" hey".trim_right(), " hey");
2847     }
2848
2849     #[test]
2850     fn test_trim() {
2851         assert_eq!("".trim(), "");
2852         assert_eq!("a".trim(), "a");
2853         assert_eq!("    ".trim(), "");
2854         assert_eq!("    blah     ".trim(), "blah");
2855         assert_eq!("\nwut   \u3000  ".trim(), "wut");
2856         assert_eq!(" hey dude ".trim(), "hey dude");
2857     }
2858
2859     #[test]
2860     fn test_is_whitespace() {
2861         assert!("".is_whitespace());
2862         assert!(" ".is_whitespace());
2863         assert!("\u2009".is_whitespace()); // Thin space
2864         assert!("  \n\t   ".is_whitespace());
2865         assert!(!"   _   ".is_whitespace());
2866     }
2867
2868     #[test]
2869     fn test_shift_byte() {
2870         let mut s = ~"ABC";
2871         let b = unsafe{raw::shift_byte(&mut s)};
2872         assert_eq!(s, ~"BC");
2873         assert_eq!(b, 65u8);
2874     }
2875
2876     #[test]
2877     fn test_pop_byte() {
2878         let mut s = ~"ABC";
2879         let b = unsafe{raw::pop_byte(&mut s)};
2880         assert_eq!(s, ~"AB");
2881         assert_eq!(b, 67u8);
2882     }
2883
2884     #[test]
2885     fn test_unsafe_from_bytes() {
2886         let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8];
2887         let b = unsafe { raw::from_bytes(a) };
2888         assert_eq!(b, ~"AAAAAAA");
2889     }
2890
2891     #[test]
2892     fn test_from_bytes() {
2893         let ss = ~"ศไทย中华Việt Nam";
2894         let bb = ~[0xe0_u8, 0xb8_u8, 0xa8_u8,
2895                   0xe0_u8, 0xb9_u8, 0x84_u8,
2896                   0xe0_u8, 0xb8_u8, 0x97_u8,
2897                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2898                   0xe4_u8, 0xb8_u8, 0xad_u8,
2899                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2900                   0x56_u8, 0x69_u8, 0xe1_u8,
2901                   0xbb_u8, 0x87_u8, 0x74_u8,
2902                   0x20_u8, 0x4e_u8, 0x61_u8,
2903                   0x6d_u8];
2904
2905         assert_eq!(ss, from_bytes(bb));
2906     }
2907
2908     #[test]
2909     #[ignore(cfg(windows))]
2910     fn test_from_bytes_fail() {
2911         use str::not_utf8::cond;
2912
2913         let bb = ~[0xff_u8, 0xb8_u8, 0xa8_u8,
2914                   0xe0_u8, 0xb9_u8, 0x84_u8,
2915                   0xe0_u8, 0xb8_u8, 0x97_u8,
2916                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2917                   0xe4_u8, 0xb8_u8, 0xad_u8,
2918                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2919                   0x56_u8, 0x69_u8, 0xe1_u8,
2920                   0xbb_u8, 0x87_u8, 0x74_u8,
2921                   0x20_u8, 0x4e_u8, 0x61_u8,
2922                   0x6d_u8];
2923
2924         let mut error_happened = false;
2925         let _x = do cond.trap(|err| {
2926             assert_eq!(err, ~"from_bytes: input is not UTF-8; first bad byte is 255");
2927             error_happened = true;
2928             ~""
2929         }).in {
2930             from_bytes(bb)
2931         };
2932         assert!(error_happened);
2933     }
2934
2935     #[test]
2936     fn test_unsafe_from_bytes_with_null() {
2937         let a = [65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
2938         let b = unsafe { raw::from_bytes_with_null(a) };
2939         assert_eq!(b, "AAAAAAA");
2940     }
2941
2942     #[test]
2943     fn test_from_bytes_with_null() {
2944         let ss = "ศไทย中华Việt Nam";
2945         let bb = [0xe0_u8, 0xb8_u8, 0xa8_u8,
2946                   0xe0_u8, 0xb9_u8, 0x84_u8,
2947                   0xe0_u8, 0xb8_u8, 0x97_u8,
2948                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2949                   0xe4_u8, 0xb8_u8, 0xad_u8,
2950                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2951                   0x56_u8, 0x69_u8, 0xe1_u8,
2952                   0xbb_u8, 0x87_u8, 0x74_u8,
2953                   0x20_u8, 0x4e_u8, 0x61_u8,
2954                   0x6d_u8, 0x0_u8];
2955
2956         assert_eq!(ss, from_bytes_with_null(bb));
2957     }
2958
2959     #[test]
2960     #[should_fail]
2961     #[ignore(cfg(windows))]
2962     fn test_from_bytes_with_null_fail() {
2963         let bb = [0xff_u8, 0xb8_u8, 0xa8_u8,
2964                   0xe0_u8, 0xb9_u8, 0x84_u8,
2965                   0xe0_u8, 0xb8_u8, 0x97_u8,
2966                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2967                   0xe4_u8, 0xb8_u8, 0xad_u8,
2968                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2969                   0x56_u8, 0x69_u8, 0xe1_u8,
2970                   0xbb_u8, 0x87_u8, 0x74_u8,
2971                   0x20_u8, 0x4e_u8, 0x61_u8,
2972                   0x6d_u8, 0x0_u8];
2973
2974          let _x = from_bytes_with_null(bb);
2975     }
2976
2977     #[test]
2978     #[should_fail]
2979     #[ignore(cfg(windows))]
2980     fn test_from_bytes_with_null_fail_2() {
2981         let bb = [0xff_u8, 0xb8_u8, 0xa8_u8,
2982                   0xe0_u8, 0xb9_u8, 0x84_u8,
2983                   0xe0_u8, 0xb8_u8, 0x97_u8,
2984                   0xe0_u8, 0xb8_u8, 0xa2_u8,
2985                   0xe4_u8, 0xb8_u8, 0xad_u8,
2986                   0xe5_u8, 0x8d_u8, 0x8e_u8,
2987                   0x56_u8, 0x69_u8, 0xe1_u8,
2988                   0xbb_u8, 0x87_u8, 0x74_u8,
2989                   0x20_u8, 0x4e_u8, 0x61_u8,
2990                   0x6d_u8, 0x60_u8];
2991
2992          let _x = from_bytes_with_null(bb);
2993     }
2994
2995     #[test]
2996     fn test_from_buf() {
2997         unsafe {
2998             let a = ~[65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 65u8, 0u8];
2999             let b = vec::raw::to_ptr(a);
3000             let c = raw::from_buf(b);
3001             assert_eq!(c, ~"AAAAAAA");
3002         }
3003     }
3004
3005     #[test]
3006     fn test_as_bytes() {
3007         // no null
3008         let v = [
3009             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3010             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3011             109
3012         ];
3013         assert_eq!("".as_bytes(), &[]);
3014         assert_eq!("abc".as_bytes(), &['a' as u8, 'b' as u8, 'c' as u8]);
3015         assert_eq!("ศไทย中华Việt Nam".as_bytes(), v);
3016     }
3017
3018     #[test]
3019     fn test_as_bytes_with_null() {
3020         // has null
3021         let v = [
3022             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3023             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3024             109, 0
3025         ];
3026
3027         let s1 = @"";
3028         let s2 = @"abc";
3029         let s3 = @"ศไทย中华Việt Nam";
3030         assert_eq!(s1.as_bytes_with_null(), &[0]);
3031         assert_eq!(s2.as_bytes_with_null(), &['a' as u8, 'b' as u8, 'c' as u8, 0]);
3032         assert_eq!(s3.as_bytes_with_null(), v);
3033
3034         let s1 = ~"";
3035         let s2 = ~"abc";
3036         let s3 = ~"ศไทย中华Việt Nam";
3037         assert_eq!(s1.as_bytes_with_null(), &[0]);
3038         assert_eq!(s2.as_bytes_with_null(), &['a' as u8, 'b' as u8, 'c' as u8, 0]);
3039         assert_eq!(s3.as_bytes_with_null(), v);
3040     }
3041
3042     #[test]
3043     fn test_to_bytes_with_null() {
3044         let s = ~"ศไทย中华Việt Nam";
3045         let v = ~[
3046             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3047             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3048             109, 0
3049         ];
3050         assert_eq!((~"").to_bytes_with_null(), ~[0]);
3051         assert_eq!((~"abc").to_bytes_with_null(),
3052                    ~['a' as u8, 'b' as u8, 'c' as u8, 0]);
3053         assert_eq!(s.to_bytes_with_null(), v);
3054     }
3055
3056     #[test]
3057     #[ignore(cfg(windows))]
3058     #[should_fail]
3059     fn test_as_bytes_fail() {
3060         // Don't double free. (I'm not sure if this exercises the
3061         // original problem code path anymore.)
3062         let s = ~"";
3063         let _bytes = s.as_bytes_with_null();
3064         fail!();
3065     }
3066
3067     #[test]
3068     fn test_as_imm_buf() {
3069         do "".as_imm_buf |buf, len| {
3070             assert_eq!(len, 1);
3071             unsafe {
3072                 assert_eq!(*ptr::offset(buf, 0), 0);
3073             }
3074         }
3075
3076         do "hello".as_imm_buf |buf, len| {
3077             assert_eq!(len, 6);
3078             unsafe {
3079                 assert_eq!(*ptr::offset(buf, 0), 'h' as u8);
3080                 assert_eq!(*ptr::offset(buf, 1), 'e' as u8);
3081                 assert_eq!(*ptr::offset(buf, 2), 'l' as u8);
3082                 assert_eq!(*ptr::offset(buf, 3), 'l' as u8);
3083                 assert_eq!(*ptr::offset(buf, 4), 'o' as u8);
3084                 assert_eq!(*ptr::offset(buf, 5), 0);
3085             }
3086         }
3087     }
3088
3089     #[test]
3090     fn test_as_c_str() {
3091         let a = ~"";
3092         do a.as_c_str |buf| {
3093             unsafe {
3094                 assert_eq!(*ptr::offset(buf, 0), 0);
3095             }
3096         }
3097
3098         let a = ~"hello";
3099         do a.as_c_str |buf| {
3100             unsafe {
3101                 assert_eq!(*ptr::offset(buf, 0), 'h' as libc::c_char);
3102                 assert_eq!(*ptr::offset(buf, 1), 'e' as libc::c_char);
3103                 assert_eq!(*ptr::offset(buf, 2), 'l' as libc::c_char);
3104                 assert_eq!(*ptr::offset(buf, 3), 'l' as libc::c_char);
3105                 assert_eq!(*ptr::offset(buf, 4), 'o' as libc::c_char);
3106                 assert_eq!(*ptr::offset(buf, 5), 0);
3107             }
3108         }
3109     }
3110
3111     #[test]
3112     fn test_subslice_offset() {
3113         let a = "kernelsprite";
3114         let b = a.slice(7, a.len());
3115         let c = a.slice(0, a.len() - 6);
3116         assert_eq!(a.subslice_offset(b), 7);
3117         assert_eq!(a.subslice_offset(c), 0);
3118
3119         let string = "a\nb\nc";
3120         let mut lines = ~[];
3121         for string.line_iter().advance |line| { lines.push(line) }
3122         assert_eq!(string.subslice_offset(lines[0]), 0);
3123         assert_eq!(string.subslice_offset(lines[1]), 2);
3124         assert_eq!(string.subslice_offset(lines[2]), 4);
3125     }
3126
3127     #[test]
3128     #[should_fail]
3129     fn test_subslice_offset_2() {
3130         let a = "alchemiter";
3131         let b = "cruxtruder";
3132         a.subslice_offset(b);
3133     }
3134
3135     #[test]
3136     fn vec_str_conversions() {
3137         let s1: ~str = ~"All mimsy were the borogoves";
3138
3139         let v: ~[u8] = s1.as_bytes().to_owned();
3140         let s2: ~str = from_bytes(v);
3141         let mut i: uint = 0u;
3142         let n1: uint = s1.len();
3143         let n2: uint = v.len();
3144         assert_eq!(n1, n2);
3145         while i < n1 {
3146             let a: u8 = s1[i];
3147             let b: u8 = s2[i];
3148             debug!(a);
3149             debug!(b);
3150             assert_eq!(a, b);
3151             i += 1u;
3152         }
3153     }
3154
3155     #[test]
3156     fn test_contains() {
3157         assert!("abcde".contains("bcd"));
3158         assert!("abcde".contains("abcd"));
3159         assert!("abcde".contains("bcde"));
3160         assert!("abcde".contains(""));
3161         assert!("".contains(""));
3162         assert!(!"abcde".contains("def"));
3163         assert!(!"".contains("a"));
3164
3165         let data = ~"ประเทศไทย中华Việt Nam";
3166         assert!(data.contains("ประเ"));
3167         assert!(data.contains("ะเ"));
3168         assert!(data.contains("中华"));
3169         assert!(!data.contains("ไท华"));
3170     }
3171
3172     #[test]
3173     fn test_contains_char() {
3174         assert!("abc".contains_char('b'));
3175         assert!("a".contains_char('a'));
3176         assert!(!"abc".contains_char('d'));
3177         assert!(!"".contains_char('a'));
3178     }
3179
3180     #[test]
3181     fn test_map() {
3182         assert_eq!(~"", "".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
3183         assert_eq!(~"YMCA", "ymca".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
3184     }
3185
3186     #[test]
3187     fn test_utf16() {
3188         let pairs =
3189             [(~"𐍅𐌿𐌻𐍆𐌹𐌻𐌰\n",
3190               ~[0xd800_u16, 0xdf45_u16, 0xd800_u16, 0xdf3f_u16,
3191                 0xd800_u16, 0xdf3b_u16, 0xd800_u16, 0xdf46_u16,
3192                 0xd800_u16, 0xdf39_u16, 0xd800_u16, 0xdf3b_u16,
3193                 0xd800_u16, 0xdf30_u16, 0x000a_u16]),
3194
3195              (~"𐐒𐑉𐐮𐑀𐐲𐑋 𐐏𐐲𐑍\n",
3196               ~[0xd801_u16, 0xdc12_u16, 0xd801_u16,
3197                 0xdc49_u16, 0xd801_u16, 0xdc2e_u16, 0xd801_u16,
3198                 0xdc40_u16, 0xd801_u16, 0xdc32_u16, 0xd801_u16,
3199                 0xdc4b_u16, 0x0020_u16, 0xd801_u16, 0xdc0f_u16,
3200                 0xd801_u16, 0xdc32_u16, 0xd801_u16, 0xdc4d_u16,
3201                 0x000a_u16]),
3202
3203              (~"𐌀𐌖𐌋𐌄𐌑𐌉·𐌌𐌄𐌕𐌄𐌋𐌉𐌑\n",
3204               ~[0xd800_u16, 0xdf00_u16, 0xd800_u16, 0xdf16_u16,
3205                 0xd800_u16, 0xdf0b_u16, 0xd800_u16, 0xdf04_u16,
3206                 0xd800_u16, 0xdf11_u16, 0xd800_u16, 0xdf09_u16,
3207                 0x00b7_u16, 0xd800_u16, 0xdf0c_u16, 0xd800_u16,
3208                 0xdf04_u16, 0xd800_u16, 0xdf15_u16, 0xd800_u16,
3209                 0xdf04_u16, 0xd800_u16, 0xdf0b_u16, 0xd800_u16,
3210                 0xdf09_u16, 0xd800_u16, 0xdf11_u16, 0x000a_u16 ]),
3211
3212              (~"𐒋𐒘𐒈𐒑𐒛𐒒 𐒕𐒓 𐒈𐒚𐒍 𐒏𐒜𐒒𐒖𐒆 𐒕𐒆\n",
3213               ~[0xd801_u16, 0xdc8b_u16, 0xd801_u16, 0xdc98_u16,
3214                 0xd801_u16, 0xdc88_u16, 0xd801_u16, 0xdc91_u16,
3215                 0xd801_u16, 0xdc9b_u16, 0xd801_u16, 0xdc92_u16,
3216                 0x0020_u16, 0xd801_u16, 0xdc95_u16, 0xd801_u16,
3217                 0xdc93_u16, 0x0020_u16, 0xd801_u16, 0xdc88_u16,
3218                 0xd801_u16, 0xdc9a_u16, 0xd801_u16, 0xdc8d_u16,
3219                 0x0020_u16, 0xd801_u16, 0xdc8f_u16, 0xd801_u16,
3220                 0xdc9c_u16, 0xd801_u16, 0xdc92_u16, 0xd801_u16,
3221                 0xdc96_u16, 0xd801_u16, 0xdc86_u16, 0x0020_u16,
3222                 0xd801_u16, 0xdc95_u16, 0xd801_u16, 0xdc86_u16,
3223                 0x000a_u16 ]) ];
3224
3225         for pairs.iter().advance |p| {
3226             let (s, u) = (*p).clone();
3227             assert!(s.to_utf16() == u);
3228             assert!(from_utf16(u) == s);
3229             assert!(from_utf16(s.to_utf16()) == s);
3230             assert!(from_utf16(u).to_utf16() == u);
3231         }
3232     }
3233
3234     #[test]
3235     fn test_char_at() {
3236         let s = ~"ศไทย中华Việt Nam";
3237         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3238         let mut pos = 0;
3239         for v.iter().advance |ch| {
3240             assert!(s.char_at(pos) == *ch);
3241             pos += from_char(*ch).len();
3242         }
3243     }
3244
3245     #[test]
3246     fn test_char_at_reverse() {
3247         let s = ~"ศไทย中华Việt Nam";
3248         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3249         let mut pos = s.len();
3250         for v.rev_iter().advance |ch| {
3251             assert!(s.char_at_reverse(pos) == *ch);
3252             pos -= from_char(*ch).len();
3253         }
3254     }
3255
3256     #[test]
3257     fn test_escape_unicode() {
3258         assert_eq!("abc".escape_unicode(), ~"\\x61\\x62\\x63");
3259         assert_eq!("a c".escape_unicode(), ~"\\x61\\x20\\x63");
3260         assert_eq!("\r\n\t".escape_unicode(), ~"\\x0d\\x0a\\x09");
3261         assert_eq!("'\"\\".escape_unicode(), ~"\\x27\\x22\\x5c");
3262         assert_eq!("\x00\x01\xfe\xff".escape_unicode(), ~"\\x00\\x01\\xfe\\xff");
3263         assert_eq!("\u0100\uffff".escape_unicode(), ~"\\u0100\\uffff");
3264         assert_eq!("\U00010000\U0010ffff".escape_unicode(), ~"\\U00010000\\U0010ffff");
3265         assert_eq!("ab\ufb00".escape_unicode(), ~"\\x61\\x62\\ufb00");
3266         assert_eq!("\U0001d4ea\r".escape_unicode(), ~"\\U0001d4ea\\x0d");
3267     }
3268
3269     #[test]
3270     fn test_escape_default() {
3271         assert_eq!("abc".escape_default(), ~"abc");
3272         assert_eq!("a c".escape_default(), ~"a c");
3273         assert_eq!("\r\n\t".escape_default(), ~"\\r\\n\\t");
3274         assert_eq!("'\"\\".escape_default(), ~"\\'\\\"\\\\");
3275         assert_eq!("\u0100\uffff".escape_default(), ~"\\u0100\\uffff");
3276         assert_eq!("\U00010000\U0010ffff".escape_default(), ~"\\U00010000\\U0010ffff");
3277         assert_eq!("ab\ufb00".escape_default(), ~"ab\\ufb00");
3278         assert_eq!("\U0001d4ea\r".escape_default(), ~"\\U0001d4ea\\r");
3279     }
3280
3281     #[test]
3282     fn test_to_managed() {
3283         assert_eq!("abc".to_managed(), @"abc");
3284         assert_eq!("abcdef".slice(1, 5).to_managed(), @"bcde");
3285     }
3286
3287     #[test]
3288     fn test_total_ord() {
3289         "1234".cmp(& &"123") == Greater;
3290         "123".cmp(& &"1234") == Less;
3291         "1234".cmp(& &"1234") == Equal;
3292         "12345555".cmp(& &"123456") == Less;
3293         "22".cmp(& &"1234") == Greater;
3294     }
3295
3296     #[test]
3297     fn test_char_range_at() {
3298         let data = ~"b¢€𤭢𤭢€¢b";
3299         assert_eq!('b', data.char_range_at(0).ch);
3300         assert_eq!('¢', data.char_range_at(1).ch);
3301         assert_eq!('€', data.char_range_at(3).ch);
3302         assert_eq!('𤭢', data.char_range_at(6).ch);
3303         assert_eq!('𤭢', data.char_range_at(10).ch);
3304         assert_eq!('€', data.char_range_at(14).ch);
3305         assert_eq!('¢', data.char_range_at(17).ch);
3306         assert_eq!('b', data.char_range_at(19).ch);
3307     }
3308
3309     #[test]
3310     fn test_char_range_at_reverse_underflow() {
3311         assert_eq!("abc".char_range_at_reverse(0).next, 0);
3312     }
3313
3314     #[test]
3315     fn test_add() {
3316         #[allow(unnecessary_allocation)];
3317         macro_rules! t (
3318             ($s1:expr, $s2:expr, $e:expr) => {
3319                 assert_eq!($s1 + $s2, $e);
3320                 assert_eq!($s1.to_owned() + $s2, $e);
3321                 assert_eq!($s1.to_managed() + $s2, $e);
3322             }
3323         );
3324
3325         t!("foo",  "bar", ~"foobar");
3326         t!("foo", @"bar", ~"foobar");
3327         t!("foo", ~"bar", ~"foobar");
3328         t!("ศไทย中",  "华Việt Nam", ~"ศไทย中华Việt Nam");
3329         t!("ศไทย中", @"华Việt Nam", ~"ศไทย中华Việt Nam");
3330         t!("ศไทย中", ~"华Việt Nam", ~"ศไทย中华Việt Nam");
3331     }
3332
3333     #[test]
3334     fn test_iterator() {
3335         use iterator::*;
3336         let s = ~"ศไทย中华Việt Nam";
3337         let v = ~['ศ','ไ','ท','ย','中','华','V','i','ệ','t',' ','N','a','m'];
3338
3339         let mut pos = 0;
3340         let mut it = s.iter();
3341
3342         for it.advance |c| {
3343             assert_eq!(c, v[pos]);
3344             pos += 1;
3345         }
3346         assert_eq!(pos, v.len());
3347     }
3348
3349     #[test]
3350     fn test_rev_iterator() {
3351         use iterator::*;
3352         let s = ~"ศไทย中华Việt Nam";
3353         let v = ~['m', 'a', 'N', ' ', 't', 'ệ','i','V','华','中','ย','ท','ไ','ศ'];
3354
3355         let mut pos = 0;
3356         let mut it = s.rev_iter();
3357
3358         for it.advance |c| {
3359             assert_eq!(c, v[pos]);
3360             pos += 1;
3361         }
3362         assert_eq!(pos, v.len());
3363     }
3364
3365     #[test]
3366     fn test_bytes_iterator() {
3367         let s = ~"ศไทย中华Việt Nam";
3368         let v = [
3369             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3370             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3371             109
3372         ];
3373         let mut pos = 0;
3374
3375         for s.bytes_iter().advance |b| {
3376             assert_eq!(b, v[pos]);
3377             pos += 1;
3378         }
3379     }
3380
3381     #[test]
3382     fn test_bytes_rev_iterator() {
3383         let s = ~"ศไทย中华Việt Nam";
3384         let v = [
3385             224, 184, 168, 224, 185, 132, 224, 184, 151, 224, 184, 162, 228,
3386             184, 173, 229, 141, 142, 86, 105, 225, 187, 135, 116, 32, 78, 97,
3387             109
3388         ];
3389         let mut pos = v.len();
3390
3391         for s.bytes_rev_iter().advance |b| {
3392             pos -= 1;
3393             assert_eq!(b, v[pos]);
3394         }
3395     }
3396
3397     #[test]
3398     fn test_split_char_iterator() {
3399         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3400
3401         let split: ~[&str] = data.split_iter(' ').collect();
3402         assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3403
3404         let split: ~[&str] = data.split_iter(|c: char| c == ' ').collect();
3405         assert_eq!(split, ~["\nMäry", "häd", "ä", "little", "lämb\nLittle", "lämb\n"]);
3406
3407         // Unicode
3408         let split: ~[&str] = data.split_iter('ä').collect();
3409         assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3410
3411         let split: ~[&str] = data.split_iter(|c: char| c == 'ä').collect();
3412         assert_eq!(split, ~["\nM", "ry h", "d ", " little l", "mb\nLittle l", "mb\n"]);
3413     }
3414     #[test]
3415     fn test_splitn_char_iterator() {
3416         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3417
3418         let split: ~[&str] = data.splitn_iter(' ', 3).collect();
3419         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3420
3421         let split: ~[&str] = data.splitn_iter(|c: char| c == ' ', 3).collect();
3422         assert_eq!(split, ~["\nMäry", "häd", "ä", "little lämb\nLittle lämb\n"]);
3423
3424         // Unicode
3425         let split: ~[&str] = data.splitn_iter('ä', 3).collect();
3426         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3427
3428         let split: ~[&str] = data.splitn_iter(|c: char| c == 'ä', 3).collect();
3429         assert_eq!(split, ~["\nM", "ry h", "d ", " little lämb\nLittle lämb\n"]);
3430     }
3431
3432     #[test]
3433     fn test_split_char_iterator_no_trailing() {
3434         let data = "\nMäry häd ä little lämb\nLittle lämb\n";
3435
3436         let split: ~[&str] = data.split_options_iter('\n', 1000, true).collect();
3437         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb", ""]);
3438
3439         let split: ~[&str] = data.split_options_iter('\n', 1000, false).collect();
3440         assert_eq!(split, ~["", "Märy häd ä little lämb", "Little lämb"]);
3441     }
3442
3443     #[test]
3444     fn test_word_iter() {
3445         let data = "\n \tMäry   häd\tä  little lämb\nLittle lämb\n";
3446         let words: ~[&str] = data.word_iter().collect();
3447         assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
3448     }
3449
3450     #[test]
3451     fn test_line_iter() {
3452         let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";
3453         let lines: ~[&str] = data.line_iter().collect();
3454         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3455
3456         let data = "\nMäry häd ä little lämb\n\nLittle lämb"; // no trailing \n
3457         let lines: ~[&str] = data.line_iter().collect();
3458         assert_eq!(lines, ~["", "Märy häd ä little lämb", "", "Little lämb"]);
3459     }
3460
3461     #[test]
3462     fn test_split_str_iterator() {
3463         fn t<'a>(s: &str, sep: &'a str, u: ~[&str]) {
3464             let v: ~[&str] = s.split_str_iter(sep).collect();
3465             assert_eq!(v, u);
3466         }
3467         t("--1233345--", "12345", ~["--1233345--"]);
3468         t("abc::hello::there", "::", ~["abc", "hello", "there"]);
3469         t("::hello::there", "::", ~["", "hello", "there"]);
3470         t("hello::there::", "::", ~["hello", "there", ""]);
3471         t("::hello::there::", "::", ~["", "hello", "there", ""]);
3472         t("ประเทศไทย中华Việt Nam", "中华", ~["ประเทศไทย", "Việt Nam"]);
3473         t("zzXXXzzYYYzz", "zz", ~["", "XXX", "YYY", ""]);
3474         t("zzXXXzYYYz", "XXX", ~["zz", "zYYYz"]);
3475         t(".XXX.YYY.", ".", ~["", "XXX", "YYY", ""]);
3476         t("", ".", ~[""]);
3477         t("zz", "zz", ~["",""]);
3478         t("ok", "z", ~["ok"]);
3479         t("zzz", "zz", ~["","z"]);
3480         t("zzzzz", "zz", ~["","","z"]);
3481     }
3482
3483     #[test]
3484     fn test_str_zero() {
3485         use num::Zero;
3486         fn t<S: Zero + Str>() {
3487             let s: S = Zero::zero();
3488             assert_eq!(s.as_slice(), "");
3489             assert!(s.is_zero());
3490         }
3491
3492         t::<&str>();
3493         t::<@str>();
3494         t::<~str>();
3495     }
3496
3497     #[test]
3498     fn test_str_container() {
3499         fn sum_len<S: Container>(v: &[S]) -> uint {
3500             v.iter().transform(|x| x.len()).sum()
3501         }
3502
3503         let s = ~"01234";
3504         assert_eq!(5, sum_len(["012", "", "34"]));
3505         assert_eq!(5, sum_len([@"01", @"2", @"34", @""]));
3506         assert_eq!(5, sum_len([~"01", ~"2", ~"34", ~""]));
3507         assert_eq!(5, sum_len([s.as_slice()]));
3508     }
3509 }