src/libcore/str/mod.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! String manipulation
  14 //!
  15 //! For more details, see std::str
  16
  17 #![doc(primitive = "str")]
  18
  19 use self::OldSearcher::{TwoWay, TwoWayLong};
  20
  21 use char::CharExt;
  22 use clone::Clone;
  23 use cmp::{self, Eq};
  24 use default::Default;
  25 use error::Error;
  26 use fmt;
  27 use iter::ExactSizeIterator;
  28 use iter::{Map, Iterator, IteratorExt, DoubleEndedIterator};
  29 use marker::Sized;
  30 use mem;
  31 use num::Int;
  32 use ops::{Fn, FnMut};
  33 use option::Option::{self, None, Some};
  34 use raw::{Repr, Slice};
  35 use result::Result::{self, Ok, Err};
  36 use slice::{self, SliceExt};
  37 use usize;
  38
  39 pub use self::pattern::Pattern;
  40 pub use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher, SearchStep};
  41
  42 mod pattern;
  43
  44 macro_rules! delegate_iter {
  45     (exact $te:ty : $ti:ty) => {
  46         delegate_iter!{$te : $ti}
  47         impl<'a> ExactSizeIterator for $ti {
  48             #[inline]
  49             fn len(&self) -> usize {
  50                 self.0.len()
  51             }
  52         }
  53     };
  54     ($te:ty : $ti:ty) => {
  55         #[stable(feature = "rust1", since = "1.0.0")]
  56         impl<'a> Iterator for $ti {
  57             type Item = $te;
  58
  59             #[inline]
  60             fn next(&mut self) -> Option<$te> {
  61                 self.0.next()
  62             }
  63             #[inline]
  64             fn size_hint(&self) -> (usize, Option<usize>) {
  65                 self.0.size_hint()
  66             }
  67         }
  68         #[stable(feature = "rust1", since = "1.0.0")]
  69         impl<'a> DoubleEndedIterator for $ti {
  70             #[inline]
  71             fn next_back(&mut self) -> Option<$te> {
  72                 self.0.next_back()
  73             }
  74         }
  75     };
  76     (pattern $te:ty : $ti:ty) => {
  77         #[stable(feature = "rust1", since = "1.0.0")]
  78         impl<'a, P: Pattern<'a>> Iterator for $ti {
  79             type Item = $te;
  80
  81             #[inline]
  82             fn next(&mut self) -> Option<$te> {
  83                 self.0.next()
  84             }
  85             #[inline]
  86             fn size_hint(&self) -> (usize, Option<usize>) {
  87                 self.0.size_hint()
  88             }
  89         }
  90         #[stable(feature = "rust1", since = "1.0.0")]
  91         impl<'a, P: Pattern<'a>> DoubleEndedIterator for $ti
  92         where P::Searcher: DoubleEndedSearcher<'a> {
  93             #[inline]
  94             fn next_back(&mut self) -> Option<$te> {
  95                 self.0.next_back()
  96             }
  97         }
  98     };
  99     (pattern forward $te:ty : $ti:ty) => {
 100         #[stable(feature = "rust1", since = "1.0.0")]
 101         impl<'a, P: Pattern<'a>> Iterator for $ti
 102         where P::Searcher: DoubleEndedSearcher<'a> {
 103             type Item = $te;
 104
 105             #[inline]
 106             fn next(&mut self) -> Option<$te> {
 107                 self.0.next()
 108             }
 109             #[inline]
 110             fn size_hint(&self) -> (usize, Option<usize>) {
 111                 self.0.size_hint()
 112             }
 113         }
 114     };
 115     (pattern reverse $te:ty : $ti:ty) => {
 116         #[stable(feature = "rust1", since = "1.0.0")]
 117         impl<'a, P: Pattern<'a>> Iterator for $ti
 118             where P::Searcher: ReverseSearcher<'a>
 119         {
 120             type Item = $te;
 121
 122             #[inline]
 123             fn next(&mut self) -> Option<$te> {
 124                 self.0.next()
 125             }
 126             #[inline]
 127             fn size_hint(&self) -> (usize, Option<usize>) {
 128                 self.0.size_hint()
 129             }
 130         }
 131     };
 132 }
 133
 134 /// A trait to abstract the idea of creating a new instance of a type from a
 135 /// string.
 136 #[stable(feature = "rust1", since = "1.0.0")]
 137 pub trait FromStr {
 138     /// The associated error which can be returned from parsing.
 139     #[stable(feature = "rust1", since = "1.0.0")]
 140     type Err;
 141
 142     /// Parses a string `s` to return an optional value of this type. If the
 143     /// string is ill-formatted, the None is returned.
 144     #[stable(feature = "rust1", since = "1.0.0")]
 145     fn from_str(s: &str) -> Result<Self, Self::Err>;
 146 }
 147
 148 #[stable(feature = "rust1", since = "1.0.0")]
 149 impl FromStr for bool {
 150     type Err = ParseBoolError;
 151
 152     /// Parse a `bool` from a string.
 153     ///
 154     /// Yields a `Result<bool, ParseBoolError>`, because `s` may or may not
 155     /// actually be parseable.
 156     ///
 157     /// # Examples
 158     ///
 159     /// ```
 160     /// use std::str::FromStr;
 161     ///
 162     /// assert_eq!(FromStr::from_str("true"), Ok(true));
 163     /// assert_eq!(FromStr::from_str("false"), Ok(false));
 164     /// assert!(<bool as FromStr>::from_str("not even a boolean").is_err());
 165     /// ```
 166     ///
 167     /// Note, in many cases, the StrExt::parse() which is based on
 168     /// this FromStr::from_str() is more proper.
 169     ///
 170     /// ```
 171     /// assert_eq!("true".parse(), Ok(true));
 172     /// assert_eq!("false".parse(), Ok(false));
 173     /// assert!("not even a boolean".parse::<bool>().is_err());
 174     /// ```
 175     #[inline]
 176     fn from_str(s: &str) -> Result<bool, ParseBoolError> {
 177         match s {
 178             "true"  => Ok(true),
 179             "false" => Ok(false),
 180             _       => Err(ParseBoolError { _priv: () }),
 181         }
 182     }
 183 }
 184
 185 /// An error returned when parsing a `bool` from a string fails.
 186 #[derive(Debug, Clone, PartialEq)]
 187 #[stable(feature = "rust1", since = "1.0.0")]
 188 pub struct ParseBoolError { _priv: () }
 189
 190 #[stable(feature = "rust1", since = "1.0.0")]
 191 impl fmt::Display for ParseBoolError {
 192     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 193         "provided string was not `true` or `false`".fmt(f)
 194     }
 195 }
 196
 197 #[stable(feature = "rust1", since = "1.0.0")]
 198 impl Error for ParseBoolError {
 199     fn description(&self) -> &str { "failed to parse bool" }
 200 }
 201
 202 /*
 203 Section: Creating a string
 204 */
 205
 206 /// Errors which can occur when attempting to interpret a byte slice as a `str`.
 207 #[derive(Copy, Eq, PartialEq, Clone, Debug)]
 208 #[unstable(feature = "core",
 209            reason = "error enumeration recently added and definitions may be refined")]
 210 pub enum Utf8Error {
 211     /// An invalid byte was detected at the byte offset given.
 212     ///
 213     /// The offset is guaranteed to be in bounds of the slice in question, and
 214     /// the byte at the specified offset was the first invalid byte in the
 215     /// sequence detected.
 216     InvalidByte(usize),
 217
 218     /// The byte slice was invalid because more bytes were needed but no more
 219     /// bytes were available.
 220     TooShort,
 221 }
 222
 223 /// Converts a slice of bytes to a string slice without performing any
 224 /// allocations.
 225 ///
 226 /// Once the slice has been validated as utf-8, it is transmuted in-place and
 227 /// returned as a '&str' instead of a '&[u8]'
 228 ///
 229 /// # Failure
 230 ///
 231 /// Returns `Err` if the slice is not utf-8 with a description as to why the
 232 /// provided slice is not utf-8.
 233 #[stable(feature = "rust1", since = "1.0.0")]
 234 pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> {
 235     try!(run_utf8_validation_iterator(&mut v.iter()));
 236     Ok(unsafe { from_utf8_unchecked(v) })
 237 }
 238
 239 /// Converts a slice of bytes to a string slice without checking
 240 /// that the string contains valid UTF-8.
 241 #[stable(feature = "rust1", since = "1.0.0")]
 242 pub unsafe fn from_utf8_unchecked<'a>(v: &'a [u8]) -> &'a str {
 243     mem::transmute(v)
 244 }
 245
 246 /// Constructs a static string slice from a given raw pointer.
 247 ///
 248 /// This function will read memory starting at `s` until it finds a 0, and then
 249 /// transmute the memory up to that point as a string slice, returning the
 250 /// corresponding `&'static str` value.
 251 ///
 252 /// This function is unsafe because the caller must ensure the C string itself
 253 /// has the static lifetime and that the memory `s` is valid up to and including
 254 /// the first null byte.
 255 ///
 256 /// # Panics
 257 ///
 258 /// This function will panic if the string pointed to by `s` is not valid UTF-8.
 259 #[unstable(feature = "core")]
 260 #[deprecated(since = "1.0.0",
 261              reason = "use std::ffi::c_str_to_bytes + str::from_utf8")]
 262 pub unsafe fn from_c_str(s: *const i8) -> &'static str {
 263     let s = s as *const u8;
 264     let mut len = 0;
 265     while *s.offset(len as isize) != 0 {
 266         len += 1;
 267     }
 268     let v: &'static [u8] = ::mem::transmute(Slice { data: s, len: len });
 269     from_utf8(v).ok().expect("from_c_str passed invalid utf-8 data")
 270 }
 271
 272 /// Something that can be used to compare against a character
 273 #[unstable(feature = "core")]
 274 #[deprecated(since = "1.0.0",
 275              reason = "use `Pattern` instead")]
 276 // NB: Rather than removing it, make it private and move it into self::pattern
 277 pub trait CharEq {
 278     /// Determine if the splitter should split at the given character
 279     fn matches(&mut self, char) -> bool;
 280     /// Indicate if this is only concerned about ASCII characters,
 281     /// which can allow for a faster implementation.
 282     fn only_ascii(&self) -> bool;
 283 }
 284
 285 #[allow(deprecated) /* for CharEq */ ]
 286 impl CharEq for char {
 287     #[inline]
 288     fn matches(&mut self, c: char) -> bool { *self == c }
 289
 290     #[inline]
 291     fn only_ascii(&self) -> bool { (*self as u32) < 128 }
 292 }
 293
 294 #[allow(deprecated) /* for CharEq */ ]
 295 impl<F> CharEq for F where F: FnMut(char) -> bool {
 296     #[inline]
 297     fn matches(&mut self, c: char) -> bool { (*self)(c) }
 298
 299     #[inline]
 300     fn only_ascii(&self) -> bool { false }
 301 }
 302
 303 #[allow(deprecated) /* for CharEq */ ]
 304 impl<'a> CharEq for &'a [char] {
 305     #[inline]
 306     #[allow(deprecated) /* for CharEq */ ]
 307     fn matches(&mut self, c: char) -> bool {
 308         self.iter().any(|&m| { let mut m = m; m.matches(c) })
 309     }
 310
 311     #[inline]
 312     #[allow(deprecated) /* for CharEq */ ]
 313     fn only_ascii(&self) -> bool {
 314         self.iter().all(|m| m.only_ascii())
 315     }
 316 }
 317
 318 #[stable(feature = "rust1", since = "1.0.0")]
 319 impl Error for Utf8Error {
 320     fn description(&self) -> &str {
 321         match *self {
 322             Utf8Error::TooShort => "invalid utf-8: not enough bytes",
 323             Utf8Error::InvalidByte(..) => "invalid utf-8: corrupt contents",
 324         }
 325     }
 326 }
 327
 328 #[stable(feature = "rust1", since = "1.0.0")]
 329 impl fmt::Display for Utf8Error {
 330     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
 331         match *self {
 332             Utf8Error::InvalidByte(n) => {
 333                 write!(f, "invalid utf-8: invalid byte at index {}", n)
 334             }
 335             Utf8Error::TooShort => {
 336                 write!(f, "invalid utf-8: byte slice too short")
 337             }
 338         }
 339     }
 340 }
 341
 342 /*
 343 Section: Iterators
 344 */
 345
 346 /// Iterator for the char (representing *Unicode Scalar Values*) of a string
 347 ///
 348 /// Created with the method `.chars()`.
 349 #[derive(Clone)]
 350 #[stable(feature = "rust1", since = "1.0.0")]
 351 pub struct Chars<'a> {
 352     iter: slice::Iter<'a, u8>
 353 }
 354
 355 /// Return the initial codepoint accumulator for the first byte.
 356 /// The first byte is special, only want bottom 5 bits for width 2, 4 bits
 357 /// for width 3, and 3 bits for width 4.
 358 #[inline]
 359 fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 }
 360
 361 /// Return the value of `ch` updated with continuation byte `byte`.
 362 #[inline]
 363 fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 }
 364
 365 /// Checks whether the byte is a UTF-8 continuation byte (i.e. starts with the
 366 /// bits `10`).
 367 #[inline]
 368 fn utf8_is_cont_byte(byte: u8) -> bool { (byte & !CONT_MASK) == TAG_CONT_U8 }
 369
 370 #[inline]
 371 fn unwrap_or_0(opt: Option<&u8>) -> u8 {
 372     match opt {
 373         Some(&byte) => byte,
 374         None => 0,
 375     }
 376 }
 377
 378 /// Reads the next code point out of a byte iterator (assuming a
 379 /// UTF-8-like encoding).
 380 #[unstable(feature = "core")]
 381 #[inline]
 382 pub fn next_code_point(bytes: &mut slice::Iter<u8>) -> Option<u32> {
 383     // Decode UTF-8
 384     let x = match bytes.next() {
 385         None => return None,
 386         Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
 387         Some(&next_byte) => next_byte,
 388     };
 389
 390     // Multibyte case follows
 391     // Decode from a byte combination out of: [[[x y] z] w]
 392     // NOTE: Performance is sensitive to the exact formulation here
 393     let init = utf8_first_byte(x, 2);
 394     let y = unwrap_or_0(bytes.next());
 395     let mut ch = utf8_acc_cont_byte(init, y);
 396     if x >= 0xE0 {
 397         // [[x y z] w] case
 398         // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
 399         let z = unwrap_or_0(bytes.next());
 400         let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
 401         ch = init << 12 | y_z;
 402         if x >= 0xF0 {
 403             // [x y z w] case
 404             // use only the lower 3 bits of `init`
 405             let w = unwrap_or_0(bytes.next());
 406             ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
 407         }
 408     }
 409
 410     Some(ch)
 411 }
 412
 413 /// Reads the last code point out of a byte iterator (assuming a
 414 /// UTF-8-like encoding).
 415 #[unstable(feature = "core")]
 416 #[inline]
 417 pub fn next_code_point_reverse(bytes: &mut slice::Iter<u8>) -> Option<u32> {
 418     // Decode UTF-8
 419     let w = match bytes.next_back() {
 420         None => return None,
 421         Some(&next_byte) if next_byte < 128 => return Some(next_byte as u32),
 422         Some(&back_byte) => back_byte,
 423     };
 424
 425     // Multibyte case follows
 426     // Decode from a byte combination out of: [x [y [z w]]]
 427     let mut ch;
 428     let z = unwrap_or_0(bytes.next_back());
 429     ch = utf8_first_byte(z, 2);
 430     if utf8_is_cont_byte(z) {
 431         let y = unwrap_or_0(bytes.next_back());
 432         ch = utf8_first_byte(y, 3);
 433         if utf8_is_cont_byte(y) {
 434             let x = unwrap_or_0(bytes.next_back());
 435             ch = utf8_first_byte(x, 4);
 436             ch = utf8_acc_cont_byte(ch, y);
 437         }
 438         ch = utf8_acc_cont_byte(ch, z);
 439     }
 440     ch = utf8_acc_cont_byte(ch, w);
 441
 442     Some(ch)
 443 }
 444
 445 #[stable(feature = "rust1", since = "1.0.0")]
 446 impl<'a> Iterator for Chars<'a> {
 447     type Item = char;
 448
 449     #[inline]
 450     fn next(&mut self) -> Option<char> {
 451         next_code_point(&mut self.iter).map(|ch| {
 452             // str invariant says `ch` is a valid Unicode Scalar Value
 453             unsafe {
 454                 mem::transmute(ch)
 455             }
 456         })
 457     }
 458
 459     #[inline]
 460     fn size_hint(&self) -> (usize, Option<usize>) {
 461         let (len, _) = self.iter.size_hint();
 462         // `(len + 3)` can't overflow, because we know that the `slice::Iter`
 463         // belongs to a slice in memory which has a maximum length of
 464         // `isize::MAX` (that's well below `usize::MAX`).
 465         ((len + 3) / 4, Some(len))
 466     }
 467 }
 468
 469 #[stable(feature = "rust1", since = "1.0.0")]
 470 impl<'a> DoubleEndedIterator for Chars<'a> {
 471     #[inline]
 472     fn next_back(&mut self) -> Option<char> {
 473         next_code_point_reverse(&mut self.iter).map(|ch| {
 474             // str invariant says `ch` is a valid Unicode Scalar Value
 475             unsafe {
 476                 mem::transmute(ch)
 477             }
 478         })
 479     }
 480 }
 481
 482 /// External iterator for a string's characters and their byte offsets.
 483 /// Use with the `std::iter` module.
 484 #[derive(Clone)]
 485 #[stable(feature = "rust1", since = "1.0.0")]
 486 pub struct CharIndices<'a> {
 487     front_offset: usize,
 488     iter: Chars<'a>,
 489 }
 490
 491 #[stable(feature = "rust1", since = "1.0.0")]
 492 impl<'a> Iterator for CharIndices<'a> {
 493     type Item = (usize, char);
 494
 495     #[inline]
 496     fn next(&mut self) -> Option<(usize, char)> {
 497         let (pre_len, _) = self.iter.iter.size_hint();
 498         match self.iter.next() {
 499             None => None,
 500             Some(ch) => {
 501                 let index = self.front_offset;
 502                 let (len, _) = self.iter.iter.size_hint();
 503                 self.front_offset += pre_len - len;
 504                 Some((index, ch))
 505             }
 506         }
 507     }
 508
 509     #[inline]
 510     fn size_hint(&self) -> (usize, Option<usize>) {
 511         self.iter.size_hint()
 512     }
 513 }
 514
 515 #[stable(feature = "rust1", since = "1.0.0")]
 516 impl<'a> DoubleEndedIterator for CharIndices<'a> {
 517     #[inline]
 518     fn next_back(&mut self) -> Option<(usize, char)> {
 519         match self.iter.next_back() {
 520             None => None,
 521             Some(ch) => {
 522                 let (len, _) = self.iter.iter.size_hint();
 523                 let index = self.front_offset + len;
 524                 Some((index, ch))
 525             }
 526         }
 527     }
 528 }
 529
 530 /// External iterator for a string's bytes.
 531 /// Use with the `std::iter` module.
 532 ///
 533 /// Created with `StrExt::bytes`
 534 #[stable(feature = "rust1", since = "1.0.0")]
 535 #[derive(Clone)]
 536 pub struct Bytes<'a>(Map<slice::Iter<'a, u8>, BytesDeref>);
 537 delegate_iter!{exact u8 : Bytes<'a>}
 538
 539 /// A temporary fn new type that ensures that the `Bytes` iterator
 540 /// is cloneable.
 541 #[derive(Copy, Clone)]
 542 struct BytesDeref;
 543
 544 impl<'a> Fn<(&'a u8,)> for BytesDeref {
 545     type Output = u8;
 546
 547     #[inline]
 548     extern "rust-call" fn call(&self, (ptr,): (&'a u8,)) -> u8 {
 549         *ptr
 550     }
 551 }
 552
 553 /// An iterator over the substrings of a string, separated by `sep`.
 554 struct CharSplits<'a, P: Pattern<'a>> {
 555     /// The slice remaining to be iterated
 556     start: usize,
 557     end: usize,
 558     matcher: P::Searcher,
 559     /// Whether an empty string at the end is allowed
 560     allow_trailing_empty: bool,
 561     finished: bool,
 562 }
 563
 564 /// An iterator over the substrings of a string, separated by `sep`,
 565 /// splitting at most `count` times.
 566 struct CharSplitsN<'a, P: Pattern<'a>> {
 567     iter: CharSplits<'a, P>,
 568     /// The number of splits remaining
 569     count: usize,
 570     invert: bool,
 571 }
 572
 573 /// An iterator over the substrings of a string, separated by a
 574 /// pattern, in reverse order.
 575 struct RCharSplits<'a, P: Pattern<'a>> {
 576     /// The slice remaining to be iterated
 577     start: usize,
 578     end: usize,
 579     matcher: P::Searcher,
 580     /// Whether an empty string at the end of iteration is allowed
 581     allow_final_empty: bool,
 582     finished: bool,
 583 }
 584
 585
 586 /// An iterator over the lines of a string, separated by `\n`.
 587 #[stable(feature = "rust1", since = "1.0.0")]
 588 pub struct Lines<'a> {
 589     inner: CharSplits<'a, char>,
 590 }
 591
 592 /// An iterator over the lines of a string, separated by either `\n` or (`\r\n`).
 593 #[stable(feature = "rust1", since = "1.0.0")]
 594 pub struct LinesAny<'a> {
 595     inner: Map<Lines<'a>, fn(&str) -> &str>,
 596 }
 597
 598 impl<'a, P: Pattern<'a>> CharSplits<'a, P> {
 599     #[inline]
 600     fn get_end(&mut self) -> Option<&'a str> {
 601         if !self.finished && (self.allow_trailing_empty || self.end - self.start > 0) {
 602             self.finished = true;
 603             unsafe {
 604                 let string = self.matcher.haystack().slice_unchecked(self.start, self.end);
 605                 Some(string)
 606             }
 607         } else {
 608             None
 609         }
 610     }
 611 }
 612
 613 #[stable(feature = "rust1", since = "1.0.0")]
 614 impl<'a, P: Pattern<'a>> Iterator for CharSplits<'a, P> {
 615     type Item = &'a str;
 616
 617     #[inline]
 618     fn next(&mut self) -> Option<&'a str> {
 619         if self.finished { return None }
 620
 621         let haystack = self.matcher.haystack();
 622         match self.matcher.next_match() {
 623             Some((a, b)) => unsafe {
 624                 let elt = haystack.slice_unchecked(self.start, a);
 625                 self.start = b;
 626                 Some(elt)
 627             },
 628             None => self.get_end(),
 629         }
 630     }
 631 }
 632
 633 #[stable(feature = "rust1", since = "1.0.0")]
 634 impl<'a, P: Pattern<'a>> DoubleEndedIterator for CharSplits<'a, P>
 635 where P::Searcher: DoubleEndedSearcher<'a> {
 636     #[inline]
 637     fn next_back(&mut self) -> Option<&'a str> {
 638         if self.finished { return None }
 639
 640         if !self.allow_trailing_empty {
 641             self.allow_trailing_empty = true;
 642             match self.next_back() {
 643                 Some(elt) if !elt.is_empty() => return Some(elt),
 644                 _ => if self.finished { return None }
 645             }
 646         }
 647
 648         let haystack = self.matcher.haystack();
 649         match self.matcher.next_match_back() {
 650             Some((a, b)) => unsafe {
 651                 let elt = haystack.slice_unchecked(b, self.end);
 652                 self.end = a;
 653                 Some(elt)
 654             },
 655             None => unsafe {
 656                 self.finished = true;
 657                 Some(haystack.slice_unchecked(self.start, self.end))
 658             },
 659         }
 660     }
 661 }
 662
 663 #[stable(feature = "rust1", since = "1.0.0")]
 664 impl<'a, P: Pattern<'a>> Iterator for CharSplitsN<'a, P>
 665 where P::Searcher: DoubleEndedSearcher<'a> {
 666     type Item = &'a str;
 667
 668     #[inline]
 669     fn next(&mut self) -> Option<&'a str> {
 670         if self.count != 0 {
 671             self.count -= 1;
 672             if self.invert { self.iter.next_back() } else { self.iter.next() }
 673         } else {
 674             self.iter.get_end()
 675         }
 676     }
 677 }
 678
 679 impl<'a, P: Pattern<'a>> RCharSplits<'a, P> {
 680     #[inline]
 681     fn get_remainder(&mut self) -> Option<&'a str> {
 682         if !self.finished && (self.allow_final_empty || self.end - self.start > 0) {
 683             self.finished = true;
 684             unsafe {
 685                 let string = self.matcher.haystack().slice_unchecked(self.start, self.end);
 686                 Some(string)
 687             }
 688         } else {
 689             None
 690         }
 691     }
 692 }
 693
 694 #[stable(feature = "rust1", since = "1.0.0")]
 695 impl<'a, P: Pattern<'a>> Iterator for RCharSplits<'a, P>
 696     where P::Searcher: ReverseSearcher<'a>
 697 {
 698     type Item = &'a str;
 699
 700     #[inline]
 701     fn next(&mut self) -> Option<&'a str> {
 702         if self.finished { return None }
 703
 704         let haystack = self.matcher.haystack();
 705         match self.matcher.next_match_back() {
 706             Some((a, b)) => unsafe {
 707                 let elt = haystack.slice_unchecked(b, self.end);
 708                 self.end = a;
 709                 Some(elt)
 710             },
 711             None => self.get_remainder(),
 712         }
 713     }
 714 }
 715
 716 /// The internal state of an iterator that searches for matches of a substring
 717 /// within a larger string using two-way search
 718 #[derive(Clone)]
 719 struct TwoWaySearcher {
 720     // constants
 721     crit_pos: usize,
 722     period: usize,
 723     byteset: u64,
 724
 725     // variables
 726     position: usize,
 727     memory: usize
 728 }
 729
 730 /*
 731     This is the Two-Way search algorithm, which was introduced in the paper:
 732     Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
 733
 734     Here's some background information.
 735
 736     A *word* is a string of symbols. The *length* of a word should be a familiar
 737     notion, and here we denote it for any word x by |x|.
 738     (We also allow for the possibility of the *empty word*, a word of length zero).
 739
 740     If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
 741     *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
 742     For example, both 1 and 2 are periods for the string "aa". As another example,
 743     the only period of the string "abcd" is 4.
 744
 745     We denote by period(x) the *smallest* period of x (provided that x is non-empty).
 746     This is always well-defined since every non-empty word x has at least one period,
 747     |x|. We sometimes call this *the period* of x.
 748
 749     If u, v and x are words such that x = uv, where uv is the concatenation of u and
 750     v, then we say that (u, v) is a *factorization* of x.
 751
 752     Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
 753     that both of the following hold
 754
 755       - either w is a suffix of u or u is a suffix of w
 756       - either w is a prefix of v or v is a prefix of w
 757
 758     then w is said to be a *repetition* for the factorization (u, v).
 759
 760     Just to unpack this, there are four possibilities here. Let w = "abc". Then we
 761     might have:
 762
 763       - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
 764       - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
 765       - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
 766       - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
 767
 768     Note that the word vu is a repetition for any factorization (u,v) of x = uv,
 769     so every factorization has at least one repetition.
 770
 771     If x is a string and (u, v) is a factorization for x, then a *local period* for
 772     (u, v) is an integer r such that there is some word w such that |w| = r and w is
 773     a repetition for (u, v).
 774
 775     We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
 776     call this *the local period* of (u, v). Provided that x = uv is non-empty, this
 777     is well-defined (because each non-empty word has at least one factorization, as
 778     noted above).
 779
 780     It can be proven that the following is an equivalent definition of a local period
 781     for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
 782     all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
 783     defined. (i.e. i > 0 and i + r < |x|).
 784
 785     Using the above reformulation, it is easy to prove that
 786
 787         1 <= local_period(u, v) <= period(uv)
 788
 789     A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
 790     *critical factorization*.
 791
 792     The algorithm hinges on the following theorem, which is stated without proof:
 793
 794     **Critical Factorization Theorem** Any word x has at least one critical
 795     factorization (u, v) such that |u| < period(x).
 796
 797     The purpose of maximal_suffix is to find such a critical factorization.
 798
 799 */
 800 impl TwoWaySearcher {
 801     #[allow(dead_code)]
 802     fn new(needle: &[u8]) -> TwoWaySearcher {
 803         let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
 804         let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
 805
 806         let (crit_pos, period) =
 807             if crit_pos_false > crit_pos_true {
 808                 (crit_pos_false, period_false)
 809             } else {
 810                 (crit_pos_true, period_true)
 811             };
 812
 813         // This isn't in the original algorithm, as far as I'm aware.
 814         let byteset = needle.iter()
 815                             .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
 816
 817         // A particularly readable explanation of what's going on here can be found
 818         // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
 819         // see the code for "Algorithm CP" on p. 323.
 820         //
 821         // What's going on is we have some critical factorization (u, v) of the
 822         // needle, and we want to determine whether u is a suffix of
 823         // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
 824         // "Algorithm CP2", which is optimized for when the period of the needle
 825         // is large.
 826         if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
 827             TwoWaySearcher {
 828                 crit_pos: crit_pos,
 829                 period: period,
 830                 byteset: byteset,
 831
 832                 position: 0,
 833                 memory: 0
 834             }
 835         } else {
 836             TwoWaySearcher {
 837                 crit_pos: crit_pos,
 838                 period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
 839                 byteset: byteset,
 840
 841                 position: 0,
 842                 memory: usize::MAX // Dummy value to signify that the period is long
 843             }
 844         }
 845     }
 846
 847     // One of the main ideas of Two-Way is that we factorize the needle into
 848     // two halves, (u, v), and begin trying to find v in the haystack by scanning
 849     // left to right. If v matches, we try to match u by scanning right to left.
 850     // How far we can jump when we encounter a mismatch is all based on the fact
 851     // that (u, v) is a critical factorization for the needle.
 852     #[inline]
 853     fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
 854             -> Option<(usize, usize)> {
 855         'search: loop {
 856             // Check that we have room to search in
 857             if self.position + needle.len() > haystack.len() {
 858                 return None;
 859             }
 860
 861             // Quickly skip by large portions unrelated to our substring
 862             if (self.byteset >>
 863                     ((haystack[self.position + needle.len() - 1] & 0x3f)
 864                      as usize)) & 1 == 0 {
 865                 self.position += needle.len();
 866                 if !long_period {
 867                     self.memory = 0;
 868                 }
 869                 continue 'search;
 870             }
 871
 872             // See if the right part of the needle matches
 873             let start = if long_period { self.crit_pos }
 874                         else { cmp::max(self.crit_pos, self.memory) };
 875             for i in start..needle.len() {
 876                 if needle[i] != haystack[self.position + i] {
 877                     self.position += i - self.crit_pos + 1;
 878                     if !long_period {
 879                         self.memory = 0;
 880                     }
 881                     continue 'search;
 882                 }
 883             }
 884
 885             // See if the left part of the needle matches
 886             let start = if long_period { 0 } else { self.memory };
 887             for i in (start..self.crit_pos).rev() {
 888                 if needle[i] != haystack[self.position + i] {
 889                     self.position += self.period;
 890                     if !long_period {
 891                         self.memory = needle.len() - self.period;
 892                     }
 893                     continue 'search;
 894                 }
 895             }
 896
 897             // We have found a match!
 898             let match_pos = self.position;
 899             self.position += needle.len(); // add self.period for all matches
 900             if !long_period {
 901                 self.memory = 0; // set to needle.len() - self.period for all matches
 902             }
 903             return Some((match_pos, match_pos + needle.len()));
 904         }
 905     }
 906
 907     // Computes a critical factorization (u, v) of `arr`.
 908     // Specifically, returns (i, p), where i is the starting index of v in some
 909     // critical factorization (u, v) and p = period(v)
 910     #[inline]
 911     #[allow(dead_code)]
 912     fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
 913         use num::wrapping::WrappingOps;
 914         let mut left = -1; // Corresponds to i in the paper
 915         let mut right = 0; // Corresponds to j in the paper
 916         let mut offset = 1; // Corresponds to k in the paper
 917         let mut period = 1; // Corresponds to p in the paper
 918
 919         while right + offset < arr.len() {
 920             let a;
 921             let b;
 922             if reversed {
 923                 a = arr[left.wrapping_add(offset)];
 924                 b = arr[right + offset];
 925             } else {
 926                 a = arr[right + offset];
 927                 b = arr[left.wrapping_add(offset)];
 928             }
 929             if a < b {
 930                 // Suffix is smaller, period is entire prefix so far.
 931                 right += offset;
 932                 offset = 1;
 933                 period = right.wrapping_sub(left);
 934             } else if a == b {
 935                 // Advance through repetition of the current period.
 936                 if offset == period {
 937                     right += offset;
 938                     offset = 1;
 939                 } else {
 940                     offset += 1;
 941                 }
 942             } else {
 943                 // Suffix is larger, start over from current location.
 944                 left = right;
 945                 right += 1;
 946                 offset = 1;
 947                 period = 1;
 948             }
 949         }
 950         (left.wrapping_add(1), period)
 951     }
 952 }
 953
 954 /// The internal state of an iterator that searches for matches of a substring
 955 /// within a larger string using a dynamically chosen search algorithm
 956 #[derive(Clone)]
 957 // NB: This is kept around for convenience because
 958 // it is planned to be used again in the future
 959 enum OldSearcher {
 960     TwoWay(TwoWaySearcher),
 961     TwoWayLong(TwoWaySearcher),
 962 }
 963
 964 impl OldSearcher {
 965     #[allow(dead_code)]
 966     fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher {
 967         if needle.len() == 0 {
 968             // Handle specially
 969             unimplemented!()
 970         // FIXME: Tune this.
 971         // FIXME(#16715): This unsigned integer addition will probably not
 972         // overflow because that would mean that the memory almost solely
 973         // consists of the needle. Needs #16715 to be formally fixed.
 974         } else if needle.len() + 20 > haystack.len() {
 975             // Use naive searcher
 976             unimplemented!()
 977         } else {
 978             let searcher = TwoWaySearcher::new(needle);
 979             if searcher.memory == usize::MAX { // If the period is long
 980                 TwoWayLong(searcher)
 981             } else {
 982                 TwoWay(searcher)
 983             }
 984         }
 985     }
 986 }
 987
 988 #[derive(Clone)]
 989 // NB: This is kept around for convenience because
 990 // it is planned to be used again in the future
 991 struct OldMatchIndices<'a, 'b> {
 992     // constants
 993     haystack: &'a str,
 994     needle: &'b str,
 995     searcher: OldSearcher
 996 }
 997
 998 // FIXME: #21637 Prevents a Clone impl
 999 /// An iterator over the start and end indices of the matches of a
1000 /// substring within a larger string
1001 #[unstable(feature = "core", reason = "type may be removed")]
1002 pub struct MatchIndices<'a, P: Pattern<'a>>(P::Searcher);
1003
1004 #[stable(feature = "rust1", since = "1.0.0")]
1005 impl<'a, P: Pattern<'a>> Iterator for MatchIndices<'a, P> {
1006     type Item = (usize, usize);
1007
1008     #[inline]
1009     fn next(&mut self) -> Option<(usize, usize)> {
1010         self.0.next_match()
1011     }
1012 }
1013
1014 /// An iterator over the substrings of a string separated by a given
1015 /// search string
1016 #[unstable(feature = "core")]
1017 #[deprecated(since = "1.0.0", reason = "use `Split` with a `&str`")]
1018 pub struct SplitStr<'a, P: Pattern<'a>>(Split<'a, P>);
1019 #[allow(deprecated)]
1020 impl<'a, P: Pattern<'a>> Iterator for SplitStr<'a, P> {
1021     type Item = &'a str;
1022
1023     #[inline]
1024     #[allow(deprecated)]
1025     fn next(&mut self) -> Option<&'a str> {
1026         Iterator::next(&mut self.0)
1027     }
1028 }
1029
1030 impl<'a, 'b>  OldMatchIndices<'a, 'b> {
1031     #[inline]
1032     #[allow(dead_code)]
1033     fn next(&mut self) -> Option<(usize, usize)> {
1034         match self.searcher {
1035             TwoWay(ref mut searcher)
1036                 => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
1037             TwoWayLong(ref mut searcher)
1038                 => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true),
1039         }
1040     }
1041 }
1042
1043 /*
1044 Section: Comparing strings
1045 */
1046
1047 // share the implementation of the lang-item vs. non-lang-item
1048 // eq_slice.
1049 /// NOTE: This function is (ab)used in rustc::middle::trans::_match
1050 /// to compare &[u8] byte slices that are not necessarily valid UTF-8.
1051 #[inline]
1052 fn eq_slice_(a: &str, b: &str) -> bool {
1053     // NOTE: In theory n should be libc::size_t and not usize, but libc is not available here
1054     #[allow(improper_ctypes)]
1055     extern { fn memcmp(s1: *const i8, s2: *const i8, n: usize) -> i32; }
1056     a.len() == b.len() && unsafe {
1057         memcmp(a.as_ptr() as *const i8,
1058                b.as_ptr() as *const i8,
1059                a.len()) == 0
1060     }
1061 }
1062
1063 /// Bytewise slice equality
1064 /// NOTE: This function is (ab)used in rustc::middle::trans::_match
1065 /// to compare &[u8] byte slices that are not necessarily valid UTF-8.
1066 #[lang="str_eq"]
1067 #[inline]
1068 fn eq_slice(a: &str, b: &str) -> bool {
1069     eq_slice_(a, b)
1070 }
1071
1072 /*
1073 Section: Misc
1074 */
1075
1076 /// Walk through `iter` checking that it's a valid UTF-8 sequence,
1077 /// returning `true` in that case, or, if it is invalid, `false` with
1078 /// `iter` reset such that it is pointing at the first byte in the
1079 /// invalid sequence.
1080 #[inline(always)]
1081 fn run_utf8_validation_iterator(iter: &mut slice::Iter<u8>)
1082                                 -> Result<(), Utf8Error> {
1083     let whole = iter.as_slice();
1084     loop {
1085         // save the current thing we're pointing at.
1086         let old = iter.clone();
1087
1088         // restore the iterator we had at the start of this codepoint.
1089         macro_rules! err { () => {{
1090             *iter = old.clone();
1091             return Err(Utf8Error::InvalidByte(whole.len() - iter.as_slice().len()))
1092         }}}
1093
1094         macro_rules! next { () => {
1095             match iter.next() {
1096                 Some(a) => *a,
1097                 // we needed data, but there was none: error!
1098                 None => return Err(Utf8Error::TooShort),
1099             }
1100         }}
1101
1102         let first = match iter.next() {
1103             Some(&b) => b,
1104             // we're at the end of the iterator and a codepoint
1105             // boundary at the same time, so this string is valid.
1106             None => return Ok(())
1107         };
1108
1109         // ASCII characters are always valid, so only large
1110         // bytes need more examination.
1111         if first >= 128 {
1112             let w = UTF8_CHAR_WIDTH[first as usize];
1113             let second = next!();
1114             // 2-byte encoding is for codepoints  \u{0080} to  \u{07ff}
1115             //        first  C2 80        last DF BF
1116             // 3-byte encoding is for codepoints  \u{0800} to  \u{ffff}
1117             //        first  E0 A0 80     last EF BF BF
1118             //   excluding surrogates codepoints  \u{d800} to  \u{dfff}
1119             //               ED A0 80 to       ED BF BF
1120             // 4-byte encoding is for codepoints \u{1000}0 to \u{10ff}ff
1121             //        first  F0 90 80 80  last F4 8F BF BF
1122             //
1123             // Use the UTF-8 syntax from the RFC
1124             //
1125             // https://tools.ietf.org/html/rfc3629
1126             // UTF8-1      = %x00-7F
1127             // UTF8-2      = %xC2-DF UTF8-tail
1128             // UTF8-3      = %xE0 %xA0-BF UTF8-tail / %xE1-EC 2( UTF8-tail ) /
1129             //               %xED %x80-9F UTF8-tail / %xEE-EF 2( UTF8-tail )
1130             // UTF8-4      = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
1131             //               %xF4 %x80-8F 2( UTF8-tail )
1132             match w {
1133                 2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
1134                 3 => {
1135                     match (first, second, next!() & !CONT_MASK) {
1136                         (0xE0         , 0xA0 ... 0xBF, TAG_CONT_U8) |
1137                         (0xE1 ... 0xEC, 0x80 ... 0xBF, TAG_CONT_U8) |
1138                         (0xED         , 0x80 ... 0x9F, TAG_CONT_U8) |
1139                         (0xEE ... 0xEF, 0x80 ... 0xBF, TAG_CONT_U8) => {}
1140                         _ => err!()
1141                     }
1142                 }
1143                 4 => {
1144                     match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
1145                         (0xF0         , 0x90 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1146                         (0xF1 ... 0xF3, 0x80 ... 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
1147                         (0xF4         , 0x80 ... 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
1148                         _ => err!()
1149                     }
1150                 }
1151                 _ => err!()
1152             }
1153         }
1154     }
1155 }
1156
1157 // https://tools.ietf.org/html/rfc3629
1158 static UTF8_CHAR_WIDTH: [u8; 256] = [
1159 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1160 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
1161 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1162 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
1163 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1164 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
1165 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1166 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
1167 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1168 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
1169 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1170 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
1171 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
1172 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
1173 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
1174 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
1175 ];
1176
1177 /// Struct that contains a `char` and the index of the first byte of
1178 /// the next `char` in a string.  This can be used as a data structure
1179 /// for iterating over the UTF-8 bytes of a string.
1180 #[derive(Copy)]
1181 #[unstable(feature = "str_char",
1182            reason = "existence of this struct is uncertain as it is frequently \
1183                      able to be replaced with char.len_utf8() and/or \
1184                      char/char_indices iterators")]
1185 pub struct CharRange {
1186     /// Current `char`
1187     pub ch: char,
1188     /// Index of the first byte of the next `char`
1189     pub next: usize,
1190 }
1191
1192 /// Mask of the value bits of a continuation byte
1193 const CONT_MASK: u8 = 0b0011_1111;
1194 /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
1195 const TAG_CONT_U8: u8 = 0b1000_0000;
1196
1197 /*
1198 Section: Trait implementations
1199 */
1200
1201 mod traits {
1202     use cmp::{Ordering, Ord, PartialEq, PartialOrd, Eq};
1203     use cmp::Ordering::{Less, Equal, Greater};
1204     use iter::IteratorExt;
1205     use option::Option;
1206     use option::Option::Some;
1207     use ops;
1208     use str::{StrExt, eq_slice};
1209
1210     #[stable(feature = "rust1", since = "1.0.0")]
1211     impl Ord for str {
1212         #[inline]
1213         fn cmp(&self, other: &str) -> Ordering {
1214             for (s_b, o_b) in self.bytes().zip(other.bytes()) {
1215                 match s_b.cmp(&o_b) {
1216                     Greater => return Greater,
1217                     Less => return Less,
1218                     Equal => ()
1219                 }
1220             }
1221
1222             self.len().cmp(&other.len())
1223         }
1224     }
1225
1226     #[stable(feature = "rust1", since = "1.0.0")]
1227     impl PartialEq for str {
1228         #[inline]
1229         fn eq(&self, other: &str) -> bool {
1230             eq_slice(self, other)
1231         }
1232         #[inline]
1233         fn ne(&self, other: &str) -> bool { !(*self).eq(other) }
1234     }
1235
1236     #[stable(feature = "rust1", since = "1.0.0")]
1237     impl Eq for str {}
1238
1239     #[stable(feature = "rust1", since = "1.0.0")]
1240     impl PartialOrd for str {
1241         #[inline]
1242         fn partial_cmp(&self, other: &str) -> Option<Ordering> {
1243             Some(self.cmp(other))
1244         }
1245     }
1246
1247     /// Returns a slice of the given string from the byte range
1248     /// [`begin`..`end`).
1249     ///
1250     /// This operation is `O(1)`.
1251     ///
1252     /// Panics when `begin` and `end` do not point to valid characters
1253     /// or point beyond the last character of the string.
1254     ///
1255     /// # Examples
1256     ///
1257     /// ```
1258     /// let s = "Löwe 老虎 Léopard";
1259     /// assert_eq!(&s[0 .. 1], "L");
1260     ///
1261     /// assert_eq!(&s[1 .. 9], "öwe 老");
1262     ///
1263     /// // these will panic:
1264     /// // byte 2 lies within `ö`:
1265     /// // &s[2 ..3];
1266     ///
1267     /// // byte 8 lies within `老`
1268     /// // &s[1 .. 8];
1269     ///
1270     /// // byte 100 is outside the string
1271     /// // &s[3 .. 100];
1272     /// ```
1273     #[stable(feature = "rust1", since = "1.0.0")]
1274     impl ops::Index<ops::Range<usize>> for str {
1275         type Output = str;
1276         #[inline]
1277         fn index(&self, index: &ops::Range<usize>) -> &str {
1278             // is_char_boundary checks that the index is in [0, .len()]
1279             if index.start <= index.end &&
1280                self.is_char_boundary(index.start) &&
1281                self.is_char_boundary(index.end) {
1282                 unsafe { self.slice_unchecked(index.start, index.end) }
1283             } else {
1284                 super::slice_error_fail(self, index.start, index.end)
1285             }
1286         }
1287     }
1288
1289     /// Returns a slice of the string from the beginning to byte
1290     /// `end`.
1291     ///
1292     /// Equivalent to `self[0 .. end]`.
1293     ///
1294     /// Panics when `end` does not point to a valid character, or is
1295     /// out of bounds.
1296     #[stable(feature = "rust1", since = "1.0.0")]
1297     impl ops::Index<ops::RangeTo<usize>> for str {
1298         type Output = str;
1299         #[inline]
1300         fn index(&self, index: &ops::RangeTo<usize>) -> &str {
1301             // is_char_boundary checks that the index is in [0, .len()]
1302             if self.is_char_boundary(index.end) {
1303                 unsafe { self.slice_unchecked(0, index.end) }
1304             } else {
1305                 super::slice_error_fail(self, 0, index.end)
1306             }
1307         }
1308     }
1309
1310     /// Returns a slice of the string from `begin` to its end.
1311     ///
1312     /// Equivalent to `self[begin .. self.len()]`.
1313     ///
1314     /// Panics when `begin` does not point to a valid character, or is
1315     /// out of bounds.
1316     #[stable(feature = "rust1", since = "1.0.0")]
1317     impl ops::Index<ops::RangeFrom<usize>> for str {
1318         type Output = str;
1319         #[inline]
1320         fn index(&self, index: &ops::RangeFrom<usize>) -> &str {
1321             // is_char_boundary checks that the index is in [0, .len()]
1322             if self.is_char_boundary(index.start) {
1323                 unsafe { self.slice_unchecked(index.start, self.len()) }
1324             } else {
1325                 super::slice_error_fail(self, index.start, self.len())
1326             }
1327         }
1328     }
1329
1330     #[stable(feature = "rust1", since = "1.0.0")]
1331     impl ops::Index<ops::RangeFull> for str {
1332         type Output = str;
1333         #[inline]
1334         fn index(&self, _index: &ops::RangeFull) -> &str {
1335             self
1336         }
1337     }
1338 }
1339
1340 /// Any string that can be represented as a slice
1341 #[unstable(feature = "core",
1342            reason = "Instead of taking this bound generically, this trait will be \
1343                      replaced with one of slicing syntax (&foo[..]), deref coercions, or \
1344                      a more generic conversion trait")]
1345 pub trait Str {
1346     /// Work with `self` as a slice.
1347     fn as_slice<'a>(&'a self) -> &'a str;
1348 }
1349
1350 impl Str for str {
1351     #[inline]
1352     fn as_slice<'a>(&'a self) -> &'a str { self }
1353 }
1354
1355 impl<'a, S: ?Sized> Str for &'a S where S: Str {
1356     #[inline]
1357     fn as_slice(&self) -> &str { Str::as_slice(*self) }
1358 }
1359
1360 /// Return type of `StrExt::split`
1361 #[stable(feature = "rust1", since = "1.0.0")]
1362 pub struct Split<'a, P: Pattern<'a>>(CharSplits<'a, P>);
1363 #[stable(feature = "rust1", since = "1.0.0")]
1364 impl<'a, P: Pattern<'a>> Iterator for Split<'a, P> {
1365     type Item = &'a str;
1366
1367     #[inline]
1368     fn next(&mut self) -> Option<&'a str> {
1369         self.0.next()
1370     }
1371 }
1372 #[stable(feature = "rust1", since = "1.0.0")]
1373 impl<'a, P: Pattern<'a>> DoubleEndedIterator for Split<'a, P>
1374 where P::Searcher: DoubleEndedSearcher<'a> {
1375     #[inline]
1376     fn next_back(&mut self) -> Option<&'a str> {
1377         self.0.next_back()
1378     }
1379 }
1380
1381 /// Return type of `StrExt::split_terminator`
1382 #[stable(feature = "rust1", since = "1.0.0")]
1383 pub struct SplitTerminator<'a, P: Pattern<'a>>(CharSplits<'a, P>);
1384 delegate_iter!{pattern &'a str : SplitTerminator<'a, P>}
1385
1386 /// Return type of `StrExt::splitn`
1387 #[stable(feature = "rust1", since = "1.0.0")]
1388 pub struct SplitN<'a, P: Pattern<'a>>(CharSplitsN<'a, P>);
1389 delegate_iter!{pattern forward &'a str : SplitN<'a, P>}
1390
1391 /// Return type of `StrExt::rsplit`
1392 #[stable(feature = "rust1", since = "1.0.0")]
1393 pub struct RSplit<'a, P: Pattern<'a>>(RCharSplits<'a, P>);
1394 delegate_iter!{pattern reverse &'a str : RSplit<'a, P>}
1395
1396 /// Return type of `StrExt::rsplitn`
1397 #[stable(feature = "rust1", since = "1.0.0")]
1398 pub struct RSplitN<'a, P: Pattern<'a>>(CharSplitsN<'a, P>);
1399 delegate_iter!{pattern forward &'a str : RSplitN<'a, P>}
1400
1401 /// Methods for string slices
1402 #[allow(missing_docs)]
1403 pub trait StrExt {
1404     // NB there are no docs here are they're all located on the StrExt trait in
1405     // libcollections, not here.
1406
1407     fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool;
1408     fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool;
1409     fn chars<'a>(&'a self) -> Chars<'a>;
1410     fn bytes<'a>(&'a self) -> Bytes<'a>;
1411     fn char_indices<'a>(&'a self) -> CharIndices<'a>;
1412     fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P>;
1413     fn splitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> SplitN<'a, P>;
1414     fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P>;
1415     fn rsplit<'a, P: Pattern<'a>>(&'a self, pat: P) -> RSplit<'a, P>
1416         where P::Searcher: ReverseSearcher<'a>;
1417     fn rsplitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> RSplitN<'a, P>;
1418     fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P>;
1419     #[allow(deprecated) /* for SplitStr */]
1420     fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P>;
1421     fn lines<'a>(&'a self) -> Lines<'a>;
1422     fn lines_any<'a>(&'a self) -> LinesAny<'a>;
1423     fn char_len(&self) -> usize;
1424     fn slice_chars<'a>(&'a self, begin: usize, end: usize) -> &'a str;
1425     unsafe fn slice_unchecked<'a>(&'a self, begin: usize, end: usize) -> &'a str;
1426     fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool;
1427     fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool
1428         where P::Searcher: ReverseSearcher<'a>;
1429     fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str
1430         where P::Searcher: DoubleEndedSearcher<'a>;
1431     fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str;
1432     fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str
1433         where P::Searcher: ReverseSearcher<'a>;
1434     fn is_char_boundary(&self, index: usize) -> bool;
1435     fn char_range_at(&self, start: usize) -> CharRange;
1436     fn char_range_at_reverse(&self, start: usize) -> CharRange;
1437     fn char_at(&self, i: usize) -> char;
1438     fn char_at_reverse(&self, i: usize) -> char;
1439     fn as_bytes<'a>(&'a self) -> &'a [u8];
1440     fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize>;
1441     fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize>
1442         where P::Searcher: ReverseSearcher<'a>;
1443     fn find_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize>;
1444     fn slice_shift_char<'a>(&'a self) -> Option<(char, &'a str)>;
1445     fn subslice_offset(&self, inner: &str) -> usize;
1446     fn as_ptr(&self) -> *const u8;
1447     fn len(&self) -> usize;
1448     fn is_empty(&self) -> bool;
1449     fn parse<T: FromStr>(&self) -> Result<T, T::Err>;
1450 }
1451
1452 #[inline(never)]
1453 fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! {
1454     assert!(begin <= end);
1455     panic!("index {} and/or {} in `{}` do not lie on character boundary",
1456           begin, end, s);
1457 }
1458
1459 impl StrExt for str {
1460     #[inline]
1461     fn contains<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool {
1462         pat.is_contained_in(self)
1463     }
1464
1465     #[inline]
1466     fn contains_char<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool {
1467         pat.is_contained_in(self)
1468     }
1469
1470     #[inline]
1471     fn chars(&self) -> Chars {
1472         Chars{iter: self.as_bytes().iter()}
1473     }
1474
1475     #[inline]
1476     fn bytes(&self) -> Bytes {
1477         Bytes(self.as_bytes().iter().map(BytesDeref))
1478     }
1479
1480     #[inline]
1481     fn char_indices(&self) -> CharIndices {
1482         CharIndices { front_offset: 0, iter: self.chars() }
1483     }
1484
1485     #[inline]
1486     fn split<'a, P: Pattern<'a>>(&'a self, pat: P) -> Split<'a, P> {
1487         Split(CharSplits {
1488             start: 0,
1489             end: self.len(),
1490             matcher: pat.into_searcher(self),
1491             allow_trailing_empty: true,
1492             finished: false,
1493         })
1494     }
1495
1496     #[inline]
1497     fn splitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> SplitN<'a, P> {
1498         SplitN(CharSplitsN {
1499             iter: self.split(pat).0,
1500             count: count,
1501             invert: false,
1502         })
1503     }
1504
1505     #[inline]
1506     fn split_terminator<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitTerminator<'a, P> {
1507         SplitTerminator(CharSplits {
1508             allow_trailing_empty: false,
1509             ..self.split(pat).0
1510         })
1511     }
1512
1513     #[inline]
1514     fn rsplit<'a, P: Pattern<'a>>(&'a self, pat: P) -> RSplit<'a, P>
1515         where P::Searcher: ReverseSearcher<'a>
1516     {
1517         RSplit(RCharSplits {
1518             start: 0,
1519             end: self.len(),
1520             matcher: pat.into_searcher(self),
1521             allow_final_empty: true,
1522             finished: false,
1523         })
1524     }
1525
1526     #[inline]
1527     fn rsplitn<'a, P: Pattern<'a>>(&'a self, count: usize, pat: P) -> RSplitN<'a, P> {
1528         RSplitN(CharSplitsN {
1529             iter: self.split(pat).0,
1530             count: count,
1531             invert: true,
1532         })
1533     }
1534
1535     #[inline]
1536     fn match_indices<'a, P: Pattern<'a>>(&'a self, pat: P) -> MatchIndices<'a, P> {
1537         MatchIndices(pat.into_searcher(self))
1538     }
1539
1540     #[inline]
1541     #[allow(deprecated) /* for SplitStr */ ]
1542     fn split_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> SplitStr<'a, P> {
1543         SplitStr(self.split(pat))
1544     }
1545
1546     #[inline]
1547     fn lines(&self) -> Lines {
1548         Lines { inner: self.split_terminator('\n').0 }
1549     }
1550
1551     fn lines_any(&self) -> LinesAny {
1552         fn f(line: &str) -> &str {
1553             let l = line.len();
1554             if l > 0 && line.as_bytes()[l - 1] == b'\r' { &line[0 .. l - 1] }
1555             else { line }
1556         }
1557
1558         let f: fn(&str) -> &str = f; // coerce to fn pointer
1559         LinesAny { inner: self.lines().map(f) }
1560     }
1561
1562     #[inline]
1563     fn char_len(&self) -> usize { self.chars().count() }
1564
1565     fn slice_chars(&self, begin: usize, end: usize) -> &str {
1566         assert!(begin <= end);
1567         let mut count = 0;
1568         let mut begin_byte = None;
1569         let mut end_byte = None;
1570
1571         // This could be even more efficient by not decoding,
1572         // only finding the char boundaries
1573         for (idx, _) in self.char_indices() {
1574             if count == begin { begin_byte = Some(idx); }
1575             if count == end { end_byte = Some(idx); break; }
1576             count += 1;
1577         }
1578         if begin_byte.is_none() && count == begin { begin_byte = Some(self.len()) }
1579         if end_byte.is_none() && count == end { end_byte = Some(self.len()) }
1580
1581         match (begin_byte, end_byte) {
1582             (None, _) => panic!("slice_chars: `begin` is beyond end of string"),
1583             (_, None) => panic!("slice_chars: `end` is beyond end of string"),
1584             (Some(a), Some(b)) => unsafe { self.slice_unchecked(a, b) }
1585         }
1586     }
1587
1588     #[inline]
1589     unsafe fn slice_unchecked(&self, begin: usize, end: usize) -> &str {
1590         mem::transmute(Slice {
1591             data: self.as_ptr().offset(begin as int),
1592             len: end - begin,
1593         })
1594     }
1595
1596     #[inline]
1597     fn starts_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool {
1598         pat.is_prefix_of(self)
1599     }
1600
1601     #[inline]
1602     fn ends_with<'a, P: Pattern<'a>>(&'a self, pat: P) -> bool
1603         where P::Searcher: ReverseSearcher<'a>
1604     {
1605         pat.is_suffix_of(self)
1606     }
1607
1608     #[inline]
1609     fn trim_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str
1610         where P::Searcher: DoubleEndedSearcher<'a>
1611     {
1612         let mut i = 0;
1613         let mut j = 0;
1614         let mut matcher = pat.into_searcher(self);
1615         if let Some((a, b)) = matcher.next_reject() {
1616             i = a;
1617             j = b; // Rember earliest known match, correct it below if
1618                    // last match is different
1619         }
1620         if let Some((_, b)) = matcher.next_reject_back() {
1621             j = b;
1622         }
1623         unsafe {
1624             // Searcher is known to return valid indices
1625             self.slice_unchecked(i, j)
1626         }
1627     }
1628
1629     #[inline]
1630     fn trim_left_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str {
1631         let mut i = self.len();
1632         let mut matcher = pat.into_searcher(self);
1633         if let Some((a, _)) = matcher.next_reject() {
1634             i = a;
1635         }
1636         unsafe {
1637             // Searcher is known to return valid indices
1638             self.slice_unchecked(i, self.len())
1639         }
1640     }
1641
1642     #[inline]
1643     fn trim_right_matches<'a, P: Pattern<'a>>(&'a self, pat: P) -> &'a str
1644         where P::Searcher: ReverseSearcher<'a>
1645     {
1646         let mut j = 0;
1647         let mut matcher = pat.into_searcher(self);
1648         if let Some((_, b)) = matcher.next_reject_back() {
1649             j = b;
1650         }
1651         unsafe {
1652             // Searcher is known to return valid indices
1653             self.slice_unchecked(0, j)
1654         }
1655     }
1656
1657     #[inline]
1658     fn is_char_boundary(&self, index: usize) -> bool {
1659         if index == self.len() { return true; }
1660         match self.as_bytes().get(index) {
1661             None => false,
1662             Some(&b) => b < 128 || b >= 192,
1663         }
1664     }
1665
1666     #[inline]
1667     fn char_range_at(&self, i: usize) -> CharRange {
1668         let (c, n) = char_range_at_raw(self.as_bytes(), i);
1669         CharRange { ch: unsafe { mem::transmute(c) }, next: n }
1670     }
1671
1672     #[inline]
1673     fn char_range_at_reverse(&self, start: usize) -> CharRange {
1674         let mut prev = start;
1675
1676         prev = prev.saturating_sub(1);
1677         if self.as_bytes()[prev] < 128 {
1678             return CharRange{ch: self.as_bytes()[prev] as char, next: prev}
1679         }
1680
1681         // Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
1682         fn multibyte_char_range_at_reverse(s: &str, mut i: usize) -> CharRange {
1683             // while there is a previous byte == 10......
1684             while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 {
1685                 i -= 1;
1686             }
1687
1688             let first= s.as_bytes()[i];
1689             let w = UTF8_CHAR_WIDTH[first as usize];
1690             assert!(w != 0);
1691
1692             let mut val = utf8_first_byte(first, w as u32);
1693             val = utf8_acc_cont_byte(val, s.as_bytes()[i + 1]);
1694             if w > 2 { val = utf8_acc_cont_byte(val, s.as_bytes()[i + 2]); }
1695             if w > 3 { val = utf8_acc_cont_byte(val, s.as_bytes()[i + 3]); }
1696
1697             return CharRange {ch: unsafe { mem::transmute(val) }, next: i};
1698         }
1699
1700         return multibyte_char_range_at_reverse(self, prev);
1701     }
1702
1703     #[inline]
1704     fn char_at(&self, i: usize) -> char {
1705         self.char_range_at(i).ch
1706     }
1707
1708     #[inline]
1709     fn char_at_reverse(&self, i: usize) -> char {
1710         self.char_range_at_reverse(i).ch
1711     }
1712
1713     #[inline]
1714     fn as_bytes(&self) -> &[u8] {
1715         unsafe { mem::transmute(self) }
1716     }
1717
1718     fn find<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize> {
1719         pat.into_searcher(self).next_match().map(|(i, _)| i)
1720     }
1721
1722     fn rfind<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize>
1723         where P::Searcher: ReverseSearcher<'a>
1724     {
1725         pat.into_searcher(self).next_match_back().map(|(i, _)| i)
1726     }
1727
1728     fn find_str<'a, P: Pattern<'a>>(&'a self, pat: P) -> Option<usize> {
1729         self.find(pat)
1730     }
1731
1732     #[inline]
1733     fn slice_shift_char(&self) -> Option<(char, &str)> {
1734         if self.is_empty() {
1735             None
1736         } else {
1737             let ch = self.char_at(0);
1738             let next_s = unsafe { self.slice_unchecked(ch.len_utf8(), self.len()) };
1739             Some((ch, next_s))
1740         }
1741     }
1742
1743     fn subslice_offset(&self, inner: &str) -> usize {
1744         let a_start = self.as_ptr() as usize;
1745         let a_end = a_start + self.len();
1746         let b_start = inner.as_ptr() as usize;
1747         let b_end = b_start + inner.len();
1748
1749         assert!(a_start <= b_start);
1750         assert!(b_end <= a_end);
1751         b_start - a_start
1752     }
1753
1754     #[inline]
1755     fn as_ptr(&self) -> *const u8 {
1756         self.repr().data
1757     }
1758
1759     #[inline]
1760     fn len(&self) -> usize { self.repr().len }
1761
1762     #[inline]
1763     fn is_empty(&self) -> bool { self.len() == 0 }
1764
1765     #[inline]
1766     fn parse<T: FromStr>(&self) -> Result<T, T::Err> { FromStr::from_str(self) }
1767 }
1768
1769 /// Pluck a code point out of a UTF-8-like byte slice and return the
1770 /// index of the next code point.
1771 #[inline]
1772 #[unstable(feature = "core")]
1773 pub fn char_range_at_raw(bytes: &[u8], i: usize) -> (u32, usize) {
1774     if bytes[i] < 128 {
1775         return (bytes[i] as u32, i + 1);
1776     }
1777
1778     // Multibyte case is a fn to allow char_range_at to inline cleanly
1779     fn multibyte_char_range_at(bytes: &[u8], i: usize) -> (u32, usize) {
1780         let first = bytes[i];
1781         let w = UTF8_CHAR_WIDTH[first as usize];
1782         assert!(w != 0);
1783
1784         let mut val = utf8_first_byte(first, w as u32);
1785         val = utf8_acc_cont_byte(val, bytes[i + 1]);
1786         if w > 2 { val = utf8_acc_cont_byte(val, bytes[i + 2]); }
1787         if w > 3 { val = utf8_acc_cont_byte(val, bytes[i + 3]); }
1788
1789         return (val, i + w as usize);
1790     }
1791
1792     multibyte_char_range_at(bytes, i)
1793 }
1794
1795 #[stable(feature = "rust1", since = "1.0.0")]
1796 impl<'a> Default for &'a str {
1797     #[stable(feature = "rust1", since = "1.0.0")]
1798     fn default() -> &'a str { "" }
1799 }
1800
1801 #[stable(feature = "rust1", since = "1.0.0")]
1802 impl<'a> Iterator for Lines<'a> {
1803     type Item = &'a str;
1804
1805     #[inline]
1806     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
1807     #[inline]
1808     fn size_hint(&self) -> (usize, Option<usize>) { self.inner.size_hint() }
1809 }
1810
1811 #[stable(feature = "rust1", since = "1.0.0")]
1812 impl<'a> DoubleEndedIterator for Lines<'a> {
1813     #[inline]
1814     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
1815 }
1816
1817 #[stable(feature = "rust1", since = "1.0.0")]
1818 impl<'a> Iterator for LinesAny<'a> {
1819     type Item = &'a str;
1820
1821     #[inline]
1822     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
1823     #[inline]
1824     fn size_hint(&self) -> (usize, Option<usize>) { self.inner.size_hint() }
1825 }
1826
1827 #[stable(feature = "rust1", since = "1.0.0")]
1828 impl<'a> DoubleEndedIterator for LinesAny<'a> {
1829     #[inline]
1830     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
1831 }