src/libunicode/u_str.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 /*!
  14  * Unicode-intensive string manipulations.
  15  *
  16  * This module provides functionality to `str` that requires the Unicode
  17  * methods provided by the UnicodeChar trait.
  18  */
  19
  20 use self::GraphemeState::*;
  21 use core::cmp;
  22 use core::slice::SlicePrelude;
  23 use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
  24 use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt};
  25 use core::kinds::Sized;
  26 use core::option::{Option, None, Some};
  27 use core::str::{CharSplits, StrPrelude};
  28 use u_char::UnicodeChar;
  29 use tables::grapheme::GraphemeCat;
  30
  31 /// An iterator over the words of a string, separated by a sequence of whitespace
  32 /// FIXME: This should be opaque
  33 pub type Words<'a> =
  34     Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>>;
  35
  36 /// Methods for Unicode string slices
  37 pub trait UnicodeStrPrelude for Sized? {
  38     /// Returns an iterator over the
  39     /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
  40     /// of the string.
  41     ///
  42     /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*;
  43     /// otherwise, the iterator is over the *legacy grapheme clusters*.
  44     /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
  45     /// recommends extended grapheme cluster boundaries for general processing.
  46     ///
  47     /// # Example
  48     ///
  49     /// ```rust
  50     /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::<Vec<&str>>();
  51     /// let b: &[_] = &["a\u0310", "e\u0301", "o\u0308\u0332"];
  52     /// assert_eq!(gr1.as_slice(), b);
  53     /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::<Vec<&str>>();
  54     /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"];
  55     /// assert_eq!(gr2.as_slice(), b);
  56     /// ```
  57     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
  58
  59     /// Returns an iterator over the grapheme clusters of self and their byte offsets.
  60     /// See `graphemes()` method for more information.
  61     ///
  62     /// # Example
  63     ///
  64     /// ```rust
  65     /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::<Vec<(uint, &str)>>();
  66     /// let b: &[_] = &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
  67     /// assert_eq!(gr_inds.as_slice(), b);
  68     /// ```
  69     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
  70
  71     /// An iterator over the words of a string (subsequences separated
  72     /// by any sequence of whitespace). Sequences of whitespace are
  73     /// collapsed, so empty "words" are not included.
  74     ///
  75     /// # Example
  76     ///
  77     /// ```rust
  78     /// let some_words = " Mary   had\ta little  \n\t lamb";
  79     /// let v: Vec<&str> = some_words.words().collect();
  80     /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
  81     /// ```
  82     fn words<'a>(&'a self) -> Words<'a>;
  83
  84     /// Returns true if the string contains only whitespace.
  85     ///
  86     /// Whitespace characters are determined by `char::is_whitespace`.
  87     ///
  88     /// # Example
  89     ///
  90     /// ```rust
  91     /// assert!(" \t\n".is_whitespace());
  92     /// assert!("".is_whitespace());
  93     ///
  94     /// assert!( !"abc".is_whitespace());
  95     /// ```
  96     fn is_whitespace(&self) -> bool;
  97
  98     /// Returns true if the string contains only alphanumeric code
  99     /// points.
 100     ///
 101     /// Alphanumeric characters are determined by `char::is_alphanumeric`.
 102     ///
 103     /// # Example
 104     ///
 105     /// ```rust
 106     /// assert!("Löwe老虎Léopard123".is_alphanumeric());
 107     /// assert!("".is_alphanumeric());
 108     ///
 109     /// assert!( !" &*~".is_alphanumeric());
 110     /// ```
 111     fn is_alphanumeric(&self) -> bool;
 112
 113     /// Returns a string's displayed width in columns, treating control
 114     /// characters as zero-width.
 115     ///
 116     /// `is_cjk` determines behavior for characters in the Ambiguous category:
 117     /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
 118     /// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
 119     /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
 120     /// recommends that these characters be treated as 1 column (i.e.,
 121     /// `is_cjk` = `false`) if the locale is unknown.
 122     fn width(&self, is_cjk: bool) -> uint;
 123
 124     /// Returns a string with leading and trailing whitespace removed.
 125     fn trim<'a>(&'a self) -> &'a str;
 126
 127     /// Returns a string with leading whitespace removed.
 128     fn trim_left<'a>(&'a self) -> &'a str;
 129
 130     /// Returns a string with trailing whitespace removed.
 131     fn trim_right<'a>(&'a self) -> &'a str;
 132 }
 133
 134 impl UnicodeStrPrelude for str {
 135     #[inline]
 136     fn graphemes(&self, is_extended: bool) -> Graphemes {
 137         Graphemes { string: self, extended: is_extended, cat: None, catb: None }
 138     }
 139
 140     #[inline]
 141     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
 142         GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
 143     }
 144
 145     #[inline]
 146     fn words(&self) -> Words {
 147         let f = |c: char| c.is_whitespace();
 148         self.split(f).filter(|s| !s.is_empty())
 149     }
 150
 151     #[inline]
 152     fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
 153
 154     #[inline]
 155     fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
 156
 157     #[inline]
 158     fn width(&self, is_cjk: bool) -> uint {
 159         self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
 160     }
 161
 162     #[inline]
 163     fn trim(&self) -> &str {
 164         self.trim_left().trim_right()
 165     }
 166
 167     #[inline]
 168     fn trim_left(&self) -> &str {
 169         self.trim_left_chars(|c: char| c.is_whitespace())
 170     }
 171
 172     #[inline]
 173     fn trim_right(&self) -> &str {
 174         self.trim_right_chars(|c: char| c.is_whitespace())
 175     }
 176 }
 177
 178 /// External iterator for grapheme clusters and byte offsets.
 179 #[deriving(Clone)]
 180 pub struct GraphemeIndices<'a> {
 181     start_offset: uint,
 182     iter: Graphemes<'a>,
 183 }
 184
 185 impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> {
 186     #[inline]
 187     fn next(&mut self) -> Option<(uint, &'a str)> {
 188         self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
 189     }
 190
 191     #[inline]
 192     fn size_hint(&self) -> (uint, Option<uint>) {
 193         self.iter.size_hint()
 194     }
 195 }
 196
 197 impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> {
 198     #[inline]
 199     fn next_back(&mut self) -> Option<(uint, &'a str)> {
 200         self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
 201     }
 202 }
 203
 204 /// External iterator for a string's
 205 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
 206 #[deriving(Clone)]
 207 pub struct Graphemes<'a> {
 208     string: &'a str,
 209     extended: bool,
 210     cat: Option<GraphemeCat>,
 211     catb: Option<GraphemeCat>,
 212 }
 213
 214 // state machine for cluster boundary rules
 215 #[deriving(PartialEq,Eq)]
 216 enum GraphemeState {
 217     Start,
 218     FindExtend,
 219     HangulL,
 220     HangulLV,
 221     HangulLVT,
 222     Regional,
 223 }
 224
 225 impl<'a> Iterator<&'a str> for Graphemes<'a> {
 226     #[inline]
 227     fn size_hint(&self) -> (uint, Option<uint>) {
 228         let slen = self.string.len();
 229         (cmp::min(slen, 1u), Some(slen))
 230     }
 231
 232     #[inline]
 233     fn next(&mut self) -> Option<&'a str> {
 234         use tables::grapheme as gr;
 235         if self.string.len() == 0 {
 236             return None;
 237         }
 238
 239         let mut take_curr = true;
 240         let mut idx = 0;
 241         let mut state = Start;
 242         let mut cat = gr::GC_Any;
 243         for (curr, ch) in self.string.char_indices() {
 244             idx = curr;
 245
 246             // retrieve cached category, if any
 247             // We do this because most of the time we would end up
 248             // looking up each character twice.
 249             cat = match self.cat {
 250                 None => gr::grapheme_category(ch),
 251                 _ => self.cat.take().unwrap()
 252             };
 253
 254             if match cat {
 255                 gr::GC_Extend => true,
 256                 gr::GC_SpacingMark if self.extended => true,
 257                 _ => false
 258             } {
 259                     state = FindExtend;     // rule GB9/GB9a
 260                     continue;
 261             }
 262
 263             state = match state {
 264                 Start if '\r' == ch => {
 265                     let slen = self.string.len();
 266                     let nidx = idx + 1;
 267                     if nidx != slen && self.string.char_at(nidx) == '\n' {
 268                         idx = nidx;             // rule GB3
 269                     }
 270                     break;                      // rule GB4
 271                 }
 272                 Start => match cat {
 273                     gr::GC_Control => break,
 274                     gr::GC_L => HangulL,
 275                     gr::GC_LV | gr::GC_V => HangulLV,
 276                     gr::GC_LVT | gr::GC_T => HangulLVT,
 277                     gr::GC_RegionalIndicator => Regional,
 278                     _ => FindExtend
 279                 },
 280                 FindExtend => {         // found non-extending when looking for extending
 281                     take_curr = false;
 282                     break;
 283                 },
 284                 HangulL => match cat {      // rule GB6: L x (L|V|LV|LVT)
 285                     gr::GC_L => continue,
 286                     gr::GC_LV | gr::GC_V => HangulLV,
 287                     gr::GC_LVT => HangulLVT,
 288                     _ => {
 289                         take_curr = false;
 290                         break;
 291                     }
 292                 },
 293                 HangulLV => match cat {     // rule GB7: (LV|V) x (V|T)
 294                     gr::GC_V => continue,
 295                     gr::GC_T => HangulLVT,
 296                     _ => {
 297                         take_curr = false;
 298                         break;
 299                     }
 300                 },
 301                 HangulLVT => match cat {    // rule GB8: (LVT|T) x T
 302                     gr::GC_T => continue,
 303                     _ => {
 304                         take_curr = false;
 305                         break;
 306                     }
 307                 },
 308                 Regional => match cat {     // rule GB8a
 309                     gr::GC_RegionalIndicator => continue,
 310                     _ => {
 311                         take_curr = false;
 312                         break;
 313                     }
 314                 }
 315             }
 316         }
 317
 318         self.cat = if take_curr {
 319             idx = self.string.char_range_at(idx).next;
 320             None
 321         } else {
 322             Some(cat)
 323         };
 324
 325         let retstr = self.string.slice_to(idx);
 326         self.string = self.string.slice_from(idx);
 327         Some(retstr)
 328     }
 329 }
 330
 331 impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
 332     #[inline]
 333     fn next_back(&mut self) -> Option<&'a str> {
 334         use tables::grapheme as gr;
 335         if self.string.len() == 0 {
 336             return None;
 337         }
 338
 339         let mut take_curr = true;
 340         let mut idx = self.string.len();
 341         let mut previdx = idx;
 342         let mut state = Start;
 343         let mut cat = gr::GC_Any;
 344         for (curr, ch) in self.string.char_indices().rev() {
 345             previdx = idx;
 346             idx = curr;
 347
 348             // cached category, if any
 349             cat = match self.catb {
 350                 None => gr::grapheme_category(ch),
 351                 _ => self.catb.take().unwrap()
 352             };
 353
 354             // a matching state machine that runs *backwards* across an input string
 355             // note that this has some implications for the Hangul matching, since
 356             // we now need to know what the rightward letter is:
 357             //
 358             // Right to left, we have:
 359             //      L x L
 360             //      V x (L|V|LV)
 361             //      T x (V|T|LV|LVT)
 362             // HangulL means the letter to the right is L
 363             // HangulLV means the letter to the right is V
 364             // HangulLVT means the letter to the right is T
 365             state = match state {
 366                 Start if '\n' == ch => {
 367                     if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
 368                         idx -= 1;       // rule GB3
 369                     }
 370                     break;              // rule GB4
 371                 },
 372                 Start | FindExtend => match cat {
 373                     gr::GC_Extend => FindExtend,
 374                     gr::GC_SpacingMark if self.extended => FindExtend,
 375                     gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
 376                     gr::GC_V => HangulLV,
 377                     gr::GC_T => HangulLVT,
 378                     gr::GC_RegionalIndicator => Regional,
 379                     gr::GC_Control => {
 380                         take_curr = Start == state;
 381                         break;
 382                     },
 383                     _ => break
 384                 },
 385                 HangulL => match cat {      // char to right is an L
 386                     gr::GC_L => continue,               // L x L is the only legal match
 387                     _ => {
 388                         take_curr = false;
 389                         break;
 390                     }
 391                 },
 392                 HangulLV => match cat {     // char to right is a V
 393                     gr::GC_V => continue,               // V x V, right char is still V
 394                     gr::GC_L | gr::GC_LV => HangulL,    // (L|V) x V, right char is now L
 395                     _ => {
 396                         take_curr = false;
 397                         break;
 398                     }
 399                 },
 400                 HangulLVT => match cat {    // char to right is a T
 401                     gr::GC_T => continue,               // T x T, right char is still T
 402                     gr::GC_V => HangulLV,               // V x T, right char is now V
 403                     gr::GC_LV | gr::GC_LVT => HangulL,  // (LV|LVT) x T, right char is now L
 404                     _ => {
 405                         take_curr = false;
 406                         break;
 407                     }
 408                 },
 409                 Regional => match cat {     // rule GB8a
 410                     gr::GC_RegionalIndicator => continue,
 411                     _ => {
 412                         take_curr = false;
 413                         break;
 414                     }
 415                 }
 416             }
 417         }
 418
 419         self.catb = if take_curr {
 420             None
 421         } else  {
 422             idx = previdx;
 423             Some(cat)
 424         };
 425
 426         let retstr = self.string.slice_from(idx);
 427         self.string = self.string.slice_to(idx);
 428         Some(retstr)
 429     }
 430 }