src/libunicode/u_str.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! Unicode-intensive string manipulations.
  14 //!
  15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
  16 //! UnicodeChar trait.
  17
  18 use self::GraphemeState::*;
  19 use core::prelude::*;
  20
  21 use core::char;
  22 use core::cmp;
  23 use core::iter::{Filter, AdditiveIterator};
  24 use core::mem;
  25 use core::num::Int;
  26 use core::slice;
  27 use core::str::Split;
  28
  29 use u_char::UnicodeChar;
  30 use tables::grapheme::GraphemeCat;
  31
  32 /// An iterator over the words of a string, separated by a sequence of whitespace
  33 #[stable]
  34 pub struct Words<'a> {
  35     inner: Filter<&'a str, Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
  36 }
  37
  38 /// Methods for Unicode string slices
  39 #[allow(missing_docs)] // docs in libcollections
  40 pub trait UnicodeStr for Sized? {
  41     fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
  42     fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
  43     fn words<'a>(&'a self) -> Words<'a>;
  44     fn is_whitespace(&self) -> bool;
  45     fn is_alphanumeric(&self) -> bool;
  46     fn width(&self, is_cjk: bool) -> uint;
  47     fn trim<'a>(&'a self) -> &'a str;
  48     fn trim_left<'a>(&'a self) -> &'a str;
  49     fn trim_right<'a>(&'a self) -> &'a str;
  50 }
  51
  52 impl UnicodeStr for str {
  53     #[inline]
  54     fn graphemes(&self, is_extended: bool) -> Graphemes {
  55         Graphemes { string: self, extended: is_extended, cat: None, catb: None }
  56     }
  57
  58     #[inline]
  59     fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
  60         GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
  61     }
  62
  63     #[inline]
  64     fn words(&self) -> Words {
  65         fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
  66         let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
  67
  68         fn is_whitespace(c: char) -> bool { c.is_whitespace() }
  69         let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
  70
  71         Words { inner: self.split(is_whitespace).filter(is_not_empty) }
  72     }
  73
  74     #[inline]
  75     fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
  76
  77     #[inline]
  78     fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
  79
  80     #[inline]
  81     fn width(&self, is_cjk: bool) -> uint {
  82         self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
  83     }
  84
  85     #[inline]
  86     fn trim(&self) -> &str {
  87         self.trim_left().trim_right()
  88     }
  89
  90     #[inline]
  91     fn trim_left(&self) -> &str {
  92         self.trim_left_matches(|&: c: char| c.is_whitespace())
  93     }
  94
  95     #[inline]
  96     fn trim_right(&self) -> &str {
  97         self.trim_right_matches(|&: c: char| c.is_whitespace())
  98     }
  99 }
 100
 101 /// External iterator for grapheme clusters and byte offsets.
 102 #[deriving(Clone)]
 103 pub struct GraphemeIndices<'a> {
 104     start_offset: uint,
 105     iter: Graphemes<'a>,
 106 }
 107
 108 impl<'a> Iterator for GraphemeIndices<'a> {
 109     type Item = (uint, &'a str);
 110
 111     #[inline]
 112     fn next(&mut self) -> Option<(uint, &'a str)> {
 113         self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
 114     }
 115
 116     #[inline]
 117     fn size_hint(&self) -> (uint, Option<uint>) {
 118         self.iter.size_hint()
 119     }
 120 }
 121
 122 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
 123     #[inline]
 124     fn next_back(&mut self) -> Option<(uint, &'a str)> {
 125         self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
 126     }
 127 }
 128
 129 /// External iterator for a string's
 130 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
 131 #[deriving(Clone)]
 132 pub struct Graphemes<'a> {
 133     string: &'a str,
 134     extended: bool,
 135     cat: Option<GraphemeCat>,
 136     catb: Option<GraphemeCat>,
 137 }
 138
 139 // state machine for cluster boundary rules
 140 #[deriving(PartialEq,Eq)]
 141 enum GraphemeState {
 142     Start,
 143     FindExtend,
 144     HangulL,
 145     HangulLV,
 146     HangulLVT,
 147     Regional,
 148 }
 149
 150 impl<'a> Iterator for Graphemes<'a> {
 151     type Item = &'a str;
 152
 153     #[inline]
 154     fn size_hint(&self) -> (uint, Option<uint>) {
 155         let slen = self.string.len();
 156         (cmp::min(slen, 1u), Some(slen))
 157     }
 158
 159     #[inline]
 160     fn next(&mut self) -> Option<&'a str> {
 161         use tables::grapheme as gr;
 162         if self.string.len() == 0 {
 163             return None;
 164         }
 165
 166         let mut take_curr = true;
 167         let mut idx = 0;
 168         let mut state = Start;
 169         let mut cat = gr::GC_Any;
 170         for (curr, ch) in self.string.char_indices() {
 171             idx = curr;
 172
 173             // retrieve cached category, if any
 174             // We do this because most of the time we would end up
 175             // looking up each character twice.
 176             cat = match self.cat {
 177                 None => gr::grapheme_category(ch),
 178                 _ => self.cat.take().unwrap()
 179             };
 180
 181             if match cat {
 182                 gr::GC_Extend => true,
 183                 gr::GC_SpacingMark if self.extended => true,
 184                 _ => false
 185             } {
 186                     state = FindExtend;     // rule GB9/GB9a
 187                     continue;
 188             }
 189
 190             state = match state {
 191                 Start if '\r' == ch => {
 192                     let slen = self.string.len();
 193                     let nidx = idx + 1;
 194                     if nidx != slen && self.string.char_at(nidx) == '\n' {
 195                         idx = nidx;             // rule GB3
 196                     }
 197                     break;                      // rule GB4
 198                 }
 199                 Start => match cat {
 200                     gr::GC_Control => break,
 201                     gr::GC_L => HangulL,
 202                     gr::GC_LV | gr::GC_V => HangulLV,
 203                     gr::GC_LVT | gr::GC_T => HangulLVT,
 204                     gr::GC_RegionalIndicator => Regional,
 205                     _ => FindExtend
 206                 },
 207                 FindExtend => {         // found non-extending when looking for extending
 208                     take_curr = false;
 209                     break;
 210                 },
 211                 HangulL => match cat {      // rule GB6: L x (L|V|LV|LVT)
 212                     gr::GC_L => continue,
 213                     gr::GC_LV | gr::GC_V => HangulLV,
 214                     gr::GC_LVT => HangulLVT,
 215                     _ => {
 216                         take_curr = false;
 217                         break;
 218                     }
 219                 },
 220                 HangulLV => match cat {     // rule GB7: (LV|V) x (V|T)
 221                     gr::GC_V => continue,
 222                     gr::GC_T => HangulLVT,
 223                     _ => {
 224                         take_curr = false;
 225                         break;
 226                     }
 227                 },
 228                 HangulLVT => match cat {    // rule GB8: (LVT|T) x T
 229                     gr::GC_T => continue,
 230                     _ => {
 231                         take_curr = false;
 232                         break;
 233                     }
 234                 },
 235                 Regional => match cat {     // rule GB8a
 236                     gr::GC_RegionalIndicator => continue,
 237                     _ => {
 238                         take_curr = false;
 239                         break;
 240                     }
 241                 }
 242             }
 243         }
 244
 245         self.cat = if take_curr {
 246             idx = self.string.char_range_at(idx).next;
 247             None
 248         } else {
 249             Some(cat)
 250         };
 251
 252         let retstr = self.string.slice_to(idx);
 253         self.string = self.string.slice_from(idx);
 254         Some(retstr)
 255     }
 256 }
 257
 258 impl<'a> DoubleEndedIterator for Graphemes<'a> {
 259     #[inline]
 260     fn next_back(&mut self) -> Option<&'a str> {
 261         use tables::grapheme as gr;
 262         if self.string.len() == 0 {
 263             return None;
 264         }
 265
 266         let mut take_curr = true;
 267         let mut idx = self.string.len();
 268         let mut previdx = idx;
 269         let mut state = Start;
 270         let mut cat = gr::GC_Any;
 271         for (curr, ch) in self.string.char_indices().rev() {
 272             previdx = idx;
 273             idx = curr;
 274
 275             // cached category, if any
 276             cat = match self.catb {
 277                 None => gr::grapheme_category(ch),
 278                 _ => self.catb.take().unwrap()
 279             };
 280
 281             // a matching state machine that runs *backwards* across an input string
 282             // note that this has some implications for the Hangul matching, since
 283             // we now need to know what the rightward letter is:
 284             //
 285             // Right to left, we have:
 286             //      L x L
 287             //      V x (L|V|LV)
 288             //      T x (V|T|LV|LVT)
 289             // HangulL means the letter to the right is L
 290             // HangulLV means the letter to the right is V
 291             // HangulLVT means the letter to the right is T
 292             state = match state {
 293                 Start if '\n' == ch => {
 294                     if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
 295                         idx -= 1;       // rule GB3
 296                     }
 297                     break;              // rule GB4
 298                 },
 299                 Start | FindExtend => match cat {
 300                     gr::GC_Extend => FindExtend,
 301                     gr::GC_SpacingMark if self.extended => FindExtend,
 302                     gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
 303                     gr::GC_V => HangulLV,
 304                     gr::GC_T => HangulLVT,
 305                     gr::GC_RegionalIndicator => Regional,
 306                     gr::GC_Control => {
 307                         take_curr = Start == state;
 308                         break;
 309                     },
 310                     _ => break
 311                 },
 312                 HangulL => match cat {      // char to right is an L
 313                     gr::GC_L => continue,               // L x L is the only legal match
 314                     _ => {
 315                         take_curr = false;
 316                         break;
 317                     }
 318                 },
 319                 HangulLV => match cat {     // char to right is a V
 320                     gr::GC_V => continue,               // V x V, right char is still V
 321                     gr::GC_L | gr::GC_LV => HangulL,    // (L|V) x V, right char is now L
 322                     _ => {
 323                         take_curr = false;
 324                         break;
 325                     }
 326                 },
 327                 HangulLVT => match cat {    // char to right is a T
 328                     gr::GC_T => continue,               // T x T, right char is still T
 329                     gr::GC_V => HangulLV,               // V x T, right char is now V
 330                     gr::GC_LV | gr::GC_LVT => HangulL,  // (LV|LVT) x T, right char is now L
 331                     _ => {
 332                         take_curr = false;
 333                         break;
 334                     }
 335                 },
 336                 Regional => match cat {     // rule GB8a
 337                     gr::GC_RegionalIndicator => continue,
 338                     _ => {
 339                         take_curr = false;
 340                         break;
 341                     }
 342                 }
 343             }
 344         }
 345
 346         self.catb = if take_curr {
 347             None
 348         } else  {
 349             idx = previdx;
 350             Some(cat)
 351         };
 352
 353         let retstr = self.string.slice_from(idx);
 354         self.string = self.string.slice_to(idx);
 355         Some(retstr)
 356     }
 357 }
 358
 359 // https://tools.ietf.org/html/rfc3629
 360 static UTF8_CHAR_WIDTH: [u8; 256] = [
 361 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 362 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
 363 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 364 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
 365 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 366 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
 367 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
 368 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
 369 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 370 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
 371 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 372 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
 373 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 374 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
 375 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
 376 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
 377 ];
 378
 379 /// Given a first byte, determine how many bytes are in this UTF-8 character
 380 #[inline]
 381 pub fn utf8_char_width(b: u8) -> uint {
 382     return UTF8_CHAR_WIDTH[b as uint] as uint;
 383 }
 384
 385 /// Determines if a vector of `u16` contains valid UTF-16
 386 pub fn is_utf16(v: &[u16]) -> bool {
 387     let mut it = v.iter();
 388     macro_rules! next { ($ret:expr) => {
 389             match it.next() { Some(u) => *u, None => return $ret }
 390         }
 391     }
 392     loop {
 393         let u = next!(true);
 394
 395         match char::from_u32(u as u32) {
 396             Some(_) => {}
 397             None => {
 398                 let u2 = next!(false);
 399                 if u < 0xD7FF || u > 0xDBFF ||
 400                     u2 < 0xDC00 || u2 > 0xDFFF { return false; }
 401             }
 402         }
 403     }
 404 }
 405
 406 /// An iterator that decodes UTF-16 encoded codepoints from a vector
 407 /// of `u16`s.
 408 #[deriving(Clone)]
 409 pub struct Utf16Items<'a> {
 410     iter: slice::Iter<'a, u16>
 411 }
 412 /// The possibilities for values decoded from a `u16` stream.
 413 #[deriving(PartialEq, Eq, Clone, Show)]
 414 pub enum Utf16Item {
 415     /// A valid codepoint.
 416     ScalarValue(char),
 417     /// An invalid surrogate without its pair.
 418     LoneSurrogate(u16)
 419 }
 420
 421 impl Copy for Utf16Item {}
 422
 423 impl Utf16Item {
 424     /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
 425     /// replacement character (U+FFFD).
 426     #[inline]
 427     pub fn to_char_lossy(&self) -> char {
 428         match *self {
 429             Utf16Item::ScalarValue(c) => c,
 430             Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
 431         }
 432     }
 433 }
 434
 435 impl<'a> Iterator for Utf16Items<'a> {
 436     type Item = Utf16Item;
 437
 438     fn next(&mut self) -> Option<Utf16Item> {
 439         let u = match self.iter.next() {
 440             Some(u) => *u,
 441             None => return None
 442         };
 443
 444         if u < 0xD800 || 0xDFFF < u {
 445             // not a surrogate
 446             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
 447         } else if u >= 0xDC00 {
 448             // a trailing surrogate
 449             Some(Utf16Item::LoneSurrogate(u))
 450         } else {
 451             // preserve state for rewinding.
 452             let old = self.iter;
 453
 454             let u2 = match self.iter.next() {
 455                 Some(u2) => *u2,
 456                 // eof
 457                 None => return Some(Utf16Item::LoneSurrogate(u))
 458             };
 459             if u2 < 0xDC00 || u2 > 0xDFFF {
 460                 // not a trailing surrogate so we're not a valid
 461                 // surrogate pair, so rewind to redecode u2 next time.
 462                 self.iter = old;
 463                 return Some(Utf16Item::LoneSurrogate(u))
 464             }
 465
 466             // all ok, so lets decode it.
 467             let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
 468             Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
 469         }
 470     }
 471
 472     #[inline]
 473     fn size_hint(&self) -> (uint, Option<uint>) {
 474         let (low, high) = self.iter.size_hint();
 475         // we could be entirely valid surrogates (2 elements per
 476         // char), or entirely non-surrogates (1 element per char)
 477         (low / 2, high)
 478     }
 479 }
 480
 481 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
 482 /// returning invalid surrogates as `LoneSurrogate`s.
 483 ///
 484 /// # Example
 485 ///
 486 /// ```rust
 487 /// use unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
 488 ///
 489 /// // 𝄞mus<invalid>ic<invalid>
 490 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 491 ///          0x0073, 0xDD1E, 0x0069, 0x0063,
 492 ///          0xD834];
 493 ///
 494 /// assert_eq!(unicode::str::utf16_items(&v).collect::<Vec<_>>(),
 495 ///            vec![ScalarValue('𝄞'),
 496 ///                 ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
 497 ///                 LoneSurrogate(0xDD1E),
 498 ///                 ScalarValue('i'), ScalarValue('c'),
 499 ///                 LoneSurrogate(0xD834)]);
 500 /// ```
 501 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
 502     Utf16Items { iter : v.iter() }
 503 }
 504
 505 /// Iterator adaptor for encoding `char`s to UTF-16.
 506 #[deriving(Clone)]
 507 pub struct Utf16Encoder<I> {
 508     chars: I,
 509     extra: u16
 510 }
 511
 512 impl<I> Utf16Encoder<I> {
 513     /// Create an UTF-16 encoder from any `char` iterator.
 514     pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
 515         Utf16Encoder { chars: chars, extra: 0 }
 516     }
 517 }
 518
 519 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
 520     type Item = u16;
 521
 522     #[inline]
 523     fn next(&mut self) -> Option<u16> {
 524         if self.extra != 0 {
 525             let tmp = self.extra;
 526             self.extra = 0;
 527             return Some(tmp);
 528         }
 529
 530         let mut buf = [0u16; 2];
 531         self.chars.next().map(|ch| {
 532             let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
 533             if n == 2 { self.extra = buf[1]; }
 534             buf[0]
 535         })
 536     }
 537
 538     #[inline]
 539     fn size_hint(&self) -> (uint, Option<uint>) {
 540         let (low, high) = self.chars.size_hint();
 541         // every char gets either one u16 or two u16,
 542         // so this iterator is between 1 or 2 times as
 543         // long as the underlying iterator.
 544         (low, high.and_then(|n| n.checked_mul(2)))
 545     }
 546 }
 547
 548 impl<'a> Iterator for Words<'a> {
 549     type Item = &'a str;
 550
 551     fn next(&mut self) -> Option<&'a str> { self.inner.next() }
 552 }
 553 impl<'a> DoubleEndedIterator for Words<'a> {
 554     fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }
 555 }