1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
14 * Unicode-intensive string manipulations.
16 * This module provides functionality to `str` that requires the Unicode
17 * methods provided by the UnicodeChar trait.
20 use self::GraphemeState::*;
22 use core::slice::SlicePrelude;
23 use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
24 use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt};
25 use core::kinds::Sized;
26 use core::option::{Option, None, Some};
27 use core::str::{CharSplits, StrPrelude};
28 use u_char::UnicodeChar;
29 use tables::grapheme::GraphemeCat;
31 /// An iterator over the words of a string, separated by a sequence of whitespace
32 /// FIXME: This should be opaque
34 Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>>;
36 /// Methods for Unicode string slices
37 pub trait UnicodeStrPrelude for Sized? {
38 /// Returns an iterator over the
39 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
42 /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*;
43 /// otherwise, the iterator is over the *legacy grapheme clusters*.
44 /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
45 /// recommends extended grapheme cluster boundaries for general processing.
50 /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::<Vec<&str>>();
51 /// let b: &[_] = &["a\u0310", "e\u0301", "o\u0308\u0332"];
52 /// assert_eq!(gr1.as_slice(), b);
53 /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::<Vec<&str>>();
54 /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"];
55 /// assert_eq!(gr2.as_slice(), b);
57 fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
59 /// Returns an iterator over the grapheme clusters of self and their byte offsets.
60 /// See `graphemes()` method for more information.
65 /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::<Vec<(uint, &str)>>();
66 /// let b: &[_] = &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
67 /// assert_eq!(gr_inds.as_slice(), b);
69 fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
71 /// An iterator over the words of a string (subsequences separated
72 /// by any sequence of whitespace). Sequences of whitespace are
73 /// collapsed, so empty "words" are not included.
78 /// let some_words = " Mary had\ta little \n\t lamb";
79 /// let v: Vec<&str> = some_words.words().collect();
80 /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
82 fn words<'a>(&'a self) -> Words<'a>;
84 /// Returns true if the string contains only whitespace.
86 /// Whitespace characters are determined by `char::is_whitespace`.
91 /// assert!(" \t\n".is_whitespace());
92 /// assert!("".is_whitespace());
94 /// assert!( !"abc".is_whitespace());
96 fn is_whitespace(&self) -> bool;
98 /// Returns true if the string contains only alphanumeric code
101 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
106 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
107 /// assert!("".is_alphanumeric());
109 /// assert!( !" &*~".is_alphanumeric());
111 fn is_alphanumeric(&self) -> bool;
113 /// Returns a string's displayed width in columns, treating control
114 /// characters as zero-width.
116 /// `is_cjk` determines behavior for characters in the Ambiguous category:
117 /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
118 /// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
119 /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
120 /// recommends that these characters be treated as 1 column (i.e.,
121 /// `is_cjk` = `false`) if the locale is unknown.
122 fn width(&self, is_cjk: bool) -> uint;
124 /// Returns a string with leading and trailing whitespace removed.
125 fn trim<'a>(&'a self) -> &'a str;
127 /// Returns a string with leading whitespace removed.
128 fn trim_left<'a>(&'a self) -> &'a str;
130 /// Returns a string with trailing whitespace removed.
131 fn trim_right<'a>(&'a self) -> &'a str;
134 impl UnicodeStrPrelude for str {
136 fn graphemes(&self, is_extended: bool) -> Graphemes {
137 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
141 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
142 GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
146 fn words(&self) -> Words {
147 let f = |c: char| c.is_whitespace();
148 self.split(f).filter(|s| !s.is_empty())
152 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
155 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
158 fn width(&self, is_cjk: bool) -> uint {
159 self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
163 fn trim(&self) -> &str {
164 self.trim_left().trim_right()
168 fn trim_left(&self) -> &str {
169 self.trim_left_chars(|c: char| c.is_whitespace())
173 fn trim_right(&self) -> &str {
174 self.trim_right_chars(|c: char| c.is_whitespace())
178 /// External iterator for grapheme clusters and byte offsets.
180 pub struct GraphemeIndices<'a> {
185 impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> {
187 fn next(&mut self) -> Option<(uint, &'a str)> {
188 self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
192 fn size_hint(&self) -> (uint, Option<uint>) {
193 self.iter.size_hint()
197 impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> {
199 fn next_back(&mut self) -> Option<(uint, &'a str)> {
200 self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
204 /// External iterator for a string's
205 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
207 pub struct Graphemes<'a> {
210 cat: Option<GraphemeCat>,
211 catb: Option<GraphemeCat>,
214 // state machine for cluster boundary rules
215 #[deriving(PartialEq,Eq)]
225 impl<'a> Iterator<&'a str> for Graphemes<'a> {
227 fn size_hint(&self) -> (uint, Option<uint>) {
228 let slen = self.string.len();
229 (cmp::min(slen, 1u), Some(slen))
233 fn next(&mut self) -> Option<&'a str> {
234 use tables::grapheme as gr;
235 if self.string.len() == 0 {
239 let mut take_curr = true;
241 let mut state = Start;
242 let mut cat = gr::GC_Any;
243 for (curr, ch) in self.string.char_indices() {
246 // retrieve cached category, if any
247 // We do this because most of the time we would end up
248 // looking up each character twice.
249 cat = match self.cat {
250 None => gr::grapheme_category(ch),
251 _ => self.cat.take().unwrap()
255 gr::GC_Extend => true,
256 gr::GC_SpacingMark if self.extended => true,
259 state = FindExtend; // rule GB9/GB9a
263 state = match state {
264 Start if '\r' == ch => {
265 let slen = self.string.len();
267 if nidx != slen && self.string.char_at(nidx) == '\n' {
268 idx = nidx; // rule GB3
273 gr::GC_Control => break,
275 gr::GC_LV | gr::GC_V => HangulLV,
276 gr::GC_LVT | gr::GC_T => HangulLVT,
277 gr::GC_RegionalIndicator => Regional,
280 FindExtend => { // found non-extending when looking for extending
284 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
285 gr::GC_L => continue,
286 gr::GC_LV | gr::GC_V => HangulLV,
287 gr::GC_LVT => HangulLVT,
293 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
294 gr::GC_V => continue,
295 gr::GC_T => HangulLVT,
301 HangulLVT => match cat { // rule GB8: (LVT|T) x T
302 gr::GC_T => continue,
308 Regional => match cat { // rule GB8a
309 gr::GC_RegionalIndicator => continue,
318 self.cat = if take_curr {
319 idx = self.string.char_range_at(idx).next;
325 let retstr = self.string.slice_to(idx);
326 self.string = self.string.slice_from(idx);
331 impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
333 fn next_back(&mut self) -> Option<&'a str> {
334 use tables::grapheme as gr;
335 if self.string.len() == 0 {
339 let mut take_curr = true;
340 let mut idx = self.string.len();
341 let mut previdx = idx;
342 let mut state = Start;
343 let mut cat = gr::GC_Any;
344 for (curr, ch) in self.string.char_indices().rev() {
348 // cached category, if any
349 cat = match self.catb {
350 None => gr::grapheme_category(ch),
351 _ => self.catb.take().unwrap()
354 // a matching state machine that runs *backwards* across an input string
355 // note that this has some implications for the Hangul matching, since
356 // we now need to know what the rightward letter is:
358 // Right to left, we have:
362 // HangulL means the letter to the right is L
363 // HangulLV means the letter to the right is V
364 // HangulLVT means the letter to the right is T
365 state = match state {
366 Start if '\n' == ch => {
367 if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
368 idx -= 1; // rule GB3
372 Start | FindExtend => match cat {
373 gr::GC_Extend => FindExtend,
374 gr::GC_SpacingMark if self.extended => FindExtend,
375 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
376 gr::GC_V => HangulLV,
377 gr::GC_T => HangulLVT,
378 gr::GC_RegionalIndicator => Regional,
380 take_curr = Start == state;
385 HangulL => match cat { // char to right is an L
386 gr::GC_L => continue, // L x L is the only legal match
392 HangulLV => match cat { // char to right is a V
393 gr::GC_V => continue, // V x V, right char is still V
394 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
400 HangulLVT => match cat { // char to right is a T
401 gr::GC_T => continue, // T x T, right char is still T
402 gr::GC_V => HangulLV, // V x T, right char is now V
403 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
409 Regional => match cat { // rule GB8a
410 gr::GC_RegionalIndicator => continue,
419 self.catb = if take_curr {
426 let retstr = self.string.slice_from(idx);
427 self.string = self.string.slice_to(idx);