1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Unicode-intensive string manipulations.
15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
16 //! UnicodeChar trait.
18 use self::GraphemeState::*;
20 use core::slice::SlicePrelude;
21 use core::iter::{Filter, AdditiveIterator, Iterator, IteratorExt};
22 use core::iter::{DoubleEndedIterator, DoubleEndedIteratorExt};
23 use core::kinds::Sized;
24 use core::option::{Option, None, Some};
25 use core::str::{CharSplits, StrPrelude};
26 use u_char::UnicodeChar;
27 use tables::grapheme::GraphemeCat;
29 /// An iterator over the words of a string, separated by a sequence of whitespace
30 /// FIXME: This should be opaque
32 Filter<'a, &'a str, CharSplits<'a, |char|:'a -> bool>>;
34 /// Methods for Unicode string slices
35 pub trait UnicodeStrPrelude for Sized? {
36 /// Returns an iterator over the
37 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
40 /// If `is_extended` is true, the iterator is over the *extended grapheme clusters*;
41 /// otherwise, the iterator is over the *legacy grapheme clusters*.
42 /// [UAX#29](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries)
43 /// recommends extended grapheme cluster boundaries for general processing.
48 /// let gr1 = "a\u0310e\u0301o\u0308\u0332".graphemes(true).collect::<Vec<&str>>();
49 /// let b: &[_] = &["a\u0310", "e\u0301", "o\u0308\u0332"];
50 /// assert_eq!(gr1.as_slice(), b);
51 /// let gr2 = "a\r\nb🇷🇺🇸🇹".graphemes(true).collect::<Vec<&str>>();
52 /// let b: &[_] = &["a", "\r\n", "b", "🇷🇺🇸🇹"];
53 /// assert_eq!(gr2.as_slice(), b);
55 fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
57 /// Returns an iterator over the grapheme clusters of self and their byte offsets.
58 /// See `graphemes()` method for more information.
63 /// let gr_inds = "a̐éö̲\r\n".grapheme_indices(true).collect::<Vec<(uint, &str)>>();
64 /// let b: &[_] = &[(0u, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")];
65 /// assert_eq!(gr_inds.as_slice(), b);
67 fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
69 /// An iterator over the words of a string (subsequences separated
70 /// by any sequence of whitespace). Sequences of whitespace are
71 /// collapsed, so empty "words" are not included.
76 /// let some_words = " Mary had\ta little \n\t lamb";
77 /// let v: Vec<&str> = some_words.words().collect();
78 /// assert_eq!(v, vec!["Mary", "had", "a", "little", "lamb"]);
80 fn words<'a>(&'a self) -> Words<'a>;
82 /// Returns true if the string contains only whitespace.
84 /// Whitespace characters are determined by `char::is_whitespace`.
89 /// assert!(" \t\n".is_whitespace());
90 /// assert!("".is_whitespace());
92 /// assert!( !"abc".is_whitespace());
94 fn is_whitespace(&self) -> bool;
96 /// Returns true if the string contains only alphanumeric code
99 /// Alphanumeric characters are determined by `char::is_alphanumeric`.
104 /// assert!("Löwe老虎Léopard123".is_alphanumeric());
105 /// assert!("".is_alphanumeric());
107 /// assert!( !" &*~".is_alphanumeric());
109 fn is_alphanumeric(&self) -> bool;
111 /// Returns a string's displayed width in columns, treating control
112 /// characters as zero-width.
114 /// `is_cjk` determines behavior for characters in the Ambiguous category:
115 /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
116 /// In CJK locales, `is_cjk` should be `true`, else it should be `false`.
117 /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
118 /// recommends that these characters be treated as 1 column (i.e.,
119 /// `is_cjk` = `false`) if the locale is unknown.
120 fn width(&self, is_cjk: bool) -> uint;
122 /// Returns a string with leading and trailing whitespace removed.
123 fn trim<'a>(&'a self) -> &'a str;
125 /// Returns a string with leading whitespace removed.
126 fn trim_left<'a>(&'a self) -> &'a str;
128 /// Returns a string with trailing whitespace removed.
129 fn trim_right<'a>(&'a self) -> &'a str;
132 impl UnicodeStrPrelude for str {
134 fn graphemes(&self, is_extended: bool) -> Graphemes {
135 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
139 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
140 GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
144 fn words(&self) -> Words {
145 let f = |c: char| c.is_whitespace();
146 self.split(f).filter(|s| !s.is_empty())
150 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
153 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
156 fn width(&self, is_cjk: bool) -> uint {
157 self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
161 fn trim(&self) -> &str {
162 self.trim_left().trim_right()
166 fn trim_left(&self) -> &str {
167 self.trim_left_chars(|c: char| c.is_whitespace())
171 fn trim_right(&self) -> &str {
172 self.trim_right_chars(|c: char| c.is_whitespace())
176 /// External iterator for grapheme clusters and byte offsets.
178 pub struct GraphemeIndices<'a> {
183 impl<'a> Iterator<(uint, &'a str)> for GraphemeIndices<'a> {
185 fn next(&mut self) -> Option<(uint, &'a str)> {
186 self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
190 fn size_hint(&self) -> (uint, Option<uint>) {
191 self.iter.size_hint()
195 impl<'a> DoubleEndedIterator<(uint, &'a str)> for GraphemeIndices<'a> {
197 fn next_back(&mut self) -> Option<(uint, &'a str)> {
198 self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
202 /// External iterator for a string's
203 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
205 pub struct Graphemes<'a> {
208 cat: Option<GraphemeCat>,
209 catb: Option<GraphemeCat>,
212 // state machine for cluster boundary rules
213 #[deriving(PartialEq,Eq)]
223 impl<'a> Iterator<&'a str> for Graphemes<'a> {
225 fn size_hint(&self) -> (uint, Option<uint>) {
226 let slen = self.string.len();
227 (cmp::min(slen, 1u), Some(slen))
231 fn next(&mut self) -> Option<&'a str> {
232 use tables::grapheme as gr;
233 if self.string.len() == 0 {
237 let mut take_curr = true;
239 let mut state = Start;
240 let mut cat = gr::GC_Any;
241 for (curr, ch) in self.string.char_indices() {
244 // retrieve cached category, if any
245 // We do this because most of the time we would end up
246 // looking up each character twice.
247 cat = match self.cat {
248 None => gr::grapheme_category(ch),
249 _ => self.cat.take().unwrap()
253 gr::GC_Extend => true,
254 gr::GC_SpacingMark if self.extended => true,
257 state = FindExtend; // rule GB9/GB9a
261 state = match state {
262 Start if '\r' == ch => {
263 let slen = self.string.len();
265 if nidx != slen && self.string.char_at(nidx) == '\n' {
266 idx = nidx; // rule GB3
271 gr::GC_Control => break,
273 gr::GC_LV | gr::GC_V => HangulLV,
274 gr::GC_LVT | gr::GC_T => HangulLVT,
275 gr::GC_RegionalIndicator => Regional,
278 FindExtend => { // found non-extending when looking for extending
282 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
283 gr::GC_L => continue,
284 gr::GC_LV | gr::GC_V => HangulLV,
285 gr::GC_LVT => HangulLVT,
291 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
292 gr::GC_V => continue,
293 gr::GC_T => HangulLVT,
299 HangulLVT => match cat { // rule GB8: (LVT|T) x T
300 gr::GC_T => continue,
306 Regional => match cat { // rule GB8a
307 gr::GC_RegionalIndicator => continue,
316 self.cat = if take_curr {
317 idx = self.string.char_range_at(idx).next;
323 let retstr = self.string.slice_to(idx);
324 self.string = self.string.slice_from(idx);
329 impl<'a> DoubleEndedIterator<&'a str> for Graphemes<'a> {
331 fn next_back(&mut self) -> Option<&'a str> {
332 use tables::grapheme as gr;
333 if self.string.len() == 0 {
337 let mut take_curr = true;
338 let mut idx = self.string.len();
339 let mut previdx = idx;
340 let mut state = Start;
341 let mut cat = gr::GC_Any;
342 for (curr, ch) in self.string.char_indices().rev() {
346 // cached category, if any
347 cat = match self.catb {
348 None => gr::grapheme_category(ch),
349 _ => self.catb.take().unwrap()
352 // a matching state machine that runs *backwards* across an input string
353 // note that this has some implications for the Hangul matching, since
354 // we now need to know what the rightward letter is:
356 // Right to left, we have:
360 // HangulL means the letter to the right is L
361 // HangulLV means the letter to the right is V
362 // HangulLVT means the letter to the right is T
363 state = match state {
364 Start if '\n' == ch => {
365 if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
366 idx -= 1; // rule GB3
370 Start | FindExtend => match cat {
371 gr::GC_Extend => FindExtend,
372 gr::GC_SpacingMark if self.extended => FindExtend,
373 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
374 gr::GC_V => HangulLV,
375 gr::GC_T => HangulLVT,
376 gr::GC_RegionalIndicator => Regional,
378 take_curr = Start == state;
383 HangulL => match cat { // char to right is an L
384 gr::GC_L => continue, // L x L is the only legal match
390 HangulLV => match cat { // char to right is a V
391 gr::GC_V => continue, // V x V, right char is still V
392 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
398 HangulLVT => match cat { // char to right is a T
399 gr::GC_T => continue, // T x T, right char is still T
400 gr::GC_V => HangulLV, // V x T, right char is now V
401 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
407 Regional => match cat { // rule GB8a
408 gr::GC_RegionalIndicator => continue,
417 self.catb = if take_curr {
424 let retstr = self.string.slice_from(idx);
425 self.string = self.string.slice_to(idx);