1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Unicode-intensive string manipulations.
15 //! This module provides functionality to `str` that requires the Unicode methods provided by the
16 //! UnicodeChar trait.
18 use self::GraphemeState::*;
23 use core::iter::{Filter, AdditiveIterator};
29 use u_char::UnicodeChar;
30 use tables::grapheme::GraphemeCat;
32 /// An iterator over the words of a string, separated by a sequence of whitespace
34 pub struct Words<'a> {
35 inner: Filter<&'a str, Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
38 /// Methods for Unicode string slices
39 #[allow(missing_docs)] // docs in libcollections
40 pub trait UnicodeStr for Sized? {
41 fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
42 fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
43 fn words<'a>(&'a self) -> Words<'a>;
44 fn is_whitespace(&self) -> bool;
45 fn is_alphanumeric(&self) -> bool;
46 fn width(&self, is_cjk: bool) -> uint;
47 fn trim<'a>(&'a self) -> &'a str;
48 fn trim_left<'a>(&'a self) -> &'a str;
49 fn trim_right<'a>(&'a self) -> &'a str;
52 impl UnicodeStr for str {
54 fn graphemes(&self, is_extended: bool) -> Graphemes {
55 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
59 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
60 GraphemeIndices { start_offset: self.as_ptr() as uint, iter: self.graphemes(is_extended) }
64 fn words(&self) -> Words {
65 fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
66 let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
68 fn is_whitespace(c: char) -> bool { c.is_whitespace() }
69 let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
71 Words { inner: self.split(is_whitespace).filter(is_not_empty) }
75 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
78 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
81 fn width(&self, is_cjk: bool) -> uint {
82 self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
86 fn trim(&self) -> &str {
87 self.trim_left().trim_right()
91 fn trim_left(&self) -> &str {
92 self.trim_left_matches(|&: c: char| c.is_whitespace())
96 fn trim_right(&self) -> &str {
97 self.trim_right_matches(|&: c: char| c.is_whitespace())
101 /// External iterator for grapheme clusters and byte offsets.
103 pub struct GraphemeIndices<'a> {
108 impl<'a> Iterator for GraphemeIndices<'a> {
109 type Item = (uint, &'a str);
112 fn next(&mut self) -> Option<(uint, &'a str)> {
113 self.iter.next().map(|s| (s.as_ptr() as uint - self.start_offset, s))
117 fn size_hint(&self) -> (uint, Option<uint>) {
118 self.iter.size_hint()
122 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
124 fn next_back(&mut self) -> Option<(uint, &'a str)> {
125 self.iter.next_back().map(|s| (s.as_ptr() as uint - self.start_offset, s))
129 /// External iterator for a string's
130 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
132 pub struct Graphemes<'a> {
135 cat: Option<GraphemeCat>,
136 catb: Option<GraphemeCat>,
139 // state machine for cluster boundary rules
140 #[deriving(PartialEq,Eq)]
150 impl<'a> Iterator for Graphemes<'a> {
154 fn size_hint(&self) -> (uint, Option<uint>) {
155 let slen = self.string.len();
156 (cmp::min(slen, 1u), Some(slen))
160 fn next(&mut self) -> Option<&'a str> {
161 use tables::grapheme as gr;
162 if self.string.len() == 0 {
166 let mut take_curr = true;
168 let mut state = Start;
169 let mut cat = gr::GC_Any;
170 for (curr, ch) in self.string.char_indices() {
173 // retrieve cached category, if any
174 // We do this because most of the time we would end up
175 // looking up each character twice.
176 cat = match self.cat {
177 None => gr::grapheme_category(ch),
178 _ => self.cat.take().unwrap()
182 gr::GC_Extend => true,
183 gr::GC_SpacingMark if self.extended => true,
186 state = FindExtend; // rule GB9/GB9a
190 state = match state {
191 Start if '\r' == ch => {
192 let slen = self.string.len();
194 if nidx != slen && self.string.char_at(nidx) == '\n' {
195 idx = nidx; // rule GB3
200 gr::GC_Control => break,
202 gr::GC_LV | gr::GC_V => HangulLV,
203 gr::GC_LVT | gr::GC_T => HangulLVT,
204 gr::GC_RegionalIndicator => Regional,
207 FindExtend => { // found non-extending when looking for extending
211 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
212 gr::GC_L => continue,
213 gr::GC_LV | gr::GC_V => HangulLV,
214 gr::GC_LVT => HangulLVT,
220 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
221 gr::GC_V => continue,
222 gr::GC_T => HangulLVT,
228 HangulLVT => match cat { // rule GB8: (LVT|T) x T
229 gr::GC_T => continue,
235 Regional => match cat { // rule GB8a
236 gr::GC_RegionalIndicator => continue,
245 self.cat = if take_curr {
246 idx = self.string.char_range_at(idx).next;
252 let retstr = self.string.slice_to(idx);
253 self.string = self.string.slice_from(idx);
258 impl<'a> DoubleEndedIterator for Graphemes<'a> {
260 fn next_back(&mut self) -> Option<&'a str> {
261 use tables::grapheme as gr;
262 if self.string.len() == 0 {
266 let mut take_curr = true;
267 let mut idx = self.string.len();
268 let mut previdx = idx;
269 let mut state = Start;
270 let mut cat = gr::GC_Any;
271 for (curr, ch) in self.string.char_indices().rev() {
275 // cached category, if any
276 cat = match self.catb {
277 None => gr::grapheme_category(ch),
278 _ => self.catb.take().unwrap()
281 // a matching state machine that runs *backwards* across an input string
282 // note that this has some implications for the Hangul matching, since
283 // we now need to know what the rightward letter is:
285 // Right to left, we have:
289 // HangulL means the letter to the right is L
290 // HangulLV means the letter to the right is V
291 // HangulLVT means the letter to the right is T
292 state = match state {
293 Start if '\n' == ch => {
294 if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
295 idx -= 1; // rule GB3
299 Start | FindExtend => match cat {
300 gr::GC_Extend => FindExtend,
301 gr::GC_SpacingMark if self.extended => FindExtend,
302 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
303 gr::GC_V => HangulLV,
304 gr::GC_T => HangulLVT,
305 gr::GC_RegionalIndicator => Regional,
307 take_curr = Start == state;
312 HangulL => match cat { // char to right is an L
313 gr::GC_L => continue, // L x L is the only legal match
319 HangulLV => match cat { // char to right is a V
320 gr::GC_V => continue, // V x V, right char is still V
321 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
327 HangulLVT => match cat { // char to right is a T
328 gr::GC_T => continue, // T x T, right char is still T
329 gr::GC_V => HangulLV, // V x T, right char is now V
330 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
336 Regional => match cat { // rule GB8a
337 gr::GC_RegionalIndicator => continue,
346 self.catb = if take_curr {
353 let retstr = self.string.slice_from(idx);
354 self.string = self.string.slice_to(idx);
359 // https://tools.ietf.org/html/rfc3629
360 static UTF8_CHAR_WIDTH: [u8; 256] = [
361 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
362 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
363 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
364 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
365 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
366 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
367 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
368 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
369 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
370 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
371 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
372 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
373 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
374 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
375 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
376 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
379 /// Given a first byte, determine how many bytes are in this UTF-8 character
381 pub fn utf8_char_width(b: u8) -> uint {
382 return UTF8_CHAR_WIDTH[b as uint] as uint;
385 /// Determines if a vector of `u16` contains valid UTF-16
386 pub fn is_utf16(v: &[u16]) -> bool {
387 let mut it = v.iter();
388 macro_rules! next { ($ret:expr) => {
389 match it.next() { Some(u) => *u, None => return $ret }
395 match char::from_u32(u as u32) {
398 let u2 = next!(false);
399 if u < 0xD7FF || u > 0xDBFF ||
400 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
406 /// An iterator that decodes UTF-16 encoded codepoints from a vector
409 pub struct Utf16Items<'a> {
410 iter: slice::Iter<'a, u16>
412 /// The possibilities for values decoded from a `u16` stream.
413 #[deriving(PartialEq, Eq, Clone, Show)]
415 /// A valid codepoint.
417 /// An invalid surrogate without its pair.
421 impl Copy for Utf16Item {}
424 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
425 /// replacement character (U+FFFD).
427 pub fn to_char_lossy(&self) -> char {
429 Utf16Item::ScalarValue(c) => c,
430 Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
435 impl<'a> Iterator for Utf16Items<'a> {
436 type Item = Utf16Item;
438 fn next(&mut self) -> Option<Utf16Item> {
439 let u = match self.iter.next() {
444 if u < 0xD800 || 0xDFFF < u {
446 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
447 } else if u >= 0xDC00 {
448 // a trailing surrogate
449 Some(Utf16Item::LoneSurrogate(u))
451 // preserve state for rewinding.
454 let u2 = match self.iter.next() {
457 None => return Some(Utf16Item::LoneSurrogate(u))
459 if u2 < 0xDC00 || u2 > 0xDFFF {
460 // not a trailing surrogate so we're not a valid
461 // surrogate pair, so rewind to redecode u2 next time.
463 return Some(Utf16Item::LoneSurrogate(u))
466 // all ok, so lets decode it.
467 let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
468 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
473 fn size_hint(&self) -> (uint, Option<uint>) {
474 let (low, high) = self.iter.size_hint();
475 // we could be entirely valid surrogates (2 elements per
476 // char), or entirely non-surrogates (1 element per char)
481 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
482 /// returning invalid surrogates as `LoneSurrogate`s.
487 /// use unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
489 /// // 𝄞mus<invalid>ic<invalid>
490 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
491 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
494 /// assert_eq!(unicode::str::utf16_items(&v).collect::<Vec<_>>(),
495 /// vec![ScalarValue('𝄞'),
496 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
497 /// LoneSurrogate(0xDD1E),
498 /// ScalarValue('i'), ScalarValue('c'),
499 /// LoneSurrogate(0xD834)]);
501 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
502 Utf16Items { iter : v.iter() }
505 /// Iterator adaptor for encoding `char`s to UTF-16.
507 pub struct Utf16Encoder<I> {
512 impl<I> Utf16Encoder<I> {
513 /// Create an UTF-16 encoder from any `char` iterator.
514 pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
515 Utf16Encoder { chars: chars, extra: 0 }
519 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
523 fn next(&mut self) -> Option<u16> {
525 let tmp = self.extra;
530 let mut buf = [0u16; 2];
531 self.chars.next().map(|ch| {
532 let n = ch.encode_utf16(buf.as_mut_slice()).unwrap_or(0);
533 if n == 2 { self.extra = buf[1]; }
539 fn size_hint(&self) -> (uint, Option<uint>) {
540 let (low, high) = self.chars.size_hint();
541 // every char gets either one u16 or two u16,
542 // so this iterator is between 1 or 2 times as
543 // long as the underlying iterator.
544 (low, high.and_then(|n| n.checked_mul(2)))
548 impl<'a> Iterator for Words<'a> {
551 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
553 impl<'a> DoubleEndedIterator for Words<'a> {
554 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }