1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Unicode-intensive string manipulations.
13 //! This module provides functionality to `str` that requires the Unicode methods provided by the
14 //! unicode parts of the CharExt trait.
16 use self::GraphemeState::*;
21 use core::iter::Filter;
26 use tables::grapheme::GraphemeCat;
28 #[deprecated(reason = "struct Words is being replaced by struct SplitWhitespace",
30 #[unstable(feature = "str_words",
31 reason = "words() will be replaced by split_whitespace() in 1.1.0")]
32 pub type Words<'a> = SplitWhitespace<'a>;
34 /// An iterator over the non-whitespace substrings of a string,
35 /// separated by any amount of whitespace.
36 #[stable(feature = "split_whitespace", since = "1.1.0")]
37 pub struct SplitWhitespace<'a> {
38 inner: Filter<Split<'a, fn(char) -> bool>, fn(&&str) -> bool>,
41 /// Methods for Unicode string slices
42 #[allow(missing_docs)] // docs in libcollections
43 pub trait UnicodeStr {
44 fn graphemes<'a>(&'a self, is_extended: bool) -> Graphemes<'a>;
45 fn grapheme_indices<'a>(&'a self, is_extended: bool) -> GraphemeIndices<'a>;
47 fn words<'a>(&'a self) -> Words<'a>;
48 fn split_whitespace<'a>(&'a self) -> SplitWhitespace<'a>;
49 fn is_whitespace(&self) -> bool;
50 fn is_alphanumeric(&self) -> bool;
51 fn width(&self, is_cjk: bool) -> usize;
52 fn trim<'a>(&'a self) -> &'a str;
53 fn trim_left<'a>(&'a self) -> &'a str;
54 fn trim_right<'a>(&'a self) -> &'a str;
57 impl UnicodeStr for str {
59 fn graphemes(&self, is_extended: bool) -> Graphemes {
60 Graphemes { string: self, extended: is_extended, cat: None, catb: None }
64 fn grapheme_indices(&self, is_extended: bool) -> GraphemeIndices {
65 GraphemeIndices { start_offset: self.as_ptr() as usize, iter: self.graphemes(is_extended) }
70 fn words(&self) -> Words {
71 self.split_whitespace()
75 fn split_whitespace(&self) -> SplitWhitespace {
76 fn is_not_empty(s: &&str) -> bool { !s.is_empty() }
77 let is_not_empty: fn(&&str) -> bool = is_not_empty; // coerce to fn pointer
79 fn is_whitespace(c: char) -> bool { c.is_whitespace() }
80 let is_whitespace: fn(char) -> bool = is_whitespace; // coerce to fn pointer
82 SplitWhitespace { inner: self.split(is_whitespace).filter(is_not_empty) }
86 fn is_whitespace(&self) -> bool { self.chars().all(|c| c.is_whitespace()) }
89 fn is_alphanumeric(&self) -> bool { self.chars().all(|c| c.is_alphanumeric()) }
93 fn width(&self, is_cjk: bool) -> usize {
94 self.chars().map(|c| c.width(is_cjk).unwrap_or(0)).sum()
98 fn trim(&self) -> &str {
99 self.trim_matches(|c: char| c.is_whitespace())
103 fn trim_left(&self) -> &str {
104 self.trim_left_matches(|c: char| c.is_whitespace())
108 fn trim_right(&self) -> &str {
109 self.trim_right_matches(|c: char| c.is_whitespace())
113 /// External iterator for grapheme clusters and byte offsets.
115 pub struct GraphemeIndices<'a> {
120 impl<'a> Iterator for GraphemeIndices<'a> {
121 type Item = (usize, &'a str);
124 fn next(&mut self) -> Option<(usize, &'a str)> {
125 self.iter.next().map(|s| (s.as_ptr() as usize - self.start_offset, s))
129 fn size_hint(&self) -> (usize, Option<usize>) {
130 self.iter.size_hint()
134 impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
136 fn next_back(&mut self) -> Option<(usize, &'a str)> {
137 self.iter.next_back().map(|s| (s.as_ptr() as usize - self.start_offset, s))
141 /// External iterator for a string's
142 /// [grapheme clusters](http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
144 pub struct Graphemes<'a> {
147 cat: Option<GraphemeCat>,
148 catb: Option<GraphemeCat>,
151 // state machine for cluster boundary rules
152 #[derive(PartialEq,Eq)]
162 impl<'a> Iterator for Graphemes<'a> {
166 fn size_hint(&self) -> (usize, Option<usize>) {
167 let slen = self.string.len();
168 (cmp::min(slen, 1), Some(slen))
172 fn next(&mut self) -> Option<&'a str> {
173 use tables::grapheme as gr;
174 if self.string.is_empty() {
178 let mut take_curr = true;
180 let mut state = Start;
181 let mut cat = gr::GC_Any;
182 for (curr, ch) in self.string.char_indices() {
185 // retrieve cached category, if any
186 // We do this because most of the time we would end up
187 // looking up each character twice.
188 cat = match self.cat {
189 None => gr::grapheme_category(ch),
190 _ => self.cat.take().unwrap()
194 gr::GC_Extend => true,
195 gr::GC_SpacingMark if self.extended => true,
198 state = FindExtend; // rule GB9/GB9a
202 state = match state {
203 Start if '\r' == ch => {
204 let slen = self.string.len();
206 if nidx != slen && self.string.char_at(nidx) == '\n' {
207 idx = nidx; // rule GB3
212 gr::GC_Control => break,
214 gr::GC_LV | gr::GC_V => HangulLV,
215 gr::GC_LVT | gr::GC_T => HangulLVT,
216 gr::GC_Regional_Indicator => Regional,
219 FindExtend => { // found non-extending when looking for extending
223 HangulL => match cat { // rule GB6: L x (L|V|LV|LVT)
224 gr::GC_L => continue,
225 gr::GC_LV | gr::GC_V => HangulLV,
226 gr::GC_LVT => HangulLVT,
232 HangulLV => match cat { // rule GB7: (LV|V) x (V|T)
233 gr::GC_V => continue,
234 gr::GC_T => HangulLVT,
240 HangulLVT => match cat { // rule GB8: (LVT|T) x T
241 gr::GC_T => continue,
247 Regional => match cat { // rule GB8a
248 gr::GC_Regional_Indicator => continue,
257 self.cat = if take_curr {
258 idx = idx + self.string.char_at(idx).len_utf8();
264 let retstr = &self.string[..idx];
265 self.string = &self.string[idx..];
270 impl<'a> DoubleEndedIterator for Graphemes<'a> {
272 fn next_back(&mut self) -> Option<&'a str> {
273 use tables::grapheme as gr;
274 if self.string.is_empty() {
278 let mut take_curr = true;
279 let mut idx = self.string.len();
280 let mut previdx = idx;
281 let mut state = Start;
282 let mut cat = gr::GC_Any;
283 for (curr, ch) in self.string.char_indices().rev() {
287 // cached category, if any
288 cat = match self.catb {
289 None => gr::grapheme_category(ch),
290 _ => self.catb.take().unwrap()
293 // a matching state machine that runs *backwards* across an input string
294 // note that this has some implications for the Hangul matching, since
295 // we now need to know what the rightward letter is:
297 // Right to left, we have:
301 // HangulL means the letter to the right is L
302 // HangulLV means the letter to the right is V
303 // HangulLVT means the letter to the right is T
304 state = match state {
305 Start if '\n' == ch => {
306 if idx > 0 && '\r' == self.string.char_at_reverse(idx) {
307 idx -= 1; // rule GB3
311 Start | FindExtend => match cat {
312 gr::GC_Extend => FindExtend,
313 gr::GC_SpacingMark if self.extended => FindExtend,
314 gr::GC_L | gr::GC_LV | gr::GC_LVT => HangulL,
315 gr::GC_V => HangulLV,
316 gr::GC_T => HangulLVT,
317 gr::GC_Regional_Indicator => Regional,
319 take_curr = Start == state;
324 HangulL => match cat { // char to right is an L
325 gr::GC_L => continue, // L x L is the only legal match
331 HangulLV => match cat { // char to right is a V
332 gr::GC_V => continue, // V x V, right char is still V
333 gr::GC_L | gr::GC_LV => HangulL, // (L|V) x V, right char is now L
339 HangulLVT => match cat { // char to right is a T
340 gr::GC_T => continue, // T x T, right char is still T
341 gr::GC_V => HangulLV, // V x T, right char is now V
342 gr::GC_LV | gr::GC_LVT => HangulL, // (LV|LVT) x T, right char is now L
348 Regional => match cat { // rule GB8a
349 gr::GC_Regional_Indicator => continue,
358 self.catb = if take_curr {
365 let retstr = &self.string[idx..];
366 self.string = &self.string[..idx];
371 // https://tools.ietf.org/html/rfc3629
372 static UTF8_CHAR_WIDTH: [u8; 256] = [
373 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
374 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x1F
375 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
376 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x3F
377 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
378 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x5F
379 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
380 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0x7F
381 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
382 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x9F
383 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
384 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xBF
385 0,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
386 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xDF
387 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, // 0xEF
388 4,4,4,4,4,0,0,0,0,0,0,0,0,0,0,0, // 0xFF
391 /// Given a first byte, determine how many bytes are in this UTF-8 character
393 pub fn utf8_char_width(b: u8) -> usize {
394 return UTF8_CHAR_WIDTH[b as usize] as usize;
397 /// Determines if a vector of `u16` contains valid UTF-16
398 pub fn is_utf16(v: &[u16]) -> bool {
399 let mut it = v.iter();
400 macro_rules! next { ($ret:expr) => {
401 match it.next() { Some(u) => *u, None => return $ret }
407 match char::from_u32(u as u32) {
410 let u2 = next!(false);
411 if u < 0xD7FF || u > 0xDBFF ||
412 u2 < 0xDC00 || u2 > 0xDFFF { return false; }
418 /// An iterator that decodes UTF-16 encoded codepoints from a vector
421 pub struct Utf16Items<'a> {
422 iter: slice::Iter<'a, u16>
424 /// The possibilities for values decoded from a `u16` stream.
425 #[derive(Copy, PartialEq, Eq, Clone, Debug)]
427 /// A valid codepoint.
429 /// An invalid surrogate without its pair.
434 /// Convert `self` to a `char`, taking `LoneSurrogate`s to the
435 /// replacement character (U+FFFD).
437 pub fn to_char_lossy(&self) -> char {
439 Utf16Item::ScalarValue(c) => c,
440 Utf16Item::LoneSurrogate(_) => '\u{FFFD}'
445 impl<'a> Iterator for Utf16Items<'a> {
446 type Item = Utf16Item;
448 fn next(&mut self) -> Option<Utf16Item> {
449 let u = match self.iter.next() {
454 if u < 0xD800 || 0xDFFF < u {
456 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(u as u32)}))
457 } else if u >= 0xDC00 {
458 // a trailing surrogate
459 Some(Utf16Item::LoneSurrogate(u))
461 // preserve state for rewinding.
462 let old = self.iter.clone();
464 let u2 = match self.iter.next() {
467 None => return Some(Utf16Item::LoneSurrogate(u))
469 if u2 < 0xDC00 || u2 > 0xDFFF {
470 // not a trailing surrogate so we're not a valid
471 // surrogate pair, so rewind to redecode u2 next time.
472 self.iter = old.clone();
473 return Some(Utf16Item::LoneSurrogate(u))
476 // all ok, so lets decode it.
477 let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
478 Some(Utf16Item::ScalarValue(unsafe {mem::transmute(c)}))
483 fn size_hint(&self) -> (usize, Option<usize>) {
484 let (low, high) = self.iter.size_hint();
485 // we could be entirely valid surrogates (2 elements per
486 // char), or entirely non-surrogates (1 element per char)
491 /// Create an iterator over the UTF-16 encoded codepoints in `v`,
492 /// returning invalid surrogates as `LoneSurrogate`s.
497 /// #![feature(unicode)]
499 /// extern crate rustc_unicode;
501 /// use rustc_unicode::str::Utf16Item::{ScalarValue, LoneSurrogate};
504 /// // 𝄞mus<invalid>ic<invalid>
505 /// let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
506 /// 0x0073, 0xDD1E, 0x0069, 0x0063,
509 /// assert_eq!(rustc_unicode::str::utf16_items(&v).collect::<Vec<_>>(),
510 /// vec![ScalarValue('𝄞'),
511 /// ScalarValue('m'), ScalarValue('u'), ScalarValue('s'),
512 /// LoneSurrogate(0xDD1E),
513 /// ScalarValue('i'), ScalarValue('c'),
514 /// LoneSurrogate(0xD834)]);
517 pub fn utf16_items<'a>(v: &'a [u16]) -> Utf16Items<'a> {
518 Utf16Items { iter : v.iter() }
521 /// Iterator adaptor for encoding `char`s to UTF-16.
523 pub struct Utf16Encoder<I> {
528 impl<I> Utf16Encoder<I> {
529 /// Create a UTF-16 encoder from any `char` iterator.
530 pub fn new(chars: I) -> Utf16Encoder<I> where I: Iterator<Item=char> {
531 Utf16Encoder { chars: chars, extra: 0 }
535 impl<I> Iterator for Utf16Encoder<I> where I: Iterator<Item=char> {
539 fn next(&mut self) -> Option<u16> {
541 let tmp = self.extra;
546 let mut buf = [0; 2];
547 self.chars.next().map(|ch| {
548 let n = CharExt::encode_utf16(ch, &mut buf).unwrap_or(0);
549 if n == 2 { self.extra = buf[1]; }
555 fn size_hint(&self) -> (usize, Option<usize>) {
556 let (low, high) = self.chars.size_hint();
557 // every char gets either one u16 or two u16,
558 // so this iterator is between 1 or 2 times as
559 // long as the underlying iterator.
560 (low, high.and_then(|n| n.checked_mul(2)))
564 impl<'a> Iterator for SplitWhitespace<'a> {
567 fn next(&mut self) -> Option<&'a str> { self.inner.next() }
569 impl<'a> DoubleEndedIterator for SplitWhitespace<'a> {
570 fn next_back(&mut self) -> Option<&'a str> { self.inner.next_back() }