use cast;
use char;
use char::Char;
-use clone::Clone;
+use clone::{Clone, DeepClone};
use container::{Container, Mutable};
use iter::Times;
use iterator::{Iterator, FromIterator, Extendable};
}
}
-/// An iterator over the start and end indicies of the matches of a
+/// An iterator over the start and end indices of the matches of a
/// substring within a larger string
#[deriving(Clone)]
pub struct MatchesIndexIterator<'self> {
}
}
+// Helper functions used for Unicode normalization
+fn canonical_sort(comb: &mut [(char, u8)]) {
+ use iterator::range;
+ use tuple::CopyableTuple;
+
+ let len = comb.len();
+ for i in range(0, len) {
+ let mut swapped = false;
+ for j in range(1, len-i) {
+ let classA = comb[j-1].second();
+ let classB = comb[j].second();
+ if classA != 0 && classB != 0 && classA > classB {
+ comb.swap(j-1, j);
+ swapped = true;
+ }
+ }
+ if !swapped { break; }
+ }
+}
+
+#[deriving(Clone)]
+enum NormalizationForm {
+ NFD,
+ NFKD
+}
+
+/// External iterator for a string's normalization's characters.
+/// Use with the `std::iterator` module.
+#[deriving(Clone)]
+struct NormalizationIterator<'self> {
+ priv kind: NormalizationForm,
+ priv index: uint,
+ priv string: &'self str,
+ priv buffer: ~[(char, u8)],
+ priv sorted: bool
+}
+
+impl<'self> Iterator<char> for NormalizationIterator<'self> {
+ #[inline]
+ fn next(&mut self) -> Option<char> {
+ use unicode::decompose::canonical_combining_class;
+
+ match self.buffer.head_opt() {
+ Some(&(c, 0)) => {
+ self.sorted = false;
+ self.buffer.shift();
+ return Some(c);
+ }
+ Some(&(c, _)) if self.sorted => {
+ self.buffer.shift();
+ return Some(c);
+ }
+ _ => self.sorted = false
+ }
+
+ let decomposer = match self.kind {
+ NFD => char::decompose_canonical,
+ NFKD => char::decompose_compatible
+ };
+
+ while !self.sorted && self.index < self.string.len() {
+ let CharRange {ch, next} = self.string.char_range_at(self.index);
+ self.index = next;
+ do decomposer(ch) |d| {
+ let class = canonical_combining_class(d);
+ if class == 0 && !self.sorted {
+ canonical_sort(self.buffer);
+ self.sorted = true;
+ }
+ self.buffer.push((d, class));
+ }
+ }
+
+ if !self.sorted {
+ canonical_sort(self.buffer);
+ self.sorted = true;
+ }
+
+ match self.buffer.shift_opt() {
+ Some((c, 0)) => {
+ self.sorted = false;
+ Some(c)
+ }
+ Some((c, _)) => Some(c),
+ None => None
+ }
+ }
+
+ fn size_hint(&self) -> (uint, Option<uint>) { (self.string.len(), None) }
+}
+
/// Replace all occurrences of one string with another
///
/// # Arguments
/// Sets the length of a string
///
/// This will explicitly set the size of the string, without actually
- /// modifing its buffers, so it is up to the caller to ensure that
+ /// modifying its buffers, so it is up to the caller to ensure that
/// the string is actually the specified size.
#[inline]
pub unsafe fn set_len(s: &mut ~str, new_len: uint) {
fn line_iter(&self) -> CharSplitIterator<'self, char>;
fn any_line_iter(&self) -> AnyLineIterator<'self>;
fn word_iter(&self) -> WordIterator<'self>;
+ fn nfd_iter(&self) -> NormalizationIterator<'self>;
+ fn nfkd_iter(&self) -> NormalizationIterator<'self>;
fn ends_with(&self, needle: &str) -> bool;
fn is_whitespace(&self) -> bool;
fn is_alphanumeric(&self) -> bool;
self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
}
+ /// Returns the string in Unicode Normalization Form D (canonical decomposition)
+ fn nfd_iter(&self) -> NormalizationIterator<'self> {
+ NormalizationIterator {
+ index: 0,
+ string: *self,
+ buffer: ~[],
+ sorted: false,
+ kind: NFD
+ }
+ }
+
+ /// Returns the string in Unicode Normalization Form KD (compatibility decomposition)
+ fn nfkd_iter(&self) -> NormalizationIterator<'self> {
+ NormalizationIterator {
+ index: 0,
+ string: *self,
+ buffer: ~[],
+ sorted: false,
+ kind: NFKD
+ }
+ }
+
/// Returns true if the string contains only whitespace
///
/// Whitespace characters are determined by `char::is_whitespace`
}
}
+impl DeepClone for ~str {
+ #[inline]
+ fn deep_clone(&self) -> ~str {
+ self.to_owned()
+ }
+}
+
impl Clone for @str {
#[inline]
fn clone(&self) -> @str {
}
}
+impl DeepClone for @str {
+ #[inline]
+ fn deep_clone(&self) -> @str {
+ *self
+ }
+}
+
impl FromIterator<char> for ~str {
#[inline]
fn from_iterator<T: Iterator<char>>(iterator: &mut T) -> ~str {
#[test]
fn test_map() {
+ #[fixed_stack_segment]; #[inline(never)];
assert_eq!(~"", "".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
assert_eq!(~"YMCA", "ymca".map_chars(|c| unsafe {libc::toupper(c as c_char)} as char));
}
assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
}
+ #[test]
+ fn test_nfd_iter() {
+ assert_eq!("abc".nfd_iter().collect::<~str>(), ~"abc");
+ assert_eq!("\u1e0b\u01c4".nfd_iter().collect::<~str>(), ~"d\u0307\u01c4");
+ assert_eq!("\u2026".nfd_iter().collect::<~str>(), ~"\u2026");
+ assert_eq!("\u2126".nfd_iter().collect::<~str>(), ~"\u03a9");
+ assert_eq!("\u1e0b\u0323".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
+ assert_eq!("\u1e0d\u0307".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
+ assert_eq!("a\u0301".nfd_iter().collect::<~str>(), ~"a\u0301");
+ assert_eq!("\u0301a".nfd_iter().collect::<~str>(), ~"\u0301a");
+ assert_eq!("\ud4db".nfd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
+ assert_eq!("\uac1c".nfd_iter().collect::<~str>(), ~"\u1100\u1162");
+ }
+
+ #[test]
+ fn test_nfkd_iter() {
+ assert_eq!("abc".nfkd_iter().collect::<~str>(), ~"abc");
+ assert_eq!("\u1e0b\u01c4".nfkd_iter().collect::<~str>(), ~"d\u0307DZ\u030c");
+ assert_eq!("\u2026".nfkd_iter().collect::<~str>(), ~"...");
+ assert_eq!("\u2126".nfkd_iter().collect::<~str>(), ~"\u03a9");
+ assert_eq!("\u1e0b\u0323".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
+ assert_eq!("\u1e0d\u0307".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
+ assert_eq!("a\u0301".nfkd_iter().collect::<~str>(), ~"a\u0301");
+ assert_eq!("\u0301a".nfkd_iter().collect::<~str>(), ~"\u0301a");
+ assert_eq!("\ud4db".nfkd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
+ assert_eq!("\uac1c".nfkd_iter().collect::<~str>(), ~"\u1100\u1162");
+ }
+
#[test]
fn test_line_iter() {
let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";