src/libstd/ascii.rs

   1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! Operations on ASCII strings and characters
  14
  15 #![unstable(feature = "std_misc",
  16             reason = "unsure about placement and naming")]
  17
  18 use iter::IteratorExt;
  19 use ops::FnMut;
  20 use slice::SliceExt;
  21 use str::StrExt;
  22 use string::String;
  23 use vec::Vec;
  24
  25 /// Extension methods for ASCII-subset only operations on owned strings
  26 #[unstable(feature = "std_misc",
  27            reason = "would prefer to do this in a more general way")]
  28 pub trait OwnedAsciiExt {
  29     /// Convert the string to ASCII upper case:
  30     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  31     /// but non-ASCII letters are unchanged.
  32     fn into_ascii_uppercase(self) -> Self;
  33
  34     /// Convert the string to ASCII lower case:
  35     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  36     /// but non-ASCII letters are unchanged.
  37     fn into_ascii_lowercase(self) -> Self;
  38 }
  39
  40 /// Extension methods for ASCII-subset only operations on string slices
  41 #[unstable(feature = "std_misc",
  42            reason = "would prefer to do this in a more general way")]
  43 pub trait AsciiExt<T = Self> {
  44     /// Check if within the ASCII range.
  45     fn is_ascii(&self) -> bool;
  46
  47     /// Makes a copy of the string in ASCII upper case:
  48     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  49     /// but non-ASCII letters are unchanged.
  50     fn to_ascii_uppercase(&self) -> T;
  51
  52     /// Makes a copy of the string in ASCII lower case:
  53     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  54     /// but non-ASCII letters are unchanged.
  55     fn to_ascii_lowercase(&self) -> T;
  56
  57     /// Check that two strings are an ASCII case-insensitive match.
  58     /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
  59     /// but without allocating and copying temporary strings.
  60     fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
  61 }
  62
  63 #[unstable(feature = "std_misc",
  64            reason = "would prefer to do this in a more general way")]
  65 impl AsciiExt<String> for str {
  66     #[inline]
  67     fn is_ascii(&self) -> bool {
  68         self.bytes().all(|b| b.is_ascii())
  69     }
  70
  71     #[inline]
  72     fn to_ascii_uppercase(&self) -> String {
  73         // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
  74         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
  75     }
  76
  77     #[inline]
  78     fn to_ascii_lowercase(&self) -> String {
  79         // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
  80         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
  81     }
  82
  83     #[inline]
  84     fn eq_ignore_ascii_case(&self, other: &str) -> bool {
  85         self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
  86     }
  87 }
  88
  89 #[unstable(feature = "std_misc",
  90            reason = "would prefer to do this in a more general way")]
  91 impl OwnedAsciiExt for String {
  92     #[inline]
  93     fn into_ascii_uppercase(self) -> String {
  94         // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
  95         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
  96     }
  97
  98     #[inline]
  99     fn into_ascii_lowercase(self) -> String {
 100         // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
 101         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
 102     }
 103 }
 104
 105 #[unstable(feature = "std_misc",
 106            reason = "would prefer to do this in a more general way")]
 107 impl AsciiExt<Vec<u8>> for [u8] {
 108     #[inline]
 109     fn is_ascii(&self) -> bool {
 110         self.iter().all(|b| b.is_ascii())
 111     }
 112
 113     #[inline]
 114     fn to_ascii_uppercase(&self) -> Vec<u8> {
 115         self.iter().map(|b| b.to_ascii_uppercase()).collect()
 116     }
 117
 118     #[inline]
 119     fn to_ascii_lowercase(&self) -> Vec<u8> {
 120         self.iter().map(|b| b.to_ascii_lowercase()).collect()
 121     }
 122
 123     #[inline]
 124     fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
 125         self.len() == other.len() &&
 126         self.iter().zip(other.iter()).all(|(a, b)| {
 127             a.eq_ignore_ascii_case(b)
 128         })
 129     }
 130 }
 131
 132 #[unstable(feature = "std_misc",
 133            reason = "would prefer to do this in a more general way")]
 134 impl OwnedAsciiExt for Vec<u8> {
 135     #[inline]
 136     fn into_ascii_uppercase(mut self) -> Vec<u8> {
 137         for byte in self.iter_mut() {
 138             *byte = byte.to_ascii_uppercase();
 139         }
 140         self
 141     }
 142
 143     #[inline]
 144     fn into_ascii_lowercase(mut self) -> Vec<u8> {
 145         for byte in self.iter_mut() {
 146             *byte = byte.to_ascii_lowercase();
 147         }
 148         self
 149     }
 150 }
 151
 152 #[unstable(feature = "std_misc",
 153            reason = "would prefer to do this in a more general way")]
 154 impl AsciiExt for u8 {
 155     #[inline]
 156     fn is_ascii(&self) -> bool {
 157         *self & 128 == 0u8
 158     }
 159
 160     #[inline]
 161     fn to_ascii_uppercase(&self) -> u8 {
 162         ASCII_UPPERCASE_MAP[*self as uint]
 163     }
 164
 165     #[inline]
 166     fn to_ascii_lowercase(&self) -> u8 {
 167         ASCII_LOWERCASE_MAP[*self as uint]
 168     }
 169
 170     #[inline]
 171     fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
 172         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 173     }
 174 }
 175
 176 #[unstable(feature = "std_misc",
 177            reason = "would prefer to do this in a more general way")]
 178 impl AsciiExt for char {
 179     #[inline]
 180     fn is_ascii(&self) -> bool {
 181         *self as u32 <= 0x7F
 182     }
 183
 184     #[inline]
 185     fn to_ascii_uppercase(&self) -> char {
 186         if self.is_ascii() {
 187             (*self as u8).to_ascii_uppercase() as char
 188         } else {
 189             *self
 190         }
 191     }
 192
 193     #[inline]
 194     fn to_ascii_lowercase(&self) -> char {
 195         if self.is_ascii() {
 196             (*self as u8).to_ascii_lowercase() as char
 197         } else {
 198             *self
 199         }
 200     }
 201
 202     #[inline]
 203     fn eq_ignore_ascii_case(&self, other: &char) -> bool {
 204         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 205     }
 206 }
 207
 208 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
 209 ///
 210 /// The default is chosen with a bias toward producing literals that are
 211 /// legal in a variety of languages, including C++11 and similar C-family
 212 /// languages. The exact rules are:
 213 ///
 214 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 215 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
 216 /// - Any other chars in the range [0x20,0x7e] are not escaped.
 217 /// - Any other chars are given hex escapes.
 218 /// - Unicode escapes are never generated by this function.
 219 #[unstable(feature = "std_misc",
 220            reason = "needs to be updated to use an iterator")]
 221 pub fn escape_default<F>(c: u8, mut f: F) where
 222     F: FnMut(u8),
 223 {
 224     match c {
 225         b'\t' => { f(b'\\'); f(b't'); }
 226         b'\r' => { f(b'\\'); f(b'r'); }
 227         b'\n' => { f(b'\\'); f(b'n'); }
 228         b'\\' => { f(b'\\'); f(b'\\'); }
 229         b'\'' => { f(b'\\'); f(b'\''); }
 230         b'"'  => { f(b'\\'); f(b'"'); }
 231         b'\x20' ... b'\x7e' => { f(c); }
 232         _ => {
 233             f(b'\\');
 234             f(b'x');
 235             for &offset in [4u, 0u].iter() {
 236                 match ((c as i32) >> offset) & 0xf {
 237                     i @ 0 ... 9 => f(b'0' + (i as u8)),
 238                     i => f(b'a' + (i as u8 - 10)),
 239                 }
 240             }
 241         }
 242     }
 243 }
 244
 245 static ASCII_LOWERCASE_MAP: [u8; 256] = [
 246     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 247     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 248     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 249     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 250     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 251     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 252     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 253     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 254     b'@',
 255
 256           b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 257     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 258     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 259     b'x', b'y', b'z',
 260
 261                       b'[', b'\\', b']', b'^', b'_',
 262     b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 263     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 264     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 265     b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
 266     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 267     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 268     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 269     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 270     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 271     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 272     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 273     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 274     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 275     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 276     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 277     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 278     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 279     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 280     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 281     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 282 ];
 283
 284 static ASCII_UPPERCASE_MAP: [u8; 256] = [
 285     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 286     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 287     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 288     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 289     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 290     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 291     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 292     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 293     b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 294     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 295     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 296     b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
 297     b'`',
 298
 299           b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 300     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 301     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 302     b'X', b'Y', b'Z',
 303
 304                       b'{', b'|', b'}', b'~', 0x7f,
 305     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 306     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 307     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 308     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 309     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 310     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 311     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 312     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 313     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 314     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 315     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 316     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 317     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 318     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 319     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 320     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 321 ];
 322
 323
 324 #[cfg(test)]
 325 mod tests {
 326     use prelude::v1::*;
 327     use super::*;
 328     use char::from_u32;
 329
 330     #[test]
 331     fn test_ascii() {
 332         assert!("banana".chars().all(|c| c.is_ascii()));
 333         assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
 334     }
 335
 336     #[test]
 337     fn test_ascii_vec() {
 338         assert!("".is_ascii());
 339         assert!("a".is_ascii());
 340         assert!(!"\u{2009}".is_ascii());
 341
 342     }
 343
 344     #[test]
 345     fn test_to_ascii_uppercase() {
 346         assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
 347         assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
 348
 349         let mut i = 0;
 350         while i <= 500 {
 351             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 352                         else { i };
 353             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
 354                        (from_u32(upper).unwrap()).to_string());
 355             i += 1;
 356         }
 357     }
 358
 359     #[test]
 360     fn test_to_ascii_lowercase() {
 361         assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
 362         // Dotted capital I, Kelvin sign, Sharp S.
 363         assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
 364
 365         let mut i = 0;
 366         while i <= 500 {
 367             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 368                         else { i };
 369             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
 370                        (from_u32(lower).unwrap()).to_string());
 371             i += 1;
 372         }
 373     }
 374
 375     #[test]
 376     fn test_into_ascii_uppercase() {
 377         assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
 378                    "URL()URL()URL()üRL".to_string());
 379         assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
 380
 381         let mut i = 0;
 382         while i <= 500 {
 383             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 384                         else { i };
 385             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
 386                        (from_u32(upper).unwrap()).to_string());
 387             i += 1;
 388         }
 389     }
 390
 391     #[test]
 392     fn test_into_ascii_lowercase() {
 393         assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
 394                    "url()url()url()Ürl");
 395         // Dotted capital I, Kelvin sign, Sharp S.
 396         assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
 397
 398         let mut i = 0;
 399         while i <= 500 {
 400             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 401                         else { i };
 402             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
 403                        (from_u32(lower).unwrap()).to_string());
 404             i += 1;
 405         }
 406     }
 407
 408     #[test]
 409     fn test_eq_ignore_ascii_case() {
 410         assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
 411         assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
 412         // Dotted capital I, Kelvin sign, Sharp S.
 413         assert!("HİKß".eq_ignore_ascii_case("hİKß"));
 414         assert!(!"İ".eq_ignore_ascii_case("i"));
 415         assert!(!"K".eq_ignore_ascii_case("k"));
 416         assert!(!"ß".eq_ignore_ascii_case("s"));
 417
 418         let mut i = 0;
 419         while i <= 500 {
 420             let c = i;
 421             let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
 422                         else { c };
 423             assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
 424                     (from_u32(lower).unwrap()).to_string().as_slice()));
 425             i += 1;
 426         }
 427     }
 428 }