src/libstd/ascii.rs

   1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! Operations on ASCII strings and characters
  14
  15 #![unstable(feature = "std_misc",
  16             reason = "unsure about placement and naming")]
  17
  18 use iter::IteratorExt;
  19 use ops::FnMut;
  20 use slice::SliceExt;
  21 use str::StrExt;
  22 use string::String;
  23 use vec::Vec;
  24
  25 /// Extension methods for ASCII-subset only operations on owned strings
  26 #[unstable(feature = "std_misc",
  27            reason = "would prefer to do this in a more general way")]
  28 pub trait OwnedAsciiExt {
  29     /// Convert the string to ASCII upper case:
  30     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  31     /// but non-ASCII letters are unchanged.
  32     fn into_ascii_uppercase(self) -> Self;
  33
  34     /// Convert the string to ASCII lower case:
  35     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  36     /// but non-ASCII letters are unchanged.
  37     fn into_ascii_lowercase(self) -> Self;
  38 }
  39
  40 /// Extension methods for ASCII-subset only operations on string slices
  41 #[unstable(feature = "std_misc",
  42            reason = "would prefer to do this in a more general way")]
  43 pub trait AsciiExt<T = Self> {
  44     /// Check if within the ASCII range.
  45     fn is_ascii(&self) -> bool;
  46
  47     /// Makes a copy of the string in ASCII upper case:
  48     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  49     /// but non-ASCII letters are unchanged.
  50     fn to_ascii_uppercase(&self) -> T;
  51
  52     /// Makes a copy of the string in ASCII lower case:
  53     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  54     /// but non-ASCII letters are unchanged.
  55     fn to_ascii_lowercase(&self) -> T;
  56
  57     /// Check that two strings are an ASCII case-insensitive match.
  58     /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
  59     /// but without allocating and copying temporary strings.
  60     fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
  61 }
  62
  63 #[unstable(feature = "std_misc",
  64            reason = "would prefer to do this in a more general way")]
  65 impl AsciiExt<String> for str {
  66     #[inline]
  67     fn is_ascii(&self) -> bool {
  68         self.bytes().all(|b| b.is_ascii())
  69     }
  70
  71     #[inline]
  72     fn to_ascii_uppercase(&self) -> String {
  73         // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
  74         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
  75     }
  76
  77     #[inline]
  78     fn to_ascii_lowercase(&self) -> String {
  79         // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
  80         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
  81     }
  82
  83     #[inline]
  84     fn eq_ignore_ascii_case(&self, other: &str) -> bool {
  85         self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
  86     }
  87 }
  88
  89 #[unstable(feature = "std_misc",
  90            reason = "would prefer to do this in a more general way")]
  91 impl OwnedAsciiExt for String {
  92     #[inline]
  93     fn into_ascii_uppercase(self) -> String {
  94         // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
  95         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
  96     }
  97
  98     #[inline]
  99     fn into_ascii_lowercase(self) -> String {
 100         // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
 101         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
 102     }
 103 }
 104
 105 #[unstable(feature = "std_misc",
 106            reason = "would prefer to do this in a more general way")]
 107 impl AsciiExt<Vec<u8>> for [u8] {
 108     #[inline]
 109     fn is_ascii(&self) -> bool {
 110         self.iter().all(|b| b.is_ascii())
 111     }
 112
 113     #[inline]
 114     fn to_ascii_uppercase(&self) -> Vec<u8> {
 115         self.iter().map(|b| b.to_ascii_uppercase()).collect()
 116     }
 117
 118     #[inline]
 119     fn to_ascii_lowercase(&self) -> Vec<u8> {
 120         self.iter().map(|b| b.to_ascii_lowercase()).collect()
 121     }
 122
 123     #[inline]
 124     fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
 125         self.len() == other.len() &&
 126         self.iter().zip(other.iter()).all(|(a, b)| {
 127             a.eq_ignore_ascii_case(b)
 128         })
 129     }
 130 }
 131
 132 #[unstable(feature = "std_misc",
 133            reason = "would prefer to do this in a more general way")]
 134 impl OwnedAsciiExt for Vec<u8> {
 135     #[inline]
 136     fn into_ascii_uppercase(mut self) -> Vec<u8> {
 137         for byte in &mut self {
 138             *byte = byte.to_ascii_uppercase();
 139         }
 140         self
 141     }
 142
 143     #[inline]
 144     fn into_ascii_lowercase(mut self) -> Vec<u8> {
 145         for byte in &mut self {
 146             *byte = byte.to_ascii_lowercase();
 147         }
 148         self
 149     }
 150 }
 151
 152 #[unstable(feature = "std_misc",
 153            reason = "would prefer to do this in a more general way")]
 154 impl AsciiExt for u8 {
 155     #[inline]
 156     fn is_ascii(&self) -> bool {
 157         *self & 128 == 0u8
 158     }
 159
 160     #[inline]
 161     fn to_ascii_uppercase(&self) -> u8 {
 162         ASCII_UPPERCASE_MAP[*self as usize]
 163     }
 164
 165     #[inline]
 166     fn to_ascii_lowercase(&self) -> u8 {
 167         ASCII_LOWERCASE_MAP[*self as usize]
 168     }
 169
 170     #[inline]
 171     fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
 172         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 173     }
 174 }
 175
 176 #[unstable(feature = "std_misc",
 177            reason = "would prefer to do this in a more general way")]
 178 impl AsciiExt for char {
 179     #[inline]
 180     fn is_ascii(&self) -> bool {
 181         *self as u32 <= 0x7F
 182     }
 183
 184     #[inline]
 185     fn to_ascii_uppercase(&self) -> char {
 186         if self.is_ascii() {
 187             (*self as u8).to_ascii_uppercase() as char
 188         } else {
 189             *self
 190         }
 191     }
 192
 193     #[inline]
 194     fn to_ascii_lowercase(&self) -> char {
 195         if self.is_ascii() {
 196             (*self as u8).to_ascii_lowercase() as char
 197         } else {
 198             *self
 199         }
 200     }
 201
 202     #[inline]
 203     fn eq_ignore_ascii_case(&self, other: &char) -> bool {
 204         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 205     }
 206 }
 207
 208 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
 209 ///
 210 /// The default is chosen with a bias toward producing literals that are
 211 /// legal in a variety of languages, including C++11 and similar C-family
 212 /// languages. The exact rules are:
 213 ///
 214 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 215 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
 216 /// - Any other chars in the range [0x20,0x7e] are not escaped.
 217 /// - Any other chars are given hex escapes.
 218 /// - Unicode escapes are never generated by this function.
 219 #[unstable(feature = "std_misc",
 220            reason = "needs to be updated to use an iterator")]
 221 pub fn escape_default<F>(c: u8, mut f: F) where
 222     F: FnMut(u8),
 223 {
 224     match c {
 225         b'\t' => { f(b'\\'); f(b't'); }
 226         b'\r' => { f(b'\\'); f(b'r'); }
 227         b'\n' => { f(b'\\'); f(b'n'); }
 228         b'\\' => { f(b'\\'); f(b'\\'); }
 229         b'\'' => { f(b'\\'); f(b'\''); }
 230         b'"'  => { f(b'\\'); f(b'"'); }
 231         b'\x20' ... b'\x7e' => { f(c); }
 232         _ => {
 233             f(b'\\');
 234             f(b'x');
 235             for &offset in &[4u, 0u] {
 236                 match ((c as i32) >> offset) & 0xf {
 237                     i @ 0 ... 9 => f(b'0' + (i as u8)),
 238                     i => f(b'a' + (i as u8 - 10)),
 239                 }
 240             }
 241         }
 242     }
 243 }
 244
 245 static ASCII_LOWERCASE_MAP: [u8; 256] = [
 246     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 247     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 248     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 249     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 250     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 251     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 252     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 253     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 254     b'@',
 255
 256           b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 257     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 258     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 259     b'x', b'y', b'z',
 260
 261                       b'[', b'\\', b']', b'^', b'_',
 262     b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 263     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 264     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 265     b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
 266     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 267     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 268     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 269     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 270     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 271     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 272     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 273     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 274     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 275     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 276     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 277     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 278     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 279     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 280     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 281     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 282 ];
 283
 284 static ASCII_UPPERCASE_MAP: [u8; 256] = [
 285     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 286     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 287     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 288     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 289     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 290     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 291     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 292     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 293     b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 294     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 295     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 296     b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
 297     b'`',
 298
 299           b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 300     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 301     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 302     b'X', b'Y', b'Z',
 303
 304                       b'{', b'|', b'}', b'~', 0x7f,
 305     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 306     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 307     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 308     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 309     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 310     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 311     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 312     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 313     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 314     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 315     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 316     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 317     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 318     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 319     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 320     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 321 ];
 322
 323
 324 #[cfg(test)]
 325 mod tests {
 326     use prelude::v1::*;
 327     use super::*;
 328     use char::from_u32;
 329
 330     #[test]
 331     fn test_ascii() {
 332         assert!("banana".chars().all(|c| c.is_ascii()));
 333         assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
 334     }
 335
 336     #[test]
 337     fn test_ascii_vec() {
 338         assert!("".is_ascii());
 339         assert!("a".is_ascii());
 340         assert!(!"\u{2009}".is_ascii());
 341     }
 342
 343     #[test]
 344     fn test_to_ascii_uppercase() {
 345         assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
 346         assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
 347
 348         for i in 0u32..501 {
 349             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 350                         else { i };
 351             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
 352                        (from_u32(upper).unwrap()).to_string());
 353         }
 354     }
 355
 356     #[test]
 357     fn test_to_ascii_lowercase() {
 358         assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
 359         // Dotted capital I, Kelvin sign, Sharp S.
 360         assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
 361
 362         for i in 0u32..501 {
 363             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 364                         else { i };
 365             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
 366                        (from_u32(lower).unwrap()).to_string());
 367         }
 368     }
 369
 370     #[test]
 371     fn test_into_ascii_uppercase() {
 372         assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
 373                    "URL()URL()URL()üRL".to_string());
 374         assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
 375
 376         for i in 0u32..501 {
 377             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 378                         else { i };
 379             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
 380                        (from_u32(upper).unwrap()).to_string());
 381         }
 382     }
 383
 384     #[test]
 385     fn test_into_ascii_lowercase() {
 386         assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
 387                    "url()url()url()Ürl");
 388         // Dotted capital I, Kelvin sign, Sharp S.
 389         assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
 390
 391         for i in 0u32..501 {
 392             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 393                         else { i };
 394             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
 395                        (from_u32(lower).unwrap()).to_string());
 396         }
 397     }
 398
 399     #[test]
 400     fn test_eq_ignore_ascii_case() {
 401         assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
 402         assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
 403         // Dotted capital I, Kelvin sign, Sharp S.
 404         assert!("HİKß".eq_ignore_ascii_case("hİKß"));
 405         assert!(!"İ".eq_ignore_ascii_case("i"));
 406         assert!(!"K".eq_ignore_ascii_case("k"));
 407         assert!(!"ß".eq_ignore_ascii_case("s"));
 408
 409         for i in 0u32..501 {
 410             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 411                         else { i };
 412             assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
 413                     &from_u32(lower).unwrap().to_string()));
 414         }
 415     }
 416 }