src/libstd/ascii.rs

   1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10 //
  11 // ignore-lexer-test FIXME #15679
  12
  13 //! Operations on ASCII strings and characters
  14
  15 #![unstable = "unsure about placement and naming"]
  16 #![allow(deprecated)]
  17
  18 use core::kinds::Sized;
  19 use iter::IteratorExt;
  20 use ops::FnMut;
  21 use slice::SliceExt;
  22 use str::StrExt;
  23 use string::String;
  24 use vec::Vec;
  25
  26 /// Extension methods for ASCII-subset only operations on owned strings
  27 #[experimental = "would prefer to do this in a more general way"]
  28 pub trait OwnedAsciiExt {
  29     /// Convert the string to ASCII upper case:
  30     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  31     /// but non-ASCII letters are unchanged.
  32     fn into_ascii_uppercase(self) -> Self;
  33
  34     /// Convert the string to ASCII lower case:
  35     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  36     /// but non-ASCII letters are unchanged.
  37     fn into_ascii_lowercase(self) -> Self;
  38 }
  39
  40 /// Extension methods for ASCII-subset only operations on string slices
  41 #[experimental = "would prefer to do this in a more general way"]
  42 pub trait AsciiExt<T = Self> for Sized? {
  43     /// Check if within the ASCII range.
  44     fn is_ascii(&self) -> bool;
  45
  46     /// Makes a copy of the string in ASCII upper case:
  47     /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
  48     /// but non-ASCII letters are unchanged.
  49     fn to_ascii_uppercase(&self) -> T;
  50
  51     /// Makes a copy of the string in ASCII lower case:
  52     /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
  53     /// but non-ASCII letters are unchanged.
  54     fn to_ascii_lowercase(&self) -> T;
  55
  56     /// Check that two strings are an ASCII case-insensitive match.
  57     /// Same as `to_ascii_lowercase(a) == to_ascii_lower(b)`,
  58     /// but without allocating and copying temporary strings.
  59     fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
  60 }
  61
  62 #[experimental = "would prefer to do this in a more general way"]
  63 impl AsciiExt<String> for str {
  64     #[inline]
  65     fn is_ascii(&self) -> bool {
  66         self.bytes().all(|b| b.is_ascii())
  67     }
  68
  69     #[inline]
  70     fn to_ascii_uppercase(&self) -> String {
  71         // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
  72         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
  73     }
  74
  75     #[inline]
  76     fn to_ascii_lowercase(&self) -> String {
  77         // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
  78         unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
  79     }
  80
  81     #[inline]
  82     fn eq_ignore_ascii_case(&self, other: &str) -> bool {
  83         self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
  84     }
  85 }
  86
  87 #[experimental = "would prefer to do this in a more general way"]
  88 impl OwnedAsciiExt for String {
  89     #[inline]
  90     fn into_ascii_uppercase(self) -> String {
  91         // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
  92         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
  93     }
  94
  95     #[inline]
  96     fn into_ascii_lowercase(self) -> String {
  97         // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
  98         unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
  99     }
 100 }
 101
 102 #[experimental = "would prefer to do this in a more general way"]
 103 impl AsciiExt<Vec<u8>> for [u8] {
 104     #[inline]
 105     fn is_ascii(&self) -> bool {
 106         self.iter().all(|b| b.is_ascii())
 107     }
 108
 109     #[inline]
 110     fn to_ascii_uppercase(&self) -> Vec<u8> {
 111         self.iter().map(|b| b.to_ascii_uppercase()).collect()
 112     }
 113
 114     #[inline]
 115     fn to_ascii_lowercase(&self) -> Vec<u8> {
 116         self.iter().map(|b| b.to_ascii_lowercase()).collect()
 117     }
 118
 119     #[inline]
 120     fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
 121         self.len() == other.len() &&
 122         self.iter().zip(other.iter()).all(|(a, b)| {
 123             a.eq_ignore_ascii_case(b)
 124         })
 125     }
 126 }
 127
 128 #[experimental = "would prefer to do this in a more general way"]
 129 impl OwnedAsciiExt for Vec<u8> {
 130     #[inline]
 131     fn into_ascii_uppercase(mut self) -> Vec<u8> {
 132         for byte in self.iter_mut() {
 133             *byte = byte.to_ascii_uppercase();
 134         }
 135         self
 136     }
 137
 138     #[inline]
 139     fn into_ascii_lowercase(mut self) -> Vec<u8> {
 140         for byte in self.iter_mut() {
 141             *byte = byte.to_ascii_lowercase();
 142         }
 143         self
 144     }
 145 }
 146
 147 #[experimental = "would prefer to do this in a more general way"]
 148 impl AsciiExt for u8 {
 149     #[inline]
 150     fn is_ascii(&self) -> bool {
 151         *self & 128 == 0u8
 152     }
 153
 154     #[inline]
 155     fn to_ascii_uppercase(&self) -> u8 {
 156         ASCII_UPPERCASE_MAP[*self as uint]
 157     }
 158
 159     #[inline]
 160     fn to_ascii_lowercase(&self) -> u8 {
 161         ASCII_LOWERCASE_MAP[*self as uint]
 162     }
 163
 164     #[inline]
 165     fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
 166         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 167     }
 168 }
 169
 170 #[experimental = "would prefer to do this in a more general way"]
 171 impl AsciiExt for char {
 172     #[inline]
 173     fn is_ascii(&self) -> bool {
 174         *self as u32 <= 0x7F
 175     }
 176
 177     #[inline]
 178     fn to_ascii_uppercase(&self) -> char {
 179         if self.is_ascii() {
 180             (*self as u8).to_ascii_uppercase() as char
 181         } else {
 182             *self
 183         }
 184     }
 185
 186     #[inline]
 187     fn to_ascii_lowercase(&self) -> char {
 188         if self.is_ascii() {
 189             (*self as u8).to_ascii_lowercase() as char
 190         } else {
 191             *self
 192         }
 193     }
 194
 195     #[inline]
 196     fn eq_ignore_ascii_case(&self, other: &char) -> bool {
 197         self.to_ascii_lowercase() == other.to_ascii_lowercase()
 198     }
 199 }
 200
 201 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
 202 ///
 203 /// The default is chosen with a bias toward producing literals that are
 204 /// legal in a variety of languages, including C++11 and similar C-family
 205 /// languages. The exact rules are:
 206 ///
 207 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 208 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
 209 /// - Any other chars in the range [0x20,0x7e] are not escaped.
 210 /// - Any other chars are given hex escapes.
 211 /// - Unicode escapes are never generated by this function.
 212 #[unstable = "needs to be updated to use an iterator"]
 213 pub fn escape_default<F>(c: u8, mut f: F) where
 214     F: FnMut(u8),
 215 {
 216     match c {
 217         b'\t' => { f(b'\\'); f(b't'); }
 218         b'\r' => { f(b'\\'); f(b'r'); }
 219         b'\n' => { f(b'\\'); f(b'n'); }
 220         b'\\' => { f(b'\\'); f(b'\\'); }
 221         b'\'' => { f(b'\\'); f(b'\''); }
 222         b'"'  => { f(b'\\'); f(b'"'); }
 223         b'\x20' ... b'\x7e' => { f(c); }
 224         _ => {
 225             f(b'\\');
 226             f(b'x');
 227             for &offset in [4u, 0u].iter() {
 228                 match ((c as i32) >> offset) & 0xf {
 229                     i @ 0 ... 9 => f(b'0' + (i as u8)),
 230                     i => f(b'a' + (i as u8 - 10)),
 231                 }
 232             }
 233         }
 234     }
 235 }
 236
 237 static ASCII_LOWERCASE_MAP: [u8; 256] = [
 238     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 239     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 240     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 241     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 242     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 243     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 244     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 245     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 246     b'@',
 247
 248           b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 249     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 250     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 251     b'x', b'y', b'z',
 252
 253                       b'[', b'\\', b']', b'^', b'_',
 254     b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
 255     b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
 256     b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
 257     b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
 258     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 259     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 260     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 261     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 262     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 263     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 264     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 265     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 266     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 267     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 268     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 269     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 270     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 271     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 272     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 273     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 274 ];
 275
 276 static ASCII_UPPERCASE_MAP: [u8; 256] = [
 277     0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
 278     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
 279     0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
 280     0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
 281     b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
 282     b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
 283     b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
 284     b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
 285     b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 286     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 287     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 288     b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
 289     b'`',
 290
 291           b'A', b'B', b'C', b'D', b'E', b'F', b'G',
 292     b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
 293     b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
 294     b'X', b'Y', b'Z',
 295
 296                       b'{', b'|', b'}', b'~', 0x7f,
 297     0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
 298     0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
 299     0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
 300     0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
 301     0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
 302     0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
 303     0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
 304     0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
 305     0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
 306     0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
 307     0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
 308     0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
 309     0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
 310     0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
 311     0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
 312     0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
 313 ];
 314
 315
 316 #[cfg(test)]
 317 mod tests {
 318     use prelude::v1::*;
 319     use super::*;
 320     use char::from_u32;
 321
 322     #[test]
 323     fn test_ascii() {
 324         assert!("banana".chars().all(|c| c.is_ascii()));
 325         assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
 326     }
 327
 328     #[test]
 329     fn test_ascii_vec() {
 330         assert!("".is_ascii());
 331         assert!("a".is_ascii());
 332         assert!(!"\u{2009}".is_ascii());
 333
 334     }
 335
 336     #[test]
 337     fn test_to_ascii_uppercase() {
 338         assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
 339         assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
 340
 341         let mut i = 0;
 342         while i <= 500 {
 343             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 344                         else { i };
 345             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
 346                        (from_u32(upper).unwrap()).to_string());
 347             i += 1;
 348         }
 349     }
 350
 351     #[test]
 352     fn test_to_ascii_lowercase() {
 353         assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
 354         // Dotted capital I, Kelvin sign, Sharp S.
 355         assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
 356
 357         let mut i = 0;
 358         while i <= 500 {
 359             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 360                         else { i };
 361             assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
 362                        (from_u32(lower).unwrap()).to_string());
 363             i += 1;
 364         }
 365     }
 366
 367     #[test]
 368     fn test_into_ascii_uppercase() {
 369         assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
 370                    "URL()URL()URL()üRL".to_string());
 371         assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
 372
 373         let mut i = 0;
 374         while i <= 500 {
 375             let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
 376                         else { i };
 377             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
 378                        (from_u32(upper).unwrap()).to_string());
 379             i += 1;
 380         }
 381     }
 382
 383     #[test]
 384     fn test_into_ascii_lowercase() {
 385         assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
 386                    "url()url()url()Ürl");
 387         // Dotted capital I, Kelvin sign, Sharp S.
 388         assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
 389
 390         let mut i = 0;
 391         while i <= 500 {
 392             let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
 393                         else { i };
 394             assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
 395                        (from_u32(lower).unwrap()).to_string());
 396             i += 1;
 397         }
 398     }
 399
 400     #[test]
 401     fn test_eq_ignore_ascii_case() {
 402         assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
 403         assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
 404         // Dotted capital I, Kelvin sign, Sharp S.
 405         assert!("HİKß".eq_ignore_ascii_case("hİKß"));
 406         assert!(!"İ".eq_ignore_ascii_case("i"));
 407         assert!(!"K".eq_ignore_ascii_case("k"));
 408         assert!(!"ß".eq_ignore_ascii_case("s"));
 409
 410         let mut i = 0;
 411         while i <= 500 {
 412             let c = i;
 413             let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
 414                         else { c };
 415             assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
 416                     (from_u32(lower).unwrap()).to_string().as_slice()));
 417             i += 1;
 418         }
 419     }
 420 }