1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Operations on ASCII strings and characters
15 #![unstable = "unsure about placement and naming"]
17 use iter::IteratorExt;
24 /// Extension methods for ASCII-subset only operations on owned strings
25 #[unstable = "would prefer to do this in a more general way"]
26 pub trait OwnedAsciiExt {
27 /// Convert the string to ASCII upper case:
28 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
29 /// but non-ASCII letters are unchanged.
30 fn into_ascii_uppercase(self) -> Self;
32 /// Convert the string to ASCII lower case:
33 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
34 /// but non-ASCII letters are unchanged.
35 fn into_ascii_lowercase(self) -> Self;
38 /// Extension methods for ASCII-subset only operations on string slices
39 #[unstable = "would prefer to do this in a more general way"]
40 pub trait AsciiExt<T = Self> {
41 /// Check if within the ASCII range.
42 fn is_ascii(&self) -> bool;
44 /// Makes a copy of the string in ASCII upper case:
45 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
46 /// but non-ASCII letters are unchanged.
47 fn to_ascii_uppercase(&self) -> T;
49 /// Makes a copy of the string in ASCII lower case:
50 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
51 /// but non-ASCII letters are unchanged.
52 fn to_ascii_lowercase(&self) -> T;
54 /// Check that two strings are an ASCII case-insensitive match.
55 /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
56 /// but without allocating and copying temporary strings.
57 fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
60 #[unstable = "would prefer to do this in a more general way"]
61 impl AsciiExt<String> for str {
63 fn is_ascii(&self) -> bool {
64 self.bytes().all(|b| b.is_ascii())
68 fn to_ascii_uppercase(&self) -> String {
69 // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
70 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
74 fn to_ascii_lowercase(&self) -> String {
75 // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
76 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
80 fn eq_ignore_ascii_case(&self, other: &str) -> bool {
81 self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
85 #[unstable = "would prefer to do this in a more general way"]
86 impl OwnedAsciiExt for String {
88 fn into_ascii_uppercase(self) -> String {
89 // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
90 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
94 fn into_ascii_lowercase(self) -> String {
95 // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
96 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
100 #[unstable = "would prefer to do this in a more general way"]
101 impl AsciiExt<Vec<u8>> for [u8] {
103 fn is_ascii(&self) -> bool {
104 self.iter().all(|b| b.is_ascii())
108 fn to_ascii_uppercase(&self) -> Vec<u8> {
109 self.iter().map(|b| b.to_ascii_uppercase()).collect()
113 fn to_ascii_lowercase(&self) -> Vec<u8> {
114 self.iter().map(|b| b.to_ascii_lowercase()).collect()
118 fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
119 self.len() == other.len() &&
120 self.iter().zip(other.iter()).all(|(a, b)| {
121 a.eq_ignore_ascii_case(b)
126 #[unstable = "would prefer to do this in a more general way"]
127 impl OwnedAsciiExt for Vec<u8> {
129 fn into_ascii_uppercase(mut self) -> Vec<u8> {
130 for byte in self.iter_mut() {
131 *byte = byte.to_ascii_uppercase();
137 fn into_ascii_lowercase(mut self) -> Vec<u8> {
138 for byte in self.iter_mut() {
139 *byte = byte.to_ascii_lowercase();
145 #[unstable = "would prefer to do this in a more general way"]
146 impl AsciiExt for u8 {
148 fn is_ascii(&self) -> bool {
153 fn to_ascii_uppercase(&self) -> u8 {
154 ASCII_UPPERCASE_MAP[*self as uint]
158 fn to_ascii_lowercase(&self) -> u8 {
159 ASCII_LOWERCASE_MAP[*self as uint]
163 fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
164 self.to_ascii_lowercase() == other.to_ascii_lowercase()
168 #[unstable = "would prefer to do this in a more general way"]
169 impl AsciiExt for char {
171 fn is_ascii(&self) -> bool {
176 fn to_ascii_uppercase(&self) -> char {
178 (*self as u8).to_ascii_uppercase() as char
185 fn to_ascii_lowercase(&self) -> char {
187 (*self as u8).to_ascii_lowercase() as char
194 fn eq_ignore_ascii_case(&self, other: &char) -> bool {
195 self.to_ascii_lowercase() == other.to_ascii_lowercase()
199 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
201 /// The default is chosen with a bias toward producing literals that are
202 /// legal in a variety of languages, including C++11 and similar C-family
203 /// languages. The exact rules are:
205 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
206 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
207 /// - Any other chars in the range [0x20,0x7e] are not escaped.
208 /// - Any other chars are given hex escapes.
209 /// - Unicode escapes are never generated by this function.
210 #[unstable = "needs to be updated to use an iterator"]
211 pub fn escape_default<F>(c: u8, mut f: F) where
215 b'\t' => { f(b'\\'); f(b't'); }
216 b'\r' => { f(b'\\'); f(b'r'); }
217 b'\n' => { f(b'\\'); f(b'n'); }
218 b'\\' => { f(b'\\'); f(b'\\'); }
219 b'\'' => { f(b'\\'); f(b'\''); }
220 b'"' => { f(b'\\'); f(b'"'); }
221 b'\x20' ... b'\x7e' => { f(c); }
225 for &offset in [4u, 0u].iter() {
226 match ((c as i32) >> offset) & 0xf {
227 i @ 0 ... 9 => f(b'0' + (i as u8)),
228 i => f(b'a' + (i as u8 - 10)),
235 static ASCII_LOWERCASE_MAP: [u8; 256] = [
236 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
237 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
238 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
239 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
240 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
241 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
242 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
243 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
246 b'a', b'b', b'c', b'd', b'e', b'f', b'g',
247 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
248 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
251 b'[', b'\\', b']', b'^', b'_',
252 b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
253 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
254 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
255 b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
256 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
257 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
258 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
259 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
260 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
261 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
262 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
263 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
264 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
265 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
266 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
267 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
268 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
269 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
270 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
271 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
274 static ASCII_UPPERCASE_MAP: [u8; 256] = [
275 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
276 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
277 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
278 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
279 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
280 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
281 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
282 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
283 b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
284 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
285 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
286 b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
289 b'A', b'B', b'C', b'D', b'E', b'F', b'G',
290 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
291 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
294 b'{', b'|', b'}', b'~', 0x7f,
295 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
296 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
297 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
298 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
299 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
300 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
301 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
302 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
303 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
304 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
305 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
306 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
307 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
308 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
309 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
310 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
322 assert!("banana".chars().all(|c| c.is_ascii()));
323 assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
327 fn test_ascii_vec() {
328 assert!("".is_ascii());
329 assert!("a".is_ascii());
330 assert!(!"\u{2009}".is_ascii());
335 fn test_to_ascii_uppercase() {
336 assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
337 assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
341 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
343 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
344 (from_u32(upper).unwrap()).to_string());
350 fn test_to_ascii_lowercase() {
351 assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
352 // Dotted capital I, Kelvin sign, Sharp S.
353 assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
357 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
359 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
360 (from_u32(lower).unwrap()).to_string());
366 fn test_into_ascii_uppercase() {
367 assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
368 "URL()URL()URL()üRL".to_string());
369 assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
373 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
375 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
376 (from_u32(upper).unwrap()).to_string());
382 fn test_into_ascii_lowercase() {
383 assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
384 "url()url()url()Ürl");
385 // Dotted capital I, Kelvin sign, Sharp S.
386 assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
390 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
392 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
393 (from_u32(lower).unwrap()).to_string());
399 fn test_eq_ignore_ascii_case() {
400 assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
401 assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
402 // Dotted capital I, Kelvin sign, Sharp S.
403 assert!("HİKß".eq_ignore_ascii_case("hİKß"));
404 assert!(!"İ".eq_ignore_ascii_case("i"));
405 assert!(!"K".eq_ignore_ascii_case("k"));
406 assert!(!"ß".eq_ignore_ascii_case("s"));
411 let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
413 assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
414 (from_u32(lower).unwrap()).to_string().as_slice()));