1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Operations on ASCII strings and characters
15 #![unstable(feature = "std_misc",
16 reason = "unsure about placement and naming")]
18 use iter::IteratorExt;
25 /// Extension methods for ASCII-subset only operations on owned strings
26 #[unstable(feature = "std_misc",
27 reason = "would prefer to do this in a more general way")]
28 pub trait OwnedAsciiExt {
29 /// Convert the string to ASCII upper case:
30 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
31 /// but non-ASCII letters are unchanged.
32 fn into_ascii_uppercase(self) -> Self;
34 /// Convert the string to ASCII lower case:
35 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
36 /// but non-ASCII letters are unchanged.
37 fn into_ascii_lowercase(self) -> Self;
40 /// Extension methods for ASCII-subset only operations on string slices
41 #[unstable(feature = "std_misc",
42 reason = "would prefer to do this in a more general way")]
43 pub trait AsciiExt<T = Self> {
44 /// Check if within the ASCII range.
45 fn is_ascii(&self) -> bool;
47 /// Makes a copy of the string in ASCII upper case:
48 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
49 /// but non-ASCII letters are unchanged.
50 fn to_ascii_uppercase(&self) -> T;
52 /// Makes a copy of the string in ASCII lower case:
53 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
54 /// but non-ASCII letters are unchanged.
55 fn to_ascii_lowercase(&self) -> T;
57 /// Check that two strings are an ASCII case-insensitive match.
58 /// Same as `to_ascii_lowercase(a) == to_ascii_lowercase(b)`,
59 /// but without allocating and copying temporary strings.
60 fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
63 #[unstable(feature = "std_misc",
64 reason = "would prefer to do this in a more general way")]
65 impl AsciiExt<String> for str {
67 fn is_ascii(&self) -> bool {
68 self.bytes().all(|b| b.is_ascii())
72 fn to_ascii_uppercase(&self) -> String {
73 // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
74 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
78 fn to_ascii_lowercase(&self) -> String {
79 // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
80 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
84 fn eq_ignore_ascii_case(&self, other: &str) -> bool {
85 self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
89 #[unstable(feature = "std_misc",
90 reason = "would prefer to do this in a more general way")]
91 impl OwnedAsciiExt for String {
93 fn into_ascii_uppercase(self) -> String {
94 // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
95 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
99 fn into_ascii_lowercase(self) -> String {
100 // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
101 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
105 #[unstable(feature = "std_misc",
106 reason = "would prefer to do this in a more general way")]
107 impl AsciiExt<Vec<u8>> for [u8] {
109 fn is_ascii(&self) -> bool {
110 self.iter().all(|b| b.is_ascii())
114 fn to_ascii_uppercase(&self) -> Vec<u8> {
115 self.iter().map(|b| b.to_ascii_uppercase()).collect()
119 fn to_ascii_lowercase(&self) -> Vec<u8> {
120 self.iter().map(|b| b.to_ascii_lowercase()).collect()
124 fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
125 self.len() == other.len() &&
126 self.iter().zip(other.iter()).all(|(a, b)| {
127 a.eq_ignore_ascii_case(b)
132 #[unstable(feature = "std_misc",
133 reason = "would prefer to do this in a more general way")]
134 impl OwnedAsciiExt for Vec<u8> {
136 fn into_ascii_uppercase(mut self) -> Vec<u8> {
137 for byte in self.iter_mut() {
138 *byte = byte.to_ascii_uppercase();
144 fn into_ascii_lowercase(mut self) -> Vec<u8> {
145 for byte in self.iter_mut() {
146 *byte = byte.to_ascii_lowercase();
152 #[unstable(feature = "std_misc",
153 reason = "would prefer to do this in a more general way")]
154 impl AsciiExt for u8 {
156 fn is_ascii(&self) -> bool {
161 fn to_ascii_uppercase(&self) -> u8 {
162 ASCII_UPPERCASE_MAP[*self as uint]
166 fn to_ascii_lowercase(&self) -> u8 {
167 ASCII_LOWERCASE_MAP[*self as uint]
171 fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
172 self.to_ascii_lowercase() == other.to_ascii_lowercase()
176 #[unstable(feature = "std_misc",
177 reason = "would prefer to do this in a more general way")]
178 impl AsciiExt for char {
180 fn is_ascii(&self) -> bool {
185 fn to_ascii_uppercase(&self) -> char {
187 (*self as u8).to_ascii_uppercase() as char
194 fn to_ascii_lowercase(&self) -> char {
196 (*self as u8).to_ascii_lowercase() as char
203 fn eq_ignore_ascii_case(&self, other: &char) -> bool {
204 self.to_ascii_lowercase() == other.to_ascii_lowercase()
208 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
210 /// The default is chosen with a bias toward producing literals that are
211 /// legal in a variety of languages, including C++11 and similar C-family
212 /// languages. The exact rules are:
214 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
215 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
216 /// - Any other chars in the range [0x20,0x7e] are not escaped.
217 /// - Any other chars are given hex escapes.
218 /// - Unicode escapes are never generated by this function.
219 #[unstable(feature = "std_misc",
220 reason = "needs to be updated to use an iterator")]
221 pub fn escape_default<F>(c: u8, mut f: F) where
225 b'\t' => { f(b'\\'); f(b't'); }
226 b'\r' => { f(b'\\'); f(b'r'); }
227 b'\n' => { f(b'\\'); f(b'n'); }
228 b'\\' => { f(b'\\'); f(b'\\'); }
229 b'\'' => { f(b'\\'); f(b'\''); }
230 b'"' => { f(b'\\'); f(b'"'); }
231 b'\x20' ... b'\x7e' => { f(c); }
235 for &offset in [4u, 0u].iter() {
236 match ((c as i32) >> offset) & 0xf {
237 i @ 0 ... 9 => f(b'0' + (i as u8)),
238 i => f(b'a' + (i as u8 - 10)),
245 static ASCII_LOWERCASE_MAP: [u8; 256] = [
246 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
247 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
248 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
249 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
250 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
251 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
252 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
253 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
256 b'a', b'b', b'c', b'd', b'e', b'f', b'g',
257 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
258 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
261 b'[', b'\\', b']', b'^', b'_',
262 b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
263 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
264 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
265 b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
266 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
267 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
268 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
269 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
270 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
271 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
272 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
273 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
274 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
275 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
276 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
277 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
278 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
279 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
280 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
281 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
284 static ASCII_UPPERCASE_MAP: [u8; 256] = [
285 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
286 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
287 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
288 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
289 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
290 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
291 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
292 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
293 b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
294 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
295 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
296 b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
299 b'A', b'B', b'C', b'D', b'E', b'F', b'G',
300 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
301 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
304 b'{', b'|', b'}', b'~', 0x7f,
305 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
306 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
307 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
308 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
309 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
310 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
311 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
312 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
313 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
314 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
315 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
316 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
317 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
318 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
319 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
320 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
332 assert!("banana".chars().all(|c| c.is_ascii()));
333 assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
337 fn test_ascii_vec() {
338 assert!("".is_ascii());
339 assert!("a".is_ascii());
340 assert!(!"\u{2009}".is_ascii());
345 fn test_to_ascii_uppercase() {
346 assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
347 assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
351 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
353 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
354 (from_u32(upper).unwrap()).to_string());
360 fn test_to_ascii_lowercase() {
361 assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
362 // Dotted capital I, Kelvin sign, Sharp S.
363 assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
367 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
369 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
370 (from_u32(lower).unwrap()).to_string());
376 fn test_into_ascii_uppercase() {
377 assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
378 "URL()URL()URL()üRL".to_string());
379 assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
383 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
385 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
386 (from_u32(upper).unwrap()).to_string());
392 fn test_into_ascii_lowercase() {
393 assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
394 "url()url()url()Ürl");
395 // Dotted capital I, Kelvin sign, Sharp S.
396 assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
400 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
402 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
403 (from_u32(lower).unwrap()).to_string());
409 fn test_eq_ignore_ascii_case() {
410 assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
411 assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
412 // Dotted capital I, Kelvin sign, Sharp S.
413 assert!("HİKß".eq_ignore_ascii_case("hİKß"));
414 assert!(!"İ".eq_ignore_ascii_case("i"));
415 assert!(!"K".eq_ignore_ascii_case("k"));
416 assert!(!"ß".eq_ignore_ascii_case("s"));
421 let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
423 assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
424 (from_u32(lower).unwrap()).to_string().as_slice()));