1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Operations on ASCII strings and characters
15 #![unstable = "unsure about placement and naming"]
18 use core::kinds::Sized;
19 use iter::IteratorExt;
26 /// Extension methods for ASCII-subset only operations on owned strings
27 #[experimental = "would prefer to do this in a more general way"]
28 pub trait OwnedAsciiExt {
29 /// Convert the string to ASCII upper case:
30 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
31 /// but non-ASCII letters are unchanged.
32 fn into_ascii_uppercase(self) -> Self;
34 /// Convert the string to ASCII lower case:
35 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
36 /// but non-ASCII letters are unchanged.
37 fn into_ascii_lowercase(self) -> Self;
40 /// Extension methods for ASCII-subset only operations on string slices
41 #[experimental = "would prefer to do this in a more general way"]
42 pub trait AsciiExt<T = Self> for Sized? {
43 /// Check if within the ASCII range.
44 fn is_ascii(&self) -> bool;
46 /// Makes a copy of the string in ASCII upper case:
47 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
48 /// but non-ASCII letters are unchanged.
49 fn to_ascii_uppercase(&self) -> T;
51 /// Makes a copy of the string in ASCII lower case:
52 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
53 /// but non-ASCII letters are unchanged.
54 fn to_ascii_lowercase(&self) -> T;
56 /// Check that two strings are an ASCII case-insensitive match.
57 /// Same as `to_ascii_lowercase(a) == to_ascii_lower(b)`,
58 /// but without allocating and copying temporary strings.
59 fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
62 #[experimental = "would prefer to do this in a more general way"]
63 impl AsciiExt<String> for str {
65 fn is_ascii(&self) -> bool {
66 self.bytes().all(|b| b.is_ascii())
70 fn to_ascii_uppercase(&self) -> String {
71 // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
72 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
76 fn to_ascii_lowercase(&self) -> String {
77 // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
78 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
82 fn eq_ignore_ascii_case(&self, other: &str) -> bool {
83 self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
87 #[experimental = "would prefer to do this in a more general way"]
88 impl OwnedAsciiExt for String {
90 fn into_ascii_uppercase(self) -> String {
91 // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
92 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
96 fn into_ascii_lowercase(self) -> String {
97 // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
98 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
102 #[experimental = "would prefer to do this in a more general way"]
103 impl AsciiExt<Vec<u8>> for [u8] {
105 fn is_ascii(&self) -> bool {
106 self.iter().all(|b| b.is_ascii())
110 fn to_ascii_uppercase(&self) -> Vec<u8> {
111 self.iter().map(|b| b.to_ascii_uppercase()).collect()
115 fn to_ascii_lowercase(&self) -> Vec<u8> {
116 self.iter().map(|b| b.to_ascii_lowercase()).collect()
120 fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
121 self.len() == other.len() &&
122 self.iter().zip(other.iter()).all(|(a, b)| {
123 a.eq_ignore_ascii_case(b)
128 #[experimental = "would prefer to do this in a more general way"]
129 impl OwnedAsciiExt for Vec<u8> {
131 fn into_ascii_uppercase(mut self) -> Vec<u8> {
132 for byte in self.iter_mut() {
133 *byte = byte.to_ascii_uppercase();
139 fn into_ascii_lowercase(mut self) -> Vec<u8> {
140 for byte in self.iter_mut() {
141 *byte = byte.to_ascii_lowercase();
147 #[experimental = "would prefer to do this in a more general way"]
148 impl AsciiExt for u8 {
150 fn is_ascii(&self) -> bool {
155 fn to_ascii_uppercase(&self) -> u8 {
156 ASCII_UPPERCASE_MAP[*self as uint]
160 fn to_ascii_lowercase(&self) -> u8 {
161 ASCII_LOWERCASE_MAP[*self as uint]
165 fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
166 self.to_ascii_lowercase() == other.to_ascii_lowercase()
170 #[experimental = "would prefer to do this in a more general way"]
171 impl AsciiExt for char {
173 fn is_ascii(&self) -> bool {
178 fn to_ascii_uppercase(&self) -> char {
180 (*self as u8).to_ascii_uppercase() as char
187 fn to_ascii_lowercase(&self) -> char {
189 (*self as u8).to_ascii_lowercase() as char
196 fn eq_ignore_ascii_case(&self, other: &char) -> bool {
197 self.to_ascii_lowercase() == other.to_ascii_lowercase()
201 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
203 /// The default is chosen with a bias toward producing literals that are
204 /// legal in a variety of languages, including C++11 and similar C-family
205 /// languages. The exact rules are:
207 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
208 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
209 /// - Any other chars in the range [0x20,0x7e] are not escaped.
210 /// - Any other chars are given hex escapes.
211 /// - Unicode escapes are never generated by this function.
212 #[unstable = "needs to be updated to use an iterator"]
213 pub fn escape_default<F>(c: u8, mut f: F) where
217 b'\t' => { f(b'\\'); f(b't'); }
218 b'\r' => { f(b'\\'); f(b'r'); }
219 b'\n' => { f(b'\\'); f(b'n'); }
220 b'\\' => { f(b'\\'); f(b'\\'); }
221 b'\'' => { f(b'\\'); f(b'\''); }
222 b'"' => { f(b'\\'); f(b'"'); }
223 b'\x20' ... b'\x7e' => { f(c); }
227 for &offset in [4u, 0u].iter() {
228 match ((c as i32) >> offset) & 0xf {
229 i @ 0 ... 9 => f(b'0' + (i as u8)),
230 i => f(b'a' + (i as u8 - 10)),
237 static ASCII_LOWERCASE_MAP: [u8; 256] = [
238 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
239 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
240 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
241 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
242 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
243 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
244 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
245 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
248 b'a', b'b', b'c', b'd', b'e', b'f', b'g',
249 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
250 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
253 b'[', b'\\', b']', b'^', b'_',
254 b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
255 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
256 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
257 b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
258 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
259 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
260 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
261 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
262 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
263 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
264 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
265 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
266 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
267 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
268 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
269 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
270 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
271 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
272 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
273 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
276 static ASCII_UPPERCASE_MAP: [u8; 256] = [
277 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
278 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
279 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
280 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
281 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
282 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
283 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
284 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
285 b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
286 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
287 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
288 b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
291 b'A', b'B', b'C', b'D', b'E', b'F', b'G',
292 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
293 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
296 b'{', b'|', b'}', b'~', 0x7f,
297 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
298 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
299 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
300 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
301 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
302 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
303 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
304 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
305 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
306 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
307 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
308 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
309 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
310 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
311 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
312 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
324 assert!("banana".chars().all(|c| c.is_ascii()));
325 assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
329 fn test_ascii_vec() {
330 assert!("".is_ascii());
331 assert!("a".is_ascii());
332 assert!(!"\u{2009}".is_ascii());
337 fn test_to_ascii_uppercase() {
338 assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
339 assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
343 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
345 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
346 (from_u32(upper).unwrap()).to_string());
352 fn test_to_ascii_lowercase() {
353 assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
354 // Dotted capital I, Kelvin sign, Sharp S.
355 assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
359 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
361 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
362 (from_u32(lower).unwrap()).to_string());
368 fn test_into_ascii_uppercase() {
369 assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
370 "URL()URL()URL()üRL".to_string());
371 assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
375 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
377 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
378 (from_u32(upper).unwrap()).to_string());
384 fn test_into_ascii_lowercase() {
385 assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
386 "url()url()url()Ürl");
387 // Dotted capital I, Kelvin sign, Sharp S.
388 assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
392 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
394 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
395 (from_u32(lower).unwrap()).to_string());
401 fn test_eq_ignore_ascii_case() {
402 assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
403 assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
404 // Dotted capital I, Kelvin sign, Sharp S.
405 assert!("HİKß".eq_ignore_ascii_case("hİKß"));
406 assert!(!"İ".eq_ignore_ascii_case("i"));
407 assert!(!"K".eq_ignore_ascii_case("k"));
408 assert!(!"ß".eq_ignore_ascii_case("s"));
413 let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
415 assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
416 (from_u32(lower).unwrap()).to_string().as_slice()));