1 // Copyright 2013-2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // ignore-lexer-test FIXME #15679
13 //! Operations on ASCII strings and characters
15 #![unstable = "unsure about placement and naming"]
17 use core::kinds::Sized;
18 use iter::IteratorExt;
25 /// Extension methods for ASCII-subset only operations on owned strings
26 #[experimental = "would prefer to do this in a more general way"]
27 pub trait OwnedAsciiExt {
28 /// Convert the string to ASCII upper case:
29 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
30 /// but non-ASCII letters are unchanged.
31 fn into_ascii_uppercase(self) -> Self;
33 /// Convert the string to ASCII lower case:
34 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
35 /// but non-ASCII letters are unchanged.
36 fn into_ascii_lowercase(self) -> Self;
39 /// Extension methods for ASCII-subset only operations on string slices
40 #[experimental = "would prefer to do this in a more general way"]
41 pub trait AsciiExt<T = Self> for Sized? {
42 /// Check if within the ASCII range.
43 fn is_ascii(&self) -> bool;
45 /// Makes a copy of the string in ASCII upper case:
46 /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z',
47 /// but non-ASCII letters are unchanged.
48 fn to_ascii_uppercase(&self) -> T;
50 /// Makes a copy of the string in ASCII lower case:
51 /// ASCII letters 'A' to 'Z' are mapped to 'a' to 'z',
52 /// but non-ASCII letters are unchanged.
53 fn to_ascii_lowercase(&self) -> T;
55 /// Check that two strings are an ASCII case-insensitive match.
56 /// Same as `to_ascii_lowercase(a) == to_ascii_lower(b)`,
57 /// but without allocating and copying temporary strings.
58 fn eq_ignore_ascii_case(&self, other: &Self) -> bool;
61 #[experimental = "would prefer to do this in a more general way"]
62 impl AsciiExt<String> for str {
64 fn is_ascii(&self) -> bool {
65 self.bytes().all(|b| b.is_ascii())
69 fn to_ascii_uppercase(&self) -> String {
70 // Vec<u8>::to_ascii_uppercase() preserves the UTF-8 invariant.
71 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_uppercase()) }
75 fn to_ascii_lowercase(&self) -> String {
76 // Vec<u8>::to_ascii_lowercase() preserves the UTF-8 invariant.
77 unsafe { String::from_utf8_unchecked(self.as_bytes().to_ascii_lowercase()) }
81 fn eq_ignore_ascii_case(&self, other: &str) -> bool {
82 self.as_bytes().eq_ignore_ascii_case(other.as_bytes())
86 #[experimental = "would prefer to do this in a more general way"]
87 impl OwnedAsciiExt for String {
89 fn into_ascii_uppercase(self) -> String {
90 // Vec<u8>::into_ascii_uppercase() preserves the UTF-8 invariant.
91 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_uppercase()) }
95 fn into_ascii_lowercase(self) -> String {
96 // Vec<u8>::into_ascii_lowercase() preserves the UTF-8 invariant.
97 unsafe { String::from_utf8_unchecked(self.into_bytes().into_ascii_lowercase()) }
101 #[experimental = "would prefer to do this in a more general way"]
102 impl AsciiExt<Vec<u8>> for [u8] {
104 fn is_ascii(&self) -> bool {
105 self.iter().all(|b| b.is_ascii())
109 fn to_ascii_uppercase(&self) -> Vec<u8> {
110 self.iter().map(|b| b.to_ascii_uppercase()).collect()
114 fn to_ascii_lowercase(&self) -> Vec<u8> {
115 self.iter().map(|b| b.to_ascii_lowercase()).collect()
119 fn eq_ignore_ascii_case(&self, other: &[u8]) -> bool {
120 self.len() == other.len() &&
121 self.iter().zip(other.iter()).all(|(a, b)| {
122 a.eq_ignore_ascii_case(b)
127 #[experimental = "would prefer to do this in a more general way"]
128 impl OwnedAsciiExt for Vec<u8> {
130 fn into_ascii_uppercase(mut self) -> Vec<u8> {
131 for byte in self.iter_mut() {
132 *byte = byte.to_ascii_uppercase();
138 fn into_ascii_lowercase(mut self) -> Vec<u8> {
139 for byte in self.iter_mut() {
140 *byte = byte.to_ascii_lowercase();
146 #[experimental = "would prefer to do this in a more general way"]
147 impl AsciiExt for u8 {
149 fn is_ascii(&self) -> bool {
154 fn to_ascii_uppercase(&self) -> u8 {
155 ASCII_UPPERCASE_MAP[*self as uint]
159 fn to_ascii_lowercase(&self) -> u8 {
160 ASCII_LOWERCASE_MAP[*self as uint]
164 fn eq_ignore_ascii_case(&self, other: &u8) -> bool {
165 self.to_ascii_lowercase() == other.to_ascii_lowercase()
169 #[experimental = "would prefer to do this in a more general way"]
170 impl AsciiExt for char {
172 fn is_ascii(&self) -> bool {
177 fn to_ascii_uppercase(&self) -> char {
179 (*self as u8).to_ascii_uppercase() as char
186 fn to_ascii_lowercase(&self) -> char {
188 (*self as u8).to_ascii_lowercase() as char
195 fn eq_ignore_ascii_case(&self, other: &char) -> bool {
196 self.to_ascii_lowercase() == other.to_ascii_lowercase()
200 /// Returns a 'default' ASCII and C++11-like literal escape of a `u8`
202 /// The default is chosen with a bias toward producing literals that are
203 /// legal in a variety of languages, including C++11 and similar C-family
204 /// languages. The exact rules are:
206 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
207 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
208 /// - Any other chars in the range [0x20,0x7e] are not escaped.
209 /// - Any other chars are given hex escapes.
210 /// - Unicode escapes are never generated by this function.
211 #[unstable = "needs to be updated to use an iterator"]
212 pub fn escape_default<F>(c: u8, mut f: F) where
216 b'\t' => { f(b'\\'); f(b't'); }
217 b'\r' => { f(b'\\'); f(b'r'); }
218 b'\n' => { f(b'\\'); f(b'n'); }
219 b'\\' => { f(b'\\'); f(b'\\'); }
220 b'\'' => { f(b'\\'); f(b'\''); }
221 b'"' => { f(b'\\'); f(b'"'); }
222 b'\x20' ... b'\x7e' => { f(c); }
226 for &offset in [4u, 0u].iter() {
227 match ((c as i32) >> offset) & 0xf {
228 i @ 0 ... 9 => f(b'0' + (i as u8)),
229 i => f(b'a' + (i as u8 - 10)),
236 static ASCII_LOWERCASE_MAP: [u8; 256] = [
237 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
238 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
239 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
240 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
241 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
242 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
243 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
244 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
247 b'a', b'b', b'c', b'd', b'e', b'f', b'g',
248 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
249 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
252 b'[', b'\\', b']', b'^', b'_',
253 b'`', b'a', b'b', b'c', b'd', b'e', b'f', b'g',
254 b'h', b'i', b'j', b'k', b'l', b'm', b'n', b'o',
255 b'p', b'q', b'r', b's', b't', b'u', b'v', b'w',
256 b'x', b'y', b'z', b'{', b'|', b'}', b'~', 0x7f,
257 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
258 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
259 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
260 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
261 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
262 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
263 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
264 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
265 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
266 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
267 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
268 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
269 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
270 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
271 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
272 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
275 static ASCII_UPPERCASE_MAP: [u8; 256] = [
276 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
277 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
278 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
279 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
280 b' ', b'!', b'"', b'#', b'$', b'%', b'&', b'\'',
281 b'(', b')', b'*', b'+', b',', b'-', b'.', b'/',
282 b'0', b'1', b'2', b'3', b'4', b'5', b'6', b'7',
283 b'8', b'9', b':', b';', b'<', b'=', b'>', b'?',
284 b'@', b'A', b'B', b'C', b'D', b'E', b'F', b'G',
285 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
286 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
287 b'X', b'Y', b'Z', b'[', b'\\', b']', b'^', b'_',
290 b'A', b'B', b'C', b'D', b'E', b'F', b'G',
291 b'H', b'I', b'J', b'K', b'L', b'M', b'N', b'O',
292 b'P', b'Q', b'R', b'S', b'T', b'U', b'V', b'W',
295 b'{', b'|', b'}', b'~', 0x7f,
296 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
297 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
298 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
299 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
300 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
301 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
302 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
303 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
304 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
305 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
306 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
307 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
308 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
309 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
310 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
311 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
323 assert!("banana".chars().all(|c| c.is_ascii()));
324 assert!(!"ประเทศไทย中华Việt Nam".chars().all(|c| c.is_ascii()));
328 fn test_ascii_vec() {
329 assert!("".is_ascii());
330 assert!("a".is_ascii());
331 assert!(!"\u{2009}".is_ascii());
336 fn test_to_ascii_uppercase() {
337 assert_eq!("url()URL()uRl()ürl".to_ascii_uppercase(), "URL()URL()URL()üRL");
338 assert_eq!("hıKß".to_ascii_uppercase(), "HıKß");
342 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
344 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_uppercase(),
345 (from_u32(upper).unwrap()).to_string());
351 fn test_to_ascii_lowercase() {
352 assert_eq!("url()URL()uRl()Ürl".to_ascii_lowercase(), "url()url()url()Ürl");
353 // Dotted capital I, Kelvin sign, Sharp S.
354 assert_eq!("HİKß".to_ascii_lowercase(), "hİKß");
358 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
360 assert_eq!((from_u32(i).unwrap()).to_string().to_ascii_lowercase(),
361 (from_u32(lower).unwrap()).to_string());
367 fn test_into_ascii_uppercase() {
368 assert_eq!(("url()URL()uRl()ürl".to_string()).into_ascii_uppercase(),
369 "URL()URL()URL()üRL".to_string());
370 assert_eq!(("hıKß".to_string()).into_ascii_uppercase(), "HıKß");
374 let upper = if 'a' as u32 <= i && i <= 'z' as u32 { i + 'A' as u32 - 'a' as u32 }
376 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_uppercase(),
377 (from_u32(upper).unwrap()).to_string());
383 fn test_into_ascii_lowercase() {
384 assert_eq!(("url()URL()uRl()Ürl".to_string()).into_ascii_lowercase(),
385 "url()url()url()Ürl");
386 // Dotted capital I, Kelvin sign, Sharp S.
387 assert_eq!(("HİKß".to_string()).into_ascii_lowercase(), "hİKß");
391 let lower = if 'A' as u32 <= i && i <= 'Z' as u32 { i + 'a' as u32 - 'A' as u32 }
393 assert_eq!((from_u32(i).unwrap()).to_string().into_ascii_lowercase(),
394 (from_u32(lower).unwrap()).to_string());
400 fn test_eq_ignore_ascii_case() {
401 assert!("url()URL()uRl()Ürl".eq_ignore_ascii_case("url()url()url()Ürl"));
402 assert!(!"Ürl".eq_ignore_ascii_case("ürl"));
403 // Dotted capital I, Kelvin sign, Sharp S.
404 assert!("HİKß".eq_ignore_ascii_case("hİKß"));
405 assert!(!"İ".eq_ignore_ascii_case("i"));
406 assert!(!"K".eq_ignore_ascii_case("k"));
407 assert!(!"ß".eq_ignore_ascii_case("s"));
412 let lower = if 'A' as u32 <= c && c <= 'Z' as u32 { c + 'a' as u32 - 'A' as u32 }
414 assert!((from_u32(i).unwrap()).to_string().eq_ignore_ascii_case(
415 (from_u32(lower).unwrap()).to_string().as_slice()));