1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Utilities for manipulating the char type
13 use option::{None, Option, Some};
16 use unicode::{derived_property, general_category};
18 #[cfg(test)] use str::OwnedStr;
20 #[cfg(not(test))] use cmp::{Eq, Ord};
21 #[cfg(not(test))] use num::Zero;
24 Lu Uppercase_Letter an uppercase letter
25 Ll Lowercase_Letter a lowercase letter
26 Lt Titlecase_Letter a digraphic character, with first part uppercase
27 Lm Modifier_Letter a modifier letter
28 Lo Other_Letter other letters, including syllables and ideographs
29 Mn Nonspacing_Mark a nonspacing combining mark (zero advance width)
30 Mc Spacing_Mark a spacing combining mark (positive advance width)
31 Me Enclosing_Mark an enclosing combining mark
32 Nd Decimal_Number a decimal digit
33 Nl Letter_Number a letterlike numeric character
34 No Other_Number a numeric character of other type
35 Pc Connector_Punctuation a connecting punctuation mark, like a tie
36 Pd Dash_Punctuation a dash or hyphen punctuation mark
37 Ps Open_Punctuation an opening punctuation mark (of a pair)
38 Pe Close_Punctuation a closing punctuation mark (of a pair)
39 Pi Initial_Punctuation an initial quotation mark
40 Pf Final_Punctuation a final quotation mark
41 Po Other_Punctuation a punctuation mark of other type
42 Sm Math_Symbol a symbol of primarily mathematical use
43 Sc Currency_Symbol a currency sign
44 Sk Modifier_Symbol a non-letterlike modifier symbol
45 So Other_Symbol a symbol of other type
46 Zs Space_Separator a space character (of various non-zero widths)
47 Zl Line_Separator U+2028 LINE SEPARATOR only
48 Zp Paragraph_Separator U+2029 PARAGRAPH SEPARATOR only
49 Cc Control a C0 or C1 control code
50 Cf Format a format control character
51 Cs Surrogate a surrogate code point
52 Co Private_Use a private-use character
53 Cn Unassigned a reserved unassigned code point or a noncharacter
56 /// Returns whether the specified character is considered a unicode alphabetic
58 pub fn is_alphabetic(c: char) -> bool { derived_property::Alphabetic(c) }
60 pub fn is_XID_start(c: char) -> bool { derived_property::XID_Start(c) }
62 pub fn is_XID_continue(c: char) -> bool { derived_property::XID_Continue(c) }
65 /// Indicates whether a character is in lower case, defined
66 /// in terms of the Unicode General Category 'Ll'
69 pub fn is_lowercase(c: char) -> bool { general_category::Ll(c) }
72 /// Indicates whether a character is in upper case, defined
73 /// in terms of the Unicode General Category 'Lu'.
76 pub fn is_uppercase(c: char) -> bool { general_category::Lu(c) }
79 /// Indicates whether a character is whitespace. Whitespace is defined in
80 /// terms of the Unicode General Categories 'Zs', 'Zl', 'Zp'
81 /// additional 'Cc'-category control codes in the range [0x09, 0x0d]
84 pub fn is_whitespace(c: char) -> bool {
86 || ('\x09' <= c && c <= '\x0d')
87 || general_category::Zs(c)
88 || general_category::Zl(c)
89 || general_category::Zp(c)
93 /// Indicates whether a character is alphanumeric. Alphanumericness is
94 /// defined in terms of the Unicode General Categories 'Nd', 'Nl', 'No'
95 /// and the Derived Core Property 'Alphabetic'.
98 pub fn is_alphanumeric(c: char) -> bool {
99 derived_property::Alphabetic(c)
100 || general_category::Nd(c)
101 || general_category::Nl(c)
102 || general_category::No(c)
105 /// Indicates whether the character is numeric (Nd, Nl, or No)
107 pub fn is_digit(c: char) -> bool {
108 general_category::Nd(c)
109 || general_category::Nl(c)
110 || general_category::No(c)
114 /// Checks if a character parses as a numeric digit in the given radix.
115 /// Compared to `is_digit()`, this function only recognizes the
116 /// characters `0-9`, `a-z` and `A-Z`.
120 /// Returns `true` if `c` is a valid digit under `radix`, and `false`
125 /// Fails if given a `radix` > 36.
129 /// This just wraps `to_digit()`.
132 pub fn is_digit_radix(c: char, radix: uint) -> bool {
133 match to_digit(c, radix) {
140 /// Convert a char to the corresponding digit.
144 /// If `c` is between '0' and '9', the corresponding value
145 /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
146 /// 'b' or 'B', 11, etc. Returns none if the char does not
147 /// refer to a digit in the given radix.
151 /// Fails if given a `radix` outside the range `[0..36]`.
154 pub fn to_digit(c: char, radix: uint) -> Option<uint> {
156 fail!("to_digit: radix %? is to high (maximum 36)", radix);
159 '0' .. '9' => c as uint - ('0' as uint),
160 'a' .. 'z' => c as uint + 10u - ('a' as uint),
161 'A' .. 'Z' => c as uint + 10u - ('A' as uint),
164 if val < radix { Some(val) }
169 /// Converts a number to the character representing it.
173 /// Returns `Some(char)` if `num` represents one digit under `radix`,
174 /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
178 /// Fails if given an `radix` > 36.
181 pub fn from_digit(num: uint, radix: uint) -> Option<char> {
183 fail!("from_digit: radix %? is to high (maximum 36)", num);
187 Some(('0' as uint + num) as char)
189 Some(('a' as uint + num - 10u) as char)
197 /// Return the hexadecimal unicode escape of a char.
199 /// The rules are as follows:
201 /// - chars in [0,0xff] get 2-digit escapes: `\\xNN`
202 /// - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
203 /// - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
205 pub fn escape_unicode(c: char, f: &fn(char)) {
206 // avoid calling str::to_str_radix because we don't really need to allocate
210 (c <= '\xff') { f('x'); 2 }
211 (c <= '\uffff') { f('u'); 4 }
214 do int::range_step(4 * (pad - 1), -1, -4) |offset| {
215 match ((c as u32) >> offset) & 0xf {
216 i @ 0 .. 9 => { f('0' + i as char); }
217 i => { f('a' + (i - 10) as char); }
224 /// Return a 'default' ASCII and C++11-like char-literal escape of a char.
226 /// The default is chosen with a bias toward producing literals that are
227 /// legal in a variety of languages, including C++11 and similar C-family
228 /// languages. The exact rules are:
230 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
231 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
232 /// - Any other chars in the range [0x20,0x7e] are not escaped.
233 /// - Any other chars are given hex unicode escapes; see `escape_unicode`.
235 pub fn escape_default(c: char, f: &fn(char)) {
237 '\t' => { f('\\'); f('t'); }
238 '\r' => { f('\\'); f('r'); }
239 '\n' => { f('\\'); f('n'); }
240 '\\' => { f('\\'); f('\\'); }
241 '\'' => { f('\\'); f('\''); }
242 '"' => { f('\\'); f('"'); }
243 '\x20' .. '\x7e' => { f(c); }
244 _ => c.escape_unicode(f),
248 /// Returns the amount of bytes this character would need if encoded in utf8
249 pub fn len_utf8_bytes(c: char) -> uint {
250 static MAX_ONE_B: uint = 128u;
251 static MAX_TWO_B: uint = 2048u;
252 static MAX_THREE_B: uint = 65536u;
253 static MAX_FOUR_B: uint = 2097152u;
255 let code = c as uint;
257 (code < MAX_ONE_B) { 1u }
258 (code < MAX_TWO_B) { 2u }
259 (code < MAX_THREE_B) { 3u }
260 (code < MAX_FOUR_B) { 4u }
261 _ { fail!("invalid character!") }
265 #[allow(missing_doc)]
267 fn is_alphabetic(&self) -> bool;
268 fn is_XID_start(&self) -> bool;
269 fn is_XID_continue(&self) -> bool;
270 fn is_lowercase(&self) -> bool;
271 fn is_uppercase(&self) -> bool;
272 fn is_whitespace(&self) -> bool;
273 fn is_alphanumeric(&self) -> bool;
274 fn is_digit(&self) -> bool;
275 fn is_digit_radix(&self, radix: uint) -> bool;
276 fn to_digit(&self, radix: uint) -> Option<uint>;
277 fn from_digit(num: uint, radix: uint) -> Option<char>;
278 fn escape_unicode(&self, f: &fn(char));
279 fn escape_default(&self, f: &fn(char));
280 fn len_utf8_bytes(&self) -> uint;
284 fn is_alphabetic(&self) -> bool { is_alphabetic(*self) }
286 fn is_XID_start(&self) -> bool { is_XID_start(*self) }
288 fn is_XID_continue(&self) -> bool { is_XID_continue(*self) }
290 fn is_lowercase(&self) -> bool { is_lowercase(*self) }
292 fn is_uppercase(&self) -> bool { is_uppercase(*self) }
294 fn is_whitespace(&self) -> bool { is_whitespace(*self) }
296 fn is_alphanumeric(&self) -> bool { is_alphanumeric(*self) }
298 fn is_digit(&self) -> bool { is_digit(*self) }
300 fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
302 fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
304 fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
306 fn escape_unicode(&self, f: &fn(char)) { escape_unicode(*self, f) }
308 fn escape_default(&self, f: &fn(char)) { escape_default(*self, f) }
310 fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
316 fn eq(&self, other: &char) -> bool { (*self) == (*other) }
318 fn ne(&self, other: &char) -> bool { (*self) != (*other) }
324 fn lt(&self, other: &char) -> bool { *self < *other }
329 fn zero() -> char { 0 as char }
330 fn is_zero(&self) -> bool { *self == 0 as char }
334 fn test_is_lowercase() {
335 assert!('a'.is_lowercase());
336 assert!('ö'.is_lowercase());
337 assert!('ß'.is_lowercase());
338 assert!(!'Ü'.is_lowercase());
339 assert!(!'P'.is_lowercase());
343 fn test_is_uppercase() {
344 assert!(!'h'.is_uppercase());
345 assert!(!'ä'.is_uppercase());
346 assert!(!'ß'.is_uppercase());
347 assert!('Ö'.is_uppercase());
348 assert!('T'.is_uppercase());
352 fn test_is_whitespace() {
353 assert!(' '.is_whitespace());
354 assert!('\u2007'.is_whitespace());
355 assert!('\t'.is_whitespace());
356 assert!('\n'.is_whitespace());
357 assert!(!'a'.is_whitespace());
358 assert!(!'_'.is_whitespace());
359 assert!(!'\u0000'.is_whitespace());
364 assert_eq!('0'.to_digit(10u), Some(0u));
365 assert_eq!('1'.to_digit(2u), Some(1u));
366 assert_eq!('2'.to_digit(3u), Some(2u));
367 assert_eq!('9'.to_digit(10u), Some(9u));
368 assert_eq!('a'.to_digit(16u), Some(10u));
369 assert_eq!('A'.to_digit(16u), Some(10u));
370 assert_eq!('b'.to_digit(16u), Some(11u));
371 assert_eq!('B'.to_digit(16u), Some(11u));
372 assert_eq!('z'.to_digit(36u), Some(35u));
373 assert_eq!('Z'.to_digit(36u), Some(35u));
374 assert_eq!(' '.to_digit(10u), None);
375 assert_eq!('$'.to_digit(36u), None);
380 assert!('2'.is_digit());
381 assert!('7'.is_digit());
382 assert!(!'c'.is_digit());
383 assert!(!'i'.is_digit());
384 assert!(!'z'.is_digit());
385 assert!(!'Q'.is_digit());
389 fn test_escape_default() {
390 fn string(c: char) -> ~str {
391 let mut result = ~"";
392 do escape_default(c) |c| { result.push_char(c); }
395 assert_eq!(string('\n'), ~"\\n");
396 assert_eq!(string('\r'), ~"\\r");
397 assert_eq!(string('\''), ~"\\'");
398 assert_eq!(string('"'), ~"\\\"");
399 assert_eq!(string(' '), ~" ");
400 assert_eq!(string('a'), ~"a");
401 assert_eq!(string('~'), ~"~");
402 assert_eq!(string('\x00'), ~"\\x00");
403 assert_eq!(string('\x1f'), ~"\\x1f");
404 assert_eq!(string('\x7f'), ~"\\x7f");
405 assert_eq!(string('\xff'), ~"\\xff");
406 assert_eq!(string('\u011b'), ~"\\u011b");
407 assert_eq!(string('\U0001d4b6'), ~"\\U0001d4b6");
411 fn test_escape_unicode() {
412 fn string(c: char) -> ~str {
413 let mut result = ~"";
414 do escape_unicode(c) |c| { result.push_char(c); }
417 assert_eq!(string('\x00'), ~"\\x00");
418 assert_eq!(string('\n'), ~"\\x0a");
419 assert_eq!(string(' '), ~"\\x20");
420 assert_eq!(string('a'), ~"\\x61");
421 assert_eq!(string('\u011b'), ~"\\u011b");
422 assert_eq!(string('\U0001d4b6'), ~"\\U0001d4b6");