src/libcore/char.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Character manipulation.
  12 //!
  13 //! For more details, see ::unicode::char (a.k.a. std::char)
  14
  15 #![allow(non_snake_case)]
  16 #![doc(primitive = "char")]
  17
  18 use mem::transmute;
  19 use option::{None, Option, Some};
  20 use iter::range_step;
  21 use collections::Collection;
  22
  23 // UTF-8 ranges and tags for encoding characters
  24 static TAG_CONT: u8    = 0b1000_0000u8;
  25 static TAG_TWO_B: u8   = 0b1100_0000u8;
  26 static TAG_THREE_B: u8 = 0b1110_0000u8;
  27 static TAG_FOUR_B: u8  = 0b1111_0000u8;
  28 static MAX_ONE_B: u32   =     0x80u32;
  29 static MAX_TWO_B: u32   =    0x800u32;
  30 static MAX_THREE_B: u32 =  0x10000u32;
  31
  32 /*
  33     Lu  Uppercase_Letter        an uppercase letter
  34     Ll  Lowercase_Letter        a lowercase letter
  35     Lt  Titlecase_Letter        a digraphic character, with first part uppercase
  36     Lm  Modifier_Letter         a modifier letter
  37     Lo  Other_Letter            other letters, including syllables and ideographs
  38     Mn  Nonspacing_Mark         a nonspacing combining mark (zero advance width)
  39     Mc  Spacing_Mark            a spacing combining mark (positive advance width)
  40     Me  Enclosing_Mark          an enclosing combining mark
  41     Nd  Decimal_Number          a decimal digit
  42     Nl  Letter_Number           a letterlike numeric character
  43     No  Other_Number            a numeric character of other type
  44     Pc  Connector_Punctuation   a connecting punctuation mark, like a tie
  45     Pd  Dash_Punctuation        a dash or hyphen punctuation mark
  46     Ps  Open_Punctuation        an opening punctuation mark (of a pair)
  47     Pe  Close_Punctuation       a closing punctuation mark (of a pair)
  48     Pi  Initial_Punctuation     an initial quotation mark
  49     Pf  Final_Punctuation       a final quotation mark
  50     Po  Other_Punctuation       a punctuation mark of other type
  51     Sm  Math_Symbol             a symbol of primarily mathematical use
  52     Sc  Currency_Symbol         a currency sign
  53     Sk  Modifier_Symbol         a non-letterlike modifier symbol
  54     So  Other_Symbol            a symbol of other type
  55     Zs  Space_Separator         a space character (of various non-zero widths)
  56     Zl  Line_Separator          U+2028 LINE SEPARATOR only
  57     Zp  Paragraph_Separator     U+2029 PARAGRAPH SEPARATOR only
  58     Cc  Control                 a C0 or C1 control code
  59     Cf  Format                  a format control character
  60     Cs  Surrogate               a surrogate code point
  61     Co  Private_Use             a private-use character
  62     Cn  Unassigned              a reserved unassigned code point or a noncharacter
  63 */
  64
  65 /// The highest valid code point
  66 pub static MAX: char = '\U0010ffff';
  67
  68 /// Converts from `u32` to a `char`
  69 #[inline]
  70 pub fn from_u32(i: u32) -> Option<char> {
  71     // catch out-of-bounds and surrogates
  72     if (i > MAX as u32) || (i >= 0xD800 && i <= 0xDFFF) {
  73         None
  74     } else {
  75         Some(unsafe { transmute(i) })
  76     }
  77 }
  78
  79 ///
  80 /// Checks if a `char` parses as a numeric digit in the given radix
  81 ///
  82 /// Compared to `is_digit()`, this function only recognizes the
  83 /// characters `0-9`, `a-z` and `A-Z`.
  84 ///
  85 /// # Return value
  86 ///
  87 /// Returns `true` if `c` is a valid digit under `radix`, and `false`
  88 /// otherwise.
  89 ///
  90 /// # Failure
  91 ///
  92 /// Fails if given a `radix` > 36.
  93 ///
  94 /// # Note
  95 ///
  96 /// This just wraps `to_digit()`.
  97 ///
  98 #[inline]
  99 pub fn is_digit_radix(c: char, radix: uint) -> bool {
 100     match to_digit(c, radix) {
 101         Some(_) => true,
 102         None    => false,
 103     }
 104 }
 105
 106 ///
 107 /// Converts a `char` to the corresponding digit
 108 ///
 109 /// # Return value
 110 ///
 111 /// If `c` is between '0' and '9', the corresponding value
 112 /// between 0 and 9. If `c` is 'a' or 'A', 10. If `c` is
 113 /// 'b' or 'B', 11, etc. Returns none if the `char` does not
 114 /// refer to a digit in the given radix.
 115 ///
 116 /// # Failure
 117 ///
 118 /// Fails if given a `radix` outside the range `[0..36]`.
 119 ///
 120 #[inline]
 121 pub fn to_digit(c: char, radix: uint) -> Option<uint> {
 122     if radix > 36 {
 123         fail!("to_digit: radix is too high (maximum 36)");
 124     }
 125     let val = match c {
 126       '0' .. '9' => c as uint - ('0' as uint),
 127       'a' .. 'z' => c as uint + 10u - ('a' as uint),
 128       'A' .. 'Z' => c as uint + 10u - ('A' as uint),
 129       _ => return None,
 130     };
 131     if val < radix { Some(val) }
 132     else { None }
 133 }
 134
 135 ///
 136 /// Converts a number to the character representing it
 137 ///
 138 /// # Return value
 139 ///
 140 /// Returns `Some(char)` if `num` represents one digit under `radix`,
 141 /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
 142 ///
 143 /// # Failure
 144 ///
 145 /// Fails if given an `radix` > 36.
 146 ///
 147 #[inline]
 148 pub fn from_digit(num: uint, radix: uint) -> Option<char> {
 149     if radix > 36 {
 150         fail!("from_digit: radix is to high (maximum 36)");
 151     }
 152     if num < radix {
 153         unsafe {
 154             if num < 10 {
 155                 Some(transmute(('0' as uint + num) as u32))
 156             } else {
 157                 Some(transmute(('a' as uint + num - 10u) as u32))
 158             }
 159         }
 160     } else {
 161         None
 162     }
 163 }
 164
 165 ///
 166 /// Returns the hexadecimal Unicode escape of a `char`
 167 ///
 168 /// The rules are as follows:
 169 ///
 170 /// - chars in [0,0xff] get 2-digit escapes: `\\xNN`
 171 /// - chars in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`
 172 /// - chars above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`
 173 ///
 174 pub fn escape_unicode(c: char, f: |char|) {
 175     // avoid calling str::to_str_radix because we don't really need to allocate
 176     // here.
 177     f('\\');
 178     let pad = match () {
 179         _ if c <= '\xff'    => { f('x'); 2 }
 180         _ if c <= '\uffff'  => { f('u'); 4 }
 181         _                   => { f('U'); 8 }
 182     };
 183     for offset in range_step::<i32>(4 * (pad - 1), -1, -4) {
 184         let offset = offset as uint;
 185         unsafe {
 186             match ((c as i32) >> offset) & 0xf {
 187                 i @ 0 .. 9 => { f(transmute('0' as i32 + i)); }
 188                 i => { f(transmute('a' as i32 + (i - 10))); }
 189             }
 190         }
 191     }
 192 }
 193
 194 ///
 195 /// Returns a 'default' ASCII and C++11-like literal escape of a `char`
 196 ///
 197 /// The default is chosen with a bias toward producing literals that are
 198 /// legal in a variety of languages, including C++11 and similar C-family
 199 /// languages. The exact rules are:
 200 ///
 201 /// - Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 202 /// - Single-quote, double-quote and backslash chars are backslash-escaped.
 203 /// - Any other chars in the range [0x20,0x7e] are not escaped.
 204 /// - Any other chars are given hex Unicode escapes; see `escape_unicode`.
 205 ///
 206 pub fn escape_default(c: char, f: |char|) {
 207     match c {
 208         '\t' => { f('\\'); f('t'); }
 209         '\r' => { f('\\'); f('r'); }
 210         '\n' => { f('\\'); f('n'); }
 211         '\\' => { f('\\'); f('\\'); }
 212         '\'' => { f('\\'); f('\''); }
 213         '"'  => { f('\\'); f('"'); }
 214         '\x20' .. '\x7e' => { f(c); }
 215         _ => c.escape_unicode(f),
 216     }
 217 }
 218
 219 /// Returns the amount of bytes this `char` would need if encoded in UTF-8
 220 #[inline]
 221 pub fn len_utf8_bytes(c: char) -> uint {
 222     let code = c as u32;
 223     match () {
 224         _ if code < MAX_ONE_B   => 1u,
 225         _ if code < MAX_TWO_B   => 2u,
 226         _ if code < MAX_THREE_B => 3u,
 227         _  => 4u,
 228     }
 229 }
 230
 231 /// Basic `char` manipulations.
 232 pub trait Char {
 233     /// Checks if a `char` parses as a numeric digit in the given radix.
 234     ///
 235     /// Compared to `is_digit()`, this function only recognizes the characters
 236     /// `0-9`, `a-z` and `A-Z`.
 237     ///
 238     /// # Return value
 239     ///
 240     /// Returns `true` if `c` is a valid digit under `radix`, and `false`
 241     /// otherwise.
 242     ///
 243     /// # Failure
 244     ///
 245     /// Fails if given a radix > 36.
 246     fn is_digit_radix(&self, radix: uint) -> bool;
 247
 248     /// Converts a character to the corresponding digit.
 249     ///
 250     /// # Return value
 251     ///
 252     /// If `c` is between '0' and '9', the corresponding value between 0 and
 253     /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
 254     /// none if the character does not refer to a digit in the given radix.
 255     ///
 256     /// # Failure
 257     ///
 258     /// Fails if given a radix outside the range [0..36].
 259     fn to_digit(&self, radix: uint) -> Option<uint>;
 260
 261     /// Converts a number to the character representing it.
 262     ///
 263     /// # Return value
 264     ///
 265     /// Returns `Some(char)` if `num` represents one digit under `radix`,
 266     /// using one character of `0-9` or `a-z`, or `None` if it doesn't.
 267     ///
 268     /// # Failure
 269     ///
 270     /// Fails if given a radix > 36.
 271     fn from_digit(num: uint, radix: uint) -> Option<Self>;
 272
 273     /// Returns the hexadecimal Unicode escape of a character.
 274     ///
 275     /// The rules are as follows:
 276     ///
 277     /// * Characters in [0,0xff] get 2-digit escapes: `\\xNN`
 278     /// * Characters in [0x100,0xffff] get 4-digit escapes: `\\uNNNN`.
 279     /// * Characters above 0x10000 get 8-digit escapes: `\\UNNNNNNNN`.
 280     fn escape_unicode(&self, f: |char|);
 281
 282     /// Returns a 'default' ASCII and C++11-like literal escape of a
 283     /// character.
 284     ///
 285     /// The default is chosen with a bias toward producing literals that are
 286     /// legal in a variety of languages, including C++11 and similar C-family
 287     /// languages. The exact rules are:
 288     ///
 289     /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
 290     /// * Single-quote, double-quote and backslash chars are backslash-
 291     ///   escaped.
 292     /// * Any other chars in the range [0x20,0x7e] are not escaped.
 293     /// * Any other chars are given hex Unicode escapes; see `escape_unicode`.
 294     fn escape_default(&self, f: |char|);
 295
 296     /// Returns the amount of bytes this character would need if encoded in
 297     /// UTF-8.
 298     fn len_utf8_bytes(&self) -> uint;
 299
 300     /// Encodes this character as UTF-8 into the provided byte buffer,
 301     /// and then returns the number of bytes written.
 302     ///
 303     /// If the buffer is not large enough, nothing will be written into it
 304     /// and a `None` will be returned.
 305     fn encode_utf8(&self, dst: &mut [u8]) -> Option<uint>;
 306
 307     /// Encodes this character as UTF-16 into the provided `u16` buffer,
 308     /// and then returns the number of `u16`s written.
 309     ///
 310     /// If the buffer is not large enough, nothing will be written into it
 311     /// and a `None` will be returned.
 312     fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint>;
 313 }
 314
 315 impl Char for char {
 316     fn is_digit_radix(&self, radix: uint) -> bool { is_digit_radix(*self, radix) }
 317
 318     fn to_digit(&self, radix: uint) -> Option<uint> { to_digit(*self, radix) }
 319
 320     fn from_digit(num: uint, radix: uint) -> Option<char> { from_digit(num, radix) }
 321
 322     fn escape_unicode(&self, f: |char|) { escape_unicode(*self, f) }
 323
 324     fn escape_default(&self, f: |char|) { escape_default(*self, f) }
 325
 326     #[inline]
 327     fn len_utf8_bytes(&self) -> uint { len_utf8_bytes(*self) }
 328
 329     #[inline]
 330     fn encode_utf8<'a>(&self, dst: &'a mut [u8]) -> Option<uint> {
 331         // Marked #[inline] to allow llvm optimizing it away
 332         let code = *self as u32;
 333         if code < MAX_ONE_B && dst.len() >= 1 {
 334             dst[0] = code as u8;
 335             Some(1)
 336         } else if code < MAX_TWO_B && dst.len() >= 2 {
 337             dst[0] = (code >> 6u & 0x1F_u32) as u8 | TAG_TWO_B;
 338             dst[1] = (code & 0x3F_u32) as u8 | TAG_CONT;
 339             Some(2)
 340         } else if code < MAX_THREE_B && dst.len() >= 3  {
 341             dst[0] = (code >> 12u & 0x0F_u32) as u8 | TAG_THREE_B;
 342             dst[1] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
 343             dst[2] = (code & 0x3F_u32) as u8 | TAG_CONT;
 344             Some(3)
 345         } else if dst.len() >= 4 {
 346             dst[0] = (code >> 18u & 0x07_u32) as u8 | TAG_FOUR_B;
 347             dst[1] = (code >> 12u & 0x3F_u32) as u8 | TAG_CONT;
 348             dst[2] = (code >>  6u & 0x3F_u32) as u8 | TAG_CONT;
 349             dst[3] = (code & 0x3F_u32) as u8 | TAG_CONT;
 350             Some(4)
 351         } else {
 352             None
 353         }
 354     }
 355
 356     #[inline]
 357     fn encode_utf16(&self, dst: &mut [u16]) -> Option<uint> {
 358         // Marked #[inline] to allow llvm optimizing it away
 359         let mut ch = *self as u32;
 360         if (ch & 0xFFFF_u32) == ch  && dst.len() >= 1 {
 361             // The BMP falls through (assuming non-surrogate, as it should)
 362             dst[0] = ch as u16;
 363             Some(1)
 364         } else if dst.len() >= 2 {
 365             // Supplementary planes break into surrogates.
 366             ch -= 0x1_0000_u32;
 367             dst[0] = 0xD800_u16 | ((ch >> 10) as u16);
 368             dst[1] = 0xDC00_u16 | ((ch as u16) & 0x3FF_u16);
 369             Some(2)
 370         } else {
 371             None
 372         }
 373     }
 374 }