src/libunicode/u_char.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! Unicode-intensive `char` methods along with the `core` methods.
  12 //!
  13 //! These methods implement functionality for `char` that requires knowledge of
  14 //! Unicode definitions, including normalization, categorization, and display information.
  15
  16 use core::char;
  17 use core::char::CharExt as C;
  18 use core::option::Option;
  19 use tables::{derived_property, property, general_category, conversions, charwidth};
  20
  21 /// Functionality for manipulating `char`.
  22 #[stable(feature = "rust1", since = "1.0.0")]
  23 pub trait CharExt {
  24     /// Checks if a `char` parses as a numeric digit in the given radix.
  25     ///
  26     /// Compared to `is_numeric()`, this function only recognizes the characters
  27     /// `0-9`, `a-z` and `A-Z`.
  28     ///
  29     /// # Return value
  30     ///
  31     /// Returns `true` if `c` is a valid digit under `radix`, and `false`
  32     /// otherwise.
  33     ///
  34     /// # Panics
  35     ///
  36     /// Panics if given a radix > 36.
  37     #[unstable(feature = "unicode",
  38                reason = "pending integer conventions")]
  39     fn is_digit(self, radix: u32) -> bool;
  40
  41     /// Converts a character to the corresponding digit.
  42     ///
  43     /// # Return value
  44     ///
  45     /// If `c` is between '0' and '9', the corresponding value between 0 and
  46     /// 9. If `c` is 'a' or 'A', 10. If `c` is 'b' or 'B', 11, etc. Returns
  47     /// none if the character does not refer to a digit in the given radix.
  48     ///
  49     /// # Panics
  50     ///
  51     /// Panics if given a radix outside the range [0..36].
  52     #[unstable(feature = "unicode",
  53                reason = "pending integer conventions")]
  54     fn to_digit(self, radix: u32) -> Option<u32>;
  55
  56     /// Returns an iterator that yields the hexadecimal Unicode escape
  57     /// of a character, as `char`s.
  58     ///
  59     /// All characters are escaped with Rust syntax of the form `\\u{NNNN}`
  60     /// where `NNNN` is the shortest hexadecimal representation of the code
  61     /// point.
  62     #[stable(feature = "rust1", since = "1.0.0")]
  63     fn escape_unicode(self) -> char::EscapeUnicode;
  64
  65     /// Returns an iterator that yields the 'default' ASCII and
  66     /// C++11-like literal escape of a character, as `char`s.
  67     ///
  68     /// The default is chosen with a bias toward producing literals that are
  69     /// legal in a variety of languages, including C++11 and similar C-family
  70     /// languages. The exact rules are:
  71     ///
  72     /// * Tab, CR and LF are escaped as '\t', '\r' and '\n' respectively.
  73     /// * Single-quote, double-quote and backslash chars are backslash-
  74     ///   escaped.
  75     /// * Any other chars in the range [0x20,0x7e] are not escaped.
  76     /// * Any other chars are given hex Unicode escapes; see `escape_unicode`.
  77     #[stable(feature = "rust1", since = "1.0.0")]
  78     fn escape_default(self) -> char::EscapeDefault;
  79
  80     /// Returns the amount of bytes this character would need if encoded in
  81     /// UTF-8.
  82     #[stable(feature = "rust1", since = "1.0.0")]
  83     fn len_utf8(self) -> usize;
  84
  85     /// Returns the amount of bytes this character would need if encoded in
  86     /// UTF-16.
  87     #[stable(feature = "rust1", since = "1.0.0")]
  88     fn len_utf16(self) -> usize;
  89
  90     /// Encodes this character as UTF-8 into the provided byte buffer,
  91     /// and then returns the number of bytes written.
  92     ///
  93     /// If the buffer is not large enough, nothing will be written into it
  94     /// and a `None` will be returned.
  95     #[unstable(feature = "unicode",
  96                reason = "pending decision about Iterator/Writer/Reader")]
  97     fn encode_utf8(self, dst: &mut [u8]) -> Option<usize>;
  98
  99     /// Encodes this character as UTF-16 into the provided `u16` buffer,
 100     /// and then returns the number of `u16`s written.
 101     ///
 102     /// If the buffer is not large enough, nothing will be written into it
 103     /// and a `None` will be returned.
 104     #[unstable(feature = "unicode",
 105                reason = "pending decision about Iterator/Writer/Reader")]
 106     fn encode_utf16(self, dst: &mut [u16]) -> Option<usize>;
 107
 108     /// Returns whether the specified character is considered a Unicode
 109     /// alphabetic code point.
 110     #[stable(feature = "rust1", since = "1.0.0")]
 111     fn is_alphabetic(self) -> bool;
 112
 113     /// Returns whether the specified character satisfies the 'XID_Start'
 114     /// Unicode property.
 115     ///
 116     /// 'XID_Start' is a Unicode Derived Property specified in
 117     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 118     /// mostly similar to ID_Start but modified for closure under NFKx.
 119     #[unstable(feature = "unicode",
 120                reason = "mainly needed for compiler internals")]
 121     fn is_xid_start(self) -> bool;
 122
 123     /// Returns whether the specified `char` satisfies the 'XID_Continue'
 124     /// Unicode property.
 125     ///
 126     /// 'XID_Continue' is a Unicode Derived Property specified in
 127     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 128     /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
 129     #[unstable(feature = "unicode",
 130                reason = "mainly needed for compiler internals")]
 131     fn is_xid_continue(self) -> bool;
 132
 133     /// Indicates whether a character is in lowercase.
 134     ///
 135     /// This is defined according to the terms of the Unicode Derived Core
 136     /// Property `Lowercase`.
 137     #[stable(feature = "rust1", since = "1.0.0")]
 138     fn is_lowercase(self) -> bool;
 139
 140     /// Indicates whether a character is in uppercase.
 141     ///
 142     /// This is defined according to the terms of the Unicode Derived Core
 143     /// Property `Uppercase`.
 144     #[stable(feature = "rust1", since = "1.0.0")]
 145     fn is_uppercase(self) -> bool;
 146
 147     /// Indicates whether a character is whitespace.
 148     ///
 149     /// Whitespace is defined in terms of the Unicode Property `White_Space`.
 150     #[stable(feature = "rust1", since = "1.0.0")]
 151     fn is_whitespace(self) -> bool;
 152
 153     /// Indicates whether a character is alphanumeric.
 154     ///
 155     /// Alphanumericness is defined in terms of the Unicode General Categories
 156     /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
 157     #[stable(feature = "rust1", since = "1.0.0")]
 158     fn is_alphanumeric(self) -> bool;
 159
 160     /// Indicates whether a character is a control code point.
 161     ///
 162     /// Control code points are defined in terms of the Unicode General
 163     /// Category `Cc`.
 164     #[stable(feature = "rust1", since = "1.0.0")]
 165     fn is_control(self) -> bool;
 166
 167     /// Indicates whether the character is numeric (Nd, Nl, or No).
 168     #[stable(feature = "rust1", since = "1.0.0")]
 169     fn is_numeric(self) -> bool;
 170
 171     /// Converts a character to its lowercase equivalent.
 172     ///
 173     /// The case-folding performed is the common or simple mapping. See
 174     /// `to_uppercase()` for references and more information.
 175     ///
 176     /// # Return value
 177     ///
 178     /// Returns the lowercase equivalent of the character, or the character
 179     /// itself if no conversion is possible.
 180     #[unstable(feature = "unicode",
 181                reason = "pending case transformation decisions")]
 182     fn to_lowercase(self) -> char;
 183
 184     /// Converts a character to its uppercase equivalent.
 185     ///
 186     /// The case-folding performed is the common or simple mapping: it maps
 187     /// one Unicode codepoint (one character in Rust) to its uppercase
 188     /// equivalent according to the Unicode database [1]. The additional
 189     /// [`SpecialCasing.txt`] is not considered here, as it expands to multiple
 190     /// codepoints in some cases.
 191     ///
 192     /// A full reference can be found here [2].
 193     ///
 194     /// # Return value
 195     ///
 196     /// Returns the uppercase equivalent of the character, or the character
 197     /// itself if no conversion was made.
 198     ///
 199     /// [1]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 200     ///
 201     /// [`SpecialCasing`.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 202     ///
 203     /// [2]: http://www.unicode.org/versions/Unicode4.0.0/ch03.pdf#G33992
 204     #[unstable(feature = "unicode",
 205                reason = "pending case transformation decisions")]
 206     fn to_uppercase(self) -> char;
 207
 208     /// Returns this character's displayed width in columns, or `None` if it is a
 209     /// control character other than `'\x00'`.
 210     ///
 211     /// `is_cjk` determines behavior for characters in the Ambiguous category:
 212     /// if `is_cjk` is `true`, these are 2 columns wide; otherwise, they are 1.
 213     /// In CJK contexts, `is_cjk` should be `true`, else it should be `false`.
 214     /// [Unicode Standard Annex #11](http://www.unicode.org/reports/tr11/)
 215     /// recommends that these characters be treated as 1 column (i.e.,
 216     /// `is_cjk` = `false`) if the context cannot be reliably determined.
 217     #[unstable(feature = "unicode",
 218                reason = "needs expert opinion. is_cjk flag stands out as ugly")]
 219     fn width(self, is_cjk: bool) -> Option<usize>;
 220 }
 221
 222 #[stable(feature = "rust1", since = "1.0.0")]
 223 impl CharExt for char {
 224     #[unstable(feature = "unicode",
 225                reason = "pending integer conventions")]
 226     fn is_digit(self, radix: u32) -> bool { C::is_digit(self, radix) }
 227     #[unstable(feature = "unicode",
 228                reason = "pending integer conventions")]
 229     fn to_digit(self, radix: u32) -> Option<u32> { C::to_digit(self, radix) }
 230     #[stable(feature = "rust1", since = "1.0.0")]
 231     fn escape_unicode(self) -> char::EscapeUnicode { C::escape_unicode(self) }
 232     #[stable(feature = "rust1", since = "1.0.0")]
 233     fn escape_default(self) -> char::EscapeDefault { C::escape_default(self) }
 234     #[stable(feature = "rust1", since = "1.0.0")]
 235     fn len_utf8(self) -> usize { C::len_utf8(self) }
 236     #[stable(feature = "rust1", since = "1.0.0")]
 237     fn len_utf16(self) -> usize { C::len_utf16(self) }
 238     #[unstable(feature = "unicode",
 239                reason = "pending decision about Iterator/Writer/Reader")]
 240     fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> { C::encode_utf8(self, dst) }
 241     #[unstable(feature = "unicode",
 242                reason = "pending decision about Iterator/Writer/Reader")]
 243     fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> { C::encode_utf16(self, dst) }
 244
 245     #[stable(feature = "rust1", since = "1.0.0")]
 246     fn is_alphabetic(self) -> bool {
 247         match self {
 248             'a' ... 'z' | 'A' ... 'Z' => true,
 249             c if c > '\x7f' => derived_property::Alphabetic(c),
 250             _ => false
 251         }
 252     }
 253
 254     #[unstable(feature = "unicode",
 255                reason = "mainly needed for compiler internals")]
 256     fn is_xid_start(self) -> bool { derived_property::XID_Start(self) }
 257
 258     #[unstable(feature = "unicode",
 259                reason = "mainly needed for compiler internals")]
 260     fn is_xid_continue(self) -> bool { derived_property::XID_Continue(self) }
 261
 262     #[stable(feature = "rust1", since = "1.0.0")]
 263     fn is_lowercase(self) -> bool {
 264         match self {
 265             'a' ... 'z' => true,
 266             c if c > '\x7f' => derived_property::Lowercase(c),
 267             _ => false
 268         }
 269     }
 270
 271     #[stable(feature = "rust1", since = "1.0.0")]
 272     fn is_uppercase(self) -> bool {
 273         match self {
 274             'A' ... 'Z' => true,
 275             c if c > '\x7f' => derived_property::Uppercase(c),
 276             _ => false
 277         }
 278     }
 279
 280     #[stable(feature = "rust1", since = "1.0.0")]
 281     fn is_whitespace(self) -> bool {
 282         match self {
 283             ' ' | '\x09' ... '\x0d' => true,
 284             c if c > '\x7f' => property::White_Space(c),
 285             _ => false
 286         }
 287     }
 288
 289     #[stable(feature = "rust1", since = "1.0.0")]
 290     fn is_alphanumeric(self) -> bool {
 291         self.is_alphabetic() || self.is_numeric()
 292     }
 293
 294     #[stable(feature = "rust1", since = "1.0.0")]
 295     fn is_control(self) -> bool { general_category::Cc(self) }
 296
 297     #[stable(feature = "rust1", since = "1.0.0")]
 298     fn is_numeric(self) -> bool {
 299         match self {
 300             '0' ... '9' => true,
 301             c if c > '\x7f' => general_category::N(c),
 302             _ => false
 303         }
 304     }
 305
 306     #[unstable(feature = "unicode",
 307                reason = "pending case transformation decisions")]
 308     fn to_lowercase(self) -> char { conversions::to_lower(self) }
 309
 310     #[unstable(feature = "unicode",
 311                reason = "pending case transformation decisions")]
 312     fn to_uppercase(self) -> char { conversions::to_upper(self) }
 313
 314     #[unstable(feature = "unicode",
 315                reason = "needs expert opinion. is_cjk flag stands out as ugly")]
 316     fn width(self, is_cjk: bool) -> Option<usize> { charwidth::width(self, is_cjk) }
 317 }