src/librustc_unicode/char.rs

   1 // Copyright 2012-2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! A character type.
  12 //!
  13 //! The `char` type represents a single character. More specifically, since
  14 //! 'character' isn't a well-defined concept in Unicode, `char` is a '[Unicode
  15 //! scalar value]', which is similar to, but not the same as, a '[Unicode code
  16 //! point]'.
  17 //!
  18 //! [Unicode scalar value]: http://www.unicode.org/glossary/#unicode_scalar_value
  19 //! [Unicode code point]: http://www.unicode.org/glossary/#code_point
  20 //!
  21 //! This module exists for technical reasons, the primary documentation for
  22 //! `char` is directly on [the `char` primitive type](../primitive.char.html)
  23 //! itself.
  24 //!
  25 //! This module is the home of the iterator implementations for the iterators
  26 //! implemented on `char`, as well as some useful constants and conversion
  27 //! functions that convert various types to `char`.
  28
  29 #![stable(feature = "rust1", since = "1.0.0")]
  30
  31 use core::char::CharExt as C;
  32 use core::option::Option::{self, Some, None};
  33 use core::iter::Iterator;
  34 use tables::{derived_property, property, general_category, conversions};
  35
  36 // stable reexports
  37 #[stable(feature = "rust1", since = "1.0.0")]
  38 pub use core::char::{MAX, from_u32, from_u32_unchecked, from_digit, EscapeUnicode, EscapeDefault};
  39
  40 // unstable reexports
  41 #[unstable(feature = "unicode", issue = "27783")]
  42 pub use tables::UNICODE_VERSION;
  43
  44 /// Returns an iterator that yields the lowercase equivalent of a `char`.
  45 ///
  46 /// This `struct` is created by the [`to_lowercase()`] method on [`char`]. See
  47 /// its documentation for more.
  48 ///
  49 /// [`to_lowercase()`]: ../primitive.char.html#method.to_lowercase
  50 /// [`char`]: ../primitive.char.html
  51 #[stable(feature = "rust1", since = "1.0.0")]
  52 pub struct ToLowercase(CaseMappingIter);
  53
  54 #[stable(feature = "rust1", since = "1.0.0")]
  55 impl Iterator for ToLowercase {
  56     type Item = char;
  57     fn next(&mut self) -> Option<char> {
  58         self.0.next()
  59     }
  60 }
  61
  62 /// Returns an iterator that yields the uppercase equivalent of a `char`.
  63 ///
  64 /// This `struct` is created by the [`to_uppercase()`] method on [`char`]. See
  65 /// its documentation for more.
  66 ///
  67 /// [`to_uppercase()`]: ../primitive.char.html#method.to_uppercase
  68 /// [`char`]: ../primitive.char.html
  69 #[stable(feature = "rust1", since = "1.0.0")]
  70 pub struct ToUppercase(CaseMappingIter);
  71
  72 #[stable(feature = "rust1", since = "1.0.0")]
  73 impl Iterator for ToUppercase {
  74     type Item = char;
  75     fn next(&mut self) -> Option<char> {
  76         self.0.next()
  77     }
  78 }
  79
  80
  81 enum CaseMappingIter {
  82     Three(char, char, char),
  83     Two(char, char),
  84     One(char),
  85     Zero,
  86 }
  87
  88 impl CaseMappingIter {
  89     fn new(chars: [char; 3]) -> CaseMappingIter {
  90         if chars[2] == '\0' {
  91             if chars[1] == '\0' {
  92                 CaseMappingIter::One(chars[0])  // Including if chars[0] == '\0'
  93             } else {
  94                 CaseMappingIter::Two(chars[0], chars[1])
  95             }
  96         } else {
  97             CaseMappingIter::Three(chars[0], chars[1], chars[2])
  98         }
  99     }
 100 }
 101
 102 impl Iterator for CaseMappingIter {
 103     type Item = char;
 104     fn next(&mut self) -> Option<char> {
 105         match *self {
 106             CaseMappingIter::Three(a, b, c) => {
 107                 *self = CaseMappingIter::Two(b, c);
 108                 Some(a)
 109             }
 110             CaseMappingIter::Two(b, c) => {
 111                 *self = CaseMappingIter::One(c);
 112                 Some(b)
 113             }
 114             CaseMappingIter::One(c) => {
 115                 *self = CaseMappingIter::Zero;
 116                 Some(c)
 117             }
 118             CaseMappingIter::Zero => None,
 119         }
 120     }
 121 }
 122
 123 #[lang = "char"]
 124 impl char {
 125     /// Checks if a `char` is a digit in the given radix.
 126     ///
 127     /// A 'radix' here is sometimes also called a 'base'. A radix of two
 128     /// indicates a binary number, a radix of ten, decimal, and a radix of
 129     /// sixteen, hexadecimal, to give some common values. Arbitrary
 130     /// radicum are supported.
 131     ///
 132     /// Compared to `is_numeric()`, this function only recognizes the characters
 133     /// `0-9`, `a-z` and `A-Z`.
 134     ///
 135     /// 'Digit' is defined to be only the following characters:
 136     ///
 137     /// * `0-9`
 138     /// * `a-z`
 139     /// * `A-Z`
 140     ///
 141     /// For a more comprehensive understanding of 'digit', see [`is_numeric()`][is_numeric].
 142     ///
 143     /// [is_numeric]: #method.is_numeric
 144     ///
 145     /// # Panics
 146     ///
 147     /// Panics if given a radix larger than 36.
 148     ///
 149     /// # Examples
 150     ///
 151     /// Basic usage:
 152     ///
 153     /// ```
 154     /// let d = '1';
 155     ///
 156     /// assert!(d.is_digit(10));
 157     ///
 158     /// let d = 'f';
 159     ///
 160     /// assert!(d.is_digit(16));
 161     /// assert!(!d.is_digit(10));
 162     /// ```
 163     ///
 164     /// Passing a large radix, causing a panic:
 165     ///
 166     /// ```
 167     /// use std::thread;
 168     ///
 169     /// let result = thread::spawn(|| {
 170     ///     let d = '1';
 171     ///
 172     ///     // this panics
 173     ///     d.is_digit(37);
 174     /// }).join();
 175     ///
 176     /// assert!(result.is_err());
 177     /// ```
 178     #[stable(feature = "rust1", since = "1.0.0")]
 179     #[inline]
 180     pub fn is_digit(self, radix: u32) -> bool {
 181         C::is_digit(self, radix)
 182     }
 183
 184     /// Converts a `char` to a digit in the given radix.
 185     ///
 186     /// A 'radix' here is sometimes also called a 'base'. A radix of two
 187     /// indicates a binary number, a radix of ten, decimal, and a radix of
 188     /// sixteen, hexadecimal, to give some common values. Arbitrary
 189     /// radicum are supported.
 190     ///
 191     /// 'Digit' is defined to be only the following characters:
 192     ///
 193     /// * `0-9`
 194     /// * `a-z`
 195     /// * `A-Z`
 196     ///
 197     /// # Failure
 198     ///
 199     /// Returns `None` if the `char` does not refer to a digit in the given radix.
 200     ///
 201     /// # Panics
 202     ///
 203     /// Panics if given a radix larger than 36.
 204     ///
 205     /// # Examples
 206     ///
 207     /// Basic usage:
 208     ///
 209     /// ```
 210     /// let d = '1';
 211     ///
 212     /// assert_eq!(d.to_digit(10), Some(1));
 213     ///
 214     /// let d = 'f';
 215     ///
 216     /// assert_eq!(d.to_digit(16), Some(15));
 217     /// ```
 218     ///
 219     /// Passing a non-digit results in failure:
 220     ///
 221     /// ```
 222     /// let d = 'f';
 223     ///
 224     /// assert_eq!(d.to_digit(10), None);
 225     ///
 226     /// let d = 'z';
 227     ///
 228     /// assert_eq!(d.to_digit(16), None);
 229     /// ```
 230     ///
 231     /// Passing a large radix, causing a panic:
 232     ///
 233     /// ```
 234     /// use std::thread;
 235     ///
 236     /// let result = thread::spawn(|| {
 237     ///   let d = '1';
 238     ///
 239     ///   d.to_digit(37);
 240     /// }).join();
 241     ///
 242     /// assert!(result.is_err());
 243     /// ```
 244     #[stable(feature = "rust1", since = "1.0.0")]
 245     #[inline]
 246     pub fn to_digit(self, radix: u32) -> Option<u32> {
 247         C::to_digit(self, radix)
 248     }
 249
 250     /// Returns an iterator that yields the hexadecimal Unicode escape of a
 251     /// character, as `char`s.
 252     ///
 253     /// All characters are escaped with Rust syntax of the form `\\u{NNNN}`
 254     /// where `NNNN` is the shortest hexadecimal representation.
 255     ///
 256     /// # Examples
 257     ///
 258     /// Basic usage:
 259     ///
 260     /// ```
 261     /// for c in '❤'.escape_unicode() {
 262     ///     print!("{}", c);
 263     /// }
 264     /// println!("");
 265     /// ```
 266     ///
 267     /// This prints:
 268     ///
 269     /// ```text
 270     /// \u{2764}
 271     /// ```
 272     ///
 273     /// Collecting into a `String`:
 274     ///
 275     /// ```
 276     /// let heart: String = '❤'.escape_unicode().collect();
 277     ///
 278     /// assert_eq!(heart, r"\u{2764}");
 279     /// ```
 280     #[stable(feature = "rust1", since = "1.0.0")]
 281     #[inline]
 282     pub fn escape_unicode(self) -> EscapeUnicode {
 283         C::escape_unicode(self)
 284     }
 285
 286     /// Returns an iterator that yields the literal escape code of a `char`.
 287     ///
 288     /// The default is chosen with a bias toward producing literals that are
 289     /// legal in a variety of languages, including C++11 and similar C-family
 290     /// languages. The exact rules are:
 291     ///
 292     /// * Tab is escaped as `\t`.
 293     /// * Carriage return is escaped as `\r`.
 294     /// * Line feed is escaped as `\n`.
 295     /// * Single quote is escaped as `\'`.
 296     /// * Double quote is escaped as `\"`.
 297     /// * Backslash is escaped as `\\`.
 298     /// * Any character in the 'printable ASCII' range `0x20` .. `0x7e`
 299     ///   inclusive is not escaped.
 300     /// * All other characters are given hexadecimal Unicode escapes; see
 301     ///   [`escape_unicode`][escape_unicode].
 302     ///
 303     /// [escape_unicode]: #method.escape_unicode
 304     ///
 305     /// # Examples
 306     ///
 307     /// Basic usage:
 308     ///
 309     /// ```
 310     /// for i in '"'.escape_default() {
 311     ///     println!("{}", i);
 312     /// }
 313     /// ```
 314     ///
 315     /// This prints:
 316     ///
 317     /// ```text
 318     /// \
 319     /// "
 320     /// ```
 321     ///
 322     /// Collecting into a `String`:
 323     ///
 324     /// ```
 325     /// let quote: String = '"'.escape_default().collect();
 326     ///
 327     /// assert_eq!(quote, "\\\"");
 328     /// ```
 329     #[stable(feature = "rust1", since = "1.0.0")]
 330     #[inline]
 331     pub fn escape_default(self) -> EscapeDefault {
 332         C::escape_default(self)
 333     }
 334
 335     /// Returns the number of bytes this `char` would need if encoded in UTF-8.
 336     ///
 337     /// That number of bytes is always between 1 and 4, inclusive.
 338     ///
 339     /// # Examples
 340     ///
 341     /// Basic usage:
 342     ///
 343     /// ```
 344     /// let len = 'A'.len_utf8();
 345     /// assert_eq!(len, 1);
 346     ///
 347     /// let len = 'ß'.len_utf8();
 348     /// assert_eq!(len, 2);
 349     ///
 350     /// let len = 'ℝ'.len_utf8();
 351     /// assert_eq!(len, 3);
 352     ///
 353     /// let len = '💣'.len_utf8();
 354     /// assert_eq!(len, 4);
 355     /// ```
 356     ///
 357     /// The `&str` type guarantees that its contents are UTF-8, and so we can compare the length it
 358     /// would take if each code point was represented as a `char` vs in the `&str` itself:
 359     ///
 360     /// ```
 361     /// // as chars
 362     /// let eastern = '東';
 363     /// let capitol = '京';
 364     ///
 365     /// // both can be represented as three bytes
 366     /// assert_eq!(3, eastern.len_utf8());
 367     /// assert_eq!(3, capitol.len_utf8());
 368     ///
 369     /// // as a &str, these two are encoded in UTF-8
 370     /// let tokyo = "東京";
 371     ///
 372     /// let len = eastern.len_utf8() + capitol.len_utf8();
 373     ///
 374     /// // we can see that they take six bytes total...
 375     /// assert_eq!(6, tokyo.len());
 376     ///
 377     /// // ... just like the &str
 378     /// assert_eq!(len, tokyo.len());
 379     /// ```
 380     #[stable(feature = "rust1", since = "1.0.0")]
 381     #[inline]
 382     pub fn len_utf8(self) -> usize {
 383         C::len_utf8(self)
 384     }
 385
 386     /// Returns the number of 16-bit code units this `char` would need if
 387     /// encoded in UTF-16.
 388     ///
 389     /// See the documentation for [`len_utf8()`] for more explanation of this
 390     /// concept. This function is a mirror, but for UTF-16 instead of UTF-8.
 391     ///
 392     /// [`len_utf8()`]: #method.len_utf8
 393     ///
 394     /// # Examples
 395     ///
 396     /// Basic usage:
 397     ///
 398     /// ```
 399     /// let n = 'ß'.len_utf16();
 400     /// assert_eq!(n, 1);
 401     ///
 402     /// let len = '💣'.len_utf16();
 403     /// assert_eq!(len, 2);
 404     /// ```
 405     #[stable(feature = "rust1", since = "1.0.0")]
 406     #[inline]
 407     pub fn len_utf16(self) -> usize {
 408         C::len_utf16(self)
 409     }
 410
 411     /// Encodes this character as UTF-8 into the provided byte buffer, and then
 412     /// returns the number of bytes written.
 413     ///
 414     /// If the buffer is not large enough, nothing will be written into it and a
 415     /// `None` will be returned. A buffer of length four is large enough to
 416     /// encode any `char`.
 417     ///
 418     /// # Examples
 419     ///
 420     /// In both of these examples, 'ß' takes two bytes to encode.
 421     ///
 422     /// ```
 423     /// #![feature(unicode)]
 424     ///
 425     /// let mut b = [0; 2];
 426     ///
 427     /// let result = 'ß'.encode_utf8(&mut b);
 428     ///
 429     /// assert_eq!(result, Some(2));
 430     /// ```
 431     ///
 432     /// A buffer that's too small:
 433     ///
 434     /// ```
 435     /// #![feature(unicode)]
 436     ///
 437     /// let mut b = [0; 1];
 438     ///
 439     /// let result = 'ß'.encode_utf8(&mut b);
 440     ///
 441     /// assert_eq!(result, None);
 442     /// ```
 443     #[unstable(feature = "unicode",
 444                reason = "pending decision about Iterator/Writer/Reader",
 445                issue = "27784")]
 446     #[inline]
 447     pub fn encode_utf8(self, dst: &mut [u8]) -> Option<usize> {
 448         C::encode_utf8(self, dst)
 449     }
 450
 451     /// Encodes this character as UTF-16 into the provided `u16` buffer, and
 452     /// then returns the number of `u16`s written.
 453     ///
 454     /// If the buffer is not large enough, nothing will be written into it and a
 455     /// `None` will be returned. A buffer of length 2 is large enough to encode
 456     /// any `char`.
 457     ///
 458     /// # Examples
 459     ///
 460     /// In both of these examples, 'ß' takes one `u16` to encode.
 461     ///
 462     /// ```
 463     /// #![feature(unicode)]
 464     ///
 465     /// let mut b = [0; 1];
 466     ///
 467     /// let result = 'ß'.encode_utf16(&mut b);
 468     ///
 469     /// assert_eq!(result, Some(1));
 470     /// ```
 471     ///
 472     /// A buffer that's too small:
 473     ///
 474     /// ```
 475     /// #![feature(unicode)]
 476     ///
 477     /// let mut b = [0; 0];
 478     ///
 479     /// let result = 'ß'.encode_utf8(&mut b);
 480     ///
 481     /// assert_eq!(result, None);
 482     /// ```
 483     #[unstable(feature = "unicode",
 484                reason = "pending decision about Iterator/Writer/Reader",
 485                issue = "27784")]
 486     #[inline]
 487     pub fn encode_utf16(self, dst: &mut [u16]) -> Option<usize> {
 488         C::encode_utf16(self, dst)
 489     }
 490
 491     /// Returns true if this `char` is an alphabetic code point, and false if not.
 492     ///
 493     /// # Examples
 494     ///
 495     /// Basic usage:
 496     ///
 497     /// ```
 498     /// let c = 'a';
 499     ///
 500     /// assert!(c.is_alphabetic());
 501     ///
 502     /// let c = '京';
 503     /// assert!(c.is_alphabetic());
 504     ///
 505     /// let c = '💝';
 506     /// // love is many things, but it is not alphabetic
 507     /// assert!(!c.is_alphabetic());
 508     /// ```
 509     #[stable(feature = "rust1", since = "1.0.0")]
 510     #[inline]
 511     pub fn is_alphabetic(self) -> bool {
 512         match self {
 513             'a'...'z' | 'A'...'Z' => true,
 514             c if c > '\x7f' => derived_property::Alphabetic(c),
 515             _ => false,
 516         }
 517     }
 518
 519     /// Returns true if this `char` satisfies the 'XID_Start' Unicode property, and false
 520     /// otherwise.
 521     ///
 522     /// 'XID_Start' is a Unicode Derived Property specified in
 523     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 524     /// mostly similar to `ID_Start` but modified for closure under `NFKx`.
 525     #[unstable(feature = "unicode",
 526                reason = "mainly needed for compiler internals",
 527                issue = "0")]
 528     #[inline]
 529     pub fn is_xid_start(self) -> bool {
 530         derived_property::XID_Start(self)
 531     }
 532
 533     /// Returns true if this `char` satisfies the 'XID_Continue' Unicode property, and false
 534     /// otherwise.
 535     ///
 536     /// 'XID_Continue' is a Unicode Derived Property specified in
 537     /// [UAX #31](http://unicode.org/reports/tr31/#NFKC_Modifications),
 538     /// mostly similar to 'ID_Continue' but modified for closure under NFKx.
 539     #[unstable(feature = "unicode",
 540                reason = "mainly needed for compiler internals",
 541                issue = "0")]
 542     #[inline]
 543     pub fn is_xid_continue(self) -> bool {
 544         derived_property::XID_Continue(self)
 545     }
 546
 547     /// Returns true if this `char` is lowercase, and false otherwise.
 548     ///
 549     /// 'Lowercase' is defined according to the terms of the Unicode Derived Core
 550     /// Property `Lowercase`.
 551     ///
 552     /// # Examples
 553     ///
 554     /// Basic usage:
 555     ///
 556     /// ```
 557     /// let c = 'a';
 558     /// assert!(c.is_lowercase());
 559     ///
 560     /// let c = 'δ';
 561     /// assert!(c.is_lowercase());
 562     ///
 563     /// let c = 'A';
 564     /// assert!(!c.is_lowercase());
 565     ///
 566     /// let c = 'Δ';
 567     /// assert!(!c.is_lowercase());
 568     ///
 569     /// // The various Chinese scripts do not have case, and so:
 570     /// let c = '中';
 571     /// assert!(!c.is_lowercase());
 572     /// ```
 573     #[stable(feature = "rust1", since = "1.0.0")]
 574     #[inline]
 575     pub fn is_lowercase(self) -> bool {
 576         match self {
 577             'a'...'z' => true,
 578             c if c > '\x7f' => derived_property::Lowercase(c),
 579             _ => false,
 580         }
 581     }
 582
 583     /// Returns true if this `char` is uppercase, and false otherwise.
 584     ///
 585     /// 'Uppercase' is defined according to the terms of the Unicode Derived Core
 586     /// Property `Uppercase`.
 587     ///
 588     /// # Examples
 589     ///
 590     /// Basic usage:
 591     ///
 592     /// ```
 593     /// let c = 'a';
 594     /// assert!(!c.is_uppercase());
 595     ///
 596     /// let c = 'δ';
 597     /// assert!(!c.is_uppercase());
 598     ///
 599     /// let c = 'A';
 600     /// assert!(c.is_uppercase());
 601     ///
 602     /// let c = 'Δ';
 603     /// assert!(c.is_uppercase());
 604     ///
 605     /// // The various Chinese scripts do not have case, and so:
 606     /// let c = '中';
 607     /// assert!(!c.is_uppercase());
 608     /// ```
 609     #[stable(feature = "rust1", since = "1.0.0")]
 610     #[inline]
 611     pub fn is_uppercase(self) -> bool {
 612         match self {
 613             'A'...'Z' => true,
 614             c if c > '\x7f' => derived_property::Uppercase(c),
 615             _ => false,
 616         }
 617     }
 618
 619     /// Returns true if this `char` is whitespace, and false otherwise.
 620     ///
 621     /// 'Whitespace' is defined according to the terms of the Unicode Derived Core
 622     /// Property `White_Space`.
 623     ///
 624     /// # Examples
 625     ///
 626     /// Basic usage:
 627     ///
 628     /// ```
 629     /// let c = ' ';
 630     /// assert!(c.is_whitespace());
 631     ///
 632     /// // a non-breaking space
 633     /// let c = '\u{A0}';
 634     /// assert!(c.is_whitespace());
 635     ///
 636     /// let c = '越';
 637     /// assert!(!c.is_whitespace());
 638     /// ```
 639     #[stable(feature = "rust1", since = "1.0.0")]
 640     #[inline]
 641     pub fn is_whitespace(self) -> bool {
 642         match self {
 643             ' ' | '\x09'...'\x0d' => true,
 644             c if c > '\x7f' => property::White_Space(c),
 645             _ => false,
 646         }
 647     }
 648
 649     /// Returns true if this `char` is alphanumeric, and false otherwise.
 650     ///
 651     /// 'Alphanumeric'-ness is defined in terms of the Unicode General Categories
 652     /// 'Nd', 'Nl', 'No' and the Derived Core Property 'Alphabetic'.
 653     ///
 654     /// # Examples
 655     ///
 656     /// Basic usage:
 657     ///
 658     /// ```
 659     /// let c = '٣';
 660     /// assert!(c.is_alphanumeric());
 661     ///
 662     /// let c = '7';
 663     /// assert!(c.is_alphanumeric());
 664     ///
 665     /// let c = '৬';
 666     /// assert!(c.is_alphanumeric());
 667     ///
 668     /// let c = 'K';
 669     /// assert!(c.is_alphanumeric());
 670     ///
 671     /// let c = 'و';
 672     /// assert!(c.is_alphanumeric());
 673     ///
 674     /// let c = '藏';
 675     /// assert!(c.is_alphanumeric());
 676     ///
 677     /// let c = '¾';
 678     /// assert!(!c.is_alphanumeric());
 679     ///
 680     /// let c = '①';
 681     /// assert!(!c.is_alphanumeric());
 682     /// ```
 683     #[stable(feature = "rust1", since = "1.0.0")]
 684     #[inline]
 685     pub fn is_alphanumeric(self) -> bool {
 686         self.is_alphabetic() || self.is_numeric()
 687     }
 688
 689     /// Returns true if this `char` is a control code point, and false otherwise.
 690     ///
 691     /// 'Control code point' is defined in terms of the Unicode General
 692     /// Category `Cc`.
 693     ///
 694     /// # Examples
 695     ///
 696     /// Basic usage:
 697     ///
 698     /// ```
 699     /// // U+009C, STRING TERMINATOR
 700     /// let c = '\9c';
 701     /// assert!(c.is_control());
 702     ///
 703     /// let c = 'q';
 704     /// assert!(!c.is_control());
 705     /// ```
 706     #[stable(feature = "rust1", since = "1.0.0")]
 707     #[inline]
 708     pub fn is_control(self) -> bool {
 709         general_category::Cc(self)
 710     }
 711
 712     /// Returns true if this `char` is numeric, and false otherwise.
 713     ///
 714     /// 'Numeric'-ness is defined in terms of the Unicode General Categories
 715     /// 'Nd', 'Nl', 'No'.
 716     ///
 717     /// # Examples
 718     ///
 719     /// Basic usage:
 720     ///
 721     /// ```
 722     /// let c = '٣';
 723     /// assert!(c.is_numeric());
 724     ///
 725     /// let c = '7';
 726     /// assert!(c.is_numeric());
 727     ///
 728     /// let c = '৬';
 729     /// assert!(c.is_numeric());
 730     ///
 731     /// let c = 'K';
 732     /// assert!(!c.is_numeric());
 733     ///
 734     /// let c = 'و';
 735     /// assert!(!c.is_numeric());
 736     ///
 737     /// let c = '藏';
 738     /// assert!(!c.is_numeric());
 739     ///
 740     /// let c = '¾';
 741     /// assert!(!c.is_numeric());
 742     ///
 743     /// let c = '①';
 744     /// assert!(!c.is_numeric());
 745     /// ```
 746     #[stable(feature = "rust1", since = "1.0.0")]
 747     #[inline]
 748     pub fn is_numeric(self) -> bool {
 749         match self {
 750             '0'...'9' => true,
 751             c if c > '\x7f' => general_category::N(c),
 752             _ => false,
 753         }
 754     }
 755
 756     /// Returns an iterator that yields the lowercase equivalent of a `char`.
 757     ///
 758     /// If no conversion is possible then an iterator with just the input character is returned.
 759     ///
 760     /// This performs complex unconditional mappings with no tailoring: it maps
 761     /// one Unicode character to its lowercase equivalent according to the
 762     /// [Unicode database] and the additional complex mappings
 763     /// [`SpecialCasing.txt`]. Conditional mappings (based on context or
 764     /// language) are not considered here.
 765     ///
 766     /// For a full reference, see [here][reference].
 767     ///
 768     /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 769     ///
 770     /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 771     ///
 772     /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
 773     ///
 774     /// # Examples
 775     ///
 776     /// Basic usage:
 777     ///
 778     /// ```
 779     /// let c = 'c';
 780     ///
 781     /// assert_eq!(c.to_uppercase().next(), Some('C'));
 782     ///
 783     /// // Japanese scripts do not have case, and so:
 784     /// let c = '山';
 785     /// assert_eq!(c.to_uppercase().next(), Some('山'));
 786     /// ```
 787     #[stable(feature = "rust1", since = "1.0.0")]
 788     #[inline]
 789     pub fn to_lowercase(self) -> ToLowercase {
 790         ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
 791     }
 792
 793     /// Returns an iterator that yields the uppercase equivalent of a `char`.
 794     ///
 795     /// If no conversion is possible then an iterator with just the input character is returned.
 796     ///
 797     /// This performs complex unconditional mappings with no tailoring: it maps
 798     /// one Unicode character to its uppercase equivalent according to the
 799     /// [Unicode database] and the additional complex mappings
 800     /// [`SpecialCasing.txt`]. Conditional mappings (based on context or
 801     /// language) are not considered here.
 802     ///
 803     /// For a full reference, see [here][reference].
 804     ///
 805     /// [Unicode database]: ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
 806     ///
 807     /// [`SpecialCasing.txt`]: ftp://ftp.unicode.org/Public/UNIDATA/SpecialCasing.txt
 808     ///
 809     /// [reference]: http://www.unicode.org/versions/Unicode7.0.0/ch03.pdf#G33992
 810     ///
 811     /// # Examples
 812     ///
 813     /// Basic usage:
 814     ///
 815     /// ```
 816     /// let c = 'c';
 817     /// assert_eq!(c.to_uppercase().next(), Some('C'));
 818     ///
 819     /// // Japanese does not have case, and so:
 820     /// let c = '山';
 821     /// assert_eq!(c.to_uppercase().next(), Some('山'));
 822     /// ```
 823     ///
 824     /// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
 825     ///
 826     /// * 'Dotless': I / ı, sometimes written ï
 827     /// * 'Dotted': İ / i
 828     ///
 829     /// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
 830     ///
 831     /// ```
 832     /// let i = 'i';
 833     ///
 834     /// let upper_i = i.to_uppercase().next();
 835     /// ```
 836     ///
 837     /// The value of `upper_i` here relies on the language of the text: if we're
 838     /// in `en-US`, it should be `Some('I')`, but if we're in `tr_TR`, it should
 839     /// be `Some('İ')`. `to_uppercase()` does not take this into account, and so:
 840     ///
 841     /// ```
 842     /// let i = 'i';
 843     ///
 844     /// let upper_i = i.to_uppercase().next();
 845     ///
 846     /// assert_eq!(Some('I'), upper_i);
 847     /// ```
 848     ///
 849     /// holds across languages.
 850     #[stable(feature = "rust1", since = "1.0.0")]
 851     #[inline]
 852     pub fn to_uppercase(self) -> ToUppercase {
 853         ToUppercase(CaseMappingIter::new(conversions::to_upper(self)))
 854     }
 855 }
 856
 857 /// An iterator that decodes UTF-16 encoded code points from an iterator of `u16`s.
 858 #[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
 859 #[derive(Clone)]
 860 pub struct DecodeUtf16<I>
 861     where I: Iterator<Item = u16>
 862 {
 863     iter: I,
 864     buf: Option<u16>,
 865 }
 866
 867 /// Create an iterator over the UTF-16 encoded code points in `iterable`,
 868 /// returning unpaired surrogates as `Err`s.
 869 ///
 870 /// # Examples
 871 ///
 872 /// Basic usage:
 873 ///
 874 /// ```
 875 /// #![feature(decode_utf16)]
 876 ///
 877 /// use std::char::decode_utf16;
 878 ///
 879 /// fn main() {
 880 ///     // 𝄞mus<invalid>ic<invalid>
 881 ///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 882 ///              0x0073, 0xDD1E, 0x0069, 0x0063,
 883 ///              0xD834];
 884 ///
 885 ///     assert_eq!(decode_utf16(v.iter().cloned()).collect::<Vec<_>>(),
 886 ///                vec![Ok('𝄞'),
 887 ///                     Ok('m'), Ok('u'), Ok('s'),
 888 ///                     Err(0xDD1E),
 889 ///                     Ok('i'), Ok('c'),
 890 ///                     Err(0xD834)]);
 891 /// }
 892 /// ```
 893 ///
 894 /// A lossy decoder can be obtained by replacing `Err` results with the replacement character:
 895 ///
 896 /// ```
 897 /// #![feature(decode_utf16)]
 898 ///
 899 /// use std::char::{decode_utf16, REPLACEMENT_CHARACTER};
 900 ///
 901 /// fn main() {
 902 ///     // 𝄞mus<invalid>ic<invalid>
 903 ///     let v = [0xD834, 0xDD1E, 0x006d, 0x0075,
 904 ///              0x0073, 0xDD1E, 0x0069, 0x0063,
 905 ///              0xD834];
 906 ///
 907 ///     assert_eq!(decode_utf16(v.iter().cloned())
 908 ///                    .map(|r| r.unwrap_or(REPLACEMENT_CHARACTER))
 909 ///                    .collect::<String>(),
 910 ///                "𝄞mus�ic�");
 911 /// }
 912 /// ```
 913 #[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
 914 #[inline]
 915 pub fn decode_utf16<I: IntoIterator<Item = u16>>(iterable: I) -> DecodeUtf16<I::IntoIter> {
 916     DecodeUtf16 {
 917         iter: iterable.into_iter(),
 918         buf: None,
 919     }
 920 }
 921
 922 #[unstable(feature = "decode_utf16", reason = "recently exposed", issue = "27830")]
 923 impl<I: Iterator<Item=u16>> Iterator for DecodeUtf16<I> {
 924     type Item = Result<char, u16>;
 925
 926     fn next(&mut self) -> Option<Result<char, u16>> {
 927         let u = match self.buf.take() {
 928             Some(buf) => buf,
 929             None => match self.iter.next() {
 930                 Some(u) => u,
 931                 None => return None,
 932             },
 933         };
 934
 935         if u < 0xD800 || 0xDFFF < u {
 936             // not a surrogate
 937             Some(Ok(unsafe { from_u32_unchecked(u as u32) }))
 938         } else if u >= 0xDC00 {
 939             // a trailing surrogate
 940             Some(Err(u))
 941         } else {
 942             let u2 = match self.iter.next() {
 943                 Some(u2) => u2,
 944                 // eof
 945                 None => return Some(Err(u)),
 946             };
 947             if u2 < 0xDC00 || u2 > 0xDFFF {
 948                 // not a trailing surrogate so we're not a valid
 949                 // surrogate pair, so rewind to redecode u2 next time.
 950                 self.buf = Some(u2);
 951                 return Some(Err(u));
 952             }
 953
 954             // all ok, so lets decode it.
 955             let c = (((u - 0xD800) as u32) << 10 | (u2 - 0xDC00) as u32) + 0x1_0000;
 956             Some(Ok(unsafe { from_u32_unchecked(c) }))
 957         }
 958     }
 959
 960     #[inline]
 961     fn size_hint(&self) -> (usize, Option<usize>) {
 962         let (low, high) = self.iter.size_hint();
 963         // we could be entirely valid surrogates (2 elements per
 964         // char), or entirely non-surrogates (1 element per char)
 965         (low / 2, high)
 966     }
 967 }
 968
 969 /// `U+FFFD REPLACEMENT CHARACTER` (�) is used in Unicode to represent a decoding error.
 970 /// It can occur, for example, when giving ill-formed UTF-8 bytes to
 971 /// [`String::from_utf8_lossy`](../string/struct.String.html#method.from_utf8_lossy).
 972 #[unstable(feature = "decode_utf16", reason = "recently added", issue = "27830")]
 973 pub const REPLACEMENT_CHARACTER: char = '\u{FFFD}';