src/libsyntax/parse/lexer/mod.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast::{self, Ident};
  12 use syntax_pos::{self, BytePos, CharPos, Pos, Span, NO_EXPANSION};
  13 use codemap::{CodeMap, FilePathMapping};
  14 use errors::{FatalError, DiagnosticBuilder};
  15 use parse::{token, ParseSess};
  16 use str::char_at;
  17 use symbol::{Symbol, keywords};
  18 use core::unicode::property::Pattern_White_Space;
  19
  20 use std::borrow::Cow;
  21 use std::char;
  22 use std::mem::replace;
  23 use rustc_data_structures::sync::Lrc;
  24
  25 pub mod comments;
  26 mod tokentrees;
  27 mod unicode_chars;
  28
  29 #[derive(Clone, PartialEq, Eq, Debug)]
  30 pub struct TokenAndSpan {
  31     pub tok: token::Token,
  32     pub sp: Span,
  33 }
  34
  35 impl Default for TokenAndSpan {
  36     fn default() -> Self {
  37         TokenAndSpan { tok: token::Whitespace, sp: syntax_pos::DUMMY_SP }
  38     }
  39 }
  40
  41 pub struct StringReader<'a> {
  42     pub sess: &'a ParseSess,
  43     /// The absolute offset within the codemap of the next character to read
  44     pub next_pos: BytePos,
  45     /// The absolute offset within the codemap of the current character
  46     pub pos: BytePos,
  47     /// The current character (which has been read from self.pos)
  48     pub ch: Option<char>,
  49     pub filemap: Lrc<syntax_pos::FileMap>,
  50     /// Stop reading src at this index.
  51     pub end_src_index: usize,
  52     /// Whether to record new-lines and multibyte chars in filemap.
  53     /// This is only necessary the first time a filemap is lexed.
  54     /// If part of a filemap is being re-lexed, this should be set to false.
  55     pub save_new_lines_and_multibyte: bool,
  56     // cached:
  57     pub peek_tok: token::Token,
  58     pub peek_span: Span,
  59     pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
  60     // cache a direct reference to the source text, so that we don't have to
  61     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
  62     src: Lrc<String>,
  63     /// Stack of open delimiters and their spans. Used for error message.
  64     token: token::Token,
  65     span: Span,
  66     open_braces: Vec<(token::DelimToken, Span)>,
  67     pub override_span: Option<Span>,
  68 }
  69
  70 impl<'a> StringReader<'a> {
  71     fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
  72         unwrap_or!(self.override_span, Span::new(lo, hi, NO_EXPANSION))
  73     }
  74     fn mk_ident(&self, string: &str) -> Ident {
  75         let mut ident = Ident::from_str(string);
  76         if let Some(span) = self.override_span {
  77             ident.span = span;
  78         }
  79         ident
  80     }
  81
  82     fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
  83         let res = self.try_next_token();
  84         self.unwrap_or_abort(res)
  85     }
  86     fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
  87         match res {
  88             Ok(tok) => tok,
  89             Err(_) => {
  90                 self.emit_fatal_errors();
  91                 FatalError.raise();
  92             }
  93         }
  94     }
  95     fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
  96         let mut t = self.try_next_token()?;
  97         loop {
  98             match t.tok {
  99                 token::Whitespace | token::Comment | token::Shebang(_) => {
 100                     t = self.try_next_token()?;
 101                 }
 102                 _ => break,
 103             }
 104         }
 105         self.token = t.tok.clone();
 106         self.span = t.sp;
 107         Ok(t)
 108     }
 109     pub fn real_token(&mut self) -> TokenAndSpan {
 110         let res = self.try_real_token();
 111         self.unwrap_or_abort(res)
 112     }
 113     fn is_eof(&self) -> bool {
 114         self.ch.is_none()
 115     }
 116     /// Return the next token. EFFECT: advances the string_reader.
 117     pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
 118         assert!(self.fatal_errs.is_empty());
 119         let ret_val = TokenAndSpan {
 120             tok: replace(&mut self.peek_tok, token::Whitespace),
 121             sp: self.peek_span,
 122         };
 123         self.advance_token()?;
 124         Ok(ret_val)
 125     }
 126
 127     fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
 128         let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
 129         err.span_label(self.mk_sp(pos, pos), "unterminated raw string");
 130         if hash_count > 0 {
 131             err.note(&format!("this raw string should be terminated with `\"{}`",
 132                               "#".repeat(hash_count as usize)));
 133         }
 134         err.emit();
 135         FatalError.raise();
 136     }
 137
 138     fn fatal(&self, m: &str) -> FatalError {
 139         self.fatal_span(self.peek_span, m)
 140     }
 141     pub fn emit_fatal_errors(&mut self) {
 142         for err in &mut self.fatal_errs {
 143             err.emit();
 144         }
 145         self.fatal_errs.clear();
 146     }
 147     pub fn peek(&self) -> TokenAndSpan {
 148         // FIXME(pcwalton): Bad copy!
 149         TokenAndSpan {
 150             tok: self.peek_tok.clone(),
 151             sp: self.peek_span,
 152         }
 153     }
 154 }
 155
 156 impl<'a> StringReader<'a> {
 157     /// For comments.rs, which hackily pokes into next_pos and ch
 158     pub fn new_raw(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
 159         let mut sr = StringReader::new_raw_internal(sess, filemap);
 160         sr.bump();
 161         sr
 162     }
 163
 164     fn new_raw_internal(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
 165         if filemap.src.is_none() {
 166             sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
 167                                               filemap.name));
 168         }
 169
 170         let src = (*filemap.src.as_ref().unwrap()).clone();
 171
 172         StringReader {
 173             sess,
 174             next_pos: filemap.start_pos,
 175             pos: filemap.start_pos,
 176             ch: Some('\n'),
 177             filemap,
 178             end_src_index: src.len(),
 179             save_new_lines_and_multibyte: true,
 180             // dummy values; not read
 181             peek_tok: token::Eof,
 182             peek_span: syntax_pos::DUMMY_SP,
 183             src,
 184             fatal_errs: Vec::new(),
 185             token: token::Eof,
 186             span: syntax_pos::DUMMY_SP,
 187             open_braces: Vec::new(),
 188             override_span: None,
 189         }
 190     }
 191
 192     pub fn new(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
 193         let mut sr = StringReader::new_raw(sess, filemap);
 194         if sr.advance_token().is_err() {
 195             sr.emit_fatal_errors();
 196             FatalError.raise();
 197         }
 198         sr
 199     }
 200
 201     pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
 202         let begin = sess.codemap().lookup_byte_offset(span.lo());
 203         let end = sess.codemap().lookup_byte_offset(span.hi());
 204
 205         // Make the range zero-length if the span is invalid.
 206         if span.lo() > span.hi() || begin.fm.start_pos != end.fm.start_pos {
 207             span = span.shrink_to_lo();
 208         }
 209
 210         let mut sr = StringReader::new_raw_internal(sess, begin.fm);
 211
 212         // Seek the lexer to the right byte range.
 213         sr.save_new_lines_and_multibyte = false;
 214         sr.next_pos = span.lo();
 215         sr.end_src_index = sr.src_index(span.hi());
 216
 217         sr.bump();
 218
 219         if sr.advance_token().is_err() {
 220             sr.emit_fatal_errors();
 221             FatalError.raise();
 222         }
 223         sr
 224     }
 225
 226     pub fn ch_is(&self, c: char) -> bool {
 227         self.ch == Some(c)
 228     }
 229
 230     /// Report a fatal lexical error with a given span.
 231     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 232         self.sess.span_diagnostic.span_fatal(sp, m)
 233     }
 234
 235     /// Report a lexical error with a given span.
 236     pub fn err_span(&self, sp: Span, m: &str) {
 237         self.sess.span_diagnostic.span_err(sp, m)
 238     }
 239
 240
 241     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 242     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 243         self.fatal_span(self.mk_sp(from_pos, to_pos), m)
 244     }
 245
 246     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 247     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 248         self.err_span(self.mk_sp(from_pos, to_pos), m)
 249     }
 250
 251     /// Pushes a character to a message string for error reporting
 252     fn push_escaped_char_for_msg(m: &mut String, c: char) {
 253         match c {
 254             '\u{20}'...'\u{7e}' => {
 255                 // Don't escape \, ' or " for user-facing messages
 256                 m.push(c);
 257             }
 258             _ => {
 259                 for c in c.escape_default() {
 260                     m.push(c);
 261                 }
 262             }
 263         }
 264     }
 265
 266     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 267     /// escaped character to the error message
 268     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
 269         let mut m = m.to_string();
 270         m.push_str(": ");
 271         Self::push_escaped_char_for_msg(&mut m, c);
 272         self.fatal_span_(from_pos, to_pos, &m[..])
 273     }
 274
 275     fn struct_span_fatal(&self,
 276                          from_pos: BytePos,
 277                          to_pos: BytePos,
 278                          m: &str)
 279                          -> DiagnosticBuilder<'a> {
 280         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m)
 281     }
 282
 283     fn struct_fatal_span_char(&self,
 284                               from_pos: BytePos,
 285                               to_pos: BytePos,
 286                               m: &str,
 287                               c: char)
 288                               -> DiagnosticBuilder<'a> {
 289         let mut m = m.to_string();
 290         m.push_str(": ");
 291         Self::push_escaped_char_for_msg(&mut m, c);
 292         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
 293     }
 294
 295     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 296     /// escaped character to the error message
 297     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
 298         let mut m = m.to_string();
 299         m.push_str(": ");
 300         Self::push_escaped_char_for_msg(&mut m, c);
 301         self.err_span_(from_pos, to_pos, &m[..]);
 302     }
 303     fn struct_err_span_char(&self,
 304                             from_pos: BytePos,
 305                             to_pos: BytePos,
 306                             m: &str,
 307                             c: char)
 308                             -> DiagnosticBuilder<'a> {
 309         let mut m = m.to_string();
 310         m.push_str(": ");
 311         Self::push_escaped_char_for_msg(&mut m, c);
 312         self.sess.span_diagnostic.struct_span_err(self.mk_sp(from_pos, to_pos), &m[..])
 313     }
 314
 315     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
 316     /// offending string to the error message
 317     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
 318         m.push_str(": ");
 319         m.push_str(&self.src[self.src_index(from_pos)..self.src_index(to_pos)]);
 320         self.fatal_span_(from_pos, to_pos, &m[..])
 321     }
 322
 323     /// Advance peek_tok and peek_span to refer to the next token, and
 324     /// possibly update the interner.
 325     fn advance_token(&mut self) -> Result<(), ()> {
 326         match self.scan_whitespace_or_comment() {
 327             Some(comment) => {
 328                 self.peek_span = comment.sp;
 329                 self.peek_tok = comment.tok;
 330             }
 331             None => {
 332                 if self.is_eof() {
 333                     self.peek_tok = token::Eof;
 334                     self.peek_span = self.mk_sp(self.filemap.end_pos, self.filemap.end_pos);
 335                 } else {
 336                     let start_bytepos = self.pos;
 337                     self.peek_tok = self.next_token_inner()?;
 338                     self.peek_span = self.mk_sp(start_bytepos, self.pos);
 339                 };
 340             }
 341         }
 342         Ok(())
 343     }
 344
 345     #[inline]
 346     fn src_index(&self, pos: BytePos) -> usize {
 347         (pos - self.filemap.start_pos).to_usize()
 348     }
 349
 350     /// Calls `f` with a string slice of the source text spanning from `start`
 351     /// up to but excluding `self.pos`, meaning the slice does not include
 352     /// the character `self.ch`.
 353     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
 354         where F: FnOnce(&str) -> T
 355     {
 356         self.with_str_from_to(start, self.pos, f)
 357     }
 358
 359     /// Create a Name from a given offset to the current offset, each
 360     /// adjusted 1 towards each other (assumes that on either side there is a
 361     /// single-byte delimiter).
 362     pub fn name_from(&self, start: BytePos) -> ast::Name {
 363         debug!("taking an ident from {:?} to {:?}", start, self.pos);
 364         self.with_str_from(start, Symbol::intern)
 365     }
 366
 367     /// As name_from, with an explicit endpoint.
 368     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
 369         debug!("taking an ident from {:?} to {:?}", start, end);
 370         self.with_str_from_to(start, end, Symbol::intern)
 371     }
 372
 373     /// Calls `f` with a string slice of the source text spanning from `start`
 374     /// up to but excluding `end`.
 375     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
 376         where F: FnOnce(&str) -> T
 377     {
 378         f(&self.src[self.src_index(start)..self.src_index(end)])
 379     }
 380
 381     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 382     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 383         let mut i = 0;
 384         while i < s.len() {
 385             let ch = char_at(s, i);
 386             let next = i + ch.len_utf8();
 387             if ch == '\r' {
 388                 if next < s.len() && char_at(s, next) == '\n' {
 389                     return translate_crlf_(self, start, s, errmsg, i).into();
 390                 }
 391                 let pos = start + BytePos(i as u32);
 392                 let end_pos = start + BytePos(next as u32);
 393                 self.err_span_(pos, end_pos, errmsg);
 394             }
 395             i = next;
 396         }
 397         return s.into();
 398
 399         fn translate_crlf_(rdr: &StringReader,
 400                            start: BytePos,
 401                            s: &str,
 402                            errmsg: &str,
 403                            mut i: usize)
 404                            -> String {
 405             let mut buf = String::with_capacity(s.len());
 406             let mut j = 0;
 407             while i < s.len() {
 408                 let ch = char_at(s, i);
 409                 let next = i + ch.len_utf8();
 410                 if ch == '\r' {
 411                     if j < i {
 412                         buf.push_str(&s[j..i]);
 413                     }
 414                     j = next;
 415                     if next >= s.len() || char_at(s, next) != '\n' {
 416                         let pos = start + BytePos(i as u32);
 417                         let end_pos = start + BytePos(next as u32);
 418                         rdr.err_span_(pos, end_pos, errmsg);
 419                     }
 420                 }
 421                 i = next;
 422             }
 423             if j < s.len() {
 424                 buf.push_str(&s[j..]);
 425             }
 426             buf
 427         }
 428     }
 429
 430     /// Advance the StringReader by one character. If a newline is
 431     /// discovered, add it to the FileMap's list of line start offsets.
 432     pub fn bump(&mut self) {
 433         let next_src_index = self.src_index(self.next_pos);
 434         if next_src_index < self.end_src_index {
 435             let next_ch = char_at(&self.src, next_src_index);
 436             let next_ch_len = next_ch.len_utf8();
 437
 438             if self.ch.unwrap() == '\n' {
 439                 if self.save_new_lines_and_multibyte {
 440                     self.filemap.next_line(self.next_pos);
 441                 }
 442             }
 443             if next_ch_len > 1 {
 444                 if self.save_new_lines_and_multibyte {
 445                     self.filemap.record_multibyte_char(self.next_pos, next_ch_len);
 446                 }
 447             }
 448             self.filemap.record_width(self.next_pos, next_ch);
 449
 450             self.ch = Some(next_ch);
 451             self.pos = self.next_pos;
 452             self.next_pos = self.next_pos + Pos::from_usize(next_ch_len);
 453         } else {
 454             self.ch = None;
 455             self.pos = self.next_pos;
 456         }
 457     }
 458
 459     pub fn nextch(&self) -> Option<char> {
 460         let next_src_index = self.src_index(self.next_pos);
 461         if next_src_index < self.end_src_index {
 462             Some(char_at(&self.src, next_src_index))
 463         } else {
 464             None
 465         }
 466     }
 467
 468     pub fn nextch_is(&self, c: char) -> bool {
 469         self.nextch() == Some(c)
 470     }
 471
 472     pub fn nextnextch(&self) -> Option<char> {
 473         let next_src_index = self.src_index(self.next_pos);
 474         if next_src_index < self.end_src_index {
 475             let next_next_src_index =
 476                 next_src_index + char_at(&self.src, next_src_index).len_utf8();
 477             if next_next_src_index < self.end_src_index {
 478                 return Some(char_at(&self.src, next_next_src_index));
 479             }
 480         }
 481         None
 482     }
 483
 484     pub fn nextnextch_is(&self, c: char) -> bool {
 485         self.nextnextch() == Some(c)
 486     }
 487
 488     /// Eats <XID_start><XID_continue>*, if possible.
 489     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
 490         if !ident_start(self.ch) {
 491             return None;
 492         }
 493         let start = self.pos;
 494         while ident_continue(self.ch) {
 495             self.bump();
 496         }
 497
 498         self.with_str_from(start, |string| {
 499             if string == "_" {
 500                 self.sess.span_diagnostic
 501                     .struct_span_warn(self.mk_sp(start, self.pos),
 502                                       "underscore literal suffix is not allowed")
 503                     .warn("this was previously accepted by the compiler but is \
 504                           being phased out; it will become a hard error in \
 505                           a future release!")
 506                     .note("for more information, see issue #42326 \
 507                           <https://github.com/rust-lang/rust/issues/42326>")
 508                     .emit();
 509                 None
 510             } else {
 511                 Some(Symbol::intern(string))
 512             }
 513         })
 514     }
 515
 516     /// PRECONDITION: self.ch is not whitespace
 517     /// Eats any kind of comment.
 518     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
 519         if let Some(c) = self.ch {
 520             if c.is_whitespace() {
 521                 let msg = "called consume_any_line_comment, but there was whitespace";
 522                 self.sess.span_diagnostic.span_err(self.mk_sp(self.pos, self.pos), msg);
 523             }
 524         }
 525
 526         if self.ch_is('/') {
 527             match self.nextch() {
 528                 Some('/') => {
 529                     self.bump();
 530                     self.bump();
 531
 532                     // line comments starting with "///" or "//!" are doc-comments
 533                     let doc_comment = (self.ch_is('/') && !self.nextch_is('/')) || self.ch_is('!');
 534                     let start_bpos = self.pos - BytePos(2);
 535
 536                     while !self.is_eof() {
 537                         match self.ch.unwrap() {
 538                             '\n' => break,
 539                             '\r' => {
 540                                 if self.nextch_is('\n') {
 541                                     // CRLF
 542                                     break;
 543                                 } else if doc_comment {
 544                                     self.err_span_(self.pos,
 545                                                    self.next_pos,
 546                                                    "bare CR not allowed in doc-comment");
 547                                 }
 548                             }
 549                             _ => (),
 550                         }
 551                         self.bump();
 552                     }
 553
 554                     if doc_comment {
 555                         self.with_str_from(start_bpos, |string| {
 556                             // comments with only more "/"s are not doc comments
 557                             let tok = if is_doc_comment(string) {
 558                                 token::DocComment(Symbol::intern(string))
 559                             } else {
 560                                 token::Comment
 561                             };
 562
 563                             Some(TokenAndSpan {
 564                                 tok,
 565                                 sp: self.mk_sp(start_bpos, self.pos),
 566                             })
 567                         })
 568                     } else {
 569                         Some(TokenAndSpan {
 570                             tok: token::Comment,
 571                             sp: self.mk_sp(start_bpos, self.pos),
 572                         })
 573                     }
 574                 }
 575                 Some('*') => {
 576                     self.bump();
 577                     self.bump();
 578                     self.scan_block_comment()
 579                 }
 580                 _ => None,
 581             }
 582         } else if self.ch_is('#') {
 583             if self.nextch_is('!') {
 584
 585                 // Parse an inner attribute.
 586                 if self.nextnextch_is('[') {
 587                     return None;
 588                 }
 589
 590                 // I guess this is the only way to figure out if
 591                 // we're at the beginning of the file...
 592                 let cmap = CodeMap::new(FilePathMapping::empty());
 593                 cmap.files.borrow_mut().file_maps.push(self.filemap.clone());
 594                 let loc = cmap.lookup_char_pos_adj(self.pos);
 595                 debug!("Skipping a shebang");
 596                 if loc.line == 1 && loc.col == CharPos(0) {
 597                     // FIXME: Add shebang "token", return it
 598                     let start = self.pos;
 599                     while !self.ch_is('\n') && !self.is_eof() {
 600                         self.bump();
 601                     }
 602                     return Some(TokenAndSpan {
 603                         tok: token::Shebang(self.name_from(start)),
 604                         sp: self.mk_sp(start, self.pos),
 605                     });
 606                 }
 607             }
 608             None
 609         } else {
 610             None
 611         }
 612     }
 613
 614     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
 615     /// return None.
 616     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
 617         match self.ch.unwrap_or('\0') {
 618             // # to handle shebang at start of file -- this is the entry point
 619             // for skipping over all "junk"
 620             '/' | '#' => {
 621                 let c = self.scan_comment();
 622                 debug!("scanning a comment {:?}", c);
 623                 c
 624             },
 625             c if is_pattern_whitespace(Some(c)) => {
 626                 let start_bpos = self.pos;
 627                 while is_pattern_whitespace(self.ch) {
 628                     self.bump();
 629                 }
 630                 let c = Some(TokenAndSpan {
 631                     tok: token::Whitespace,
 632                     sp: self.mk_sp(start_bpos, self.pos),
 633                 });
 634                 debug!("scanning whitespace: {:?}", c);
 635                 c
 636             }
 637             _ => None,
 638         }
 639     }
 640
 641     /// Might return a sugared-doc-attr
 642     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
 643         // block comments starting with "/**" or "/*!" are doc-comments
 644         let is_doc_comment = self.ch_is('*') || self.ch_is('!');
 645         let start_bpos = self.pos - BytePos(2);
 646
 647         let mut level: isize = 1;
 648         let mut has_cr = false;
 649         while level > 0 {
 650             if self.is_eof() {
 651                 let msg = if is_doc_comment {
 652                     "unterminated block doc-comment"
 653                 } else {
 654                     "unterminated block comment"
 655                 };
 656                 let last_bpos = self.pos;
 657                 self.fatal_span_(start_bpos, last_bpos, msg).raise();
 658             }
 659             let n = self.ch.unwrap();
 660             match n {
 661                 '/' if self.nextch_is('*') => {
 662                     level += 1;
 663                     self.bump();
 664                 }
 665                 '*' if self.nextch_is('/') => {
 666                     level -= 1;
 667                     self.bump();
 668                 }
 669                 '\r' => {
 670                     has_cr = true;
 671                 }
 672                 _ => (),
 673             }
 674             self.bump();
 675         }
 676
 677         self.with_str_from(start_bpos, |string| {
 678             // but comments with only "*"s between two "/"s are not
 679             let tok = if is_block_doc_comment(string) {
 680                 let string = if has_cr {
 681                     self.translate_crlf(start_bpos,
 682                                         string,
 683                                         "bare CR not allowed in block doc-comment")
 684                 } else {
 685                     string.into()
 686                 };
 687                 token::DocComment(Symbol::intern(&string[..]))
 688             } else {
 689                 token::Comment
 690             };
 691
 692             Some(TokenAndSpan {
 693                 tok,
 694                 sp: self.mk_sp(start_bpos, self.pos),
 695             })
 696         })
 697     }
 698
 699     /// Scan through any digits (base `scan_radix`) or underscores,
 700     /// and return how many digits there were.
 701     ///
 702     /// `real_radix` represents the true radix of the number we're
 703     /// interested in, and errors will be emitted for any digits
 704     /// between `real_radix` and `scan_radix`.
 705     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
 706         assert!(real_radix <= scan_radix);
 707         let mut len = 0;
 708         loop {
 709             let c = self.ch;
 710             if c == Some('_') {
 711                 debug!("skipping a _");
 712                 self.bump();
 713                 continue;
 714             }
 715             match c.and_then(|cc| cc.to_digit(scan_radix)) {
 716                 Some(_) => {
 717                     debug!("{:?} in scan_digits", c);
 718                     // check that the hypothetical digit is actually
 719                     // in range for the true radix
 720                     if c.unwrap().to_digit(real_radix).is_none() {
 721                         self.err_span_(self.pos,
 722                                        self.next_pos,
 723                                        &format!("invalid digit for a base {} literal", real_radix));
 724                     }
 725                     len += 1;
 726                     self.bump();
 727                 }
 728                 _ => return len,
 729             }
 730         }
 731     }
 732
 733     /// Lex a LIT_INTEGER or a LIT_FLOAT
 734     fn scan_number(&mut self, c: char) -> token::Lit {
 735         let num_digits;
 736         let mut base = 10;
 737         let start_bpos = self.pos;
 738
 739         self.bump();
 740
 741         if c == '0' {
 742             match self.ch.unwrap_or('\0') {
 743                 'b' => {
 744                     self.bump();
 745                     base = 2;
 746                     num_digits = self.scan_digits(2, 10);
 747                 }
 748                 'o' => {
 749                     self.bump();
 750                     base = 8;
 751                     num_digits = self.scan_digits(8, 10);
 752                 }
 753                 'x' => {
 754                     self.bump();
 755                     base = 16;
 756                     num_digits = self.scan_digits(16, 16);
 757                 }
 758                 '0'...'9' | '_' | '.' | 'e' | 'E' => {
 759                     num_digits = self.scan_digits(10, 10) + 1;
 760                 }
 761                 _ => {
 762                     // just a 0
 763                     return token::Integer(self.name_from(start_bpos));
 764                 }
 765             }
 766         } else if c.is_digit(10) {
 767             num_digits = self.scan_digits(10, 10) + 1;
 768         } else {
 769             num_digits = 0;
 770         }
 771
 772         if num_digits == 0 {
 773             self.err_span_(start_bpos,
 774                            self.pos,
 775                            "no valid digits found for number");
 776             return token::Integer(Symbol::intern("0"));
 777         }
 778
 779         // might be a float, but don't be greedy if this is actually an
 780         // integer literal followed by field/method access or a range pattern
 781         // (`0..2` and `12.foo()`)
 782         if self.ch_is('.') && !self.nextch_is('.') &&
 783            !ident_start(self.nextch()) {
 784             // might have stuff after the ., and if it does, it needs to start
 785             // with a number
 786             self.bump();
 787             if self.ch.unwrap_or('\0').is_digit(10) {
 788                 self.scan_digits(10, 10);
 789                 self.scan_float_exponent();
 790             }
 791             let pos = self.pos;
 792             self.check_float_base(start_bpos, pos, base);
 793             token::Float(self.name_from(start_bpos))
 794         } else {
 795             // it might be a float if it has an exponent
 796             if self.ch_is('e') || self.ch_is('E') {
 797                 self.scan_float_exponent();
 798                 let pos = self.pos;
 799                 self.check_float_base(start_bpos, pos, base);
 800                 return token::Float(self.name_from(start_bpos));
 801             }
 802             // but we certainly have an integer!
 803             token::Integer(self.name_from(start_bpos))
 804         }
 805     }
 806
 807     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
 808     /// error if too many or too few digits are encountered.
 809     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
 810         debug!("scanning {} digits until {:?}", n_digits, delim);
 811         let start_bpos = self.pos;
 812         let mut accum_int = 0;
 813
 814         let mut valid = true;
 815         for _ in 0..n_digits {
 816             if self.is_eof() {
 817                 let last_bpos = self.pos;
 818                 self.fatal_span_(start_bpos,
 819                                  last_bpos,
 820                                  "unterminated numeric character escape").raise();
 821             }
 822             if self.ch_is(delim) {
 823                 let last_bpos = self.pos;
 824                 self.err_span_(start_bpos,
 825                                last_bpos,
 826                                "numeric character escape is too short");
 827                 valid = false;
 828                 break;
 829             }
 830             let c = self.ch.unwrap_or('\x00');
 831             accum_int *= 16;
 832             accum_int += c.to_digit(16).unwrap_or_else(|| {
 833                 self.err_span_char(self.pos,
 834                                    self.next_pos,
 835                                    "invalid character in numeric character escape",
 836                                    c);
 837
 838                 valid = false;
 839                 0
 840             });
 841             self.bump();
 842         }
 843
 844         if below_0x7f_only && accum_int >= 0x80 {
 845             self.err_span_(start_bpos,
 846                            self.pos,
 847                            "this form of character escape may only be used with characters in \
 848                             the range [\\x00-\\x7f]");
 849             valid = false;
 850         }
 851
 852         match char::from_u32(accum_int) {
 853             Some(_) => valid,
 854             None => {
 855                 let last_bpos = self.pos;
 856                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
 857                 false
 858             }
 859         }
 860     }
 861
 862     /// Scan for a single (possibly escaped) byte or char
 863     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
 864     /// `start` is the position of `first_source_char`, which is already consumed.
 865     ///
 866     /// Returns true if there was a valid char/byte, false otherwise.
 867     fn scan_char_or_byte(&mut self,
 868                          start: BytePos,
 869                          first_source_char: char,
 870                          ascii_only: bool,
 871                          delim: char)
 872                          -> bool {
 873         match first_source_char {
 874             '\\' => {
 875                 // '\X' for some X must be a character constant:
 876                 let escaped = self.ch;
 877                 let escaped_pos = self.pos;
 878                 self.bump();
 879                 match escaped {
 880                     None => {}  // EOF here is an error that will be checked later.
 881                     Some(e) => {
 882                         return match e {
 883                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
 884                             'x' => self.scan_byte_escape(delim, !ascii_only),
 885                             'u' => {
 886                                 let valid = if self.ch_is('{') {
 887                                     self.scan_unicode_escape(delim) && !ascii_only
 888                                 } else {
 889                                     let span = self.mk_sp(start, self.pos);
 890                                     self.sess.span_diagnostic
 891                                         .struct_span_err(span, "incorrect unicode escape sequence")
 892                                         .span_help(span,
 893                                                    "format of unicode escape sequences is \
 894                                                     `\\u{…}`")
 895                                         .emit();
 896                                     false
 897                                 };
 898                                 if ascii_only {
 899                                     self.err_span_(start,
 900                                                    self.pos,
 901                                                    "unicode escape sequences cannot be used as a \
 902                                                     byte or in a byte string");
 903                                 }
 904                                 valid
 905
 906                             }
 907                             '\n' if delim == '"' => {
 908                                 self.consume_whitespace();
 909                                 true
 910                             }
 911                             '\r' if delim == '"' && self.ch_is('\n') => {
 912                                 self.consume_whitespace();
 913                                 true
 914                             }
 915                             c => {
 916                                 let pos = self.pos;
 917                                 let mut err = self.struct_err_span_char(escaped_pos,
 918                                                                         pos,
 919                                                                         if ascii_only {
 920                                                                             "unknown byte escape"
 921                                                                         } else {
 922                                                                             "unknown character \
 923                                                                              escape"
 924                                                                         },
 925                                                                         c);
 926                                 if e == '\r' {
 927                                     err.span_help(self.mk_sp(escaped_pos, pos),
 928                                                   "this is an isolated carriage return; consider \
 929                                                    checking your editor and version control \
 930                                                    settings");
 931                                 }
 932                                 if (e == '{' || e == '}') && !ascii_only {
 933                                     err.span_help(self.mk_sp(escaped_pos, pos),
 934                                                   "if used in a formatting string, curly braces \
 935                                                    are escaped with `{{` and `}}`");
 936                                 }
 937                                 err.emit();
 938                                 false
 939                             }
 940                         }
 941                     }
 942                 }
 943             }
 944             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
 945                 let pos = self.pos;
 946                 self.err_span_char(start,
 947                                    pos,
 948                                    if ascii_only {
 949                                        "byte constant must be escaped"
 950                                    } else {
 951                                        "character constant must be escaped"
 952                                    },
 953                                    first_source_char);
 954                 return false;
 955             }
 956             '\r' => {
 957                 if self.ch_is('\n') {
 958                     self.bump();
 959                     return true;
 960                 } else {
 961                     self.err_span_(start,
 962                                    self.pos,
 963                                    "bare CR not allowed in string, use \\r instead");
 964                     return false;
 965                 }
 966             }
 967             _ => {
 968                 if ascii_only && first_source_char > '\x7F' {
 969                     let pos = self.pos;
 970                     self.err_span_(start,
 971                                    pos,
 972                                    "byte constant must be ASCII. Use a \\xHH escape for a \
 973                                     non-ASCII byte");
 974                     return false;
 975                 }
 976             }
 977         }
 978         true
 979     }
 980
 981     /// Scan over a `\u{...}` escape
 982     ///
 983     /// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
 984     /// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
 985     /// and pass over the `}`.
 986     fn scan_unicode_escape(&mut self, delim: char) -> bool {
 987         self.bump(); // past the {
 988         let start_bpos = self.pos;
 989         let mut valid = true;
 990
 991         if let Some('_') = self.ch {
 992             // disallow leading `_`
 993             self.err_span_(self.pos,
 994                            self.next_pos,
 995                            "invalid start of unicode escape");
 996             valid = false;
 997         }
 998
 999         let count = self.scan_digits(16, 16);
1000
1001         if count > 6 {
1002             self.err_span_(start_bpos,
1003                            self.pos,
1004                            "overlong unicode escape (must have at most 6 hex digits)");
1005             valid = false;
1006         }
1007         loop {
1008             match self.ch {
1009                 Some('}') => {
1010                     if valid && count == 0 {
1011                         self.err_span_(start_bpos,
1012                                        self.pos,
1013                                        "empty unicode escape (must have at least 1 hex digit)");
1014                         valid = false;
1015                     }
1016                     self.bump(); // past the ending `}`
1017                     break;
1018                 },
1019                 Some(c) => {
1020                     if c == delim {
1021                         self.err_span_(self.pos,
1022                                        self.pos,
1023                                        "unterminated unicode escape (needed a `}`)");
1024                         valid = false;
1025                         break;
1026                     } else if valid {
1027                         self.err_span_char(start_bpos,
1028                                            self.pos,
1029                                            "invalid character in unicode escape",
1030                                            c);
1031                         valid = false;
1032                     }
1033                 },
1034                 None => {
1035                     self.fatal_span_(start_bpos,
1036                                      self.pos,
1037                                      "unterminated unicode escape (found EOF)").raise();
1038                 }
1039             }
1040             self.bump();
1041         }
1042         valid
1043     }
1044
1045     /// Scan over a float exponent.
1046     fn scan_float_exponent(&mut self) {
1047         if self.ch_is('e') || self.ch_is('E') {
1048             self.bump();
1049             if self.ch_is('-') || self.ch_is('+') {
1050                 self.bump();
1051             }
1052             if self.scan_digits(10, 10) == 0 {
1053                 self.err_span_(self.pos,
1054                                self.next_pos,
1055                                "expected at least one digit in exponent")
1056             }
1057         }
1058     }
1059
1060     /// Check that a base is valid for a floating literal, emitting a nice
1061     /// error if it isn't.
1062     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1063         match base {
1064             16 => {
1065                 self.err_span_(start_bpos,
1066                                last_bpos,
1067                                "hexadecimal float literal is not supported")
1068             }
1069             8 => {
1070                 self.err_span_(start_bpos,
1071                                last_bpos,
1072                                "octal float literal is not supported")
1073             }
1074             2 => {
1075                 self.err_span_(start_bpos,
1076                                last_bpos,
1077                                "binary float literal is not supported")
1078             }
1079             _ => (),
1080         }
1081     }
1082
1083     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1084         self.bump();
1085         if self.ch_is('=') {
1086             self.bump();
1087             token::BinOpEq(op)
1088         } else {
1089             token::BinOp(op)
1090         }
1091     }
1092
1093     /// Return the next token from the string, advances the input past that
1094     /// token, and updates the interner
1095     fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1096         let c = self.ch;
1097
1098         if ident_start(c) {
1099             let (is_ident_start, is_raw_ident) =
1100                 match (c.unwrap(), self.nextch(), self.nextnextch()) {
1101                     // r# followed by an identifier starter is a raw identifier.
1102                     // This is an exception to the r# case below.
1103                     ('r', Some('#'), x) if ident_start(x) => (true, true),
1104                     // r as in r" or r#" is part of a raw string literal.
1105                     // b as in b' is part of a byte literal.
1106                     // They are not identifiers, and are handled further down.
1107                     ('r', Some('"'), _) |
1108                     ('r', Some('#'), _) |
1109                     ('b', Some('"'), _) |
1110                     ('b', Some('\''), _) |
1111                     ('b', Some('r'), Some('"')) |
1112                     ('b', Some('r'), Some('#')) => (false, false),
1113                     _ => (true, false),
1114                 };
1115             if is_ident_start {
1116                 let raw_start = self.pos;
1117                 if is_raw_ident {
1118                     // Consume the 'r#' characters.
1119                     self.bump();
1120                     self.bump();
1121                 }
1122
1123                 let start = self.pos;
1124                 while ident_continue(self.ch) {
1125                     self.bump();
1126                 }
1127
1128                 return Ok(self.with_str_from(start, |string| {
1129                     // FIXME: perform NFKC normalization here. (Issue #2253)
1130                     let ident = self.mk_ident(string);
1131                     if is_raw_ident && (ident.is_path_segment_keyword() ||
1132                                         ident.name == keywords::Underscore.name()) {
1133                         self.fatal_span_(raw_start, self.pos,
1134                             &format!("`r#{}` is not currently supported.", ident.name)
1135                         ).raise();
1136                     }
1137                     if is_raw_ident {
1138                         let span = self.mk_sp(raw_start, self.pos);
1139                         self.sess.raw_identifier_spans.borrow_mut().push(span);
1140                     }
1141                     token::Ident(ident, is_raw_ident)
1142                 }));
1143             }
1144         }
1145
1146         if is_dec_digit(c) {
1147             let num = self.scan_number(c.unwrap());
1148             let suffix = self.scan_optional_raw_name();
1149             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1150             return Ok(token::Literal(num, suffix));
1151         }
1152
1153         match c.expect("next_token_inner called at EOF") {
1154             // One-byte tokens.
1155             ';' => {
1156                 self.bump();
1157                 Ok(token::Semi)
1158             }
1159             ',' => {
1160                 self.bump();
1161                 Ok(token::Comma)
1162             }
1163             '.' => {
1164                 self.bump();
1165                 if self.ch_is('.') {
1166                     self.bump();
1167                     if self.ch_is('.') {
1168                         self.bump();
1169                         Ok(token::DotDotDot)
1170                     } else if self.ch_is('=') {
1171                         self.bump();
1172                         Ok(token::DotDotEq)
1173                     } else {
1174                         Ok(token::DotDot)
1175                     }
1176                 } else {
1177                     Ok(token::Dot)
1178                 }
1179             }
1180             '(' => {
1181                 self.bump();
1182                 Ok(token::OpenDelim(token::Paren))
1183             }
1184             ')' => {
1185                 self.bump();
1186                 Ok(token::CloseDelim(token::Paren))
1187             }
1188             '{' => {
1189                 self.bump();
1190                 Ok(token::OpenDelim(token::Brace))
1191             }
1192             '}' => {
1193                 self.bump();
1194                 Ok(token::CloseDelim(token::Brace))
1195             }
1196             '[' => {
1197                 self.bump();
1198                 Ok(token::OpenDelim(token::Bracket))
1199             }
1200             ']' => {
1201                 self.bump();
1202                 Ok(token::CloseDelim(token::Bracket))
1203             }
1204             '@' => {
1205                 self.bump();
1206                 Ok(token::At)
1207             }
1208             '#' => {
1209                 self.bump();
1210                 Ok(token::Pound)
1211             }
1212             '~' => {
1213                 self.bump();
1214                 Ok(token::Tilde)
1215             }
1216             '?' => {
1217                 self.bump();
1218                 Ok(token::Question)
1219             }
1220             ':' => {
1221                 self.bump();
1222                 if self.ch_is(':') {
1223                     self.bump();
1224                     Ok(token::ModSep)
1225                 } else {
1226                     Ok(token::Colon)
1227                 }
1228             }
1229
1230             '$' => {
1231                 self.bump();
1232                 Ok(token::Dollar)
1233             }
1234
1235             // Multi-byte tokens.
1236             '=' => {
1237                 self.bump();
1238                 if self.ch_is('=') {
1239                     self.bump();
1240                     Ok(token::EqEq)
1241                 } else if self.ch_is('>') {
1242                     self.bump();
1243                     Ok(token::FatArrow)
1244                 } else {
1245                     Ok(token::Eq)
1246                 }
1247             }
1248             '!' => {
1249                 self.bump();
1250                 if self.ch_is('=') {
1251                     self.bump();
1252                     Ok(token::Ne)
1253                 } else {
1254                     Ok(token::Not)
1255                 }
1256             }
1257             '<' => {
1258                 self.bump();
1259                 match self.ch.unwrap_or('\x00') {
1260                     '=' => {
1261                         self.bump();
1262                         Ok(token::Le)
1263                     }
1264                     '<' => {
1265                         Ok(self.binop(token::Shl))
1266                     }
1267                     '-' => {
1268                         self.bump();
1269                         match self.ch.unwrap_or('\x00') {
1270                             _ => {
1271                                 Ok(token::LArrow)
1272                             }
1273                         }
1274                     }
1275                     _ => {
1276                         Ok(token::Lt)
1277                     }
1278                 }
1279             }
1280             '>' => {
1281                 self.bump();
1282                 match self.ch.unwrap_or('\x00') {
1283                     '=' => {
1284                         self.bump();
1285                         Ok(token::Ge)
1286                     }
1287                     '>' => {
1288                         Ok(self.binop(token::Shr))
1289                     }
1290                     _ => {
1291                         Ok(token::Gt)
1292                     }
1293                 }
1294             }
1295             '\'' => {
1296                 // Either a character constant 'a' OR a lifetime name 'abc
1297                 let start_with_quote = self.pos;
1298                 self.bump();
1299                 let start = self.pos;
1300
1301                 // the eof will be picked up by the final `'` check below
1302                 let c2 = self.ch.unwrap_or('\x00');
1303                 self.bump();
1304
1305                 // If the character is an ident start not followed by another single
1306                 // quote, then this is a lifetime name:
1307                 if ident_start(Some(c2)) && !self.ch_is('\'') {
1308                     while ident_continue(self.ch) {
1309                         self.bump();
1310                     }
1311                     // lifetimes shouldn't end with a single quote
1312                     // if we find one, then this is an invalid character literal
1313                     if self.ch_is('\'') {
1314                         self.fatal_span_verbose(start_with_quote, self.next_pos,
1315                                 String::from("character literal may only contain one codepoint"))
1316                             .raise();
1317
1318                     }
1319
1320                     // Include the leading `'` in the real identifier, for macro
1321                     // expansion purposes. See #12512 for the gory details of why
1322                     // this is necessary.
1323                     let ident = self.with_str_from(start, |lifetime_name| {
1324                         self.mk_ident(&format!("'{}", lifetime_name))
1325                     });
1326
1327                     return Ok(token::Lifetime(ident));
1328                 }
1329
1330                 let valid = self.scan_char_or_byte(start,
1331                                                    c2,
1332                                                    // ascii_only =
1333                                                    false,
1334                                                    '\'');
1335
1336                 if !self.ch_is('\'') {
1337                     let pos = self.pos;
1338                     loop {
1339                         self.bump();
1340                         if self.ch_is('\'') {
1341                             let start = self.src_index(start);
1342                             let end = self.src_index(self.pos);
1343                             self.bump();
1344                             let span = self.mk_sp(start_with_quote, self.pos);
1345                             self.sess.span_diagnostic
1346                                 .struct_span_err(span,
1347                                                  "character literal may only contain one codepoint")
1348                                 .span_suggestion(span,
1349                                                  "if you meant to write a `str` literal, \
1350                                                   use double quotes",
1351                                                  format!("\"{}\"", &self.src[start..end]))
1352                                 .emit();
1353                             return Ok(token::Literal(token::Str_(Symbol::intern("??")), None))
1354                         }
1355                         if self.ch_is('\n') || self.is_eof() || self.ch_is('/') {
1356                             // Only attempt to infer single line string literals. If we encounter
1357                             // a slash, bail out in order to avoid nonsensical suggestion when
1358                             // involving comments.
1359                             break;
1360                         }
1361                     }
1362                     self.fatal_span_verbose(start_with_quote, pos,
1363                         String::from("character literal may only contain one codepoint")).raise();
1364                 }
1365
1366                 let id = if valid {
1367                     self.name_from(start)
1368                 } else {
1369                     Symbol::intern("0")
1370                 };
1371                 self.bump(); // advance ch past token
1372                 let suffix = self.scan_optional_raw_name();
1373                 Ok(token::Literal(token::Char(id), suffix))
1374             }
1375             'b' => {
1376                 self.bump();
1377                 let lit = match self.ch {
1378                     Some('\'') => self.scan_byte(),
1379                     Some('"') => self.scan_byte_string(),
1380                     Some('r') => self.scan_raw_byte_string(),
1381                     _ => unreachable!(),  // Should have been a token::Ident above.
1382                 };
1383                 let suffix = self.scan_optional_raw_name();
1384                 Ok(token::Literal(lit, suffix))
1385             }
1386             '"' => {
1387                 let start_bpos = self.pos;
1388                 let mut valid = true;
1389                 self.bump();
1390                 while !self.ch_is('"') {
1391                     if self.is_eof() {
1392                         let last_bpos = self.pos;
1393                         self.fatal_span_(start_bpos,
1394                                          last_bpos,
1395                                          "unterminated double quote string").raise();
1396                     }
1397
1398                     let ch_start = self.pos;
1399                     let ch = self.ch.unwrap();
1400                     self.bump();
1401                     valid &= self.scan_char_or_byte(ch_start,
1402                                                     ch,
1403                                                     // ascii_only =
1404                                                     false,
1405                                                     '"');
1406                 }
1407                 // adjust for the ASCII " at the start of the literal
1408                 let id = if valid {
1409                     self.name_from(start_bpos + BytePos(1))
1410                 } else {
1411                     Symbol::intern("??")
1412                 };
1413                 self.bump();
1414                 let suffix = self.scan_optional_raw_name();
1415                 Ok(token::Literal(token::Str_(id), suffix))
1416             }
1417             'r' => {
1418                 let start_bpos = self.pos;
1419                 self.bump();
1420                 let mut hash_count: u16 = 0;
1421                 while self.ch_is('#') {
1422                     self.bump();
1423                     hash_count += 1;
1424                 }
1425
1426                 if self.is_eof() {
1427                     self.fail_unterminated_raw_string(start_bpos, hash_count);
1428                 } else if !self.ch_is('"') {
1429                     let last_bpos = self.pos;
1430                     let curr_char = self.ch.unwrap();
1431                     self.fatal_span_char(start_bpos,
1432                                          last_bpos,
1433                                          "found invalid character; only `#` is allowed \
1434                                          in raw string delimitation",
1435                                          curr_char).raise();
1436                 }
1437                 self.bump();
1438                 let content_start_bpos = self.pos;
1439                 let mut content_end_bpos;
1440                 let mut valid = true;
1441                 'outer: loop {
1442                     if self.is_eof() {
1443                         self.fail_unterminated_raw_string(start_bpos, hash_count);
1444                     }
1445                     // if self.ch_is('"') {
1446                     // content_end_bpos = self.pos;
1447                     // for _ in 0..hash_count {
1448                     // self.bump();
1449                     // if !self.ch_is('#') {
1450                     // continue 'outer;
1451                     let c = self.ch.unwrap();
1452                     match c {
1453                         '"' => {
1454                             content_end_bpos = self.pos;
1455                             for _ in 0..hash_count {
1456                                 self.bump();
1457                                 if !self.ch_is('#') {
1458                                     continue 'outer;
1459                                 }
1460                             }
1461                             break;
1462                         }
1463                         '\r' => {
1464                             if !self.nextch_is('\n') {
1465                                 let last_bpos = self.pos;
1466                                 self.err_span_(start_bpos,
1467                                                last_bpos,
1468                                                "bare CR not allowed in raw string, use \\r \
1469                                                 instead");
1470                                 valid = false;
1471                             }
1472                         }
1473                         _ => (),
1474                     }
1475                     self.bump();
1476                 }
1477                 self.bump();
1478                 let id = if valid {
1479                     self.name_from_to(content_start_bpos, content_end_bpos)
1480                 } else {
1481                     Symbol::intern("??")
1482                 };
1483                 let suffix = self.scan_optional_raw_name();
1484                 Ok(token::Literal(token::StrRaw(id, hash_count), suffix))
1485             }
1486             '-' => {
1487                 if self.nextch_is('>') {
1488                     self.bump();
1489                     self.bump();
1490                     Ok(token::RArrow)
1491                 } else {
1492                     Ok(self.binop(token::Minus))
1493                 }
1494             }
1495             '&' => {
1496                 if self.nextch_is('&') {
1497                     self.bump();
1498                     self.bump();
1499                     Ok(token::AndAnd)
1500                 } else {
1501                     Ok(self.binop(token::And))
1502                 }
1503             }
1504             '|' => {
1505                 match self.nextch() {
1506                     Some('|') => {
1507                         self.bump();
1508                         self.bump();
1509                         Ok(token::OrOr)
1510                     }
1511                     _ => {
1512                         Ok(self.binop(token::Or))
1513                     }
1514                 }
1515             }
1516             '+' => {
1517                 Ok(self.binop(token::Plus))
1518             }
1519             '*' => {
1520                 Ok(self.binop(token::Star))
1521             }
1522             '/' => {
1523                 Ok(self.binop(token::Slash))
1524             }
1525             '^' => {
1526                 Ok(self.binop(token::Caret))
1527             }
1528             '%' => {
1529                 Ok(self.binop(token::Percent))
1530             }
1531             c => {
1532                 let last_bpos = self.pos;
1533                 let bpos = self.next_pos;
1534                 let mut err = self.struct_fatal_span_char(last_bpos,
1535                                                           bpos,
1536                                                           "unknown start of token",
1537                                                           c);
1538                 unicode_chars::check_for_substitution(self, c, &mut err);
1539                 self.fatal_errs.push(err);
1540                 Err(())
1541             }
1542         }
1543     }
1544
1545     fn consume_whitespace(&mut self) {
1546         while is_pattern_whitespace(self.ch) && !self.is_eof() {
1547             self.bump();
1548         }
1549     }
1550
1551     fn read_to_eol(&mut self) -> String {
1552         let mut val = String::new();
1553         while !self.ch_is('\n') && !self.is_eof() {
1554             val.push(self.ch.unwrap());
1555             self.bump();
1556         }
1557         if self.ch_is('\n') {
1558             self.bump();
1559         }
1560         val
1561     }
1562
1563     fn read_one_line_comment(&mut self) -> String {
1564         let val = self.read_to_eol();
1565         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1566                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1567         val
1568     }
1569
1570     fn consume_non_eol_whitespace(&mut self) {
1571         while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1572             self.bump();
1573         }
1574     }
1575
1576     fn peeking_at_comment(&self) -> bool {
1577         (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1578         // consider shebangs comments, but not inner attributes
1579         (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1580     }
1581
1582     fn scan_byte(&mut self) -> token::Lit {
1583         self.bump();
1584         let start = self.pos;
1585
1586         // the eof will be picked up by the final `'` check below
1587         let c2 = self.ch.unwrap_or('\x00');
1588         self.bump();
1589
1590         let valid = self.scan_char_or_byte(start,
1591                                            c2,
1592                                            // ascii_only =
1593                                            true,
1594                                            '\'');
1595         if !self.ch_is('\'') {
1596             // Byte offsetting here is okay because the
1597             // character before position `start` are an
1598             // ascii single quote and ascii 'b'.
1599             let pos = self.pos;
1600             self.fatal_span_verbose(start - BytePos(2),
1601                                     pos,
1602                                     "unterminated byte constant".to_string()).raise();
1603         }
1604
1605         let id = if valid {
1606             self.name_from(start)
1607         } else {
1608             Symbol::intern("?")
1609         };
1610         self.bump(); // advance ch past token
1611         token::Byte(id)
1612     }
1613
1614     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1615         self.scan_hex_digits(2, delim, below_0x7f_only)
1616     }
1617
1618     fn scan_byte_string(&mut self) -> token::Lit {
1619         self.bump();
1620         let start = self.pos;
1621         let mut valid = true;
1622
1623         while !self.ch_is('"') {
1624             if self.is_eof() {
1625                 let pos = self.pos;
1626                 self.fatal_span_(start, pos, "unterminated double quote byte string").raise();
1627             }
1628
1629             let ch_start = self.pos;
1630             let ch = self.ch.unwrap();
1631             self.bump();
1632             valid &= self.scan_char_or_byte(ch_start,
1633                                             ch,
1634                                             // ascii_only =
1635                                             true,
1636                                             '"');
1637         }
1638         let id = if valid {
1639             self.name_from(start)
1640         } else {
1641             Symbol::intern("??")
1642         };
1643         self.bump();
1644         token::ByteStr(id)
1645     }
1646
1647     fn scan_raw_byte_string(&mut self) -> token::Lit {
1648         let start_bpos = self.pos;
1649         self.bump();
1650         let mut hash_count = 0;
1651         while self.ch_is('#') {
1652             self.bump();
1653             hash_count += 1;
1654         }
1655
1656         if self.is_eof() {
1657             self.fail_unterminated_raw_string(start_bpos, hash_count);
1658         } else if !self.ch_is('"') {
1659             let pos = self.pos;
1660             let ch = self.ch.unwrap();
1661             self.fatal_span_char(start_bpos,
1662                                         pos,
1663                                         "found invalid character; only `#` is allowed in raw \
1664                                          string delimitation",
1665                                         ch).raise();
1666         }
1667         self.bump();
1668         let content_start_bpos = self.pos;
1669         let mut content_end_bpos;
1670         'outer: loop {
1671             match self.ch {
1672                 None => {
1673                     self.fail_unterminated_raw_string(start_bpos, hash_count);
1674                 }
1675                 Some('"') => {
1676                     content_end_bpos = self.pos;
1677                     for _ in 0..hash_count {
1678                         self.bump();
1679                         if !self.ch_is('#') {
1680                             continue 'outer;
1681                         }
1682                     }
1683                     break;
1684                 }
1685                 Some(c) => {
1686                     if c > '\x7F' {
1687                         let pos = self.pos;
1688                         self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1689                     }
1690                 }
1691             }
1692             self.bump();
1693         }
1694         self.bump();
1695         token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1696                                  hash_count)
1697     }
1698 }
1699
1700 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1701 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1702 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1703     c.map_or(false, Pattern_White_Space)
1704 }
1705
1706 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1707     match c {
1708         Some(c) => lo <= c && c <= hi,
1709         _ => false,
1710     }
1711 }
1712
1713 fn is_dec_digit(c: Option<char>) -> bool {
1714     in_range(c, '0', '9')
1715 }
1716
1717 pub fn is_doc_comment(s: &str) -> bool {
1718     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1719               s.starts_with("//!");
1720     debug!("is {:?} a doc comment? {}", s, res);
1721     res
1722 }
1723
1724 pub fn is_block_doc_comment(s: &str) -> bool {
1725     // Prevent `/**/` from being parsed as a doc comment
1726     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1727                s.starts_with("/*!")) && s.len() >= 5;
1728     debug!("is {:?} a doc comment? {}", s, res);
1729     res
1730 }
1731
1732 fn ident_start(c: Option<char>) -> bool {
1733     let c = match c {
1734         Some(c) => c,
1735         None => return false,
1736     };
1737
1738     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1739 }
1740
1741 fn ident_continue(c: Option<char>) -> bool {
1742     let c = match c {
1743         Some(c) => c,
1744         None => return false,
1745     };
1746
1747     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1748     (c > '\x7f' && c.is_xid_continue())
1749 }
1750
1751 // The string is a valid identifier or a lifetime identifier.
1752 pub fn is_valid_ident(s: &str) -> bool {
1753     let mut chars = s.chars();
1754     ident_start(chars.next()) && chars.all(|ch| ident_continue(Some(ch)))
1755 }
1756
1757 #[cfg(test)]
1758 mod tests {
1759     use super::*;
1760
1761     use ast::{Ident, CrateConfig};
1762     use symbol::Symbol;
1763     use syntax_pos::{BytePos, Span, NO_EXPANSION};
1764     use codemap::CodeMap;
1765     use errors;
1766     use feature_gate::UnstableFeatures;
1767     use parse::token;
1768     use std::collections::HashSet;
1769     use std::io;
1770     use std::path::PathBuf;
1771     use diagnostics::plugin::ErrorMap;
1772     use rustc_data_structures::sync::Lock;
1773     use with_globals;
1774     fn mk_sess(cm: Lrc<CodeMap>) -> ParseSess {
1775         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()),
1776                                                           Some(cm.clone()),
1777                                                           false,
1778                                                           false);
1779         ParseSess {
1780             span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1781             unstable_features: UnstableFeatures::from_environment(),
1782             config: CrateConfig::new(),
1783             included_mod_stack: Lock::new(Vec::new()),
1784             code_map: cm,
1785             missing_fragment_specifiers: Lock::new(HashSet::new()),
1786             raw_identifier_spans: Lock::new(Vec::new()),
1787             registered_diagnostics: Lock::new(ErrorMap::new()),
1788             non_modrs_mods: Lock::new(vec![]),
1789         }
1790     }
1791
1792     // open a string reader for the given string
1793     fn setup<'a>(cm: &CodeMap,
1794                  sess: &'a ParseSess,
1795                  teststr: String)
1796                  -> StringReader<'a> {
1797         let fm = cm.new_filemap(PathBuf::from("zebra.rs").into(), teststr);
1798         StringReader::new(sess, fm)
1799     }
1800
1801     #[test]
1802     fn t1() {
1803         with_globals(|| {
1804             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1805             let sh = mk_sess(cm.clone());
1806             let mut string_reader = setup(&cm,
1807                                         &sh,
1808                                         "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1809                                             .to_string());
1810             let id = Ident::from_str("fn");
1811             assert_eq!(string_reader.next_token().tok, token::Comment);
1812             assert_eq!(string_reader.next_token().tok, token::Whitespace);
1813             let tok1 = string_reader.next_token();
1814             let tok2 = TokenAndSpan {
1815                 tok: token::Ident(id, false),
1816                 sp: Span::new(BytePos(21), BytePos(23), NO_EXPANSION),
1817             };
1818             assert_eq!(tok1, tok2);
1819             assert_eq!(string_reader.next_token().tok, token::Whitespace);
1820             // the 'main' id is already read:
1821             assert_eq!(string_reader.pos.clone(), BytePos(28));
1822             // read another token:
1823             let tok3 = string_reader.next_token();
1824             let tok4 = TokenAndSpan {
1825                 tok: mk_ident("main"),
1826                 sp: Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
1827             };
1828             assert_eq!(tok3, tok4);
1829             // the lparen is already read:
1830             assert_eq!(string_reader.pos.clone(), BytePos(29))
1831         })
1832     }
1833
1834     // check that the given reader produces the desired stream
1835     // of tokens (stop checking after exhausting the expected vec)
1836     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1837         for expected_tok in &expected {
1838             assert_eq!(&string_reader.next_token().tok, expected_tok);
1839         }
1840     }
1841
1842     // make the identifier by looking up the string in the interner
1843     fn mk_ident(id: &str) -> token::Token {
1844         token::Token::from_ast_ident(Ident::from_str(id))
1845     }
1846
1847     #[test]
1848     fn doublecolonparsing() {
1849         with_globals(|| {
1850             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1851             let sh = mk_sess(cm.clone());
1852             check_tokenization(setup(&cm, &sh, "a b".to_string()),
1853                             vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1854         })
1855     }
1856
1857     #[test]
1858     fn dcparsing_2() {
1859         with_globals(|| {
1860             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1861             let sh = mk_sess(cm.clone());
1862             check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1863                             vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1864         })
1865     }
1866
1867     #[test]
1868     fn dcparsing_3() {
1869         with_globals(|| {
1870             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1871             let sh = mk_sess(cm.clone());
1872             check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1873                             vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1874         })
1875     }
1876
1877     #[test]
1878     fn dcparsing_4() {
1879         with_globals(|| {
1880             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1881             let sh = mk_sess(cm.clone());
1882             check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1883                             vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1884         })
1885     }
1886
1887     #[test]
1888     fn character_a() {
1889         with_globals(|| {
1890             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1891             let sh = mk_sess(cm.clone());
1892             assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1893                     token::Literal(token::Char(Symbol::intern("a")), None));
1894         })
1895     }
1896
1897     #[test]
1898     fn character_space() {
1899         with_globals(|| {
1900             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1901             let sh = mk_sess(cm.clone());
1902             assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1903                     token::Literal(token::Char(Symbol::intern(" ")), None));
1904         })
1905     }
1906
1907     #[test]
1908     fn character_escaped() {
1909         with_globals(|| {
1910             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1911             let sh = mk_sess(cm.clone());
1912             assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1913                     token::Literal(token::Char(Symbol::intern("\\n")), None));
1914         })
1915     }
1916
1917     #[test]
1918     fn lifetime_name() {
1919         with_globals(|| {
1920             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1921             let sh = mk_sess(cm.clone());
1922             assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1923                     token::Lifetime(Ident::from_str("'abc")));
1924         })
1925     }
1926
1927     #[test]
1928     fn raw_string() {
1929         with_globals(|| {
1930             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1931             let sh = mk_sess(cm.clone());
1932             assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1933                         .next_token()
1934                         .tok,
1935                     token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1936         })
1937     }
1938
1939     #[test]
1940     fn literal_suffixes() {
1941         with_globals(|| {
1942             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1943             let sh = mk_sess(cm.clone());
1944             macro_rules! test {
1945                 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1946                     assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1947                             token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1948                                             Some(Symbol::intern("suffix"))));
1949                     // with a whitespace separator:
1950                     assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1951                             token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1952                                             None));
1953                 }}
1954             }
1955
1956             test!("'a'", Char, "a");
1957             test!("b'a'", Byte, "a");
1958             test!("\"a\"", Str_, "a");
1959             test!("b\"a\"", ByteStr, "a");
1960             test!("1234", Integer, "1234");
1961             test!("0b101", Integer, "0b101");
1962             test!("0xABC", Integer, "0xABC");
1963             test!("1.0", Float, "1.0");
1964             test!("1.0e10", Float, "1.0e10");
1965
1966             assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1967                     token::Literal(token::Integer(Symbol::intern("2")),
1968                                     Some(Symbol::intern("us"))));
1969             assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1970                     token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1971                                     Some(Symbol::intern("suffix"))));
1972             assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1973                     token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1974                                     Some(Symbol::intern("suffix"))));
1975         })
1976     }
1977
1978     #[test]
1979     fn line_doc_comments() {
1980         assert!(is_doc_comment("///"));
1981         assert!(is_doc_comment("/// blah"));
1982         assert!(!is_doc_comment("////"));
1983     }
1984
1985     #[test]
1986     fn nested_block_comments() {
1987         with_globals(|| {
1988             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1989             let sh = mk_sess(cm.clone());
1990             let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1991             match lexer.next_token().tok {
1992                 token::Comment => {}
1993                 _ => panic!("expected a comment!"),
1994             }
1995             assert_eq!(lexer.next_token().tok,
1996                     token::Literal(token::Char(Symbol::intern("a")), None));
1997         })
1998     }
1999
2000     #[test]
2001     fn crlf_comments() {
2002         with_globals(|| {
2003             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
2004             let sh = mk_sess(cm.clone());
2005             let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
2006             let comment = lexer.next_token();
2007             assert_eq!(comment.tok, token::Comment);
2008             assert_eq!((comment.sp.lo(), comment.sp.hi()), (BytePos(0), BytePos(7)));
2009             assert_eq!(lexer.next_token().tok, token::Whitespace);
2010             assert_eq!(lexer.next_token().tok,
2011                     token::DocComment(Symbol::intern("/// test")));
2012         })
2013     }
2014 }