src/libsyntax/parse/lexer/mod.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast::{self, Ident};
  12 use syntax_pos::{self, BytePos, CharPos, Pos, Span};
  13 use codemap::CodeMap;
  14 use errors::{FatalError, DiagnosticBuilder};
  15 use parse::{token, ParseSess};
  16 use str::char_at;
  17 use symbol::{Symbol, keywords};
  18 use std_unicode::property::Pattern_White_Space;
  19
  20 use std::borrow::Cow;
  21 use std::char;
  22 use std::mem::replace;
  23 use std::rc::Rc;
  24
  25 pub mod comments;
  26 mod tokentrees;
  27 mod unicode_chars;
  28
  29 #[derive(Clone, PartialEq, Eq, Debug)]
  30 pub struct TokenAndSpan {
  31     pub tok: token::Token,
  32     pub sp: Span,
  33 }
  34
  35 impl Default for TokenAndSpan {
  36     fn default() -> Self {
  37         TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP }
  38     }
  39 }
  40
  41 pub struct StringReader<'a> {
  42     pub sess: &'a ParseSess,
  43     /// The absolute offset within the codemap of the next character to read
  44     pub next_pos: BytePos,
  45     /// The absolute offset within the codemap of the current character
  46     pub pos: BytePos,
  47     /// The column of the next character to read
  48     pub col: CharPos,
  49     /// The current character (which has been read from self.pos)
  50     pub ch: Option<char>,
  51     pub filemap: Rc<syntax_pos::FileMap>,
  52     /// If Some, stop reading the source at this position (inclusive).
  53     pub terminator: Option<BytePos>,
  54     /// Whether to record new-lines and multibyte chars in filemap.
  55     /// This is only necessary the first time a filemap is lexed.
  56     /// If part of a filemap is being re-lexed, this should be set to false.
  57     pub save_new_lines_and_multibyte: bool,
  58     // cached:
  59     pub peek_tok: token::Token,
  60     pub peek_span: Span,
  61     pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
  62     // cache a direct reference to the source text, so that we don't have to
  63     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
  64     source_text: Rc<String>,
  65     /// Stack of open delimiters and their spans. Used for error message.
  66     token: token::Token,
  67     span: Span,
  68     open_braces: Vec<(token::DelimToken, Span)>,
  69 }
  70
  71 impl<'a> StringReader<'a> {
  72     fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
  73         let res = self.try_next_token();
  74         self.unwrap_or_abort(res)
  75     }
  76     fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
  77         match res {
  78             Ok(tok) => tok,
  79             Err(_) => {
  80                 self.emit_fatal_errors();
  81                 panic!(FatalError);
  82             }
  83         }
  84     }
  85     fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
  86         let mut t = self.try_next_token()?;
  87         loop {
  88             match t.tok {
  89                 token::Whitespace | token::Comment | token::Shebang(_) => {
  90                     t = self.try_next_token()?;
  91                 }
  92                 _ => break,
  93             }
  94         }
  95         self.token = t.tok.clone();
  96         self.span = t.sp;
  97         Ok(t)
  98     }
  99     pub fn real_token(&mut self) -> TokenAndSpan {
 100         let res = self.try_real_token();
 101         self.unwrap_or_abort(res)
 102     }
 103     fn is_eof(&self) -> bool {
 104         if self.ch.is_none() {
 105             return true;
 106         }
 107
 108         match self.terminator {
 109             Some(t) => self.next_pos > t,
 110             None => false,
 111         }
 112     }
 113     /// Return the next token. EFFECT: advances the string_reader.
 114     pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
 115         assert!(self.fatal_errs.is_empty());
 116         let ret_val = TokenAndSpan {
 117             tok: replace(&mut self.peek_tok, token::Underscore),
 118             sp: self.peek_span,
 119         };
 120         self.advance_token()?;
 121         Ok(ret_val)
 122     }
 123     fn fatal(&self, m: &str) -> FatalError {
 124         self.fatal_span(self.peek_span, m)
 125     }
 126     pub fn emit_fatal_errors(&mut self) {
 127         for err in &mut self.fatal_errs {
 128             err.emit();
 129         }
 130         self.fatal_errs.clear();
 131     }
 132     pub fn peek(&self) -> TokenAndSpan {
 133         // FIXME(pcwalton): Bad copy!
 134         TokenAndSpan {
 135             tok: self.peek_tok.clone(),
 136             sp: self.peek_span,
 137         }
 138     }
 139 }
 140
 141 impl<'a> StringReader<'a> {
 142     /// For comments.rs, which hackily pokes into next_pos and ch
 143     pub fn new_raw<'b>(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 144         let mut sr = StringReader::new_raw_internal(sess, filemap);
 145         sr.bump();
 146         sr
 147     }
 148
 149     fn new_raw_internal(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 150         if filemap.src.is_none() {
 151             sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
 152                                               filemap.name));
 153         }
 154
 155         let source_text = (*filemap.src.as_ref().unwrap()).clone();
 156
 157         StringReader {
 158             sess: sess,
 159             next_pos: filemap.start_pos,
 160             pos: filemap.start_pos,
 161             col: CharPos(0),
 162             ch: Some('\n'),
 163             filemap: filemap,
 164             terminator: None,
 165             save_new_lines_and_multibyte: true,
 166             // dummy values; not read
 167             peek_tok: token::Eof,
 168             peek_span: syntax_pos::DUMMY_SP,
 169             source_text: source_text,
 170             fatal_errs: Vec::new(),
 171             token: token::Eof,
 172             span: syntax_pos::DUMMY_SP,
 173             open_braces: Vec::new(),
 174         }
 175     }
 176
 177     pub fn new(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 178         let mut sr = StringReader::new_raw(sess, filemap);
 179         if let Err(_) = sr.advance_token() {
 180             sr.emit_fatal_errors();
 181             panic!(FatalError);
 182         }
 183         sr
 184     }
 185
 186     pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
 187         let begin = sess.codemap().lookup_byte_offset(span.lo);
 188         let end = sess.codemap().lookup_byte_offset(span.hi);
 189
 190         // Make the range zero-length if the span is invalid.
 191         if span.lo > span.hi || begin.fm.start_pos != end.fm.start_pos {
 192             span.hi = span.lo;
 193         }
 194
 195         let mut sr = StringReader::new_raw_internal(sess, begin.fm);
 196
 197         // Seek the lexer to the right byte range.
 198         sr.save_new_lines_and_multibyte = false;
 199         sr.next_pos = span.lo;
 200         sr.terminator = Some(span.hi);
 201
 202         sr.bump();
 203
 204         if let Err(_) = sr.advance_token() {
 205             sr.emit_fatal_errors();
 206             panic!(FatalError);
 207         }
 208         sr
 209     }
 210
 211     pub fn ch_is(&self, c: char) -> bool {
 212         self.ch == Some(c)
 213     }
 214
 215     /// Report a fatal lexical error with a given span.
 216     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 217         self.sess.span_diagnostic.span_fatal(sp, m)
 218     }
 219
 220     /// Report a lexical error with a given span.
 221     pub fn err_span(&self, sp: Span, m: &str) {
 222         self.sess.span_diagnostic.span_err(sp, m)
 223     }
 224
 225
 226     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 227     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 228         self.fatal_span(syntax_pos::mk_sp(from_pos, to_pos), m)
 229     }
 230
 231     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 232     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 233         self.err_span(syntax_pos::mk_sp(from_pos, to_pos), m)
 234     }
 235
 236     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 237     /// escaped character to the error message
 238     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
 239         let mut m = m.to_string();
 240         m.push_str(": ");
 241         for c in c.escape_default() {
 242             m.push(c)
 243         }
 244         self.fatal_span_(from_pos, to_pos, &m[..])
 245     }
 246     fn struct_fatal_span_char(&self,
 247                               from_pos: BytePos,
 248                               to_pos: BytePos,
 249                               m: &str,
 250                               c: char)
 251                               -> DiagnosticBuilder<'a> {
 252         let mut m = m.to_string();
 253         m.push_str(": ");
 254         for c in c.escape_default() {
 255             m.push(c)
 256         }
 257         self.sess.span_diagnostic.struct_span_fatal(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
 258     }
 259
 260     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 261     /// escaped character to the error message
 262     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
 263         let mut m = m.to_string();
 264         m.push_str(": ");
 265         for c in c.escape_default() {
 266             m.push(c)
 267         }
 268         self.err_span_(from_pos, to_pos, &m[..]);
 269     }
 270     fn struct_err_span_char(&self,
 271                             from_pos: BytePos,
 272                             to_pos: BytePos,
 273                             m: &str,
 274                             c: char)
 275                             -> DiagnosticBuilder<'a> {
 276         let mut m = m.to_string();
 277         m.push_str(": ");
 278         for c in c.escape_default() {
 279             m.push(c)
 280         }
 281         self.sess.span_diagnostic.struct_span_err(syntax_pos::mk_sp(from_pos, to_pos), &m[..])
 282     }
 283
 284     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
 285     /// offending string to the error message
 286     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
 287         m.push_str(": ");
 288         let from = self.byte_offset(from_pos).to_usize();
 289         let to = self.byte_offset(to_pos).to_usize();
 290         m.push_str(&self.source_text[from..to]);
 291         self.fatal_span_(from_pos, to_pos, &m[..])
 292     }
 293
 294     /// Advance peek_tok and peek_span to refer to the next token, and
 295     /// possibly update the interner.
 296     fn advance_token(&mut self) -> Result<(), ()> {
 297         match self.scan_whitespace_or_comment() {
 298             Some(comment) => {
 299                 self.peek_span = comment.sp;
 300                 self.peek_tok = comment.tok;
 301             }
 302             None => {
 303                 if self.is_eof() {
 304                     self.peek_tok = token::Eof;
 305                     self.peek_span = syntax_pos::mk_sp(self.filemap.end_pos, self.filemap.end_pos);
 306                 } else {
 307                     let start_bytepos = self.pos;
 308                     self.peek_tok = self.next_token_inner()?;
 309                     self.peek_span = syntax_pos::mk_sp(start_bytepos, self.pos);
 310                 };
 311             }
 312         }
 313         Ok(())
 314     }
 315
 316     fn byte_offset(&self, pos: BytePos) -> BytePos {
 317         (pos - self.filemap.start_pos)
 318     }
 319
 320     /// Calls `f` with a string slice of the source text spanning from `start`
 321     /// up to but excluding `self.pos`, meaning the slice does not include
 322     /// the character `self.ch`.
 323     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
 324         where F: FnOnce(&str) -> T
 325     {
 326         self.with_str_from_to(start, self.pos, f)
 327     }
 328
 329     /// Create a Name from a given offset to the current offset, each
 330     /// adjusted 1 towards each other (assumes that on either side there is a
 331     /// single-byte delimiter).
 332     pub fn name_from(&self, start: BytePos) -> ast::Name {
 333         debug!("taking an ident from {:?} to {:?}", start, self.pos);
 334         self.with_str_from(start, Symbol::intern)
 335     }
 336
 337     /// As name_from, with an explicit endpoint.
 338     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
 339         debug!("taking an ident from {:?} to {:?}", start, end);
 340         self.with_str_from_to(start, end, Symbol::intern)
 341     }
 342
 343     /// Calls `f` with a string slice of the source text spanning from `start`
 344     /// up to but excluding `end`.
 345     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
 346         where F: FnOnce(&str) -> T
 347     {
 348         f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
 349     }
 350
 351     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 352     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 353         let mut i = 0;
 354         while i < s.len() {
 355             let ch = char_at(s, i);
 356             let next = i + ch.len_utf8();
 357             if ch == '\r' {
 358                 if next < s.len() && char_at(s, next) == '\n' {
 359                     return translate_crlf_(self, start, s, errmsg, i).into();
 360                 }
 361                 let pos = start + BytePos(i as u32);
 362                 let end_pos = start + BytePos(next as u32);
 363                 self.err_span_(pos, end_pos, errmsg);
 364             }
 365             i = next;
 366         }
 367         return s.into();
 368
 369         fn translate_crlf_(rdr: &StringReader,
 370                            start: BytePos,
 371                            s: &str,
 372                            errmsg: &str,
 373                            mut i: usize)
 374                            -> String {
 375             let mut buf = String::with_capacity(s.len());
 376             let mut j = 0;
 377             while i < s.len() {
 378                 let ch = char_at(s, i);
 379                 let next = i + ch.len_utf8();
 380                 if ch == '\r' {
 381                     if j < i {
 382                         buf.push_str(&s[j..i]);
 383                     }
 384                     j = next;
 385                     if next >= s.len() || char_at(s, next) != '\n' {
 386                         let pos = start + BytePos(i as u32);
 387                         let end_pos = start + BytePos(next as u32);
 388                         rdr.err_span_(pos, end_pos, errmsg);
 389                     }
 390                 }
 391                 i = next;
 392             }
 393             if j < s.len() {
 394                 buf.push_str(&s[j..]);
 395             }
 396             buf
 397         }
 398     }
 399
 400
 401     /// Advance the StringReader by one character. If a newline is
 402     /// discovered, add it to the FileMap's list of line start offsets.
 403     pub fn bump(&mut self) {
 404         let new_pos = self.next_pos;
 405         let new_byte_offset = self.byte_offset(new_pos).to_usize();
 406         let end = self.terminator.map_or(self.source_text.len(), |t| {
 407             self.byte_offset(t).to_usize()
 408         });
 409         if new_byte_offset < end {
 410             let old_ch_is_newline = self.ch.unwrap() == '\n';
 411             let new_ch = char_at(&self.source_text, new_byte_offset);
 412             let new_ch_len = new_ch.len_utf8();
 413
 414             self.ch = Some(new_ch);
 415             self.pos = new_pos;
 416             self.next_pos = new_pos + Pos::from_usize(new_ch_len);
 417             if old_ch_is_newline {
 418                 if self.save_new_lines_and_multibyte {
 419                     self.filemap.next_line(self.pos);
 420                 }
 421                 self.col = CharPos(0);
 422             } else {
 423                 self.col = self.col + CharPos(1);
 424             }
 425             if new_ch_len > 1 {
 426                 if self.save_new_lines_and_multibyte {
 427                     self.filemap.record_multibyte_char(self.pos, new_ch_len);
 428                 }
 429             }
 430         } else {
 431             self.ch = None;
 432             self.pos = new_pos;
 433         }
 434     }
 435
 436     pub fn nextch(&self) -> Option<char> {
 437         let offset = self.byte_offset(self.next_pos).to_usize();
 438         if offset < self.source_text.len() {
 439             Some(char_at(&self.source_text, offset))
 440         } else {
 441             None
 442         }
 443     }
 444
 445     pub fn nextch_is(&self, c: char) -> bool {
 446         self.nextch() == Some(c)
 447     }
 448
 449     pub fn nextnextch(&self) -> Option<char> {
 450         let offset = self.byte_offset(self.next_pos).to_usize();
 451         let s = &self.source_text[..];
 452         if offset >= s.len() {
 453             return None;
 454         }
 455         let next = offset + char_at(s, offset).len_utf8();
 456         if next < s.len() {
 457             Some(char_at(s, next))
 458         } else {
 459             None
 460         }
 461     }
 462
 463     pub fn nextnextch_is(&self, c: char) -> bool {
 464         self.nextnextch() == Some(c)
 465     }
 466
 467     /// Eats <XID_start><XID_continue>*, if possible.
 468     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
 469         if !ident_start(self.ch) {
 470             return None;
 471         }
 472         let start = self.pos;
 473         while ident_continue(self.ch) {
 474             self.bump();
 475         }
 476
 477         self.with_str_from(start, |string| {
 478             if string == "_" {
 479                 None
 480             } else {
 481                 Some(Symbol::intern(string))
 482             }
 483         })
 484     }
 485
 486     /// PRECONDITION: self.ch is not whitespace
 487     /// Eats any kind of comment.
 488     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
 489         if let Some(c) = self.ch {
 490             if c.is_whitespace() {
 491                 let msg = "called consume_any_line_comment, but there was whitespace";
 492                 self.sess.span_diagnostic.span_err(syntax_pos::mk_sp(self.pos, self.pos), msg);
 493             }
 494         }
 495
 496         if self.ch_is('/') {
 497             match self.nextch() {
 498                 Some('/') => {
 499                     self.bump();
 500                     self.bump();
 501
 502                     // line comments starting with "///" or "//!" are doc-comments
 503                     let doc_comment = self.ch_is('/') || self.ch_is('!');
 504                     let start_bpos = self.pos - BytePos(2);
 505
 506                     while !self.is_eof() {
 507                         match self.ch.unwrap() {
 508                             '\n' => break,
 509                             '\r' => {
 510                                 if self.nextch_is('\n') {
 511                                     // CRLF
 512                                     break;
 513                                 } else if doc_comment {
 514                                     self.err_span_(self.pos,
 515                                                    self.next_pos,
 516                                                    "bare CR not allowed in doc-comment");
 517                                 }
 518                             }
 519                             _ => (),
 520                         }
 521                         self.bump();
 522                     }
 523
 524                     return if doc_comment {
 525                         self.with_str_from(start_bpos, |string| {
 526                             // comments with only more "/"s are not doc comments
 527                             let tok = if is_doc_comment(string) {
 528                                 token::DocComment(Symbol::intern(string))
 529                             } else {
 530                                 token::Comment
 531                             };
 532
 533                             Some(TokenAndSpan {
 534                                 tok: tok,
 535                                 sp: syntax_pos::mk_sp(start_bpos, self.pos),
 536                             })
 537                         })
 538                     } else {
 539                         Some(TokenAndSpan {
 540                             tok: token::Comment,
 541                             sp: syntax_pos::mk_sp(start_bpos, self.pos),
 542                         })
 543                     };
 544                 }
 545                 Some('*') => {
 546                     self.bump();
 547                     self.bump();
 548                     self.scan_block_comment()
 549                 }
 550                 _ => None,
 551             }
 552         } else if self.ch_is('#') {
 553             if self.nextch_is('!') {
 554
 555                 // Parse an inner attribute.
 556                 if self.nextnextch_is('[') {
 557                     return None;
 558                 }
 559
 560                 // I guess this is the only way to figure out if
 561                 // we're at the beginning of the file...
 562                 let cmap = CodeMap::new();
 563                 cmap.files.borrow_mut().push(self.filemap.clone());
 564                 let loc = cmap.lookup_char_pos_adj(self.pos);
 565                 debug!("Skipping a shebang");
 566                 if loc.line == 1 && loc.col == CharPos(0) {
 567                     // FIXME: Add shebang "token", return it
 568                     let start = self.pos;
 569                     while !self.ch_is('\n') && !self.is_eof() {
 570                         self.bump();
 571                     }
 572                     return Some(TokenAndSpan {
 573                         tok: token::Shebang(self.name_from(start)),
 574                         sp: syntax_pos::mk_sp(start, self.pos),
 575                     });
 576                 }
 577             }
 578             None
 579         } else {
 580             None
 581         }
 582     }
 583
 584     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
 585     /// return None.
 586     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
 587         match self.ch.unwrap_or('\0') {
 588             // # to handle shebang at start of file -- this is the entry point
 589             // for skipping over all "junk"
 590             '/' | '#' => {
 591                 let c = self.scan_comment();
 592                 debug!("scanning a comment {:?}", c);
 593                 c
 594             },
 595             c if is_pattern_whitespace(Some(c)) => {
 596                 let start_bpos = self.pos;
 597                 while is_pattern_whitespace(self.ch) {
 598                     self.bump();
 599                 }
 600                 let c = Some(TokenAndSpan {
 601                     tok: token::Whitespace,
 602                     sp: syntax_pos::mk_sp(start_bpos, self.pos),
 603                 });
 604                 debug!("scanning whitespace: {:?}", c);
 605                 c
 606             }
 607             _ => None,
 608         }
 609     }
 610
 611     /// Might return a sugared-doc-attr
 612     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
 613         // block comments starting with "/**" or "/*!" are doc-comments
 614         let is_doc_comment = self.ch_is('*') || self.ch_is('!');
 615         let start_bpos = self.pos - BytePos(2);
 616
 617         let mut level: isize = 1;
 618         let mut has_cr = false;
 619         while level > 0 {
 620             if self.is_eof() {
 621                 let msg = if is_doc_comment {
 622                     "unterminated block doc-comment"
 623                 } else {
 624                     "unterminated block comment"
 625                 };
 626                 let last_bpos = self.pos;
 627                 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
 628             }
 629             let n = self.ch.unwrap();
 630             match n {
 631                 '/' if self.nextch_is('*') => {
 632                     level += 1;
 633                     self.bump();
 634                 }
 635                 '*' if self.nextch_is('/') => {
 636                     level -= 1;
 637                     self.bump();
 638                 }
 639                 '\r' => {
 640                     has_cr = true;
 641                 }
 642                 _ => (),
 643             }
 644             self.bump();
 645         }
 646
 647         self.with_str_from(start_bpos, |string| {
 648             // but comments with only "*"s between two "/"s are not
 649             let tok = if is_block_doc_comment(string) {
 650                 let string = if has_cr {
 651                     self.translate_crlf(start_bpos,
 652                                         string,
 653                                         "bare CR not allowed in block doc-comment")
 654                 } else {
 655                     string.into()
 656                 };
 657                 token::DocComment(Symbol::intern(&string[..]))
 658             } else {
 659                 token::Comment
 660             };
 661
 662             Some(TokenAndSpan {
 663                 tok: tok,
 664                 sp: syntax_pos::mk_sp(start_bpos, self.pos),
 665             })
 666         })
 667     }
 668
 669     /// Scan through any digits (base `scan_radix`) or underscores,
 670     /// and return how many digits there were.
 671     ///
 672     /// `real_radix` represents the true radix of the number we're
 673     /// interested in, and errors will be emitted for any digits
 674     /// between `real_radix` and `scan_radix`.
 675     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
 676         assert!(real_radix <= scan_radix);
 677         let mut len = 0;
 678         loop {
 679             let c = self.ch;
 680             if c == Some('_') {
 681                 debug!("skipping a _");
 682                 self.bump();
 683                 continue;
 684             }
 685             match c.and_then(|cc| cc.to_digit(scan_radix)) {
 686                 Some(_) => {
 687                     debug!("{:?} in scan_digits", c);
 688                     // check that the hypothetical digit is actually
 689                     // in range for the true radix
 690                     if c.unwrap().to_digit(real_radix).is_none() {
 691                         self.err_span_(self.pos,
 692                                        self.next_pos,
 693                                        &format!("invalid digit for a base {} literal", real_radix));
 694                     }
 695                     len += 1;
 696                     self.bump();
 697                 }
 698                 _ => return len,
 699             }
 700         }
 701     }
 702
 703     /// Lex a LIT_INTEGER or a LIT_FLOAT
 704     fn scan_number(&mut self, c: char) -> token::Lit {
 705         let num_digits;
 706         let mut base = 10;
 707         let start_bpos = self.pos;
 708
 709         self.bump();
 710
 711         if c == '0' {
 712             match self.ch.unwrap_or('\0') {
 713                 'b' => {
 714                     self.bump();
 715                     base = 2;
 716                     num_digits = self.scan_digits(2, 10);
 717                 }
 718                 'o' => {
 719                     self.bump();
 720                     base = 8;
 721                     num_digits = self.scan_digits(8, 10);
 722                 }
 723                 'x' => {
 724                     self.bump();
 725                     base = 16;
 726                     num_digits = self.scan_digits(16, 16);
 727                 }
 728                 '0'...'9' | '_' | '.' | 'e' | 'E' => {
 729                     num_digits = self.scan_digits(10, 10) + 1;
 730                 }
 731                 _ => {
 732                     // just a 0
 733                     return token::Integer(self.name_from(start_bpos));
 734                 }
 735             }
 736         } else if c.is_digit(10) {
 737             num_digits = self.scan_digits(10, 10) + 1;
 738         } else {
 739             num_digits = 0;
 740         }
 741
 742         if num_digits == 0 {
 743             self.err_span_(start_bpos,
 744                            self.pos,
 745                            "no valid digits found for number");
 746             return token::Integer(Symbol::intern("0"));
 747         }
 748
 749         // might be a float, but don't be greedy if this is actually an
 750         // integer literal followed by field/method access or a range pattern
 751         // (`0..2` and `12.foo()`)
 752         if self.ch_is('.') && !self.nextch_is('.') &&
 753            !self.nextch()
 754                 .unwrap_or('\0')
 755                 .is_xid_start() {
 756             // might have stuff after the ., and if it does, it needs to start
 757             // with a number
 758             self.bump();
 759             if self.ch.unwrap_or('\0').is_digit(10) {
 760                 self.scan_digits(10, 10);
 761                 self.scan_float_exponent();
 762             }
 763             let pos = self.pos;
 764             self.check_float_base(start_bpos, pos, base);
 765             return token::Float(self.name_from(start_bpos));
 766         } else {
 767             // it might be a float if it has an exponent
 768             if self.ch_is('e') || self.ch_is('E') {
 769                 self.scan_float_exponent();
 770                 let pos = self.pos;
 771                 self.check_float_base(start_bpos, pos, base);
 772                 return token::Float(self.name_from(start_bpos));
 773             }
 774             // but we certainly have an integer!
 775             return token::Integer(self.name_from(start_bpos));
 776         }
 777     }
 778
 779     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
 780     /// error if too many or too few digits are encountered.
 781     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
 782         debug!("scanning {} digits until {:?}", n_digits, delim);
 783         let start_bpos = self.pos;
 784         let mut accum_int = 0;
 785
 786         let mut valid = true;
 787         for _ in 0..n_digits {
 788             if self.is_eof() {
 789                 let last_bpos = self.pos;
 790                 panic!(self.fatal_span_(start_bpos,
 791                                         last_bpos,
 792                                         "unterminated numeric character escape"));
 793             }
 794             if self.ch_is(delim) {
 795                 let last_bpos = self.pos;
 796                 self.err_span_(start_bpos,
 797                                last_bpos,
 798                                "numeric character escape is too short");
 799                 valid = false;
 800                 break;
 801             }
 802             let c = self.ch.unwrap_or('\x00');
 803             accum_int *= 16;
 804             accum_int += c.to_digit(16).unwrap_or_else(|| {
 805                 self.err_span_char(self.pos,
 806                                    self.next_pos,
 807                                    "invalid character in numeric character escape",
 808                                    c);
 809
 810                 valid = false;
 811                 0
 812             });
 813             self.bump();
 814         }
 815
 816         if below_0x7f_only && accum_int >= 0x80 {
 817             self.err_span_(start_bpos,
 818                            self.pos,
 819                            "this form of character escape may only be used with characters in \
 820                             the range [\\x00-\\x7f]");
 821             valid = false;
 822         }
 823
 824         match char::from_u32(accum_int) {
 825             Some(_) => valid,
 826             None => {
 827                 let last_bpos = self.pos;
 828                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
 829                 false
 830             }
 831         }
 832     }
 833
 834     /// Scan for a single (possibly escaped) byte or char
 835     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
 836     /// `start` is the position of `first_source_char`, which is already consumed.
 837     ///
 838     /// Returns true if there was a valid char/byte, false otherwise.
 839     fn scan_char_or_byte(&mut self,
 840                          start: BytePos,
 841                          first_source_char: char,
 842                          ascii_only: bool,
 843                          delim: char)
 844                          -> bool {
 845         match first_source_char {
 846             '\\' => {
 847                 // '\X' for some X must be a character constant:
 848                 let escaped = self.ch;
 849                 let escaped_pos = self.pos;
 850                 self.bump();
 851                 match escaped {
 852                     None => {}  // EOF here is an error that will be checked later.
 853                     Some(e) => {
 854                         return match e {
 855                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
 856                             'x' => self.scan_byte_escape(delim, !ascii_only),
 857                             'u' => {
 858                                 let valid = if self.ch_is('{') {
 859                                     self.scan_unicode_escape(delim) && !ascii_only
 860                                 } else {
 861                                     let span = syntax_pos::mk_sp(start, self.pos);
 862                                     self.sess.span_diagnostic
 863                                         .struct_span_err(span, "incorrect unicode escape sequence")
 864                                         .span_help(span,
 865                                                    "format of unicode escape sequences is \
 866                                                     `\\u{…}`")
 867                                         .emit();
 868                                     false
 869                                 };
 870                                 if ascii_only {
 871                                     self.err_span_(start,
 872                                                    self.pos,
 873                                                    "unicode escape sequences cannot be used as a \
 874                                                     byte or in a byte string");
 875                                 }
 876                                 valid
 877
 878                             }
 879                             '\n' if delim == '"' => {
 880                                 self.consume_whitespace();
 881                                 true
 882                             }
 883                             '\r' if delim == '"' && self.ch_is('\n') => {
 884                                 self.consume_whitespace();
 885                                 true
 886                             }
 887                             c => {
 888                                 let pos = self.pos;
 889                                 let mut err = self.struct_err_span_char(escaped_pos,
 890                                                                         pos,
 891                                                                         if ascii_only {
 892                                                                             "unknown byte escape"
 893                                                                         } else {
 894                                                                             "unknown character \
 895                                                                              escape"
 896                                                                         },
 897                                                                         c);
 898                                 if e == '\r' {
 899                                     err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
 900                                                   "this is an isolated carriage return; consider \
 901                                                    checking your editor and version control \
 902                                                    settings");
 903                                 }
 904                                 if (e == '{' || e == '}') && !ascii_only {
 905                                     err.span_help(syntax_pos::mk_sp(escaped_pos, pos),
 906                                                   "if used in a formatting string, curly braces \
 907                                                    are escaped with `{{` and `}}`");
 908                                 }
 909                                 err.emit();
 910                                 false
 911                             }
 912                         }
 913                     }
 914                 }
 915             }
 916             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
 917                 let pos = self.pos;
 918                 self.err_span_char(start,
 919                                    pos,
 920                                    if ascii_only {
 921                                        "byte constant must be escaped"
 922                                    } else {
 923                                        "character constant must be escaped"
 924                                    },
 925                                    first_source_char);
 926                 return false;
 927             }
 928             '\r' => {
 929                 if self.ch_is('\n') {
 930                     self.bump();
 931                     return true;
 932                 } else {
 933                     self.err_span_(start,
 934                                    self.pos,
 935                                    "bare CR not allowed in string, use \\r instead");
 936                     return false;
 937                 }
 938             }
 939             _ => {
 940                 if ascii_only && first_source_char > '\x7F' {
 941                     let pos = self.pos;
 942                     self.err_span_(start,
 943                                    pos,
 944                                    "byte constant must be ASCII. Use a \\xHH escape for a \
 945                                     non-ASCII byte");
 946                     return false;
 947                 }
 948             }
 949         }
 950         true
 951     }
 952
 953     /// Scan over a \u{...} escape
 954     ///
 955     /// At this point, we have already seen the \ and the u, the { is the current character. We
 956     /// will read at least one digit, and up to 6, and pass over the }.
 957     fn scan_unicode_escape(&mut self, delim: char) -> bool {
 958         self.bump(); // past the {
 959         let start_bpos = self.pos;
 960         let mut count = 0;
 961         let mut accum_int = 0;
 962         let mut valid = true;
 963
 964         while !self.ch_is('}') && count <= 6 {
 965             let c = match self.ch {
 966                 Some(c) => c,
 967                 None => {
 968                     panic!(self.fatal_span_(start_bpos,
 969                                             self.pos,
 970                                             "unterminated unicode escape (found EOF)"));
 971                 }
 972             };
 973             accum_int *= 16;
 974             accum_int += c.to_digit(16).unwrap_or_else(|| {
 975                 if c == delim {
 976                     panic!(self.fatal_span_(self.pos,
 977                                             self.next_pos,
 978                                             "unterminated unicode escape (needed a `}`)"));
 979                 } else {
 980                     self.err_span_char(self.pos,
 981                                        self.next_pos,
 982                                        "invalid character in unicode escape",
 983                                        c);
 984                 }
 985                 valid = false;
 986                 0
 987             });
 988             self.bump();
 989             count += 1;
 990         }
 991
 992         if count > 6 {
 993             self.err_span_(start_bpos,
 994                            self.pos,
 995                            "overlong unicode escape (can have at most 6 hex digits)");
 996             valid = false;
 997         }
 998
 999         if valid && (char::from_u32(accum_int).is_none() || count == 0) {
1000             self.err_span_(start_bpos,
1001                            self.pos,
1002                            "invalid unicode character escape");
1003             valid = false;
1004         }
1005
1006         self.bump(); // past the ending }
1007         valid
1008     }
1009
1010     /// Scan over a float exponent.
1011     fn scan_float_exponent(&mut self) {
1012         if self.ch_is('e') || self.ch_is('E') {
1013             self.bump();
1014             if self.ch_is('-') || self.ch_is('+') {
1015                 self.bump();
1016             }
1017             if self.scan_digits(10, 10) == 0 {
1018                 self.err_span_(self.pos,
1019                                self.next_pos,
1020                                "expected at least one digit in exponent")
1021             }
1022         }
1023     }
1024
1025     /// Check that a base is valid for a floating literal, emitting a nice
1026     /// error if it isn't.
1027     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1028         match base {
1029             16 => {
1030                 self.err_span_(start_bpos,
1031                                last_bpos,
1032                                "hexadecimal float literal is not supported")
1033             }
1034             8 => {
1035                 self.err_span_(start_bpos,
1036                                last_bpos,
1037                                "octal float literal is not supported")
1038             }
1039             2 => {
1040                 self.err_span_(start_bpos,
1041                                last_bpos,
1042                                "binary float literal is not supported")
1043             }
1044             _ => (),
1045         }
1046     }
1047
1048     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1049         self.bump();
1050         if self.ch_is('=') {
1051             self.bump();
1052             return token::BinOpEq(op);
1053         } else {
1054             return token::BinOp(op);
1055         }
1056     }
1057
1058     /// Return the next token from the string, advances the input past that
1059     /// token, and updates the interner
1060     fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1061         let c = self.ch;
1062         if ident_start(c) &&
1063            match (c.unwrap(), self.nextch(), self.nextnextch()) {
1064             // Note: r as in r" or r#" is part of a raw string literal,
1065             // b as in b' is part of a byte literal.
1066             // They are not identifiers, and are handled further down.
1067             ('r', Some('"'), _) |
1068             ('r', Some('#'), _) |
1069             ('b', Some('"'), _) |
1070             ('b', Some('\''), _) |
1071             ('b', Some('r'), Some('"')) |
1072             ('b', Some('r'), Some('#')) => false,
1073             _ => true,
1074         } {
1075             let start = self.pos;
1076             while ident_continue(self.ch) {
1077                 self.bump();
1078             }
1079
1080             return Ok(self.with_str_from(start, |string| {
1081                 if string == "_" {
1082                     token::Underscore
1083                 } else {
1084                     // FIXME: perform NFKC normalization here. (Issue #2253)
1085                     token::Ident(Ident::from_str(string))
1086                 }
1087             }));
1088         }
1089
1090         if is_dec_digit(c) {
1091             let num = self.scan_number(c.unwrap());
1092             let suffix = self.scan_optional_raw_name();
1093             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1094             return Ok(token::Literal(num, suffix));
1095         }
1096
1097         match c.expect("next_token_inner called at EOF") {
1098             // One-byte tokens.
1099             ';' => {
1100                 self.bump();
1101                 return Ok(token::Semi);
1102             }
1103             ',' => {
1104                 self.bump();
1105                 return Ok(token::Comma);
1106             }
1107             '.' => {
1108                 self.bump();
1109                 return if self.ch_is('.') {
1110                     self.bump();
1111                     if self.ch_is('.') {
1112                         self.bump();
1113                         Ok(token::DotDotDot)
1114                     } else {
1115                         Ok(token::DotDot)
1116                     }
1117                 } else {
1118                     Ok(token::Dot)
1119                 };
1120             }
1121             '(' => {
1122                 self.bump();
1123                 return Ok(token::OpenDelim(token::Paren));
1124             }
1125             ')' => {
1126                 self.bump();
1127                 return Ok(token::CloseDelim(token::Paren));
1128             }
1129             '{' => {
1130                 self.bump();
1131                 return Ok(token::OpenDelim(token::Brace));
1132             }
1133             '}' => {
1134                 self.bump();
1135                 return Ok(token::CloseDelim(token::Brace));
1136             }
1137             '[' => {
1138                 self.bump();
1139                 return Ok(token::OpenDelim(token::Bracket));
1140             }
1141             ']' => {
1142                 self.bump();
1143                 return Ok(token::CloseDelim(token::Bracket));
1144             }
1145             '@' => {
1146                 self.bump();
1147                 return Ok(token::At);
1148             }
1149             '#' => {
1150                 self.bump();
1151                 return Ok(token::Pound);
1152             }
1153             '~' => {
1154                 self.bump();
1155                 return Ok(token::Tilde);
1156             }
1157             '?' => {
1158                 self.bump();
1159                 return Ok(token::Question);
1160             }
1161             ':' => {
1162                 self.bump();
1163                 if self.ch_is(':') {
1164                     self.bump();
1165                     return Ok(token::ModSep);
1166                 } else {
1167                     return Ok(token::Colon);
1168                 }
1169             }
1170
1171             '$' => {
1172                 self.bump();
1173                 return Ok(token::Dollar);
1174             }
1175
1176             // Multi-byte tokens.
1177             '=' => {
1178                 self.bump();
1179                 if self.ch_is('=') {
1180                     self.bump();
1181                     return Ok(token::EqEq);
1182                 } else if self.ch_is('>') {
1183                     self.bump();
1184                     return Ok(token::FatArrow);
1185                 } else {
1186                     return Ok(token::Eq);
1187                 }
1188             }
1189             '!' => {
1190                 self.bump();
1191                 if self.ch_is('=') {
1192                     self.bump();
1193                     return Ok(token::Ne);
1194                 } else {
1195                     return Ok(token::Not);
1196                 }
1197             }
1198             '<' => {
1199                 self.bump();
1200                 match self.ch.unwrap_or('\x00') {
1201                     '=' => {
1202                         self.bump();
1203                         return Ok(token::Le);
1204                     }
1205                     '<' => {
1206                         return Ok(self.binop(token::Shl));
1207                     }
1208                     '-' => {
1209                         self.bump();
1210                         match self.ch.unwrap_or('\x00') {
1211                             _ => {
1212                                 return Ok(token::LArrow);
1213                             }
1214                         }
1215                     }
1216                     _ => {
1217                         return Ok(token::Lt);
1218                     }
1219                 }
1220             }
1221             '>' => {
1222                 self.bump();
1223                 match self.ch.unwrap_or('\x00') {
1224                     '=' => {
1225                         self.bump();
1226                         return Ok(token::Ge);
1227                     }
1228                     '>' => {
1229                         return Ok(self.binop(token::Shr));
1230                     }
1231                     _ => {
1232                         return Ok(token::Gt);
1233                     }
1234                 }
1235             }
1236             '\'' => {
1237                 // Either a character constant 'a' OR a lifetime name 'abc
1238                 let start_with_quote = self.pos;
1239                 self.bump();
1240                 let start = self.pos;
1241
1242                 // the eof will be picked up by the final `'` check below
1243                 let c2 = self.ch.unwrap_or('\x00');
1244                 self.bump();
1245
1246                 // If the character is an ident start not followed by another single
1247                 // quote, then this is a lifetime name:
1248                 if ident_start(Some(c2)) && !self.ch_is('\'') {
1249                     while ident_continue(self.ch) {
1250                         self.bump();
1251                     }
1252                     // lifetimes shouldn't end with a single quote
1253                     // if we find one, then this is an invalid character literal
1254                     if self.ch_is('\'') {
1255                         panic!(self.fatal_span_verbose(
1256                                start_with_quote, self.next_pos,
1257                                String::from("character literal may only contain one codepoint")));
1258
1259                     }
1260
1261                     // Include the leading `'` in the real identifier, for macro
1262                     // expansion purposes. See #12512 for the gory details of why
1263                     // this is necessary.
1264                     let ident = self.with_str_from(start, |lifetime_name| {
1265                         Ident::from_str(&format!("'{}", lifetime_name))
1266                     });
1267
1268                     // Conjure up a "keyword checking ident" to make sure that
1269                     // the lifetime name is not a keyword.
1270                     let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1271                         Ident::from_str(lifetime_name)
1272                     });
1273                     let keyword_checking_token = &token::Ident(keyword_checking_ident);
1274                     let last_bpos = self.pos;
1275                     if keyword_checking_token.is_any_keyword() &&
1276                        !keyword_checking_token.is_keyword(keywords::Static) {
1277                         self.err_span_(start, last_bpos, "lifetimes cannot use keyword names");
1278                     }
1279
1280                     return Ok(token::Lifetime(ident));
1281                 }
1282
1283                 let valid = self.scan_char_or_byte(start,
1284                                                    c2,
1285                                                    // ascii_only =
1286                                                    false,
1287                                                    '\'');
1288
1289                 if !self.ch_is('\'') {
1290                     panic!(self.fatal_span_verbose(
1291                            start_with_quote, self.pos,
1292                            String::from("character literal may only contain one codepoint")));
1293                 }
1294
1295                 let id = if valid {
1296                     self.name_from(start)
1297                 } else {
1298                     Symbol::intern("0")
1299                 };
1300                 self.bump(); // advance ch past token
1301                 let suffix = self.scan_optional_raw_name();
1302                 return Ok(token::Literal(token::Char(id), suffix));
1303             }
1304             'b' => {
1305                 self.bump();
1306                 let lit = match self.ch {
1307                     Some('\'') => self.scan_byte(),
1308                     Some('"') => self.scan_byte_string(),
1309                     Some('r') => self.scan_raw_byte_string(),
1310                     _ => unreachable!(),  // Should have been a token::Ident above.
1311                 };
1312                 let suffix = self.scan_optional_raw_name();
1313                 return Ok(token::Literal(lit, suffix));
1314             }
1315             '"' => {
1316                 let start_bpos = self.pos;
1317                 let mut valid = true;
1318                 self.bump();
1319                 while !self.ch_is('"') {
1320                     if self.is_eof() {
1321                         let last_bpos = self.pos;
1322                         panic!(self.fatal_span_(start_bpos,
1323                                                 last_bpos,
1324                                                 "unterminated double quote string"));
1325                     }
1326
1327                     let ch_start = self.pos;
1328                     let ch = self.ch.unwrap();
1329                     self.bump();
1330                     valid &= self.scan_char_or_byte(ch_start,
1331                                                     ch,
1332                                                     // ascii_only =
1333                                                     false,
1334                                                     '"');
1335                 }
1336                 // adjust for the ASCII " at the start of the literal
1337                 let id = if valid {
1338                     self.name_from(start_bpos + BytePos(1))
1339                 } else {
1340                     Symbol::intern("??")
1341                 };
1342                 self.bump();
1343                 let suffix = self.scan_optional_raw_name();
1344                 return Ok(token::Literal(token::Str_(id), suffix));
1345             }
1346             'r' => {
1347                 let start_bpos = self.pos;
1348                 self.bump();
1349                 let mut hash_count = 0;
1350                 while self.ch_is('#') {
1351                     self.bump();
1352                     hash_count += 1;
1353                 }
1354
1355                 if self.is_eof() {
1356                     let last_bpos = self.pos;
1357                     panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1358                 } else if !self.ch_is('"') {
1359                     let last_bpos = self.pos;
1360                     let curr_char = self.ch.unwrap();
1361                     panic!(self.fatal_span_char(start_bpos,
1362                                                 last_bpos,
1363                                                 "found invalid character; only `#` is allowed \
1364                                                  in raw string delimitation",
1365                                                 curr_char));
1366                 }
1367                 self.bump();
1368                 let content_start_bpos = self.pos;
1369                 let mut content_end_bpos;
1370                 let mut valid = true;
1371                 'outer: loop {
1372                     if self.is_eof() {
1373                         let last_bpos = self.pos;
1374                         panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1375                     }
1376                     // if self.ch_is('"') {
1377                     // content_end_bpos = self.pos;
1378                     // for _ in 0..hash_count {
1379                     // self.bump();
1380                     // if !self.ch_is('#') {
1381                     // continue 'outer;
1382                     let c = self.ch.unwrap();
1383                     match c {
1384                         '"' => {
1385                             content_end_bpos = self.pos;
1386                             for _ in 0..hash_count {
1387                                 self.bump();
1388                                 if !self.ch_is('#') {
1389                                     continue 'outer;
1390                                 }
1391                             }
1392                             break;
1393                         }
1394                         '\r' => {
1395                             if !self.nextch_is('\n') {
1396                                 let last_bpos = self.pos;
1397                                 self.err_span_(start_bpos,
1398                                                last_bpos,
1399                                                "bare CR not allowed in raw string, use \\r \
1400                                                 instead");
1401                                 valid = false;
1402                             }
1403                         }
1404                         _ => (),
1405                     }
1406                     self.bump();
1407                 }
1408                 self.bump();
1409                 let id = if valid {
1410                     self.name_from_to(content_start_bpos, content_end_bpos)
1411                 } else {
1412                     Symbol::intern("??")
1413                 };
1414                 let suffix = self.scan_optional_raw_name();
1415                 return Ok(token::Literal(token::StrRaw(id, hash_count), suffix));
1416             }
1417             '-' => {
1418                 if self.nextch_is('>') {
1419                     self.bump();
1420                     self.bump();
1421                     return Ok(token::RArrow);
1422                 } else {
1423                     return Ok(self.binop(token::Minus));
1424                 }
1425             }
1426             '&' => {
1427                 if self.nextch_is('&') {
1428                     self.bump();
1429                     self.bump();
1430                     return Ok(token::AndAnd);
1431                 } else {
1432                     return Ok(self.binop(token::And));
1433                 }
1434             }
1435             '|' => {
1436                 match self.nextch() {
1437                     Some('|') => {
1438                         self.bump();
1439                         self.bump();
1440                         return Ok(token::OrOr);
1441                     }
1442                     _ => {
1443                         return Ok(self.binop(token::Or));
1444                     }
1445                 }
1446             }
1447             '+' => {
1448                 return Ok(self.binop(token::Plus));
1449             }
1450             '*' => {
1451                 return Ok(self.binop(token::Star));
1452             }
1453             '/' => {
1454                 return Ok(self.binop(token::Slash));
1455             }
1456             '^' => {
1457                 return Ok(self.binop(token::Caret));
1458             }
1459             '%' => {
1460                 return Ok(self.binop(token::Percent));
1461             }
1462             c => {
1463                 let last_bpos = self.pos;
1464                 let bpos = self.next_pos;
1465                 let mut err = self.struct_fatal_span_char(last_bpos,
1466                                                           bpos,
1467                                                           "unknown start of token",
1468                                                           c);
1469                 unicode_chars::check_for_substitution(&self, c, &mut err);
1470                 self.fatal_errs.push(err);
1471                 Err(())
1472             }
1473         }
1474     }
1475
1476     fn consume_whitespace(&mut self) {
1477         while is_pattern_whitespace(self.ch) && !self.is_eof() {
1478             self.bump();
1479         }
1480     }
1481
1482     fn read_to_eol(&mut self) -> String {
1483         let mut val = String::new();
1484         while !self.ch_is('\n') && !self.is_eof() {
1485             val.push(self.ch.unwrap());
1486             self.bump();
1487         }
1488         if self.ch_is('\n') {
1489             self.bump();
1490         }
1491         return val;
1492     }
1493
1494     fn read_one_line_comment(&mut self) -> String {
1495         let val = self.read_to_eol();
1496         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1497                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1498         return val;
1499     }
1500
1501     fn consume_non_eol_whitespace(&mut self) {
1502         while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1503             self.bump();
1504         }
1505     }
1506
1507     fn peeking_at_comment(&self) -> bool {
1508         (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1509         // consider shebangs comments, but not inner attributes
1510         (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1511     }
1512
1513     fn scan_byte(&mut self) -> token::Lit {
1514         self.bump();
1515         let start = self.pos;
1516
1517         // the eof will be picked up by the final `'` check below
1518         let c2 = self.ch.unwrap_or('\x00');
1519         self.bump();
1520
1521         let valid = self.scan_char_or_byte(start,
1522                                            c2,
1523                                            // ascii_only =
1524                                            true,
1525                                            '\'');
1526         if !self.ch_is('\'') {
1527             // Byte offsetting here is okay because the
1528             // character before position `start` are an
1529             // ascii single quote and ascii 'b'.
1530             let pos = self.pos;
1531             panic!(self.fatal_span_verbose(start - BytePos(2),
1532                                            pos,
1533                                            "unterminated byte constant".to_string()));
1534         }
1535
1536         let id = if valid {
1537             self.name_from(start)
1538         } else {
1539             Symbol::intern("?")
1540         };
1541         self.bump(); // advance ch past token
1542         return token::Byte(id);
1543     }
1544
1545     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1546         self.scan_hex_digits(2, delim, below_0x7f_only)
1547     }
1548
1549     fn scan_byte_string(&mut self) -> token::Lit {
1550         self.bump();
1551         let start = self.pos;
1552         let mut valid = true;
1553
1554         while !self.ch_is('"') {
1555             if self.is_eof() {
1556                 let pos = self.pos;
1557                 panic!(self.fatal_span_(start, pos, "unterminated double quote byte string"));
1558             }
1559
1560             let ch_start = self.pos;
1561             let ch = self.ch.unwrap();
1562             self.bump();
1563             valid &= self.scan_char_or_byte(ch_start,
1564                                             ch,
1565                                             // ascii_only =
1566                                             true,
1567                                             '"');
1568         }
1569         let id = if valid {
1570             self.name_from(start)
1571         } else {
1572             Symbol::intern("??")
1573         };
1574         self.bump();
1575         return token::ByteStr(id);
1576     }
1577
1578     fn scan_raw_byte_string(&mut self) -> token::Lit {
1579         let start_bpos = self.pos;
1580         self.bump();
1581         let mut hash_count = 0;
1582         while self.ch_is('#') {
1583             self.bump();
1584             hash_count += 1;
1585         }
1586
1587         if self.is_eof() {
1588             let pos = self.pos;
1589             panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"));
1590         } else if !self.ch_is('"') {
1591             let pos = self.pos;
1592             let ch = self.ch.unwrap();
1593             panic!(self.fatal_span_char(start_bpos,
1594                                         pos,
1595                                         "found invalid character; only `#` is allowed in raw \
1596                                          string delimitation",
1597                                         ch));
1598         }
1599         self.bump();
1600         let content_start_bpos = self.pos;
1601         let mut content_end_bpos;
1602         'outer: loop {
1603             match self.ch {
1604                 None => {
1605                     let pos = self.pos;
1606                     panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"))
1607                 }
1608                 Some('"') => {
1609                     content_end_bpos = self.pos;
1610                     for _ in 0..hash_count {
1611                         self.bump();
1612                         if !self.ch_is('#') {
1613                             continue 'outer;
1614                         }
1615                     }
1616                     break;
1617                 }
1618                 Some(c) => {
1619                     if c > '\x7F' {
1620                         let pos = self.pos;
1621                         self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1622                     }
1623                 }
1624             }
1625             self.bump();
1626         }
1627         self.bump();
1628         return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1629                                  hash_count);
1630     }
1631 }
1632
1633 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1634 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1635 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1636     c.map_or(false, Pattern_White_Space)
1637 }
1638
1639 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1640     match c {
1641         Some(c) => lo <= c && c <= hi,
1642         _ => false,
1643     }
1644 }
1645
1646 fn is_dec_digit(c: Option<char>) -> bool {
1647     return in_range(c, '0', '9');
1648 }
1649
1650 pub fn is_doc_comment(s: &str) -> bool {
1651     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1652               s.starts_with("//!");
1653     debug!("is {:?} a doc comment? {}", s, res);
1654     res
1655 }
1656
1657 pub fn is_block_doc_comment(s: &str) -> bool {
1658     // Prevent `/**/` from being parsed as a doc comment
1659     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1660                s.starts_with("/*!")) && s.len() >= 5;
1661     debug!("is {:?} a doc comment? {}", s, res);
1662     res
1663 }
1664
1665 fn ident_start(c: Option<char>) -> bool {
1666     let c = match c {
1667         Some(c) => c,
1668         None => return false,
1669     };
1670
1671     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1672 }
1673
1674 fn ident_continue(c: Option<char>) -> bool {
1675     let c = match c {
1676         Some(c) => c,
1677         None => return false,
1678     };
1679
1680     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1681     (c > '\x7f' && c.is_xid_continue())
1682 }
1683
1684 #[cfg(test)]
1685 mod tests {
1686     use super::*;
1687
1688     use ast::{Ident, CrateConfig};
1689     use symbol::Symbol;
1690     use syntax_pos::{BytePos, Span, NO_EXPANSION};
1691     use codemap::CodeMap;
1692     use errors;
1693     use feature_gate::UnstableFeatures;
1694     use parse::token;
1695     use std::cell::RefCell;
1696     use std::collections::HashSet;
1697     use std::io;
1698     use std::rc::Rc;
1699
1700     fn mk_sess(cm: Rc<CodeMap>) -> ParseSess {
1701         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), Some(cm.clone()));
1702         ParseSess {
1703             span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1704             unstable_features: UnstableFeatures::from_environment(),
1705             config: CrateConfig::new(),
1706             included_mod_stack: RefCell::new(Vec::new()),
1707             code_map: cm,
1708             missing_fragment_specifiers: RefCell::new(HashSet::new()),
1709         }
1710     }
1711
1712     // open a string reader for the given string
1713     fn setup<'a>(cm: &CodeMap,
1714                  sess: &'a ParseSess,
1715                  teststr: String)
1716                  -> StringReader<'a> {
1717         let fm = cm.new_filemap("zebra.rs".to_string(), None, teststr);
1718         StringReader::new(sess, fm)
1719     }
1720
1721     #[test]
1722     fn t1() {
1723         let cm = Rc::new(CodeMap::new());
1724         let sh = mk_sess(cm.clone());
1725         let mut string_reader = setup(&cm,
1726                                       &sh,
1727                                       "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1728                                           .to_string());
1729         let id = Ident::from_str("fn");
1730         assert_eq!(string_reader.next_token().tok, token::Comment);
1731         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1732         let tok1 = string_reader.next_token();
1733         let tok2 = TokenAndSpan {
1734             tok: token::Ident(id),
1735             sp: Span {
1736                 lo: BytePos(21),
1737                 hi: BytePos(23),
1738                 expn_id: NO_EXPANSION,
1739             },
1740         };
1741         assert_eq!(tok1, tok2);
1742         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1743         // the 'main' id is already read:
1744         assert_eq!(string_reader.pos.clone(), BytePos(28));
1745         // read another token:
1746         let tok3 = string_reader.next_token();
1747         let tok4 = TokenAndSpan {
1748             tok: token::Ident(Ident::from_str("main")),
1749             sp: Span {
1750                 lo: BytePos(24),
1751                 hi: BytePos(28),
1752                 expn_id: NO_EXPANSION,
1753             },
1754         };
1755         assert_eq!(tok3, tok4);
1756         // the lparen is already read:
1757         assert_eq!(string_reader.pos.clone(), BytePos(29))
1758     }
1759
1760     // check that the given reader produces the desired stream
1761     // of tokens (stop checking after exhausting the expected vec)
1762     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1763         for expected_tok in &expected {
1764             assert_eq!(&string_reader.next_token().tok, expected_tok);
1765         }
1766     }
1767
1768     // make the identifier by looking up the string in the interner
1769     fn mk_ident(id: &str) -> token::Token {
1770         token::Ident(Ident::from_str(id))
1771     }
1772
1773     #[test]
1774     fn doublecolonparsing() {
1775         let cm = Rc::new(CodeMap::new());
1776         let sh = mk_sess(cm.clone());
1777         check_tokenization(setup(&cm, &sh, "a b".to_string()),
1778                            vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1779     }
1780
1781     #[test]
1782     fn dcparsing_2() {
1783         let cm = Rc::new(CodeMap::new());
1784         let sh = mk_sess(cm.clone());
1785         check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1786                            vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1787     }
1788
1789     #[test]
1790     fn dcparsing_3() {
1791         let cm = Rc::new(CodeMap::new());
1792         let sh = mk_sess(cm.clone());
1793         check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1794                            vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1795     }
1796
1797     #[test]
1798     fn dcparsing_4() {
1799         let cm = Rc::new(CodeMap::new());
1800         let sh = mk_sess(cm.clone());
1801         check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1802                            vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1803     }
1804
1805     #[test]
1806     fn character_a() {
1807         let cm = Rc::new(CodeMap::new());
1808         let sh = mk_sess(cm.clone());
1809         assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1810                    token::Literal(token::Char(Symbol::intern("a")), None));
1811     }
1812
1813     #[test]
1814     fn character_space() {
1815         let cm = Rc::new(CodeMap::new());
1816         let sh = mk_sess(cm.clone());
1817         assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1818                    token::Literal(token::Char(Symbol::intern(" ")), None));
1819     }
1820
1821     #[test]
1822     fn character_escaped() {
1823         let cm = Rc::new(CodeMap::new());
1824         let sh = mk_sess(cm.clone());
1825         assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1826                    token::Literal(token::Char(Symbol::intern("\\n")), None));
1827     }
1828
1829     #[test]
1830     fn lifetime_name() {
1831         let cm = Rc::new(CodeMap::new());
1832         let sh = mk_sess(cm.clone());
1833         assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1834                    token::Lifetime(Ident::from_str("'abc")));
1835     }
1836
1837     #[test]
1838     fn raw_string() {
1839         let cm = Rc::new(CodeMap::new());
1840         let sh = mk_sess(cm.clone());
1841         assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1842                        .next_token()
1843                        .tok,
1844                    token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1845     }
1846
1847     #[test]
1848     fn literal_suffixes() {
1849         let cm = Rc::new(CodeMap::new());
1850         let sh = mk_sess(cm.clone());
1851         macro_rules! test {
1852             ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1853                 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1854                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1855                                           Some(Symbol::intern("suffix"))));
1856                 // with a whitespace separator:
1857                 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1858                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1859                                           None));
1860             }}
1861         }
1862
1863         test!("'a'", Char, "a");
1864         test!("b'a'", Byte, "a");
1865         test!("\"a\"", Str_, "a");
1866         test!("b\"a\"", ByteStr, "a");
1867         test!("1234", Integer, "1234");
1868         test!("0b101", Integer, "0b101");
1869         test!("0xABC", Integer, "0xABC");
1870         test!("1.0", Float, "1.0");
1871         test!("1.0e10", Float, "1.0e10");
1872
1873         assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1874                    token::Literal(token::Integer(Symbol::intern("2")),
1875                                   Some(Symbol::intern("us"))));
1876         assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1877                    token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1878                                   Some(Symbol::intern("suffix"))));
1879         assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1880                    token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1881                                   Some(Symbol::intern("suffix"))));
1882     }
1883
1884     #[test]
1885     fn line_doc_comments() {
1886         assert!(is_doc_comment("///"));
1887         assert!(is_doc_comment("/// blah"));
1888         assert!(!is_doc_comment("////"));
1889     }
1890
1891     #[test]
1892     fn nested_block_comments() {
1893         let cm = Rc::new(CodeMap::new());
1894         let sh = mk_sess(cm.clone());
1895         let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1896         match lexer.next_token().tok {
1897             token::Comment => {}
1898             _ => panic!("expected a comment!"),
1899         }
1900         assert_eq!(lexer.next_token().tok,
1901                    token::Literal(token::Char(Symbol::intern("a")), None));
1902     }
1903
1904     #[test]
1905     fn crlf_comments() {
1906         let cm = Rc::new(CodeMap::new());
1907         let sh = mk_sess(cm.clone());
1908         let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1909         let comment = lexer.next_token();
1910         assert_eq!(comment.tok, token::Comment);
1911         assert_eq!(comment.sp, ::syntax_pos::mk_sp(BytePos(0), BytePos(7)));
1912         assert_eq!(lexer.next_token().tok, token::Whitespace);
1913         assert_eq!(lexer.next_token().tok,
1914                    token::DocComment(Symbol::intern("/// test")));
1915     }
1916 }