src/libsyntax/parse/lexer/mod.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast::{self, Ident};
  12 use syntax_pos::{self, BytePos, CharPos, Pos, Span, NO_EXPANSION};
  13 use codemap::{CodeMap, FilePathMapping};
  14 use errors::{FatalError, DiagnosticBuilder};
  15 use parse::{token, ParseSess};
  16 use str::char_at;
  17 use symbol::{Symbol, keywords};
  18 use std_unicode::property::Pattern_White_Space;
  19
  20 use std::borrow::Cow;
  21 use std::char;
  22 use std::mem::replace;
  23 use std::rc::Rc;
  24
  25 pub mod comments;
  26 mod tokentrees;
  27 mod unicode_chars;
  28
  29 #[derive(Clone, PartialEq, Eq, Debug)]
  30 pub struct TokenAndSpan {
  31     pub tok: token::Token,
  32     pub sp: Span,
  33 }
  34
  35 impl Default for TokenAndSpan {
  36     fn default() -> Self {
  37         TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP }
  38     }
  39 }
  40
  41 pub struct StringReader<'a> {
  42     pub sess: &'a ParseSess,
  43     /// The absolute offset within the codemap of the next character to read
  44     pub next_pos: BytePos,
  45     /// The absolute offset within the codemap of the current character
  46     pub pos: BytePos,
  47     /// The column of the next character to read
  48     pub col: CharPos,
  49     /// The current character (which has been read from self.pos)
  50     pub ch: Option<char>,
  51     pub filemap: Rc<syntax_pos::FileMap>,
  52     /// If Some, stop reading the source at this position (inclusive).
  53     pub terminator: Option<BytePos>,
  54     /// Whether to record new-lines and multibyte chars in filemap.
  55     /// This is only necessary the first time a filemap is lexed.
  56     /// If part of a filemap is being re-lexed, this should be set to false.
  57     pub save_new_lines_and_multibyte: bool,
  58     // cached:
  59     pub peek_tok: token::Token,
  60     pub peek_span: Span,
  61     pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
  62     // cache a direct reference to the source text, so that we don't have to
  63     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
  64     source_text: Rc<String>,
  65     /// Stack of open delimiters and their spans. Used for error message.
  66     token: token::Token,
  67     span: Span,
  68     open_braces: Vec<(token::DelimToken, Span)>,
  69 }
  70
  71 fn mk_sp(lo: BytePos, hi: BytePos) -> Span {
  72     Span { lo: lo, hi: hi, ctxt: NO_EXPANSION }
  73 }
  74
  75 impl<'a> StringReader<'a> {
  76     fn next_token(&mut self) -> TokenAndSpan {
  77         let res = self.try_next_token();
  78         self.unwrap_or_abort(res)
  79     }
  80     fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
  81         match res {
  82             Ok(tok) => tok,
  83             Err(_) => {
  84                 self.emit_fatal_errors();
  85                 panic!(FatalError);
  86             }
  87         }
  88     }
  89     fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
  90         let mut t = self.try_next_token()?;
  91         loop {
  92             match t.tok {
  93                 token::Whitespace | token::Comment | token::Shebang(_) => {
  94                     t = self.try_next_token()?;
  95                 }
  96                 _ => break,
  97             }
  98         }
  99         self.token = t.tok.clone();
 100         self.span = t.sp;
 101         Ok(t)
 102     }
 103     pub fn real_token(&mut self) -> TokenAndSpan {
 104         let res = self.try_real_token();
 105         self.unwrap_or_abort(res)
 106     }
 107     fn is_eof(&self) -> bool {
 108         if self.ch.is_none() {
 109             return true;
 110         }
 111
 112         match self.terminator {
 113             Some(t) => self.next_pos > t,
 114             None => false,
 115         }
 116     }
 117     /// Return the next token. EFFECT: advances the string_reader.
 118     pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
 119         assert!(self.fatal_errs.is_empty());
 120         let ret_val = TokenAndSpan {
 121             tok: replace(&mut self.peek_tok, token::Underscore),
 122             sp: self.peek_span,
 123         };
 124         self.advance_token()?;
 125         Ok(ret_val)
 126     }
 127     fn fatal(&self, m: &str) -> FatalError {
 128         self.fatal_span(self.peek_span, m)
 129     }
 130     pub fn emit_fatal_errors(&mut self) {
 131         for err in &mut self.fatal_errs {
 132             err.emit();
 133         }
 134         self.fatal_errs.clear();
 135     }
 136     pub fn peek(&self) -> TokenAndSpan {
 137         // FIXME(pcwalton): Bad copy!
 138         TokenAndSpan {
 139             tok: self.peek_tok.clone(),
 140             sp: self.peek_span,
 141         }
 142     }
 143 }
 144
 145 impl<'a> StringReader<'a> {
 146     /// For comments.rs, which hackily pokes into next_pos and ch
 147     pub fn new_raw<'b>(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 148         let mut sr = StringReader::new_raw_internal(sess, filemap);
 149         sr.bump();
 150         sr
 151     }
 152
 153     fn new_raw_internal(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 154         if filemap.src.is_none() {
 155             sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
 156                                               filemap.name));
 157         }
 158
 159         let source_text = (*filemap.src.as_ref().unwrap()).clone();
 160
 161         StringReader {
 162             sess: sess,
 163             next_pos: filemap.start_pos,
 164             pos: filemap.start_pos,
 165             col: CharPos(0),
 166             ch: Some('\n'),
 167             filemap: filemap,
 168             terminator: None,
 169             save_new_lines_and_multibyte: true,
 170             // dummy values; not read
 171             peek_tok: token::Eof,
 172             peek_span: syntax_pos::DUMMY_SP,
 173             source_text: source_text,
 174             fatal_errs: Vec::new(),
 175             token: token::Eof,
 176             span: syntax_pos::DUMMY_SP,
 177             open_braces: Vec::new(),
 178         }
 179     }
 180
 181     pub fn new(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
 182         let mut sr = StringReader::new_raw(sess, filemap);
 183         if let Err(_) = sr.advance_token() {
 184             sr.emit_fatal_errors();
 185             panic!(FatalError);
 186         }
 187         sr
 188     }
 189
 190     pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
 191         let begin = sess.codemap().lookup_byte_offset(span.lo);
 192         let end = sess.codemap().lookup_byte_offset(span.hi);
 193
 194         // Make the range zero-length if the span is invalid.
 195         if span.lo > span.hi || begin.fm.start_pos != end.fm.start_pos {
 196             span.hi = span.lo;
 197         }
 198
 199         let mut sr = StringReader::new_raw_internal(sess, begin.fm);
 200
 201         // Seek the lexer to the right byte range.
 202         sr.save_new_lines_and_multibyte = false;
 203         sr.next_pos = span.lo;
 204         sr.terminator = Some(span.hi);
 205
 206         sr.bump();
 207
 208         if let Err(_) = sr.advance_token() {
 209             sr.emit_fatal_errors();
 210             panic!(FatalError);
 211         }
 212         sr
 213     }
 214
 215     pub fn ch_is(&self, c: char) -> bool {
 216         self.ch == Some(c)
 217     }
 218
 219     /// Report a fatal lexical error with a given span.
 220     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 221         self.sess.span_diagnostic.span_fatal(sp, m)
 222     }
 223
 224     /// Report a lexical error with a given span.
 225     pub fn err_span(&self, sp: Span, m: &str) {
 226         self.sess.span_diagnostic.span_err(sp, m)
 227     }
 228
 229
 230     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 231     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 232         self.fatal_span(mk_sp(from_pos, to_pos), m)
 233     }
 234
 235     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 236     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 237         self.err_span(mk_sp(from_pos, to_pos), m)
 238     }
 239
 240     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 241     /// escaped character to the error message
 242     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
 243         let mut m = m.to_string();
 244         m.push_str(": ");
 245         for c in c.escape_default() {
 246             m.push(c)
 247         }
 248         self.fatal_span_(from_pos, to_pos, &m[..])
 249     }
 250     fn struct_fatal_span_char(&self,
 251                               from_pos: BytePos,
 252                               to_pos: BytePos,
 253                               m: &str,
 254                               c: char)
 255                               -> DiagnosticBuilder<'a> {
 256         let mut m = m.to_string();
 257         m.push_str(": ");
 258         for c in c.escape_default() {
 259             m.push(c)
 260         }
 261         self.sess.span_diagnostic.struct_span_fatal(mk_sp(from_pos, to_pos), &m[..])
 262     }
 263
 264     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
 265     /// escaped character to the error message
 266     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
 267         let mut m = m.to_string();
 268         m.push_str(": ");
 269         for c in c.escape_default() {
 270             m.push(c)
 271         }
 272         self.err_span_(from_pos, to_pos, &m[..]);
 273     }
 274     fn struct_err_span_char(&self,
 275                             from_pos: BytePos,
 276                             to_pos: BytePos,
 277                             m: &str,
 278                             c: char)
 279                             -> DiagnosticBuilder<'a> {
 280         let mut m = m.to_string();
 281         m.push_str(": ");
 282         for c in c.escape_default() {
 283             m.push(c)
 284         }
 285         self.sess.span_diagnostic.struct_span_err(mk_sp(from_pos, to_pos), &m[..])
 286     }
 287
 288     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
 289     /// offending string to the error message
 290     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
 291         m.push_str(": ");
 292         let from = self.byte_offset(from_pos).to_usize();
 293         let to = self.byte_offset(to_pos).to_usize();
 294         m.push_str(&self.source_text[from..to]);
 295         self.fatal_span_(from_pos, to_pos, &m[..])
 296     }
 297
 298     /// Advance peek_tok and peek_span to refer to the next token, and
 299     /// possibly update the interner.
 300     fn advance_token(&mut self) -> Result<(), ()> {
 301         match self.scan_whitespace_or_comment() {
 302             Some(comment) => {
 303                 self.peek_span = comment.sp;
 304                 self.peek_tok = comment.tok;
 305             }
 306             None => {
 307                 if self.is_eof() {
 308                     self.peek_tok = token::Eof;
 309                     self.peek_span = mk_sp(self.filemap.end_pos, self.filemap.end_pos);
 310                 } else {
 311                     let start_bytepos = self.pos;
 312                     self.peek_tok = self.next_token_inner()?;
 313                     self.peek_span = mk_sp(start_bytepos, self.pos);
 314                 };
 315             }
 316         }
 317         Ok(())
 318     }
 319
 320     fn byte_offset(&self, pos: BytePos) -> BytePos {
 321         (pos - self.filemap.start_pos)
 322     }
 323
 324     /// Calls `f` with a string slice of the source text spanning from `start`
 325     /// up to but excluding `self.pos`, meaning the slice does not include
 326     /// the character `self.ch`.
 327     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
 328         where F: FnOnce(&str) -> T
 329     {
 330         self.with_str_from_to(start, self.pos, f)
 331     }
 332
 333     /// Create a Name from a given offset to the current offset, each
 334     /// adjusted 1 towards each other (assumes that on either side there is a
 335     /// single-byte delimiter).
 336     pub fn name_from(&self, start: BytePos) -> ast::Name {
 337         debug!("taking an ident from {:?} to {:?}", start, self.pos);
 338         self.with_str_from(start, Symbol::intern)
 339     }
 340
 341     /// As name_from, with an explicit endpoint.
 342     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
 343         debug!("taking an ident from {:?} to {:?}", start, end);
 344         self.with_str_from_to(start, end, Symbol::intern)
 345     }
 346
 347     /// Calls `f` with a string slice of the source text spanning from `start`
 348     /// up to but excluding `end`.
 349     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
 350         where F: FnOnce(&str) -> T
 351     {
 352         f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
 353     }
 354
 355     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 356     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 357         let mut i = 0;
 358         while i < s.len() {
 359             let ch = char_at(s, i);
 360             let next = i + ch.len_utf8();
 361             if ch == '\r' {
 362                 if next < s.len() && char_at(s, next) == '\n' {
 363                     return translate_crlf_(self, start, s, errmsg, i).into();
 364                 }
 365                 let pos = start + BytePos(i as u32);
 366                 let end_pos = start + BytePos(next as u32);
 367                 self.err_span_(pos, end_pos, errmsg);
 368             }
 369             i = next;
 370         }
 371         return s.into();
 372
 373         fn translate_crlf_(rdr: &StringReader,
 374                            start: BytePos,
 375                            s: &str,
 376                            errmsg: &str,
 377                            mut i: usize)
 378                            -> String {
 379             let mut buf = String::with_capacity(s.len());
 380             let mut j = 0;
 381             while i < s.len() {
 382                 let ch = char_at(s, i);
 383                 let next = i + ch.len_utf8();
 384                 if ch == '\r' {
 385                     if j < i {
 386                         buf.push_str(&s[j..i]);
 387                     }
 388                     j = next;
 389                     if next >= s.len() || char_at(s, next) != '\n' {
 390                         let pos = start + BytePos(i as u32);
 391                         let end_pos = start + BytePos(next as u32);
 392                         rdr.err_span_(pos, end_pos, errmsg);
 393                     }
 394                 }
 395                 i = next;
 396             }
 397             if j < s.len() {
 398                 buf.push_str(&s[j..]);
 399             }
 400             buf
 401         }
 402     }
 403
 404
 405     /// Advance the StringReader by one character. If a newline is
 406     /// discovered, add it to the FileMap's list of line start offsets.
 407     pub fn bump(&mut self) {
 408         let new_pos = self.next_pos;
 409         let new_byte_offset = self.byte_offset(new_pos).to_usize();
 410         let end = self.terminator.map_or(self.source_text.len(), |t| {
 411             self.byte_offset(t).to_usize()
 412         });
 413         if new_byte_offset < end {
 414             let old_ch_is_newline = self.ch.unwrap() == '\n';
 415             let new_ch = char_at(&self.source_text, new_byte_offset);
 416             let new_ch_len = new_ch.len_utf8();
 417
 418             self.ch = Some(new_ch);
 419             self.pos = new_pos;
 420             self.next_pos = new_pos + Pos::from_usize(new_ch_len);
 421             if old_ch_is_newline {
 422                 if self.save_new_lines_and_multibyte {
 423                     self.filemap.next_line(self.pos);
 424                 }
 425                 self.col = CharPos(0);
 426             } else {
 427                 self.col = self.col + CharPos(1);
 428             }
 429             if new_ch_len > 1 {
 430                 if self.save_new_lines_and_multibyte {
 431                     self.filemap.record_multibyte_char(self.pos, new_ch_len);
 432                 }
 433             }
 434         } else {
 435             self.ch = None;
 436             self.pos = new_pos;
 437         }
 438     }
 439
 440     pub fn nextch(&self) -> Option<char> {
 441         let offset = self.byte_offset(self.next_pos).to_usize();
 442         if offset < self.source_text.len() {
 443             Some(char_at(&self.source_text, offset))
 444         } else {
 445             None
 446         }
 447     }
 448
 449     pub fn nextch_is(&self, c: char) -> bool {
 450         self.nextch() == Some(c)
 451     }
 452
 453     pub fn nextnextch(&self) -> Option<char> {
 454         let offset = self.byte_offset(self.next_pos).to_usize();
 455         let s = &self.source_text[..];
 456         if offset >= s.len() {
 457             return None;
 458         }
 459         let next = offset + char_at(s, offset).len_utf8();
 460         if next < s.len() {
 461             Some(char_at(s, next))
 462         } else {
 463             None
 464         }
 465     }
 466
 467     pub fn nextnextch_is(&self, c: char) -> bool {
 468         self.nextnextch() == Some(c)
 469     }
 470
 471     /// Eats <XID_start><XID_continue>*, if possible.
 472     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
 473         if !ident_start(self.ch) {
 474             return None;
 475         }
 476         let start = self.pos;
 477         while ident_continue(self.ch) {
 478             self.bump();
 479         }
 480
 481         self.with_str_from(start, |string| {
 482             if string == "_" {
 483                 None
 484             } else {
 485                 Some(Symbol::intern(string))
 486             }
 487         })
 488     }
 489
 490     /// PRECONDITION: self.ch is not whitespace
 491     /// Eats any kind of comment.
 492     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
 493         if let Some(c) = self.ch {
 494             if c.is_whitespace() {
 495                 let msg = "called consume_any_line_comment, but there was whitespace";
 496                 self.sess.span_diagnostic.span_err(mk_sp(self.pos, self.pos), msg);
 497             }
 498         }
 499
 500         if self.ch_is('/') {
 501             match self.nextch() {
 502                 Some('/') => {
 503                     self.bump();
 504                     self.bump();
 505
 506                     // line comments starting with "///" or "//!" are doc-comments
 507                     let doc_comment = (self.ch_is('/') && !self.nextch_is('/')) || self.ch_is('!');
 508                     let start_bpos = self.pos - BytePos(2);
 509
 510                     while !self.is_eof() {
 511                         match self.ch.unwrap() {
 512                             '\n' => break,
 513                             '\r' => {
 514                                 if self.nextch_is('\n') {
 515                                     // CRLF
 516                                     break;
 517                                 } else if doc_comment {
 518                                     self.err_span_(self.pos,
 519                                                    self.next_pos,
 520                                                    "bare CR not allowed in doc-comment");
 521                                 }
 522                             }
 523                             _ => (),
 524                         }
 525                         self.bump();
 526                     }
 527
 528                     return if doc_comment {
 529                         self.with_str_from(start_bpos, |string| {
 530                             // comments with only more "/"s are not doc comments
 531                             let tok = if is_doc_comment(string) {
 532                                 token::DocComment(Symbol::intern(string))
 533                             } else {
 534                                 token::Comment
 535                             };
 536
 537                             Some(TokenAndSpan {
 538                                 tok: tok,
 539                                 sp: mk_sp(start_bpos, self.pos),
 540                             })
 541                         })
 542                     } else {
 543                         Some(TokenAndSpan {
 544                             tok: token::Comment,
 545                             sp: mk_sp(start_bpos, self.pos),
 546                         })
 547                     };
 548                 }
 549                 Some('*') => {
 550                     self.bump();
 551                     self.bump();
 552                     self.scan_block_comment()
 553                 }
 554                 _ => None,
 555             }
 556         } else if self.ch_is('#') {
 557             if self.nextch_is('!') {
 558
 559                 // Parse an inner attribute.
 560                 if self.nextnextch_is('[') {
 561                     return None;
 562                 }
 563
 564                 // I guess this is the only way to figure out if
 565                 // we're at the beginning of the file...
 566                 let cmap = CodeMap::new(FilePathMapping::empty());
 567                 cmap.files.borrow_mut().push(self.filemap.clone());
 568                 let loc = cmap.lookup_char_pos_adj(self.pos);
 569                 debug!("Skipping a shebang");
 570                 if loc.line == 1 && loc.col == CharPos(0) {
 571                     // FIXME: Add shebang "token", return it
 572                     let start = self.pos;
 573                     while !self.ch_is('\n') && !self.is_eof() {
 574                         self.bump();
 575                     }
 576                     return Some(TokenAndSpan {
 577                         tok: token::Shebang(self.name_from(start)),
 578                         sp: mk_sp(start, self.pos),
 579                     });
 580                 }
 581             }
 582             None
 583         } else {
 584             None
 585         }
 586     }
 587
 588     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
 589     /// return None.
 590     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
 591         match self.ch.unwrap_or('\0') {
 592             // # to handle shebang at start of file -- this is the entry point
 593             // for skipping over all "junk"
 594             '/' | '#' => {
 595                 let c = self.scan_comment();
 596                 debug!("scanning a comment {:?}", c);
 597                 c
 598             },
 599             c if is_pattern_whitespace(Some(c)) => {
 600                 let start_bpos = self.pos;
 601                 while is_pattern_whitespace(self.ch) {
 602                     self.bump();
 603                 }
 604                 let c = Some(TokenAndSpan {
 605                     tok: token::Whitespace,
 606                     sp: mk_sp(start_bpos, self.pos),
 607                 });
 608                 debug!("scanning whitespace: {:?}", c);
 609                 c
 610             }
 611             _ => None,
 612         }
 613     }
 614
 615     /// Might return a sugared-doc-attr
 616     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
 617         // block comments starting with "/**" or "/*!" are doc-comments
 618         let is_doc_comment = self.ch_is('*') || self.ch_is('!');
 619         let start_bpos = self.pos - BytePos(2);
 620
 621         let mut level: isize = 1;
 622         let mut has_cr = false;
 623         while level > 0 {
 624             if self.is_eof() {
 625                 let msg = if is_doc_comment {
 626                     "unterminated block doc-comment"
 627                 } else {
 628                     "unterminated block comment"
 629                 };
 630                 let last_bpos = self.pos;
 631                 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
 632             }
 633             let n = self.ch.unwrap();
 634             match n {
 635                 '/' if self.nextch_is('*') => {
 636                     level += 1;
 637                     self.bump();
 638                 }
 639                 '*' if self.nextch_is('/') => {
 640                     level -= 1;
 641                     self.bump();
 642                 }
 643                 '\r' => {
 644                     has_cr = true;
 645                 }
 646                 _ => (),
 647             }
 648             self.bump();
 649         }
 650
 651         self.with_str_from(start_bpos, |string| {
 652             // but comments with only "*"s between two "/"s are not
 653             let tok = if is_block_doc_comment(string) {
 654                 let string = if has_cr {
 655                     self.translate_crlf(start_bpos,
 656                                         string,
 657                                         "bare CR not allowed in block doc-comment")
 658                 } else {
 659                     string.into()
 660                 };
 661                 token::DocComment(Symbol::intern(&string[..]))
 662             } else {
 663                 token::Comment
 664             };
 665
 666             Some(TokenAndSpan {
 667                 tok: tok,
 668                 sp: mk_sp(start_bpos, self.pos),
 669             })
 670         })
 671     }
 672
 673     /// Scan through any digits (base `scan_radix`) or underscores,
 674     /// and return how many digits there were.
 675     ///
 676     /// `real_radix` represents the true radix of the number we're
 677     /// interested in, and errors will be emitted for any digits
 678     /// between `real_radix` and `scan_radix`.
 679     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
 680         assert!(real_radix <= scan_radix);
 681         let mut len = 0;
 682         loop {
 683             let c = self.ch;
 684             if c == Some('_') {
 685                 debug!("skipping a _");
 686                 self.bump();
 687                 continue;
 688             }
 689             match c.and_then(|cc| cc.to_digit(scan_radix)) {
 690                 Some(_) => {
 691                     debug!("{:?} in scan_digits", c);
 692                     // check that the hypothetical digit is actually
 693                     // in range for the true radix
 694                     if c.unwrap().to_digit(real_radix).is_none() {
 695                         self.err_span_(self.pos,
 696                                        self.next_pos,
 697                                        &format!("invalid digit for a base {} literal", real_radix));
 698                     }
 699                     len += 1;
 700                     self.bump();
 701                 }
 702                 _ => return len,
 703             }
 704         }
 705     }
 706
 707     /// Lex a LIT_INTEGER or a LIT_FLOAT
 708     fn scan_number(&mut self, c: char) -> token::Lit {
 709         let num_digits;
 710         let mut base = 10;
 711         let start_bpos = self.pos;
 712
 713         self.bump();
 714
 715         if c == '0' {
 716             match self.ch.unwrap_or('\0') {
 717                 'b' => {
 718                     self.bump();
 719                     base = 2;
 720                     num_digits = self.scan_digits(2, 10);
 721                 }
 722                 'o' => {
 723                     self.bump();
 724                     base = 8;
 725                     num_digits = self.scan_digits(8, 10);
 726                 }
 727                 'x' => {
 728                     self.bump();
 729                     base = 16;
 730                     num_digits = self.scan_digits(16, 16);
 731                 }
 732                 '0'...'9' | '_' | '.' | 'e' | 'E' => {
 733                     num_digits = self.scan_digits(10, 10) + 1;
 734                 }
 735                 _ => {
 736                     // just a 0
 737                     return token::Integer(self.name_from(start_bpos));
 738                 }
 739             }
 740         } else if c.is_digit(10) {
 741             num_digits = self.scan_digits(10, 10) + 1;
 742         } else {
 743             num_digits = 0;
 744         }
 745
 746         if num_digits == 0 {
 747             self.err_span_(start_bpos,
 748                            self.pos,
 749                            "no valid digits found for number");
 750             return token::Integer(Symbol::intern("0"));
 751         }
 752
 753         // might be a float, but don't be greedy if this is actually an
 754         // integer literal followed by field/method access or a range pattern
 755         // (`0..2` and `12.foo()`)
 756         if self.ch_is('.') && !self.nextch_is('.') &&
 757            !self.nextch()
 758                 .unwrap_or('\0')
 759                 .is_xid_start() {
 760             // might have stuff after the ., and if it does, it needs to start
 761             // with a number
 762             self.bump();
 763             if self.ch.unwrap_or('\0').is_digit(10) {
 764                 self.scan_digits(10, 10);
 765                 self.scan_float_exponent();
 766             }
 767             let pos = self.pos;
 768             self.check_float_base(start_bpos, pos, base);
 769             return token::Float(self.name_from(start_bpos));
 770         } else {
 771             // it might be a float if it has an exponent
 772             if self.ch_is('e') || self.ch_is('E') {
 773                 self.scan_float_exponent();
 774                 let pos = self.pos;
 775                 self.check_float_base(start_bpos, pos, base);
 776                 return token::Float(self.name_from(start_bpos));
 777             }
 778             // but we certainly have an integer!
 779             return token::Integer(self.name_from(start_bpos));
 780         }
 781     }
 782
 783     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
 784     /// error if too many or too few digits are encountered.
 785     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
 786         debug!("scanning {} digits until {:?}", n_digits, delim);
 787         let start_bpos = self.pos;
 788         let mut accum_int = 0;
 789
 790         let mut valid = true;
 791         for _ in 0..n_digits {
 792             if self.is_eof() {
 793                 let last_bpos = self.pos;
 794                 panic!(self.fatal_span_(start_bpos,
 795                                         last_bpos,
 796                                         "unterminated numeric character escape"));
 797             }
 798             if self.ch_is(delim) {
 799                 let last_bpos = self.pos;
 800                 self.err_span_(start_bpos,
 801                                last_bpos,
 802                                "numeric character escape is too short");
 803                 valid = false;
 804                 break;
 805             }
 806             let c = self.ch.unwrap_or('\x00');
 807             accum_int *= 16;
 808             accum_int += c.to_digit(16).unwrap_or_else(|| {
 809                 self.err_span_char(self.pos,
 810                                    self.next_pos,
 811                                    "invalid character in numeric character escape",
 812                                    c);
 813
 814                 valid = false;
 815                 0
 816             });
 817             self.bump();
 818         }
 819
 820         if below_0x7f_only && accum_int >= 0x80 {
 821             self.err_span_(start_bpos,
 822                            self.pos,
 823                            "this form of character escape may only be used with characters in \
 824                             the range [\\x00-\\x7f]");
 825             valid = false;
 826         }
 827
 828         match char::from_u32(accum_int) {
 829             Some(_) => valid,
 830             None => {
 831                 let last_bpos = self.pos;
 832                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
 833                 false
 834             }
 835         }
 836     }
 837
 838     /// Scan for a single (possibly escaped) byte or char
 839     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
 840     /// `start` is the position of `first_source_char`, which is already consumed.
 841     ///
 842     /// Returns true if there was a valid char/byte, false otherwise.
 843     fn scan_char_or_byte(&mut self,
 844                          start: BytePos,
 845                          first_source_char: char,
 846                          ascii_only: bool,
 847                          delim: char)
 848                          -> bool {
 849         match first_source_char {
 850             '\\' => {
 851                 // '\X' for some X must be a character constant:
 852                 let escaped = self.ch;
 853                 let escaped_pos = self.pos;
 854                 self.bump();
 855                 match escaped {
 856                     None => {}  // EOF here is an error that will be checked later.
 857                     Some(e) => {
 858                         return match e {
 859                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
 860                             'x' => self.scan_byte_escape(delim, !ascii_only),
 861                             'u' => {
 862                                 let valid = if self.ch_is('{') {
 863                                     self.scan_unicode_escape(delim) && !ascii_only
 864                                 } else {
 865                                     let span = mk_sp(start, self.pos);
 866                                     self.sess.span_diagnostic
 867                                         .struct_span_err(span, "incorrect unicode escape sequence")
 868                                         .span_help(span,
 869                                                    "format of unicode escape sequences is \
 870                                                     `\\u{…}`")
 871                                         .emit();
 872                                     false
 873                                 };
 874                                 if ascii_only {
 875                                     self.err_span_(start,
 876                                                    self.pos,
 877                                                    "unicode escape sequences cannot be used as a \
 878                                                     byte or in a byte string");
 879                                 }
 880                                 valid
 881
 882                             }
 883                             '\n' if delim == '"' => {
 884                                 self.consume_whitespace();
 885                                 true
 886                             }
 887                             '\r' if delim == '"' && self.ch_is('\n') => {
 888                                 self.consume_whitespace();
 889                                 true
 890                             }
 891                             c => {
 892                                 let pos = self.pos;
 893                                 let mut err = self.struct_err_span_char(escaped_pos,
 894                                                                         pos,
 895                                                                         if ascii_only {
 896                                                                             "unknown byte escape"
 897                                                                         } else {
 898                                                                             "unknown character \
 899                                                                              escape"
 900                                                                         },
 901                                                                         c);
 902                                 if e == '\r' {
 903                                     err.span_help(mk_sp(escaped_pos, pos),
 904                                                   "this is an isolated carriage return; consider \
 905                                                    checking your editor and version control \
 906                                                    settings");
 907                                 }
 908                                 if (e == '{' || e == '}') && !ascii_only {
 909                                     err.span_help(mk_sp(escaped_pos, pos),
 910                                                   "if used in a formatting string, curly braces \
 911                                                    are escaped with `{{` and `}}`");
 912                                 }
 913                                 err.emit();
 914                                 false
 915                             }
 916                         }
 917                     }
 918                 }
 919             }
 920             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
 921                 let pos = self.pos;
 922                 self.err_span_char(start,
 923                                    pos,
 924                                    if ascii_only {
 925                                        "byte constant must be escaped"
 926                                    } else {
 927                                        "character constant must be escaped"
 928                                    },
 929                                    first_source_char);
 930                 return false;
 931             }
 932             '\r' => {
 933                 if self.ch_is('\n') {
 934                     self.bump();
 935                     return true;
 936                 } else {
 937                     self.err_span_(start,
 938                                    self.pos,
 939                                    "bare CR not allowed in string, use \\r instead");
 940                     return false;
 941                 }
 942             }
 943             _ => {
 944                 if ascii_only && first_source_char > '\x7F' {
 945                     let pos = self.pos;
 946                     self.err_span_(start,
 947                                    pos,
 948                                    "byte constant must be ASCII. Use a \\xHH escape for a \
 949                                     non-ASCII byte");
 950                     return false;
 951                 }
 952             }
 953         }
 954         true
 955     }
 956
 957     /// Scan over a \u{...} escape
 958     ///
 959     /// At this point, we have already seen the \ and the u, the { is the current character. We
 960     /// will read at least one digit, and up to 6, and pass over the }.
 961     fn scan_unicode_escape(&mut self, delim: char) -> bool {
 962         self.bump(); // past the {
 963         let start_bpos = self.pos;
 964         let mut count = 0;
 965         let mut accum_int = 0;
 966         let mut valid = true;
 967
 968         while !self.ch_is('}') && count <= 6 {
 969             let c = match self.ch {
 970                 Some(c) => c,
 971                 None => {
 972                     panic!(self.fatal_span_(start_bpos,
 973                                             self.pos,
 974                                             "unterminated unicode escape (found EOF)"));
 975                 }
 976             };
 977             accum_int *= 16;
 978             accum_int += c.to_digit(16).unwrap_or_else(|| {
 979                 if c == delim {
 980                     panic!(self.fatal_span_(self.pos,
 981                                             self.next_pos,
 982                                             "unterminated unicode escape (needed a `}`)"));
 983                 } else {
 984                     self.err_span_char(self.pos,
 985                                        self.next_pos,
 986                                        "invalid character in unicode escape",
 987                                        c);
 988                 }
 989                 valid = false;
 990                 0
 991             });
 992             self.bump();
 993             count += 1;
 994         }
 995
 996         if count > 6 {
 997             self.err_span_(start_bpos,
 998                            self.pos,
 999                            "overlong unicode escape (can have at most 6 hex digits)");
1000             valid = false;
1001         }
1002
1003         if valid && (char::from_u32(accum_int).is_none() || count == 0) {
1004             self.err_span_(start_bpos,
1005                            self.pos,
1006                            "invalid unicode character escape");
1007             valid = false;
1008         }
1009
1010         self.bump(); // past the ending }
1011         valid
1012     }
1013
1014     /// Scan over a float exponent.
1015     fn scan_float_exponent(&mut self) {
1016         if self.ch_is('e') || self.ch_is('E') {
1017             self.bump();
1018             if self.ch_is('-') || self.ch_is('+') {
1019                 self.bump();
1020             }
1021             if self.scan_digits(10, 10) == 0 {
1022                 self.err_span_(self.pos,
1023                                self.next_pos,
1024                                "expected at least one digit in exponent")
1025             }
1026         }
1027     }
1028
1029     /// Check that a base is valid for a floating literal, emitting a nice
1030     /// error if it isn't.
1031     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1032         match base {
1033             16 => {
1034                 self.err_span_(start_bpos,
1035                                last_bpos,
1036                                "hexadecimal float literal is not supported")
1037             }
1038             8 => {
1039                 self.err_span_(start_bpos,
1040                                last_bpos,
1041                                "octal float literal is not supported")
1042             }
1043             2 => {
1044                 self.err_span_(start_bpos,
1045                                last_bpos,
1046                                "binary float literal is not supported")
1047             }
1048             _ => (),
1049         }
1050     }
1051
1052     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1053         self.bump();
1054         if self.ch_is('=') {
1055             self.bump();
1056             return token::BinOpEq(op);
1057         } else {
1058             return token::BinOp(op);
1059         }
1060     }
1061
1062     /// Return the next token from the string, advances the input past that
1063     /// token, and updates the interner
1064     fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1065         let c = self.ch;
1066         if ident_start(c) &&
1067            match (c.unwrap(), self.nextch(), self.nextnextch()) {
1068             // Note: r as in r" or r#" is part of a raw string literal,
1069             // b as in b' is part of a byte literal.
1070             // They are not identifiers, and are handled further down.
1071             ('r', Some('"'), _) |
1072             ('r', Some('#'), _) |
1073             ('b', Some('"'), _) |
1074             ('b', Some('\''), _) |
1075             ('b', Some('r'), Some('"')) |
1076             ('b', Some('r'), Some('#')) => false,
1077             _ => true,
1078         } {
1079             let start = self.pos;
1080             while ident_continue(self.ch) {
1081                 self.bump();
1082             }
1083
1084             return Ok(self.with_str_from(start, |string| {
1085                 if string == "_" {
1086                     token::Underscore
1087                 } else {
1088                     // FIXME: perform NFKC normalization here. (Issue #2253)
1089                     token::Ident(Ident::from_str(string))
1090                 }
1091             }));
1092         }
1093
1094         if is_dec_digit(c) {
1095             let num = self.scan_number(c.unwrap());
1096             let suffix = self.scan_optional_raw_name();
1097             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1098             return Ok(token::Literal(num, suffix));
1099         }
1100
1101         match c.expect("next_token_inner called at EOF") {
1102             // One-byte tokens.
1103             ';' => {
1104                 self.bump();
1105                 return Ok(token::Semi);
1106             }
1107             ',' => {
1108                 self.bump();
1109                 return Ok(token::Comma);
1110             }
1111             '.' => {
1112                 self.bump();
1113                 return if self.ch_is('.') {
1114                     self.bump();
1115                     if self.ch_is('.') {
1116                         self.bump();
1117                         Ok(token::DotDotDot)
1118                     } else {
1119                         Ok(token::DotDot)
1120                     }
1121                 } else {
1122                     Ok(token::Dot)
1123                 };
1124             }
1125             '(' => {
1126                 self.bump();
1127                 return Ok(token::OpenDelim(token::Paren));
1128             }
1129             ')' => {
1130                 self.bump();
1131                 return Ok(token::CloseDelim(token::Paren));
1132             }
1133             '{' => {
1134                 self.bump();
1135                 return Ok(token::OpenDelim(token::Brace));
1136             }
1137             '}' => {
1138                 self.bump();
1139                 return Ok(token::CloseDelim(token::Brace));
1140             }
1141             '[' => {
1142                 self.bump();
1143                 return Ok(token::OpenDelim(token::Bracket));
1144             }
1145             ']' => {
1146                 self.bump();
1147                 return Ok(token::CloseDelim(token::Bracket));
1148             }
1149             '@' => {
1150                 self.bump();
1151                 return Ok(token::At);
1152             }
1153             '#' => {
1154                 self.bump();
1155                 return Ok(token::Pound);
1156             }
1157             '~' => {
1158                 self.bump();
1159                 return Ok(token::Tilde);
1160             }
1161             '?' => {
1162                 self.bump();
1163                 return Ok(token::Question);
1164             }
1165             ':' => {
1166                 self.bump();
1167                 if self.ch_is(':') {
1168                     self.bump();
1169                     return Ok(token::ModSep);
1170                 } else {
1171                     return Ok(token::Colon);
1172                 }
1173             }
1174
1175             '$' => {
1176                 self.bump();
1177                 return Ok(token::Dollar);
1178             }
1179
1180             // Multi-byte tokens.
1181             '=' => {
1182                 self.bump();
1183                 if self.ch_is('=') {
1184                     self.bump();
1185                     return Ok(token::EqEq);
1186                 } else if self.ch_is('>') {
1187                     self.bump();
1188                     return Ok(token::FatArrow);
1189                 } else {
1190                     return Ok(token::Eq);
1191                 }
1192             }
1193             '!' => {
1194                 self.bump();
1195                 if self.ch_is('=') {
1196                     self.bump();
1197                     return Ok(token::Ne);
1198                 } else {
1199                     return Ok(token::Not);
1200                 }
1201             }
1202             '<' => {
1203                 self.bump();
1204                 match self.ch.unwrap_or('\x00') {
1205                     '=' => {
1206                         self.bump();
1207                         return Ok(token::Le);
1208                     }
1209                     '<' => {
1210                         return Ok(self.binop(token::Shl));
1211                     }
1212                     '-' => {
1213                         self.bump();
1214                         match self.ch.unwrap_or('\x00') {
1215                             _ => {
1216                                 return Ok(token::LArrow);
1217                             }
1218                         }
1219                     }
1220                     _ => {
1221                         return Ok(token::Lt);
1222                     }
1223                 }
1224             }
1225             '>' => {
1226                 self.bump();
1227                 match self.ch.unwrap_or('\x00') {
1228                     '=' => {
1229                         self.bump();
1230                         return Ok(token::Ge);
1231                     }
1232                     '>' => {
1233                         return Ok(self.binop(token::Shr));
1234                     }
1235                     _ => {
1236                         return Ok(token::Gt);
1237                     }
1238                 }
1239             }
1240             '\'' => {
1241                 // Either a character constant 'a' OR a lifetime name 'abc
1242                 let start_with_quote = self.pos;
1243                 self.bump();
1244                 let start = self.pos;
1245
1246                 // the eof will be picked up by the final `'` check below
1247                 let c2 = self.ch.unwrap_or('\x00');
1248                 self.bump();
1249
1250                 // If the character is an ident start not followed by another single
1251                 // quote, then this is a lifetime name:
1252                 if ident_start(Some(c2)) && !self.ch_is('\'') {
1253                     while ident_continue(self.ch) {
1254                         self.bump();
1255                     }
1256                     // lifetimes shouldn't end with a single quote
1257                     // if we find one, then this is an invalid character literal
1258                     if self.ch_is('\'') {
1259                         panic!(self.fatal_span_verbose(
1260                                start_with_quote, self.next_pos,
1261                                String::from("character literal may only contain one codepoint")));
1262
1263                     }
1264
1265                     // Include the leading `'` in the real identifier, for macro
1266                     // expansion purposes. See #12512 for the gory details of why
1267                     // this is necessary.
1268                     let ident = self.with_str_from(start, |lifetime_name| {
1269                         Ident::from_str(&format!("'{}", lifetime_name))
1270                     });
1271
1272                     // Conjure up a "keyword checking ident" to make sure that
1273                     // the lifetime name is not a keyword.
1274                     let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1275                         Ident::from_str(lifetime_name)
1276                     });
1277                     let keyword_checking_token = &token::Ident(keyword_checking_ident);
1278                     let last_bpos = self.pos;
1279                     if keyword_checking_token.is_any_keyword() &&
1280                        !keyword_checking_token.is_keyword(keywords::Static) {
1281                         self.err_span_(start, last_bpos, "lifetimes cannot use keyword names");
1282                     }
1283
1284                     return Ok(token::Lifetime(ident));
1285                 }
1286
1287                 let valid = self.scan_char_or_byte(start,
1288                                                    c2,
1289                                                    // ascii_only =
1290                                                    false,
1291                                                    '\'');
1292
1293                 if !self.ch_is('\'') {
1294                     panic!(self.fatal_span_verbose(
1295                            start_with_quote, self.pos,
1296                            String::from("character literal may only contain one codepoint")));
1297                 }
1298
1299                 let id = if valid {
1300                     self.name_from(start)
1301                 } else {
1302                     Symbol::intern("0")
1303                 };
1304                 self.bump(); // advance ch past token
1305                 let suffix = self.scan_optional_raw_name();
1306                 return Ok(token::Literal(token::Char(id), suffix));
1307             }
1308             'b' => {
1309                 self.bump();
1310                 let lit = match self.ch {
1311                     Some('\'') => self.scan_byte(),
1312                     Some('"') => self.scan_byte_string(),
1313                     Some('r') => self.scan_raw_byte_string(),
1314                     _ => unreachable!(),  // Should have been a token::Ident above.
1315                 };
1316                 let suffix = self.scan_optional_raw_name();
1317                 return Ok(token::Literal(lit, suffix));
1318             }
1319             '"' => {
1320                 let start_bpos = self.pos;
1321                 let mut valid = true;
1322                 self.bump();
1323                 while !self.ch_is('"') {
1324                     if self.is_eof() {
1325                         let last_bpos = self.pos;
1326                         panic!(self.fatal_span_(start_bpos,
1327                                                 last_bpos,
1328                                                 "unterminated double quote string"));
1329                     }
1330
1331                     let ch_start = self.pos;
1332                     let ch = self.ch.unwrap();
1333                     self.bump();
1334                     valid &= self.scan_char_or_byte(ch_start,
1335                                                     ch,
1336                                                     // ascii_only =
1337                                                     false,
1338                                                     '"');
1339                 }
1340                 // adjust for the ASCII " at the start of the literal
1341                 let id = if valid {
1342                     self.name_from(start_bpos + BytePos(1))
1343                 } else {
1344                     Symbol::intern("??")
1345                 };
1346                 self.bump();
1347                 let suffix = self.scan_optional_raw_name();
1348                 return Ok(token::Literal(token::Str_(id), suffix));
1349             }
1350             'r' => {
1351                 let start_bpos = self.pos;
1352                 self.bump();
1353                 let mut hash_count = 0;
1354                 while self.ch_is('#') {
1355                     self.bump();
1356                     hash_count += 1;
1357                 }
1358
1359                 if self.is_eof() {
1360                     let last_bpos = self.pos;
1361                     panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1362                 } else if !self.ch_is('"') {
1363                     let last_bpos = self.pos;
1364                     let curr_char = self.ch.unwrap();
1365                     panic!(self.fatal_span_char(start_bpos,
1366                                                 last_bpos,
1367                                                 "found invalid character; only `#` is allowed \
1368                                                  in raw string delimitation",
1369                                                 curr_char));
1370                 }
1371                 self.bump();
1372                 let content_start_bpos = self.pos;
1373                 let mut content_end_bpos;
1374                 let mut valid = true;
1375                 'outer: loop {
1376                     if self.is_eof() {
1377                         let last_bpos = self.pos;
1378                         panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1379                     }
1380                     // if self.ch_is('"') {
1381                     // content_end_bpos = self.pos;
1382                     // for _ in 0..hash_count {
1383                     // self.bump();
1384                     // if !self.ch_is('#') {
1385                     // continue 'outer;
1386                     let c = self.ch.unwrap();
1387                     match c {
1388                         '"' => {
1389                             content_end_bpos = self.pos;
1390                             for _ in 0..hash_count {
1391                                 self.bump();
1392                                 if !self.ch_is('#') {
1393                                     continue 'outer;
1394                                 }
1395                             }
1396                             break;
1397                         }
1398                         '\r' => {
1399                             if !self.nextch_is('\n') {
1400                                 let last_bpos = self.pos;
1401                                 self.err_span_(start_bpos,
1402                                                last_bpos,
1403                                                "bare CR not allowed in raw string, use \\r \
1404                                                 instead");
1405                                 valid = false;
1406                             }
1407                         }
1408                         _ => (),
1409                     }
1410                     self.bump();
1411                 }
1412                 self.bump();
1413                 let id = if valid {
1414                     self.name_from_to(content_start_bpos, content_end_bpos)
1415                 } else {
1416                     Symbol::intern("??")
1417                 };
1418                 let suffix = self.scan_optional_raw_name();
1419                 return Ok(token::Literal(token::StrRaw(id, hash_count), suffix));
1420             }
1421             '-' => {
1422                 if self.nextch_is('>') {
1423                     self.bump();
1424                     self.bump();
1425                     return Ok(token::RArrow);
1426                 } else {
1427                     return Ok(self.binop(token::Minus));
1428                 }
1429             }
1430             '&' => {
1431                 if self.nextch_is('&') {
1432                     self.bump();
1433                     self.bump();
1434                     return Ok(token::AndAnd);
1435                 } else {
1436                     return Ok(self.binop(token::And));
1437                 }
1438             }
1439             '|' => {
1440                 match self.nextch() {
1441                     Some('|') => {
1442                         self.bump();
1443                         self.bump();
1444                         return Ok(token::OrOr);
1445                     }
1446                     _ => {
1447                         return Ok(self.binop(token::Or));
1448                     }
1449                 }
1450             }
1451             '+' => {
1452                 return Ok(self.binop(token::Plus));
1453             }
1454             '*' => {
1455                 return Ok(self.binop(token::Star));
1456             }
1457             '/' => {
1458                 return Ok(self.binop(token::Slash));
1459             }
1460             '^' => {
1461                 return Ok(self.binop(token::Caret));
1462             }
1463             '%' => {
1464                 return Ok(self.binop(token::Percent));
1465             }
1466             c => {
1467                 let last_bpos = self.pos;
1468                 let bpos = self.next_pos;
1469                 let mut err = self.struct_fatal_span_char(last_bpos,
1470                                                           bpos,
1471                                                           "unknown start of token",
1472                                                           c);
1473                 unicode_chars::check_for_substitution(&self, c, &mut err);
1474                 self.fatal_errs.push(err);
1475                 Err(())
1476             }
1477         }
1478     }
1479
1480     fn consume_whitespace(&mut self) {
1481         while is_pattern_whitespace(self.ch) && !self.is_eof() {
1482             self.bump();
1483         }
1484     }
1485
1486     fn read_to_eol(&mut self) -> String {
1487         let mut val = String::new();
1488         while !self.ch_is('\n') && !self.is_eof() {
1489             val.push(self.ch.unwrap());
1490             self.bump();
1491         }
1492         if self.ch_is('\n') {
1493             self.bump();
1494         }
1495         return val;
1496     }
1497
1498     fn read_one_line_comment(&mut self) -> String {
1499         let val = self.read_to_eol();
1500         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1501                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1502         return val;
1503     }
1504
1505     fn consume_non_eol_whitespace(&mut self) {
1506         while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1507             self.bump();
1508         }
1509     }
1510
1511     fn peeking_at_comment(&self) -> bool {
1512         (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1513         // consider shebangs comments, but not inner attributes
1514         (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1515     }
1516
1517     fn scan_byte(&mut self) -> token::Lit {
1518         self.bump();
1519         let start = self.pos;
1520
1521         // the eof will be picked up by the final `'` check below
1522         let c2 = self.ch.unwrap_or('\x00');
1523         self.bump();
1524
1525         let valid = self.scan_char_or_byte(start,
1526                                            c2,
1527                                            // ascii_only =
1528                                            true,
1529                                            '\'');
1530         if !self.ch_is('\'') {
1531             // Byte offsetting here is okay because the
1532             // character before position `start` are an
1533             // ascii single quote and ascii 'b'.
1534             let pos = self.pos;
1535             panic!(self.fatal_span_verbose(start - BytePos(2),
1536                                            pos,
1537                                            "unterminated byte constant".to_string()));
1538         }
1539
1540         let id = if valid {
1541             self.name_from(start)
1542         } else {
1543             Symbol::intern("?")
1544         };
1545         self.bump(); // advance ch past token
1546         return token::Byte(id);
1547     }
1548
1549     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1550         self.scan_hex_digits(2, delim, below_0x7f_only)
1551     }
1552
1553     fn scan_byte_string(&mut self) -> token::Lit {
1554         self.bump();
1555         let start = self.pos;
1556         let mut valid = true;
1557
1558         while !self.ch_is('"') {
1559             if self.is_eof() {
1560                 let pos = self.pos;
1561                 panic!(self.fatal_span_(start, pos, "unterminated double quote byte string"));
1562             }
1563
1564             let ch_start = self.pos;
1565             let ch = self.ch.unwrap();
1566             self.bump();
1567             valid &= self.scan_char_or_byte(ch_start,
1568                                             ch,
1569                                             // ascii_only =
1570                                             true,
1571                                             '"');
1572         }
1573         let id = if valid {
1574             self.name_from(start)
1575         } else {
1576             Symbol::intern("??")
1577         };
1578         self.bump();
1579         return token::ByteStr(id);
1580     }
1581
1582     fn scan_raw_byte_string(&mut self) -> token::Lit {
1583         let start_bpos = self.pos;
1584         self.bump();
1585         let mut hash_count = 0;
1586         while self.ch_is('#') {
1587             self.bump();
1588             hash_count += 1;
1589         }
1590
1591         if self.is_eof() {
1592             let pos = self.pos;
1593             panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"));
1594         } else if !self.ch_is('"') {
1595             let pos = self.pos;
1596             let ch = self.ch.unwrap();
1597             panic!(self.fatal_span_char(start_bpos,
1598                                         pos,
1599                                         "found invalid character; only `#` is allowed in raw \
1600                                          string delimitation",
1601                                         ch));
1602         }
1603         self.bump();
1604         let content_start_bpos = self.pos;
1605         let mut content_end_bpos;
1606         'outer: loop {
1607             match self.ch {
1608                 None => {
1609                     let pos = self.pos;
1610                     panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"))
1611                 }
1612                 Some('"') => {
1613                     content_end_bpos = self.pos;
1614                     for _ in 0..hash_count {
1615                         self.bump();
1616                         if !self.ch_is('#') {
1617                             continue 'outer;
1618                         }
1619                     }
1620                     break;
1621                 }
1622                 Some(c) => {
1623                     if c > '\x7F' {
1624                         let pos = self.pos;
1625                         self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1626                     }
1627                 }
1628             }
1629             self.bump();
1630         }
1631         self.bump();
1632         return token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1633                                  hash_count);
1634     }
1635 }
1636
1637 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1638 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1639 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1640     c.map_or(false, Pattern_White_Space)
1641 }
1642
1643 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1644     match c {
1645         Some(c) => lo <= c && c <= hi,
1646         _ => false,
1647     }
1648 }
1649
1650 fn is_dec_digit(c: Option<char>) -> bool {
1651     return in_range(c, '0', '9');
1652 }
1653
1654 pub fn is_doc_comment(s: &str) -> bool {
1655     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1656               s.starts_with("//!");
1657     debug!("is {:?} a doc comment? {}", s, res);
1658     res
1659 }
1660
1661 pub fn is_block_doc_comment(s: &str) -> bool {
1662     // Prevent `/**/` from being parsed as a doc comment
1663     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1664                s.starts_with("/*!")) && s.len() >= 5;
1665     debug!("is {:?} a doc comment? {}", s, res);
1666     res
1667 }
1668
1669 fn ident_start(c: Option<char>) -> bool {
1670     let c = match c {
1671         Some(c) => c,
1672         None => return false,
1673     };
1674
1675     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1676 }
1677
1678 fn ident_continue(c: Option<char>) -> bool {
1679     let c = match c {
1680         Some(c) => c,
1681         None => return false,
1682     };
1683
1684     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1685     (c > '\x7f' && c.is_xid_continue())
1686 }
1687
1688 #[cfg(test)]
1689 mod tests {
1690     use super::*;
1691
1692     use ast::{Ident, CrateConfig};
1693     use symbol::Symbol;
1694     use syntax_pos::{BytePos, Span, NO_EXPANSION};
1695     use codemap::CodeMap;
1696     use errors;
1697     use feature_gate::UnstableFeatures;
1698     use parse::token;
1699     use std::cell::RefCell;
1700     use std::collections::HashSet;
1701     use std::io;
1702     use std::rc::Rc;
1703
1704     fn mk_sess(cm: Rc<CodeMap>) -> ParseSess {
1705         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), Some(cm.clone()));
1706         ParseSess {
1707             span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1708             unstable_features: UnstableFeatures::from_environment(),
1709             config: CrateConfig::new(),
1710             included_mod_stack: RefCell::new(Vec::new()),
1711             code_map: cm,
1712             missing_fragment_specifiers: RefCell::new(HashSet::new()),
1713         }
1714     }
1715
1716     // open a string reader for the given string
1717     fn setup<'a>(cm: &CodeMap,
1718                  sess: &'a ParseSess,
1719                  teststr: String)
1720                  -> StringReader<'a> {
1721         let fm = cm.new_filemap("zebra.rs".to_string(), teststr);
1722         StringReader::new(sess, fm)
1723     }
1724
1725     #[test]
1726     fn t1() {
1727         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1728         let sh = mk_sess(cm.clone());
1729         let mut string_reader = setup(&cm,
1730                                       &sh,
1731                                       "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1732                                           .to_string());
1733         let id = Ident::from_str("fn");
1734         assert_eq!(string_reader.next_token().tok, token::Comment);
1735         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1736         let tok1 = string_reader.next_token();
1737         let tok2 = TokenAndSpan {
1738             tok: token::Ident(id),
1739             sp: Span {
1740                 lo: BytePos(21),
1741                 hi: BytePos(23),
1742                 ctxt: NO_EXPANSION,
1743             },
1744         };
1745         assert_eq!(tok1, tok2);
1746         assert_eq!(string_reader.next_token().tok, token::Whitespace);
1747         // the 'main' id is already read:
1748         assert_eq!(string_reader.pos.clone(), BytePos(28));
1749         // read another token:
1750         let tok3 = string_reader.next_token();
1751         let tok4 = TokenAndSpan {
1752             tok: token::Ident(Ident::from_str("main")),
1753             sp: Span {
1754                 lo: BytePos(24),
1755                 hi: BytePos(28),
1756                 ctxt: NO_EXPANSION,
1757             },
1758         };
1759         assert_eq!(tok3, tok4);
1760         // the lparen is already read:
1761         assert_eq!(string_reader.pos.clone(), BytePos(29))
1762     }
1763
1764     // check that the given reader produces the desired stream
1765     // of tokens (stop checking after exhausting the expected vec)
1766     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1767         for expected_tok in &expected {
1768             assert_eq!(&string_reader.next_token().tok, expected_tok);
1769         }
1770     }
1771
1772     // make the identifier by looking up the string in the interner
1773     fn mk_ident(id: &str) -> token::Token {
1774         token::Ident(Ident::from_str(id))
1775     }
1776
1777     #[test]
1778     fn doublecolonparsing() {
1779         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1780         let sh = mk_sess(cm.clone());
1781         check_tokenization(setup(&cm, &sh, "a b".to_string()),
1782                            vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1783     }
1784
1785     #[test]
1786     fn dcparsing_2() {
1787         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1788         let sh = mk_sess(cm.clone());
1789         check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1790                            vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1791     }
1792
1793     #[test]
1794     fn dcparsing_3() {
1795         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1796         let sh = mk_sess(cm.clone());
1797         check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1798                            vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1799     }
1800
1801     #[test]
1802     fn dcparsing_4() {
1803         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1804         let sh = mk_sess(cm.clone());
1805         check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1806                            vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1807     }
1808
1809     #[test]
1810     fn character_a() {
1811         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1812         let sh = mk_sess(cm.clone());
1813         assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1814                    token::Literal(token::Char(Symbol::intern("a")), None));
1815     }
1816
1817     #[test]
1818     fn character_space() {
1819         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1820         let sh = mk_sess(cm.clone());
1821         assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1822                    token::Literal(token::Char(Symbol::intern(" ")), None));
1823     }
1824
1825     #[test]
1826     fn character_escaped() {
1827         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1828         let sh = mk_sess(cm.clone());
1829         assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1830                    token::Literal(token::Char(Symbol::intern("\\n")), None));
1831     }
1832
1833     #[test]
1834     fn lifetime_name() {
1835         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1836         let sh = mk_sess(cm.clone());
1837         assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1838                    token::Lifetime(Ident::from_str("'abc")));
1839     }
1840
1841     #[test]
1842     fn raw_string() {
1843         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1844         let sh = mk_sess(cm.clone());
1845         assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1846                        .next_token()
1847                        .tok,
1848                    token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1849     }
1850
1851     #[test]
1852     fn literal_suffixes() {
1853         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1854         let sh = mk_sess(cm.clone());
1855         macro_rules! test {
1856             ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1857                 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1858                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1859                                           Some(Symbol::intern("suffix"))));
1860                 // with a whitespace separator:
1861                 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1862                            token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1863                                           None));
1864             }}
1865         }
1866
1867         test!("'a'", Char, "a");
1868         test!("b'a'", Byte, "a");
1869         test!("\"a\"", Str_, "a");
1870         test!("b\"a\"", ByteStr, "a");
1871         test!("1234", Integer, "1234");
1872         test!("0b101", Integer, "0b101");
1873         test!("0xABC", Integer, "0xABC");
1874         test!("1.0", Float, "1.0");
1875         test!("1.0e10", Float, "1.0e10");
1876
1877         assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1878                    token::Literal(token::Integer(Symbol::intern("2")),
1879                                   Some(Symbol::intern("us"))));
1880         assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1881                    token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1882                                   Some(Symbol::intern("suffix"))));
1883         assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1884                    token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1885                                   Some(Symbol::intern("suffix"))));
1886     }
1887
1888     #[test]
1889     fn line_doc_comments() {
1890         assert!(is_doc_comment("///"));
1891         assert!(is_doc_comment("/// blah"));
1892         assert!(!is_doc_comment("////"));
1893     }
1894
1895     #[test]
1896     fn nested_block_comments() {
1897         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1898         let sh = mk_sess(cm.clone());
1899         let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1900         match lexer.next_token().tok {
1901             token::Comment => {}
1902             _ => panic!("expected a comment!"),
1903         }
1904         assert_eq!(lexer.next_token().tok,
1905                    token::Literal(token::Char(Symbol::intern("a")), None));
1906     }
1907
1908     #[test]
1909     fn crlf_comments() {
1910         let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1911         let sh = mk_sess(cm.clone());
1912         let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1913         let comment = lexer.next_token();
1914         assert_eq!(comment.tok, token::Comment);
1915         assert_eq!((comment.sp.lo, comment.sp.hi), (BytePos(0), BytePos(7)));
1916         assert_eq!(lexer.next_token().tok, token::Whitespace);
1917         assert_eq!(lexer.next_token().tok,
1918                    token::DocComment(Symbol::intern("/// test")));
1919     }
1920 }