src/libsyntax/parse/lexer.rs

   1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 use ast;
  12 use codemap::{BytePos, CharPos, CodeMap, Pos, Span};
  13 use codemap;
  14 use diagnostic::SpanHandler;
  15 use ext::tt::transcribe::tt_next_token;
  16 use parse::token;
  17 use parse::token::{str_to_ident};
  18
  19 use std::char;
  20 use std::mem::replace;
  21 use std::num::from_str_radix;
  22 use std::rc::Rc;
  23 use std::str;
  24 use std::strbuf::StrBuf;
  25
  26 pub use ext::tt::transcribe::{TtReader, new_tt_reader};
  27
  28 pub trait Reader {
  29     fn is_eof(&self) -> bool;
  30     fn next_token(&mut self) -> TokenAndSpan;
  31     fn fatal(&self, StrBuf) -> !;
  32     fn span_diag<'a>(&'a self) -> &'a SpanHandler;
  33     fn peek(&self) -> TokenAndSpan;
  34 }
  35
  36 #[deriving(Clone, Eq, Show)]
  37 pub struct TokenAndSpan {
  38     pub tok: token::Token,
  39     pub sp: Span,
  40 }
  41
  42 pub struct StringReader<'a> {
  43     pub span_diagnostic: &'a SpanHandler,
  44     // The absolute offset within the codemap of the next character to read
  45     pub pos: BytePos,
  46     // The absolute offset within the codemap of the last character read(curr)
  47     pub last_pos: BytePos,
  48     // The column of the next character to read
  49     pub col: CharPos,
  50     // The last character to be read
  51     pub curr: Option<char>,
  52     pub filemap: Rc<codemap::FileMap>,
  53     /* cached: */
  54     pub peek_tok: token::Token,
  55     pub peek_span: Span,
  56 }
  57
  58 impl<'a> StringReader<'a> {
  59     pub fn curr_is(&self, c: char) -> bool {
  60         self.curr == Some(c)
  61     }
  62 }
  63
  64 pub fn new_string_reader<'a>(span_diagnostic: &'a SpanHandler,
  65                              filemap: Rc<codemap::FileMap>)
  66                              -> StringReader<'a> {
  67     let mut r = new_low_level_string_reader(span_diagnostic, filemap);
  68     string_advance_token(&mut r); /* fill in peek_* */
  69     r
  70 }
  71
  72 /* For comments.rs, which hackily pokes into 'pos' and 'curr' */
  73 pub fn new_low_level_string_reader<'a>(span_diagnostic: &'a SpanHandler,
  74                                        filemap: Rc<codemap::FileMap>)
  75                                        -> StringReader<'a> {
  76     // Force the initial reader bump to start on a fresh line
  77     let initial_char = '\n';
  78     let mut r = StringReader {
  79         span_diagnostic: span_diagnostic,
  80         pos: filemap.start_pos,
  81         last_pos: filemap.start_pos,
  82         col: CharPos(0),
  83         curr: Some(initial_char),
  84         filemap: filemap,
  85         /* dummy values; not read */
  86         peek_tok: token::EOF,
  87         peek_span: codemap::DUMMY_SP,
  88     };
  89     bump(&mut r);
  90     r
  91 }
  92
  93 impl<'a> Reader for StringReader<'a> {
  94     fn is_eof(&self) -> bool { is_eof(self) }
  95     // return the next token. EFFECT: advances the string_reader.
  96     fn next_token(&mut self) -> TokenAndSpan {
  97         let ret_val = TokenAndSpan {
  98             tok: replace(&mut self.peek_tok, token::UNDERSCORE),
  99             sp: self.peek_span,
 100         };
 101         string_advance_token(self);
 102         ret_val
 103     }
 104     fn fatal(&self, m: StrBuf) -> ! {
 105         self.span_diagnostic.span_fatal(self.peek_span, m.as_slice())
 106     }
 107     fn span_diag<'a>(&'a self) -> &'a SpanHandler { self.span_diagnostic }
 108     fn peek(&self) -> TokenAndSpan {
 109         // FIXME(pcwalton): Bad copy!
 110         TokenAndSpan {
 111             tok: self.peek_tok.clone(),
 112             sp: self.peek_span.clone(),
 113         }
 114     }
 115 }
 116
 117 impl<'a> Reader for TtReader<'a> {
 118     fn is_eof(&self) -> bool {
 119         self.cur_tok == token::EOF
 120     }
 121     fn next_token(&mut self) -> TokenAndSpan {
 122         let r = tt_next_token(self);
 123         debug!("TtReader: r={:?}", r);
 124         r
 125     }
 126     fn fatal(&self, m: StrBuf) -> ! {
 127         self.sp_diag.span_fatal(self.cur_span, m.as_slice());
 128     }
 129     fn span_diag<'a>(&'a self) -> &'a SpanHandler { self.sp_diag }
 130     fn peek(&self) -> TokenAndSpan {
 131         TokenAndSpan {
 132             tok: self.cur_tok.clone(),
 133             sp: self.cur_span.clone(),
 134         }
 135     }
 136 }
 137
 138 // report a lexical error spanning [`from_pos`, `to_pos`)
 139 fn fatal_span(rdr: &mut StringReader,
 140               from_pos: BytePos,
 141               to_pos: BytePos,
 142               m: StrBuf)
 143            -> ! {
 144     rdr.peek_span = codemap::mk_sp(from_pos, to_pos);
 145     rdr.fatal(m);
 146 }
 147
 148 // report a lexical error spanning [`from_pos`, `to_pos`), appending an
 149 // escaped character to the error message
 150 fn fatal_span_char(rdr: &mut StringReader,
 151                    from_pos: BytePos,
 152                    to_pos: BytePos,
 153                    m: StrBuf,
 154                    c: char)
 155                 -> ! {
 156     let mut m = m;
 157     m.push_str(": ");
 158     char::escape_default(c, |c| m.push_char(c));
 159     fatal_span(rdr, from_pos, to_pos, m.into_strbuf());
 160 }
 161
 162 // report a lexical error spanning [`from_pos`, `to_pos`), appending the
 163 // offending string to the error message
 164 fn fatal_span_verbose(rdr: &mut StringReader,
 165                       from_pos: BytePos,
 166                       to_pos: BytePos,
 167                       m: StrBuf)
 168                    -> ! {
 169     let mut m = m;
 170     m.push_str(": ");
 171     let from = byte_offset(rdr, from_pos).to_uint();
 172     let to = byte_offset(rdr, to_pos).to_uint();
 173     m.push_str(rdr.filemap.src.as_slice().slice(from, to));
 174     fatal_span(rdr, from_pos, to_pos, m);
 175 }
 176
 177 // EFFECT: advance peek_tok and peek_span to refer to the next token.
 178 // EFFECT: update the interner, maybe.
 179 fn string_advance_token(r: &mut StringReader) {
 180     match consume_whitespace_and_comments(r) {
 181         Some(comment) => {
 182             r.peek_span = comment.sp;
 183             r.peek_tok = comment.tok;
 184         },
 185         None => {
 186             if is_eof(r) {
 187                 r.peek_tok = token::EOF;
 188             } else {
 189                 let start_bytepos = r.last_pos;
 190                 r.peek_tok = next_token_inner(r);
 191                 r.peek_span = codemap::mk_sp(start_bytepos,
 192                                              r.last_pos);
 193             };
 194         }
 195     }
 196 }
 197
 198 fn byte_offset(rdr: &StringReader, pos: BytePos) -> BytePos {
 199     (pos - rdr.filemap.start_pos)
 200 }
 201
 202 /// Calls `f` with a string slice of the source text spanning from `start`
 203 /// up to but excluding `rdr.last_pos`, meaning the slice does not include
 204 /// the character `rdr.curr`.
 205 pub fn with_str_from<T>(
 206                      rdr: &StringReader,
 207                      start: BytePos,
 208                      f: |s: &str| -> T)
 209                      -> T {
 210     with_str_from_to(rdr, start, rdr.last_pos, f)
 211 }
 212
 213 /// Calls `f` with astring slice of the source text spanning from `start`
 214 /// up to but excluding `end`.
 215 fn with_str_from_to<T>(
 216                     rdr: &StringReader,
 217                     start: BytePos,
 218                     end: BytePos,
 219                     f: |s: &str| -> T)
 220                     -> T {
 221     f(rdr.filemap.src.as_slice().slice(
 222             byte_offset(rdr, start).to_uint(),
 223             byte_offset(rdr, end).to_uint()))
 224 }
 225
 226 // EFFECT: advance the StringReader by one character. If a newline is
 227 // discovered, add it to the FileMap's list of line start offsets.
 228 pub fn bump(rdr: &mut StringReader) {
 229     rdr.last_pos = rdr.pos;
 230     let current_byte_offset = byte_offset(rdr, rdr.pos).to_uint();
 231     if current_byte_offset < rdr.filemap.src.len() {
 232         assert!(rdr.curr.is_some());
 233         let last_char = rdr.curr.unwrap();
 234         let next = rdr.filemap
 235                       .src
 236                       .as_slice()
 237                       .char_range_at(current_byte_offset);
 238         let byte_offset_diff = next.next - current_byte_offset;
 239         rdr.pos = rdr.pos + Pos::from_uint(byte_offset_diff);
 240         rdr.curr = Some(next.ch);
 241         rdr.col = rdr.col + CharPos(1u);
 242         if last_char == '\n' {
 243             rdr.filemap.next_line(rdr.last_pos);
 244             rdr.col = CharPos(0u);
 245         }
 246
 247         if byte_offset_diff > 1 {
 248             rdr.filemap.record_multibyte_char(rdr.last_pos, byte_offset_diff);
 249         }
 250     } else {
 251         rdr.curr = None;
 252     }
 253 }
 254
 255 pub fn is_eof(rdr: &StringReader) -> bool {
 256     rdr.curr.is_none()
 257 }
 258
 259 pub fn nextch(rdr: &StringReader) -> Option<char> {
 260     let offset = byte_offset(rdr, rdr.pos).to_uint();
 261     if offset < rdr.filemap.src.len() {
 262         Some(rdr.filemap.src.as_slice().char_at(offset))
 263     } else {
 264         None
 265     }
 266 }
 267 pub fn nextch_is(rdr: &StringReader, c: char) -> bool {
 268     nextch(rdr) == Some(c)
 269 }
 270
 271 pub fn nextnextch(rdr: &StringReader) -> Option<char> {
 272     let offset = byte_offset(rdr, rdr.pos).to_uint();
 273     let s = rdr.filemap.deref().src.as_slice();
 274     if offset >= s.len() { return None }
 275     let str::CharRange { next, .. } = s.char_range_at(offset);
 276     if next < s.len() {
 277         Some(s.char_at(next))
 278     } else {
 279         None
 280     }
 281 }
 282 pub fn nextnextch_is(rdr: &StringReader, c: char) -> bool {
 283     nextnextch(rdr) == Some(c)
 284 }
 285
 286 fn hex_digit_val(c: Option<char>) -> int {
 287     let d = c.unwrap_or('\x00');
 288
 289     if in_range(c, '0', '9') { return (d as int) - ('0' as int); }
 290     if in_range(c, 'a', 'f') { return (d as int) - ('a' as int) + 10; }
 291     if in_range(c, 'A', 'F') { return (d as int) - ('A' as int) + 10; }
 292     fail!();
 293 }
 294
 295 pub fn is_whitespace(c: Option<char>) -> bool {
 296     match c.unwrap_or('\x00') { // None can be null for now... it's not whitespace
 297         ' ' | '\n' | '\t' | '\r' => true,
 298         _ => false
 299     }
 300 }
 301
 302 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
 303     match c {
 304         Some(c) => lo <= c && c <= hi,
 305         _ => false
 306     }
 307 }
 308
 309 fn is_dec_digit(c: Option<char>) -> bool { return in_range(c, '0', '9'); }
 310
 311 fn is_hex_digit(c: Option<char>) -> bool {
 312     return in_range(c, '0', '9') || in_range(c, 'a', 'f') ||
 313             in_range(c, 'A', 'F');
 314 }
 315
 316 // EFFECT: eats whitespace and comments.
 317 // returns a Some(sugared-doc-attr) if one exists, None otherwise.
 318 fn consume_whitespace_and_comments(rdr: &mut StringReader)
 319                                 -> Option<TokenAndSpan> {
 320     while is_whitespace(rdr.curr) { bump(rdr); }
 321     return consume_any_line_comment(rdr);
 322 }
 323
 324 pub fn is_line_non_doc_comment(s: &str) -> bool {
 325     s.starts_with("////")
 326 }
 327
 328 // PRECONDITION: rdr.curr is not whitespace
 329 // EFFECT: eats any kind of comment.
 330 // returns a Some(sugared-doc-attr) if one exists, None otherwise
 331 fn consume_any_line_comment(rdr: &mut StringReader)
 332                          -> Option<TokenAndSpan> {
 333     if rdr.curr_is('/') {
 334         match nextch(rdr) {
 335             Some('/') => {
 336                 bump(rdr);
 337                 bump(rdr);
 338                 // line comments starting with "///" or "//!" are doc-comments
 339                 if rdr.curr_is('/') || rdr.curr_is('!') {
 340                     let start_bpos = rdr.pos - BytePos(3);
 341                     while !rdr.curr_is('\n') && !is_eof(rdr) {
 342                         bump(rdr);
 343                     }
 344                     let ret = with_str_from(rdr, start_bpos, |string| {
 345                         // but comments with only more "/"s are not
 346                         if !is_line_non_doc_comment(string) {
 347                             Some(TokenAndSpan{
 348                                 tok: token::DOC_COMMENT(str_to_ident(string)),
 349                                 sp: codemap::mk_sp(start_bpos, rdr.pos)
 350                             })
 351                         } else {
 352                             None
 353                         }
 354                     });
 355
 356                     if ret.is_some() {
 357                         return ret;
 358                     }
 359                 } else {
 360                     while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); }
 361                 }
 362                 // Restart whitespace munch.
 363                 consume_whitespace_and_comments(rdr)
 364             }
 365             Some('*') => { bump(rdr); bump(rdr); consume_block_comment(rdr) }
 366             _ => None
 367         }
 368     } else if rdr.curr_is('#') {
 369         if nextch_is(rdr, '!') {
 370
 371             // Parse an inner attribute.
 372             if nextnextch_is(rdr, '[') {
 373                 return None;
 374             }
 375
 376             // I guess this is the only way to figure out if
 377             // we're at the beginning of the file...
 378             let cmap = CodeMap::new();
 379             cmap.files.borrow_mut().push(rdr.filemap.clone());
 380             let loc = cmap.lookup_char_pos_adj(rdr.last_pos);
 381             if loc.line == 1u && loc.col == CharPos(0u) {
 382                 while !rdr.curr_is('\n') && !is_eof(rdr) { bump(rdr); }
 383                 return consume_whitespace_and_comments(rdr);
 384             }
 385         }
 386         None
 387     } else {
 388         None
 389     }
 390 }
 391
 392 pub fn is_block_non_doc_comment(s: &str) -> bool {
 393     s.starts_with("/***")
 394 }
 395
 396 // might return a sugared-doc-attr
 397 fn consume_block_comment(rdr: &mut StringReader) -> Option<TokenAndSpan> {
 398     // block comments starting with "/**" or "/*!" are doc-comments
 399     let is_doc_comment = rdr.curr_is('*') || rdr.curr_is('!');
 400     let start_bpos = rdr.pos - BytePos(if is_doc_comment {3} else {2});
 401
 402     let mut level: int = 1;
 403     while level > 0 {
 404         if is_eof(rdr) {
 405             let msg = if is_doc_comment {
 406                 "unterminated block doc-comment".to_strbuf()
 407             } else {
 408                 "unterminated block comment".to_strbuf()
 409             };
 410             fatal_span(rdr, start_bpos, rdr.last_pos, msg);
 411         } else if rdr.curr_is('/') && nextch_is(rdr, '*') {
 412             level += 1;
 413             bump(rdr);
 414             bump(rdr);
 415         } else if rdr.curr_is('*') && nextch_is(rdr, '/') {
 416             level -= 1;
 417             bump(rdr);
 418             bump(rdr);
 419         } else {
 420             bump(rdr);
 421         }
 422     }
 423
 424     let res = if is_doc_comment {
 425         with_str_from(rdr, start_bpos, |string| {
 426             // but comments with only "*"s between two "/"s are not
 427             if !is_block_non_doc_comment(string) {
 428                 Some(TokenAndSpan{
 429                         tok: token::DOC_COMMENT(str_to_ident(string)),
 430                         sp: codemap::mk_sp(start_bpos, rdr.pos)
 431                     })
 432             } else {
 433                 None
 434             }
 435         })
 436     } else {
 437         None
 438     };
 439
 440     // restart whitespace munch.
 441     if res.is_some() { res } else { consume_whitespace_and_comments(rdr) }
 442 }
 443
 444 fn scan_exponent(rdr: &mut StringReader, start_bpos: BytePos) -> Option<StrBuf> {
 445     // \x00 hits the `return None` case immediately, so this is fine.
 446     let mut c = rdr.curr.unwrap_or('\x00');
 447     let mut rslt = StrBuf::new();
 448     if c == 'e' || c == 'E' {
 449         rslt.push_char(c);
 450         bump(rdr);
 451         c = rdr.curr.unwrap_or('\x00');
 452         if c == '-' || c == '+' {
 453             rslt.push_char(c);
 454             bump(rdr);
 455         }
 456         let exponent = scan_digits(rdr, 10u);
 457         if exponent.len() > 0u {
 458             rslt.push_str(exponent.as_slice());
 459             return Some(rslt);
 460         } else {
 461             fatal_span(rdr, start_bpos, rdr.last_pos,
 462                        "scan_exponent: bad fp literal".to_strbuf());
 463         }
 464     } else {
 465         return None::<StrBuf>;
 466     }
 467 }
 468
 469 fn scan_digits(rdr: &mut StringReader, radix: uint) -> StrBuf {
 470     let mut rslt = StrBuf::new();
 471     loop {
 472         let c = rdr.curr;
 473         if c == Some('_') { bump(rdr); continue; }
 474         match c.and_then(|cc| char::to_digit(cc, radix)) {
 475           Some(_) => {
 476             rslt.push_char(c.unwrap());
 477             bump(rdr);
 478           }
 479           _ => return rslt
 480         }
 481     };
 482 }
 483
 484 fn check_float_base(rdr: &mut StringReader, start_bpos: BytePos, last_bpos: BytePos,
 485                     base: uint) {
 486     match base {
 487       16u => {
 488           fatal_span(rdr, start_bpos, last_bpos,
 489                      "hexadecimal float literal is not supported".to_strbuf())
 490       }
 491       8u => fatal_span(rdr, start_bpos, last_bpos,
 492                      "octal float literal is not supported".to_strbuf()),
 493       2u => fatal_span(rdr, start_bpos, last_bpos,
 494                      "binary float literal is not supported".to_strbuf()),
 495       _ => ()
 496     }
 497 }
 498
 499 fn scan_number(c: char, rdr: &mut StringReader) -> token::Token {
 500     let mut num_str;
 501     let mut base = 10u;
 502     let mut c = c;
 503     let mut n = nextch(rdr).unwrap_or('\x00');
 504     let start_bpos = rdr.last_pos;
 505     if c == '0' && n == 'x' {
 506         bump(rdr);
 507         bump(rdr);
 508         base = 16u;
 509     } else if c == '0' && n == 'o' {
 510         bump(rdr);
 511         bump(rdr);
 512         base = 8u;
 513     } else if c == '0' && n == 'b' {
 514         bump(rdr);
 515         bump(rdr);
 516         base = 2u;
 517     }
 518     num_str = scan_digits(rdr, base);
 519     c = rdr.curr.unwrap_or('\x00');
 520     nextch(rdr);
 521     if c == 'u' || c == 'i' {
 522         enum Result { Signed(ast::IntTy), Unsigned(ast::UintTy) }
 523         let signed = c == 'i';
 524         let mut tp = {
 525             if signed { Signed(ast::TyI) }
 526             else { Unsigned(ast::TyU) }
 527         };
 528         bump(rdr);
 529         c = rdr.curr.unwrap_or('\x00');
 530         if c == '8' {
 531             bump(rdr);
 532             tp = if signed { Signed(ast::TyI8) }
 533                       else { Unsigned(ast::TyU8) };
 534         }
 535         n = nextch(rdr).unwrap_or('\x00');
 536         if c == '1' && n == '6' {
 537             bump(rdr);
 538             bump(rdr);
 539             tp = if signed { Signed(ast::TyI16) }
 540                       else { Unsigned(ast::TyU16) };
 541         } else if c == '3' && n == '2' {
 542             bump(rdr);
 543             bump(rdr);
 544             tp = if signed { Signed(ast::TyI32) }
 545                       else { Unsigned(ast::TyU32) };
 546         } else if c == '6' && n == '4' {
 547             bump(rdr);
 548             bump(rdr);
 549             tp = if signed { Signed(ast::TyI64) }
 550                       else { Unsigned(ast::TyU64) };
 551         }
 552         if num_str.len() == 0u {
 553             fatal_span(rdr, start_bpos, rdr.last_pos,
 554                        "no valid digits found for number".to_strbuf());
 555         }
 556         let parsed = match from_str_radix::<u64>(num_str.as_slice(),
 557                                                  base as uint) {
 558             Some(p) => p,
 559             None => fatal_span(rdr, start_bpos, rdr.last_pos,
 560                                "int literal is too large".to_strbuf())
 561         };
 562
 563         match tp {
 564           Signed(t) => return token::LIT_INT(parsed as i64, t),
 565           Unsigned(t) => return token::LIT_UINT(parsed, t)
 566         }
 567     }
 568     let mut is_float = false;
 569     if rdr.curr_is('.') && !(ident_start(nextch(rdr)) || nextch_is(rdr, '.')) {
 570         is_float = true;
 571         bump(rdr);
 572         let dec_part = scan_digits(rdr, 10u);
 573         num_str.push_char('.');
 574         num_str.push_str(dec_part.as_slice());
 575     }
 576     match scan_exponent(rdr, start_bpos) {
 577       Some(ref s) => {
 578         is_float = true;
 579         num_str.push_str(s.as_slice());
 580       }
 581       None => ()
 582     }
 583
 584     if rdr.curr_is('f') {
 585         bump(rdr);
 586         c = rdr.curr.unwrap_or('\x00');
 587         n = nextch(rdr).unwrap_or('\x00');
 588         if c == '3' && n == '2' {
 589             bump(rdr);
 590             bump(rdr);
 591             check_float_base(rdr, start_bpos, rdr.last_pos, base);
 592             return token::LIT_FLOAT(str_to_ident(num_str.into_owned()),
 593                                     ast::TyF32);
 594         } else if c == '6' && n == '4' {
 595             bump(rdr);
 596             bump(rdr);
 597             check_float_base(rdr, start_bpos, rdr.last_pos, base);
 598             return token::LIT_FLOAT(str_to_ident(num_str.into_owned()),
 599                                     ast::TyF64);
 600             /* FIXME (#2252): if this is out of range for either a
 601             32-bit or 64-bit float, it won't be noticed till the
 602             back-end.  */
 603         } else if c == '1' && n == '2' && nextnextch(rdr).unwrap_or('\x00') == '8' {
 604             bump(rdr);
 605             bump(rdr);
 606             bump(rdr);
 607             check_float_base(rdr, start_bpos, rdr.last_pos, base);
 608             return token::LIT_FLOAT(str_to_ident(num_str.as_slice()), ast::TyF128);
 609         }
 610         fatal_span(rdr, start_bpos, rdr.last_pos,
 611                    "expected `f32`, `f64` or `f128` suffix".to_strbuf());
 612     }
 613     if is_float {
 614         check_float_base(rdr, start_bpos, rdr.last_pos, base);
 615         return token::LIT_FLOAT_UNSUFFIXED(str_to_ident(
 616                 num_str.into_owned()));
 617     } else {
 618         if num_str.len() == 0u {
 619             fatal_span(rdr, start_bpos, rdr.last_pos,
 620                        "no valid digits found for number".to_strbuf());
 621         }
 622         let parsed = match from_str_radix::<u64>(num_str.as_slice(),
 623                                                  base as uint) {
 624             Some(p) => p,
 625             None => fatal_span(rdr, start_bpos, rdr.last_pos,
 626                                "int literal is too large".to_strbuf())
 627         };
 628
 629         debug!("lexing {} as an unsuffixed integer literal",
 630                num_str.as_slice());
 631         return token::LIT_INT_UNSUFFIXED(parsed as i64);
 632     }
 633 }
 634
 635 fn scan_numeric_escape(rdr: &mut StringReader, n_hex_digits: uint) -> char {
 636     let mut accum_int = 0;
 637     let mut i = n_hex_digits;
 638     let start_bpos = rdr.last_pos;
 639     while i != 0u && !is_eof(rdr) {
 640         let n = rdr.curr;
 641         if !is_hex_digit(n) {
 642             fatal_span_char(
 643                 rdr,
 644                 rdr.last_pos,
 645                 rdr.pos,
 646                 "illegal character in numeric character escape".to_strbuf(),
 647                 n.unwrap());
 648         }
 649         bump(rdr);
 650         accum_int *= 16;
 651         accum_int += hex_digit_val(n);
 652         i -= 1u;
 653     }
 654     if i != 0 && is_eof(rdr) {
 655         fatal_span(rdr, start_bpos, rdr.last_pos,
 656                    "unterminated numeric character escape".to_strbuf());
 657     }
 658
 659     match char::from_u32(accum_int as u32) {
 660         Some(x) => x,
 661         None => fatal_span(rdr, start_bpos, rdr.last_pos,
 662                            "illegal numeric character escape".to_strbuf())
 663     }
 664 }
 665
 666 fn ident_start(c: Option<char>) -> bool {
 667     let c = match c { Some(c) => c, None => return false };
 668
 669     (c >= 'a' && c <= 'z')
 670         || (c >= 'A' && c <= 'Z')
 671         || c == '_'
 672         || (c > '\x7f' && char::is_XID_start(c))
 673 }
 674
 675 fn ident_continue(c: Option<char>) -> bool {
 676     let c = match c { Some(c) => c, None => return false };
 677
 678     (c >= 'a' && c <= 'z')
 679         || (c >= 'A' && c <= 'Z')
 680         || (c >= '0' && c <= '9')
 681         || c == '_'
 682         || (c > '\x7f' && char::is_XID_continue(c))
 683 }
 684
 685 // return the next token from the string
 686 // EFFECT: advances the input past that token
 687 // EFFECT: updates the interner
 688 fn next_token_inner(rdr: &mut StringReader) -> token::Token {
 689     let c = rdr.curr;
 690     if ident_start(c) && !nextch_is(rdr, '"') && !nextch_is(rdr, '#') {
 691         // Note: r as in r" or r#" is part of a raw string literal,
 692         // not an identifier, and is handled further down.
 693
 694         let start = rdr.last_pos;
 695         while ident_continue(rdr.curr) {
 696             bump(rdr);
 697         }
 698
 699         return with_str_from(rdr, start, |string| {
 700             if string == "_" {
 701                 token::UNDERSCORE
 702             } else {
 703                 let is_mod_name = rdr.curr_is(':') && nextch_is(rdr, ':');
 704
 705                 // FIXME: perform NFKC normalization here. (Issue #2253)
 706                 token::IDENT(str_to_ident(string), is_mod_name)
 707             }
 708         })
 709     }
 710     if is_dec_digit(c) {
 711         return scan_number(c.unwrap(), rdr);
 712     }
 713     fn binop(rdr: &mut StringReader, op: token::BinOp) -> token::Token {
 714         bump(rdr);
 715         if rdr.curr_is('=') {
 716             bump(rdr);
 717             return token::BINOPEQ(op);
 718         } else { return token::BINOP(op); }
 719     }
 720     match c.expect("next_token_inner called at EOF") {
 721
 722
 723
 724
 725
 726       // One-byte tokens.
 727       ';' => { bump(rdr); return token::SEMI; }
 728       ',' => { bump(rdr); return token::COMMA; }
 729       '.' => {
 730           bump(rdr);
 731           return if rdr.curr_is('.') {
 732               bump(rdr);
 733               if rdr.curr_is('.') {
 734                   bump(rdr);
 735                   token::DOTDOTDOT
 736               } else {
 737                   token::DOTDOT
 738               }
 739           } else {
 740               token::DOT
 741           };
 742       }
 743       '(' => { bump(rdr); return token::LPAREN; }
 744       ')' => { bump(rdr); return token::RPAREN; }
 745       '{' => { bump(rdr); return token::LBRACE; }
 746       '}' => { bump(rdr); return token::RBRACE; }
 747       '[' => { bump(rdr); return token::LBRACKET; }
 748       ']' => { bump(rdr); return token::RBRACKET; }
 749       '@' => { bump(rdr); return token::AT; }
 750       '#' => { bump(rdr); return token::POUND; }
 751       '~' => { bump(rdr); return token::TILDE; }
 752       ':' => {
 753         bump(rdr);
 754         if rdr.curr_is(':') {
 755             bump(rdr);
 756             return token::MOD_SEP;
 757         } else { return token::COLON; }
 758       }
 759
 760       '$' => { bump(rdr); return token::DOLLAR; }
 761
 762
 763
 764
 765
 766       // Multi-byte tokens.
 767       '=' => {
 768         bump(rdr);
 769         if rdr.curr_is('=') {
 770             bump(rdr);
 771             return token::EQEQ;
 772         } else if rdr.curr_is('>') {
 773             bump(rdr);
 774             return token::FAT_ARROW;
 775         } else {
 776             return token::EQ;
 777         }
 778       }
 779       '!' => {
 780         bump(rdr);
 781         if rdr.curr_is('=') {
 782             bump(rdr);
 783             return token::NE;
 784         } else { return token::NOT; }
 785       }
 786       '<' => {
 787         bump(rdr);
 788         match rdr.curr.unwrap_or('\x00') {
 789           '=' => { bump(rdr); return token::LE; }
 790           '<' => { return binop(rdr, token::SHL); }
 791           '-' => {
 792             bump(rdr);
 793             match rdr.curr.unwrap_or('\x00') {
 794               '>' => { bump(rdr); return token::DARROW; }
 795               _ => { return token::LARROW; }
 796             }
 797           }
 798           _ => { return token::LT; }
 799         }
 800       }
 801       '>' => {
 802         bump(rdr);
 803         match rdr.curr.unwrap_or('\x00') {
 804           '=' => { bump(rdr); return token::GE; }
 805           '>' => { return binop(rdr, token::SHR); }
 806           _ => { return token::GT; }
 807         }
 808       }
 809       '\'' => {
 810         // Either a character constant 'a' OR a lifetime name 'abc
 811         bump(rdr);
 812         let start = rdr.last_pos;
 813
 814         // the eof will be picked up by the final `'` check below
 815         let mut c2 = rdr.curr.unwrap_or('\x00');
 816         bump(rdr);
 817
 818         // If the character is an ident start not followed by another single
 819         // quote, then this is a lifetime name:
 820         if ident_start(Some(c2)) && !rdr.curr_is('\'') {
 821             while ident_continue(rdr.curr) {
 822                 bump(rdr);
 823             }
 824             let ident = with_str_from(rdr, start, |lifetime_name| {
 825                 str_to_ident(lifetime_name)
 826             });
 827             let tok = &token::IDENT(ident, false);
 828
 829             if token::is_keyword(token::keywords::Self, tok) {
 830                 fatal_span(rdr, start, rdr.last_pos,
 831                            "invalid lifetime name: 'self \
 832                             is no longer a special lifetime".to_strbuf());
 833             } else if token::is_any_keyword(tok) &&
 834                 !token::is_keyword(token::keywords::Static, tok) {
 835                 fatal_span(rdr, start, rdr.last_pos,
 836                            "invalid lifetime name".to_strbuf());
 837             } else {
 838                 return token::LIFETIME(ident);
 839             }
 840         }
 841
 842         // Otherwise it is a character constant:
 843         match c2 {
 844             '\\' => {
 845                 // '\X' for some X must be a character constant:
 846                 let escaped = rdr.curr;
 847                 let escaped_pos = rdr.last_pos;
 848                 bump(rdr);
 849                 match escaped {
 850                     None => {}
 851                     Some(e) => {
 852                         c2 = match e {
 853                             'n' => '\n',
 854                             'r' => '\r',
 855                             't' => '\t',
 856                             '\\' => '\\',
 857                             '\'' => '\'',
 858                             '"' => '"',
 859                             '0' => '\x00',
 860                             'x' => scan_numeric_escape(rdr, 2u),
 861                             'u' => scan_numeric_escape(rdr, 4u),
 862                             'U' => scan_numeric_escape(rdr, 8u),
 863                             c2 => {
 864                                 fatal_span_char(rdr,
 865                                                 escaped_pos,
 866                                                 rdr.last_pos,
 867                                                 "unknown character \
 868                                                  escape".to_strbuf(),
 869                                                 c2)
 870                             }
 871                         }
 872                     }
 873                 }
 874             }
 875             '\t' | '\n' | '\r' | '\'' => {
 876                 fatal_span_char(
 877                     rdr,
 878                     start,
 879                     rdr.last_pos,
 880                     "character constant must be escaped".to_strbuf(),
 881                     c2);
 882             }
 883             _ => {}
 884         }
 885         if !rdr.curr_is('\'') {
 886             fatal_span_verbose(rdr,
 887                                // Byte offsetting here is okay because the
 888                                // character before position `start` is an
 889                                // ascii single quote.
 890                                start - BytePos(1),
 891                                rdr.last_pos,
 892                                "unterminated character constant".to_strbuf());
 893         }
 894         bump(rdr); // advance curr past token
 895         return token::LIT_CHAR(c2);
 896       }
 897       '"' => {
 898         let mut accum_str = StrBuf::new();
 899         let start_bpos = rdr.last_pos;
 900         bump(rdr);
 901         while !rdr.curr_is('"') {
 902             if is_eof(rdr) {
 903                 fatal_span(rdr, start_bpos, rdr.last_pos,
 904                            "unterminated double quote string".to_strbuf());
 905             }
 906
 907             let ch = rdr.curr.unwrap();
 908             bump(rdr);
 909             match ch {
 910               '\\' => {
 911                 if is_eof(rdr) {
 912                     fatal_span(rdr, start_bpos, rdr.last_pos,
 913                            "unterminated double quote string".to_strbuf());
 914                 }
 915
 916                 let escaped = rdr.curr.unwrap();
 917                 let escaped_pos = rdr.last_pos;
 918                 bump(rdr);
 919                 match escaped {
 920                   'n' => accum_str.push_char('\n'),
 921                   'r' => accum_str.push_char('\r'),
 922                   't' => accum_str.push_char('\t'),
 923                   '\\' => accum_str.push_char('\\'),
 924                   '\'' => accum_str.push_char('\''),
 925                   '"' => accum_str.push_char('"'),
 926                   '\n' => consume_whitespace(rdr),
 927                   '0' => accum_str.push_char('\x00'),
 928                   'x' => {
 929                     accum_str.push_char(scan_numeric_escape(rdr, 2u));
 930                   }
 931                   'u' => {
 932                     accum_str.push_char(scan_numeric_escape(rdr, 4u));
 933                   }
 934                   'U' => {
 935                     accum_str.push_char(scan_numeric_escape(rdr, 8u));
 936                   }
 937                   c2 => {
 938                     fatal_span_char(rdr, escaped_pos, rdr.last_pos,
 939                                     "unknown string escape".to_strbuf(), c2);
 940                   }
 941                 }
 942               }
 943               _ => accum_str.push_char(ch)
 944             }
 945         }
 946         bump(rdr);
 947         return token::LIT_STR(str_to_ident(accum_str.as_slice()));
 948       }
 949       'r' => {
 950         let start_bpos = rdr.last_pos;
 951         bump(rdr);
 952         let mut hash_count = 0u;
 953         while rdr.curr_is('#') {
 954             bump(rdr);
 955             hash_count += 1;
 956         }
 957
 958         if is_eof(rdr) {
 959             fatal_span(rdr, start_bpos, rdr.last_pos,
 960                        "unterminated raw string".to_strbuf());
 961         } else if !rdr.curr_is('"') {
 962             fatal_span_char(rdr, start_bpos, rdr.last_pos,
 963                             "only `#` is allowed in raw string delimitation; \
 964                              found illegal character".to_strbuf(),
 965                             rdr.curr.unwrap());
 966         }
 967         bump(rdr);
 968         let content_start_bpos = rdr.last_pos;
 969         let mut content_end_bpos;
 970         'outer: loop {
 971             if is_eof(rdr) {
 972                 fatal_span(rdr, start_bpos, rdr.last_pos,
 973                            "unterminated raw string".to_strbuf());
 974             }
 975             if rdr.curr_is('"') {
 976                 content_end_bpos = rdr.last_pos;
 977                 for _ in range(0, hash_count) {
 978                     bump(rdr);
 979                     if !rdr.curr_is('#') {
 980                         continue 'outer;
 981                     }
 982                 }
 983                 break;
 984             }
 985             bump(rdr);
 986         }
 987         bump(rdr);
 988         let str_content = with_str_from_to(rdr,
 989                                            content_start_bpos,
 990                                            content_end_bpos,
 991                                            str_to_ident);
 992         return token::LIT_STR_RAW(str_content, hash_count);
 993       }
 994       '-' => {
 995         if nextch_is(rdr, '>') {
 996             bump(rdr);
 997             bump(rdr);
 998             return token::RARROW;
 999         } else { return binop(rdr, token::MINUS); }
1000       }
1001       '&' => {
1002         if nextch_is(rdr, '&') {
1003             bump(rdr);
1004             bump(rdr);
1005             return token::ANDAND;
1006         } else { return binop(rdr, token::AND); }
1007       }
1008       '|' => {
1009         match nextch(rdr) {
1010           Some('|') => { bump(rdr); bump(rdr); return token::OROR; }
1011           _ => { return binop(rdr, token::OR); }
1012         }
1013       }
1014       '+' => { return binop(rdr, token::PLUS); }
1015       '*' => { return binop(rdr, token::STAR); }
1016       '/' => { return binop(rdr, token::SLASH); }
1017       '^' => { return binop(rdr, token::CARET); }
1018       '%' => { return binop(rdr, token::PERCENT); }
1019       c => {
1020           fatal_span_char(rdr, rdr.last_pos, rdr.pos,
1021                           "unknown start of token".to_strbuf(), c);
1022       }
1023     }
1024 }
1025
1026 fn consume_whitespace(rdr: &mut StringReader) {
1027     while is_whitespace(rdr.curr) && !is_eof(rdr) { bump(rdr); }
1028 }
1029
1030 #[cfg(test)]
1031 mod test {
1032     use super::*;
1033
1034     use codemap::{BytePos, CodeMap, Span};
1035     use diagnostic;
1036     use parse::token;
1037     use parse::token::{str_to_ident};
1038     use std::io::util;
1039
1040     fn mk_sh() -> diagnostic::SpanHandler {
1041         let emitter = diagnostic::EmitterWriter::new(box util::NullWriter);
1042         let handler = diagnostic::mk_handler(box emitter);
1043         diagnostic::mk_span_handler(handler, CodeMap::new())
1044     }
1045
1046     // open a string reader for the given string
1047     fn setup<'a>(span_handler: &'a diagnostic::SpanHandler,
1048                  teststr: StrBuf) -> StringReader<'a> {
1049         let fm = span_handler.cm.new_filemap("zebra.rs".to_strbuf(), teststr);
1050         new_string_reader(span_handler, fm)
1051     }
1052
1053     #[test] fn t1 () {
1054         let span_handler = mk_sh();
1055         let mut string_reader = setup(&span_handler,
1056             "/* my source file */ \
1057              fn main() { println!(\"zebra\"); }\n".to_strbuf());
1058         let id = str_to_ident("fn");
1059         let tok1 = string_reader.next_token();
1060         let tok2 = TokenAndSpan{
1061             tok:token::IDENT(id, false),
1062             sp:Span {lo:BytePos(21),hi:BytePos(23),expn_info: None}};
1063         assert_eq!(tok1,tok2);
1064         // the 'main' id is already read:
1065         assert_eq!(string_reader.last_pos.clone(), BytePos(28));
1066         // read another token:
1067         let tok3 = string_reader.next_token();
1068         let tok4 = TokenAndSpan{
1069             tok:token::IDENT(str_to_ident("main"), false),
1070             sp:Span {lo:BytePos(24),hi:BytePos(28),expn_info: None}};
1071         assert_eq!(tok3,tok4);
1072         // the lparen is already read:
1073         assert_eq!(string_reader.last_pos.clone(), BytePos(29))
1074     }
1075
1076     // check that the given reader produces the desired stream
1077     // of tokens (stop checking after exhausting the expected vec)
1078     fn check_tokenization (mut string_reader: StringReader, expected: Vec<token::Token> ) {
1079         for expected_tok in expected.iter() {
1080             assert_eq!(&string_reader.next_token().tok, expected_tok);
1081         }
1082     }
1083
1084     // make the identifier by looking up the string in the interner
1085     fn mk_ident (id: &str, is_mod_name: bool) -> token::Token {
1086         token::IDENT (str_to_ident(id),is_mod_name)
1087     }
1088
1089     #[test] fn doublecolonparsing () {
1090         check_tokenization(setup(&mk_sh(), "a b".to_strbuf()),
1091                            vec!(mk_ident("a",false),
1092                              mk_ident("b",false)));
1093     }
1094
1095     #[test] fn dcparsing_2 () {
1096         check_tokenization(setup(&mk_sh(), "a::b".to_strbuf()),
1097                            vec!(mk_ident("a",true),
1098                              token::MOD_SEP,
1099                              mk_ident("b",false)));
1100     }
1101
1102     #[test] fn dcparsing_3 () {
1103         check_tokenization(setup(&mk_sh(), "a ::b".to_strbuf()),
1104                            vec!(mk_ident("a",false),
1105                              token::MOD_SEP,
1106                              mk_ident("b",false)));
1107     }
1108
1109     #[test] fn dcparsing_4 () {
1110         check_tokenization(setup(&mk_sh(), "a:: b".to_strbuf()),
1111                            vec!(mk_ident("a",true),
1112                              token::MOD_SEP,
1113                              mk_ident("b",false)));
1114     }
1115
1116     #[test] fn character_a() {
1117         assert_eq!(setup(&mk_sh(), "'a'".to_strbuf()).next_token().tok,
1118                    token::LIT_CHAR('a'));
1119     }
1120
1121     #[test] fn character_space() {
1122         assert_eq!(setup(&mk_sh(), "' '".to_strbuf()).next_token().tok,
1123                    token::LIT_CHAR(' '));
1124     }
1125
1126     #[test] fn character_escaped() {
1127         assert_eq!(setup(&mk_sh(), "'\\n'".to_strbuf()).next_token().tok,
1128                    token::LIT_CHAR('\n'));
1129     }
1130
1131     #[test] fn lifetime_name() {
1132         assert_eq!(setup(&mk_sh(), "'abc".to_strbuf()).next_token().tok,
1133                    token::LIFETIME(token::str_to_ident("abc")));
1134     }
1135
1136     #[test] fn raw_string() {
1137         assert_eq!(setup(&mk_sh(),
1138                          "r###\"\"#a\\b\x00c\"\"###".to_strbuf()).next_token()
1139                                                                  .tok,
1140                    token::LIT_STR_RAW(token::str_to_ident("\"#a\\b\x00c\""), 3));
1141     }
1142
1143     #[test] fn line_doc_comments() {
1144         assert!(!is_line_non_doc_comment("///"));
1145         assert!(!is_line_non_doc_comment("/// blah"));
1146         assert!(is_line_non_doc_comment("////"));
1147     }
1148
1149     #[test] fn nested_block_comments() {
1150         assert_eq!(setup(&mk_sh(),
1151                          "/* /* */ */'a'".to_strbuf()).next_token().tok,
1152                    token::LIT_CHAR('a'));
1153     }
1154
1155 }