]> git.lizzy.rs Git - rust.git/blob - src/libsyntax/parse/lexer/mod.rs
bbece1ee5e3d4474e286737cb53d74b5bbb73a6a
[rust.git] / src / libsyntax / parse / lexer / mod.rs
1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use ast::{self, Ident};
12 use syntax_pos::{self, BytePos, CharPos, Pos, Span, NO_EXPANSION};
13 use codemap::{CodeMap, FilePathMapping};
14 use errors::{FatalError, DiagnosticBuilder};
15 use parse::{token, ParseSess};
16 use str::char_at;
17 use symbol::{Symbol, keywords};
18 use core::unicode::property::Pattern_White_Space;
19
20 use std::borrow::Cow;
21 use std::char;
22 use std::mem::replace;
23 use rustc_data_structures::sync::Lrc;
24
25 pub mod comments;
26 mod tokentrees;
27 mod unicode_chars;
28
29 #[derive(Clone, PartialEq, Eq, Debug)]
30 pub struct TokenAndSpan {
31     pub tok: token::Token,
32     pub sp: Span,
33 }
34
35 impl Default for TokenAndSpan {
36     fn default() -> Self {
37         TokenAndSpan { tok: token::Whitespace, sp: syntax_pos::DUMMY_SP }
38     }
39 }
40
41 pub struct StringReader<'a> {
42     pub sess: &'a ParseSess,
43     /// The absolute offset within the codemap of the next character to read
44     pub next_pos: BytePos,
45     /// The absolute offset within the codemap of the current character
46     pub pos: BytePos,
47     /// The current character (which has been read from self.pos)
48     pub ch: Option<char>,
49     pub filemap: Lrc<syntax_pos::FileMap>,
50     /// Stop reading src at this index.
51     pub end_src_index: usize,
52     /// Whether to record new-lines and multibyte chars in filemap.
53     /// This is only necessary the first time a filemap is lexed.
54     /// If part of a filemap is being re-lexed, this should be set to false.
55     pub save_new_lines_and_multibyte: bool,
56     // cached:
57     pub peek_tok: token::Token,
58     pub peek_span: Span,
59     pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
60     // cache a direct reference to the source text, so that we don't have to
61     // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
62     src: Lrc<String>,
63     /// Stack of open delimiters and their spans. Used for error message.
64     token: token::Token,
65     span: Span,
66     open_braces: Vec<(token::DelimToken, Span)>,
67     pub override_span: Option<Span>,
68 }
69
70 impl<'a> StringReader<'a> {
71     fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
72         unwrap_or!(self.override_span, Span::new(lo, hi, NO_EXPANSION))
73     }
74     fn mk_ident(&self, string: &str) -> Ident {
75         let mut ident = Ident::from_str(string);
76         if let Some(span) = self.override_span {
77             ident.span = span;
78         }
79         ident
80     }
81
82     fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
83         let res = self.try_next_token();
84         self.unwrap_or_abort(res)
85     }
86     fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
87         match res {
88             Ok(tok) => tok,
89             Err(_) => {
90                 self.emit_fatal_errors();
91                 FatalError.raise();
92             }
93         }
94     }
95     fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
96         let mut t = self.try_next_token()?;
97         loop {
98             match t.tok {
99                 token::Whitespace | token::Comment | token::Shebang(_) => {
100                     t = self.try_next_token()?;
101                 }
102                 _ => break,
103             }
104         }
105         self.token = t.tok.clone();
106         self.span = t.sp;
107         Ok(t)
108     }
109     pub fn real_token(&mut self) -> TokenAndSpan {
110         let res = self.try_real_token();
111         self.unwrap_or_abort(res)
112     }
113     fn is_eof(&self) -> bool {
114         self.ch.is_none()
115     }
116     /// Return the next token. EFFECT: advances the string_reader.
117     pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
118         assert!(self.fatal_errs.is_empty());
119         let ret_val = TokenAndSpan {
120             tok: replace(&mut self.peek_tok, token::Whitespace),
121             sp: self.peek_span,
122         };
123         self.advance_token()?;
124         Ok(ret_val)
125     }
126
127     fn fail_unterminated_raw_string(&self, pos: BytePos, hash_count: u16) {
128         let mut err = self.struct_span_fatal(pos, pos, "unterminated raw string");
129         err.span_label(self.mk_sp(pos, pos), "unterminated raw string");
130         if hash_count > 0 {
131             err.note(&format!("this raw string should be terminated with `\"{}`",
132                               "#".repeat(hash_count as usize)));
133         }
134         err.emit();
135         FatalError.raise();
136     }
137
138     fn fatal(&self, m: &str) -> FatalError {
139         self.fatal_span(self.peek_span, m)
140     }
141     pub fn emit_fatal_errors(&mut self) {
142         for err in &mut self.fatal_errs {
143             err.emit();
144         }
145         self.fatal_errs.clear();
146     }
147     pub fn peek(&self) -> TokenAndSpan {
148         // FIXME(pcwalton): Bad copy!
149         TokenAndSpan {
150             tok: self.peek_tok.clone(),
151             sp: self.peek_span,
152         }
153     }
154 }
155
156 impl<'a> StringReader<'a> {
157     /// For comments.rs, which hackily pokes into next_pos and ch
158     pub fn new_raw(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
159         let mut sr = StringReader::new_raw_internal(sess, filemap);
160         sr.bump();
161         sr
162     }
163
164     fn new_raw_internal(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
165         if filemap.src.is_none() {
166             sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
167                                               filemap.name));
168         }
169
170         let src = (*filemap.src.as_ref().unwrap()).clone();
171
172         StringReader {
173             sess,
174             next_pos: filemap.start_pos,
175             pos: filemap.start_pos,
176             ch: Some('\n'),
177             filemap,
178             end_src_index: src.len(),
179             save_new_lines_and_multibyte: true,
180             // dummy values; not read
181             peek_tok: token::Eof,
182             peek_span: syntax_pos::DUMMY_SP,
183             src,
184             fatal_errs: Vec::new(),
185             token: token::Eof,
186             span: syntax_pos::DUMMY_SP,
187             open_braces: Vec::new(),
188             override_span: None,
189         }
190     }
191
192     pub fn new(sess: &'a ParseSess, filemap: Lrc<syntax_pos::FileMap>) -> Self {
193         let mut sr = StringReader::new_raw(sess, filemap);
194         if sr.advance_token().is_err() {
195             sr.emit_fatal_errors();
196             FatalError.raise();
197         }
198         sr
199     }
200
201     pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
202         let begin = sess.codemap().lookup_byte_offset(span.lo());
203         let end = sess.codemap().lookup_byte_offset(span.hi());
204
205         // Make the range zero-length if the span is invalid.
206         if span.lo() > span.hi() || begin.fm.start_pos != end.fm.start_pos {
207             span = span.shrink_to_lo();
208         }
209
210         let mut sr = StringReader::new_raw_internal(sess, begin.fm);
211
212         // Seek the lexer to the right byte range.
213         sr.save_new_lines_and_multibyte = false;
214         sr.next_pos = span.lo();
215         sr.end_src_index = sr.src_index(span.hi());
216
217         sr.bump();
218
219         if sr.advance_token().is_err() {
220             sr.emit_fatal_errors();
221             FatalError.raise();
222         }
223         sr
224     }
225
226     pub fn ch_is(&self, c: char) -> bool {
227         self.ch == Some(c)
228     }
229
230     /// Report a fatal lexical error with a given span.
231     pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
232         self.sess.span_diagnostic.span_fatal(sp, m)
233     }
234
235     /// Report a lexical error with a given span.
236     pub fn err_span(&self, sp: Span, m: &str) {
237         self.sess.span_diagnostic.span_err(sp, m)
238     }
239
240
241     /// Report a fatal error spanning [`from_pos`, `to_pos`).
242     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
243         self.fatal_span(self.mk_sp(from_pos, to_pos), m)
244     }
245
246     /// Report a lexical error spanning [`from_pos`, `to_pos`).
247     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
248         self.err_span(self.mk_sp(from_pos, to_pos), m)
249     }
250
251     /// Pushes a character to a message string for error reporting
252     fn push_escaped_char_for_msg(m: &mut String, c: char) {
253         match c {
254             '\u{20}'...'\u{7e}' => {
255                 // Don't escape \, ' or " for user-facing messages
256                 m.push(c);
257             }
258             _ => {
259                 for c in c.escape_default() {
260                     m.push(c);
261                 }
262             }
263         }
264     }
265
266     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
267     /// escaped character to the error message
268     fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
269         let mut m = m.to_string();
270         m.push_str(": ");
271         Self::push_escaped_char_for_msg(&mut m, c);
272         self.fatal_span_(from_pos, to_pos, &m[..])
273     }
274
275     fn struct_span_fatal(&self,
276                          from_pos: BytePos,
277                          to_pos: BytePos,
278                          m: &str)
279                          -> DiagnosticBuilder<'a> {
280         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m)
281     }
282
283     fn struct_fatal_span_char(&self,
284                               from_pos: BytePos,
285                               to_pos: BytePos,
286                               m: &str,
287                               c: char)
288                               -> DiagnosticBuilder<'a> {
289         let mut m = m.to_string();
290         m.push_str(": ");
291         Self::push_escaped_char_for_msg(&mut m, c);
292         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
293     }
294
295     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
296     /// escaped character to the error message
297     fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
298         let mut m = m.to_string();
299         m.push_str(": ");
300         Self::push_escaped_char_for_msg(&mut m, c);
301         self.err_span_(from_pos, to_pos, &m[..]);
302     }
303     fn struct_err_span_char(&self,
304                             from_pos: BytePos,
305                             to_pos: BytePos,
306                             m: &str,
307                             c: char)
308                             -> DiagnosticBuilder<'a> {
309         let mut m = m.to_string();
310         m.push_str(": ");
311         Self::push_escaped_char_for_msg(&mut m, c);
312         self.sess.span_diagnostic.struct_span_err(self.mk_sp(from_pos, to_pos), &m[..])
313     }
314
315     /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
316     /// offending string to the error message
317     fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
318         m.push_str(": ");
319         m.push_str(&self.src[self.src_index(from_pos)..self.src_index(to_pos)]);
320         self.fatal_span_(from_pos, to_pos, &m[..])
321     }
322
323     /// Advance peek_tok and peek_span to refer to the next token, and
324     /// possibly update the interner.
325     fn advance_token(&mut self) -> Result<(), ()> {
326         match self.scan_whitespace_or_comment() {
327             Some(comment) => {
328                 self.peek_span = comment.sp;
329                 self.peek_tok = comment.tok;
330             }
331             None => {
332                 if self.is_eof() {
333                     self.peek_tok = token::Eof;
334                     self.peek_span = self.mk_sp(self.filemap.end_pos, self.filemap.end_pos);
335                 } else {
336                     let start_bytepos = self.pos;
337                     self.peek_tok = self.next_token_inner()?;
338                     self.peek_span = self.mk_sp(start_bytepos, self.pos);
339                 };
340             }
341         }
342         Ok(())
343     }
344
345     #[inline]
346     fn src_index(&self, pos: BytePos) -> usize {
347         (pos - self.filemap.start_pos).to_usize()
348     }
349
350     /// Calls `f` with a string slice of the source text spanning from `start`
351     /// up to but excluding `self.pos`, meaning the slice does not include
352     /// the character `self.ch`.
353     pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
354         where F: FnOnce(&str) -> T
355     {
356         self.with_str_from_to(start, self.pos, f)
357     }
358
359     /// Create a Name from a given offset to the current offset, each
360     /// adjusted 1 towards each other (assumes that on either side there is a
361     /// single-byte delimiter).
362     pub fn name_from(&self, start: BytePos) -> ast::Name {
363         debug!("taking an ident from {:?} to {:?}", start, self.pos);
364         self.with_str_from(start, Symbol::intern)
365     }
366
367     /// As name_from, with an explicit endpoint.
368     pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
369         debug!("taking an ident from {:?} to {:?}", start, end);
370         self.with_str_from_to(start, end, Symbol::intern)
371     }
372
373     /// Calls `f` with a string slice of the source text spanning from `start`
374     /// up to but excluding `end`.
375     fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
376         where F: FnOnce(&str) -> T
377     {
378         f(&self.src[self.src_index(start)..self.src_index(end)])
379     }
380
381     /// Converts CRLF to LF in the given string, raising an error on bare CR.
382     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
383         let mut i = 0;
384         while i < s.len() {
385             let ch = char_at(s, i);
386             let next = i + ch.len_utf8();
387             if ch == '\r' {
388                 if next < s.len() && char_at(s, next) == '\n' {
389                     return translate_crlf_(self, start, s, errmsg, i).into();
390                 }
391                 let pos = start + BytePos(i as u32);
392                 let end_pos = start + BytePos(next as u32);
393                 self.err_span_(pos, end_pos, errmsg);
394             }
395             i = next;
396         }
397         return s.into();
398
399         fn translate_crlf_(rdr: &StringReader,
400                            start: BytePos,
401                            s: &str,
402                            errmsg: &str,
403                            mut i: usize)
404                            -> String {
405             let mut buf = String::with_capacity(s.len());
406             let mut j = 0;
407             while i < s.len() {
408                 let ch = char_at(s, i);
409                 let next = i + ch.len_utf8();
410                 if ch == '\r' {
411                     if j < i {
412                         buf.push_str(&s[j..i]);
413                     }
414                     j = next;
415                     if next >= s.len() || char_at(s, next) != '\n' {
416                         let pos = start + BytePos(i as u32);
417                         let end_pos = start + BytePos(next as u32);
418                         rdr.err_span_(pos, end_pos, errmsg);
419                     }
420                 }
421                 i = next;
422             }
423             if j < s.len() {
424                 buf.push_str(&s[j..]);
425             }
426             buf
427         }
428     }
429
430     /// Advance the StringReader by one character. If a newline is
431     /// discovered, add it to the FileMap's list of line start offsets.
432     pub fn bump(&mut self) {
433         let next_src_index = self.src_index(self.next_pos);
434         if next_src_index < self.end_src_index {
435             let next_ch = char_at(&self.src, next_src_index);
436             let next_ch_len = next_ch.len_utf8();
437
438             if self.ch.unwrap() == '\n' {
439                 if self.save_new_lines_and_multibyte {
440                     self.filemap.next_line(self.next_pos);
441                 }
442             }
443             if next_ch_len > 1 {
444                 if self.save_new_lines_and_multibyte {
445                     self.filemap.record_multibyte_char(self.next_pos, next_ch_len);
446                 }
447             }
448             self.filemap.record_width(self.next_pos, next_ch);
449
450             self.ch = Some(next_ch);
451             self.pos = self.next_pos;
452             self.next_pos = self.next_pos + Pos::from_usize(next_ch_len);
453         } else {
454             self.ch = None;
455             self.pos = self.next_pos;
456         }
457     }
458
459     pub fn nextch(&self) -> Option<char> {
460         let next_src_index = self.src_index(self.next_pos);
461         if next_src_index < self.end_src_index {
462             Some(char_at(&self.src, next_src_index))
463         } else {
464             None
465         }
466     }
467
468     pub fn nextch_is(&self, c: char) -> bool {
469         self.nextch() == Some(c)
470     }
471
472     pub fn nextnextch(&self) -> Option<char> {
473         let next_src_index = self.src_index(self.next_pos);
474         if next_src_index < self.end_src_index {
475             let next_next_src_index =
476                 next_src_index + char_at(&self.src, next_src_index).len_utf8();
477             if next_next_src_index < self.end_src_index {
478                 return Some(char_at(&self.src, next_next_src_index));
479             }
480         }
481         None
482     }
483
484     pub fn nextnextch_is(&self, c: char) -> bool {
485         self.nextnextch() == Some(c)
486     }
487
488     /// Eats <XID_start><XID_continue>*, if possible.
489     fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
490         if !ident_start(self.ch) {
491             return None;
492         }
493         let start = self.pos;
494         while ident_continue(self.ch) {
495             self.bump();
496         }
497
498         self.with_str_from(start, |string| {
499             if string == "_" {
500                 self.sess.span_diagnostic
501                     .struct_span_warn(self.mk_sp(start, self.pos),
502                                       "underscore literal suffix is not allowed")
503                     .warn("this was previously accepted by the compiler but is \
504                           being phased out; it will become a hard error in \
505                           a future release!")
506                     .note("for more information, see issue #42326 \
507                           <https://github.com/rust-lang/rust/issues/42326>")
508                     .emit();
509                 None
510             } else {
511                 Some(Symbol::intern(string))
512             }
513         })
514     }
515
516     /// PRECONDITION: self.ch is not whitespace
517     /// Eats any kind of comment.
518     fn scan_comment(&mut self) -> Option<TokenAndSpan> {
519         if let Some(c) = self.ch {
520             if c.is_whitespace() {
521                 let msg = "called consume_any_line_comment, but there was whitespace";
522                 self.sess.span_diagnostic.span_err(self.mk_sp(self.pos, self.pos), msg);
523             }
524         }
525
526         if self.ch_is('/') {
527             match self.nextch() {
528                 Some('/') => {
529                     self.bump();
530                     self.bump();
531
532                     // line comments starting with "///" or "//!" are doc-comments
533                     let doc_comment = (self.ch_is('/') && !self.nextch_is('/')) || self.ch_is('!');
534                     let start_bpos = self.pos - BytePos(2);
535
536                     while !self.is_eof() {
537                         match self.ch.unwrap() {
538                             '\n' => break,
539                             '\r' => {
540                                 if self.nextch_is('\n') {
541                                     // CRLF
542                                     break;
543                                 } else if doc_comment {
544                                     self.err_span_(self.pos,
545                                                    self.next_pos,
546                                                    "bare CR not allowed in doc-comment");
547                                 }
548                             }
549                             _ => (),
550                         }
551                         self.bump();
552                     }
553
554                     if doc_comment {
555                         self.with_str_from(start_bpos, |string| {
556                             // comments with only more "/"s are not doc comments
557                             let tok = if is_doc_comment(string) {
558                                 token::DocComment(Symbol::intern(string))
559                             } else {
560                                 token::Comment
561                             };
562
563                             Some(TokenAndSpan {
564                                 tok,
565                                 sp: self.mk_sp(start_bpos, self.pos),
566                             })
567                         })
568                     } else {
569                         Some(TokenAndSpan {
570                             tok: token::Comment,
571                             sp: self.mk_sp(start_bpos, self.pos),
572                         })
573                     }
574                 }
575                 Some('*') => {
576                     self.bump();
577                     self.bump();
578                     self.scan_block_comment()
579                 }
580                 _ => None,
581             }
582         } else if self.ch_is('#') {
583             if self.nextch_is('!') {
584
585                 // Parse an inner attribute.
586                 if self.nextnextch_is('[') {
587                     return None;
588                 }
589
590                 // I guess this is the only way to figure out if
591                 // we're at the beginning of the file...
592                 let cmap = CodeMap::new(FilePathMapping::empty());
593                 cmap.files.borrow_mut().file_maps.push(self.filemap.clone());
594                 let loc = cmap.lookup_char_pos_adj(self.pos);
595                 debug!("Skipping a shebang");
596                 if loc.line == 1 && loc.col == CharPos(0) {
597                     // FIXME: Add shebang "token", return it
598                     let start = self.pos;
599                     while !self.ch_is('\n') && !self.is_eof() {
600                         self.bump();
601                     }
602                     return Some(TokenAndSpan {
603                         tok: token::Shebang(self.name_from(start)),
604                         sp: self.mk_sp(start, self.pos),
605                     });
606                 }
607             }
608             None
609         } else {
610             None
611         }
612     }
613
614     /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
615     /// return None.
616     fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
617         match self.ch.unwrap_or('\0') {
618             // # to handle shebang at start of file -- this is the entry point
619             // for skipping over all "junk"
620             '/' | '#' => {
621                 let c = self.scan_comment();
622                 debug!("scanning a comment {:?}", c);
623                 c
624             },
625             c if is_pattern_whitespace(Some(c)) => {
626                 let start_bpos = self.pos;
627                 while is_pattern_whitespace(self.ch) {
628                     self.bump();
629                 }
630                 let c = Some(TokenAndSpan {
631                     tok: token::Whitespace,
632                     sp: self.mk_sp(start_bpos, self.pos),
633                 });
634                 debug!("scanning whitespace: {:?}", c);
635                 c
636             }
637             _ => None,
638         }
639     }
640
641     /// Might return a sugared-doc-attr
642     fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
643         // block comments starting with "/**" or "/*!" are doc-comments
644         let is_doc_comment = self.ch_is('*') || self.ch_is('!');
645         let start_bpos = self.pos - BytePos(2);
646
647         let mut level: isize = 1;
648         let mut has_cr = false;
649         while level > 0 {
650             if self.is_eof() {
651                 let msg = if is_doc_comment {
652                     "unterminated block doc-comment"
653                 } else {
654                     "unterminated block comment"
655                 };
656                 let last_bpos = self.pos;
657                 self.fatal_span_(start_bpos, last_bpos, msg).raise();
658             }
659             let n = self.ch.unwrap();
660             match n {
661                 '/' if self.nextch_is('*') => {
662                     level += 1;
663                     self.bump();
664                 }
665                 '*' if self.nextch_is('/') => {
666                     level -= 1;
667                     self.bump();
668                 }
669                 '\r' => {
670                     has_cr = true;
671                 }
672                 _ => (),
673             }
674             self.bump();
675         }
676
677         self.with_str_from(start_bpos, |string| {
678             // but comments with only "*"s between two "/"s are not
679             let tok = if is_block_doc_comment(string) {
680                 let string = if has_cr {
681                     self.translate_crlf(start_bpos,
682                                         string,
683                                         "bare CR not allowed in block doc-comment")
684                 } else {
685                     string.into()
686                 };
687                 token::DocComment(Symbol::intern(&string[..]))
688             } else {
689                 token::Comment
690             };
691
692             Some(TokenAndSpan {
693                 tok,
694                 sp: self.mk_sp(start_bpos, self.pos),
695             })
696         })
697     }
698
699     /// Scan through any digits (base `scan_radix`) or underscores,
700     /// and return how many digits there were.
701     ///
702     /// `real_radix` represents the true radix of the number we're
703     /// interested in, and errors will be emitted for any digits
704     /// between `real_radix` and `scan_radix`.
705     fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
706         assert!(real_radix <= scan_radix);
707         let mut len = 0;
708         loop {
709             let c = self.ch;
710             if c == Some('_') {
711                 debug!("skipping a _");
712                 self.bump();
713                 continue;
714             }
715             match c.and_then(|cc| cc.to_digit(scan_radix)) {
716                 Some(_) => {
717                     debug!("{:?} in scan_digits", c);
718                     // check that the hypothetical digit is actually
719                     // in range for the true radix
720                     if c.unwrap().to_digit(real_radix).is_none() {
721                         self.err_span_(self.pos,
722                                        self.next_pos,
723                                        &format!("invalid digit for a base {} literal", real_radix));
724                     }
725                     len += 1;
726                     self.bump();
727                 }
728                 _ => return len,
729             }
730         }
731     }
732
733     /// Lex a LIT_INTEGER or a LIT_FLOAT
734     fn scan_number(&mut self, c: char) -> token::Lit {
735         let num_digits;
736         let mut base = 10;
737         let start_bpos = self.pos;
738
739         self.bump();
740
741         if c == '0' {
742             match self.ch.unwrap_or('\0') {
743                 'b' => {
744                     self.bump();
745                     base = 2;
746                     num_digits = self.scan_digits(2, 10);
747                 }
748                 'o' => {
749                     self.bump();
750                     base = 8;
751                     num_digits = self.scan_digits(8, 10);
752                 }
753                 'x' => {
754                     self.bump();
755                     base = 16;
756                     num_digits = self.scan_digits(16, 16);
757                 }
758                 '0'...'9' | '_' | '.' | 'e' | 'E' => {
759                     num_digits = self.scan_digits(10, 10) + 1;
760                 }
761                 _ => {
762                     // just a 0
763                     return token::Integer(self.name_from(start_bpos));
764                 }
765             }
766         } else if c.is_digit(10) {
767             num_digits = self.scan_digits(10, 10) + 1;
768         } else {
769             num_digits = 0;
770         }
771
772         if num_digits == 0 {
773             self.err_span_(start_bpos,
774                            self.pos,
775                            "no valid digits found for number");
776             return token::Integer(Symbol::intern("0"));
777         }
778
779         // might be a float, but don't be greedy if this is actually an
780         // integer literal followed by field/method access or a range pattern
781         // (`0..2` and `12.foo()`)
782         if self.ch_is('.') && !self.nextch_is('.') &&
783            !ident_start(self.nextch()) {
784             // might have stuff after the ., and if it does, it needs to start
785             // with a number
786             self.bump();
787             if self.ch.unwrap_or('\0').is_digit(10) {
788                 self.scan_digits(10, 10);
789                 self.scan_float_exponent();
790             }
791             let pos = self.pos;
792             self.check_float_base(start_bpos, pos, base);
793             token::Float(self.name_from(start_bpos))
794         } else {
795             // it might be a float if it has an exponent
796             if self.ch_is('e') || self.ch_is('E') {
797                 self.scan_float_exponent();
798                 let pos = self.pos;
799                 self.check_float_base(start_bpos, pos, base);
800                 return token::Float(self.name_from(start_bpos));
801             }
802             // but we certainly have an integer!
803             token::Integer(self.name_from(start_bpos))
804         }
805     }
806
807     /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
808     /// error if too many or too few digits are encountered.
809     fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
810         debug!("scanning {} digits until {:?}", n_digits, delim);
811         let start_bpos = self.pos;
812         let mut accum_int = 0;
813
814         let mut valid = true;
815         for _ in 0..n_digits {
816             if self.is_eof() {
817                 let last_bpos = self.pos;
818                 self.fatal_span_(start_bpos,
819                                  last_bpos,
820                                  "unterminated numeric character escape").raise();
821             }
822             if self.ch_is(delim) {
823                 let last_bpos = self.pos;
824                 self.err_span_(start_bpos,
825                                last_bpos,
826                                "numeric character escape is too short");
827                 valid = false;
828                 break;
829             }
830             let c = self.ch.unwrap_or('\x00');
831             accum_int *= 16;
832             accum_int += c.to_digit(16).unwrap_or_else(|| {
833                 self.err_span_char(self.pos,
834                                    self.next_pos,
835                                    "invalid character in numeric character escape",
836                                    c);
837
838                 valid = false;
839                 0
840             });
841             self.bump();
842         }
843
844         if below_0x7f_only && accum_int >= 0x80 {
845             self.err_span_(start_bpos,
846                            self.pos,
847                            "this form of character escape may only be used with characters in \
848                             the range [\\x00-\\x7f]");
849             valid = false;
850         }
851
852         match char::from_u32(accum_int) {
853             Some(_) => valid,
854             None => {
855                 let last_bpos = self.pos;
856                 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
857                 false
858             }
859         }
860     }
861
862     /// Scan for a single (possibly escaped) byte or char
863     /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
864     /// `start` is the position of `first_source_char`, which is already consumed.
865     ///
866     /// Returns true if there was a valid char/byte, false otherwise.
867     fn scan_char_or_byte(&mut self,
868                          start: BytePos,
869                          first_source_char: char,
870                          ascii_only: bool,
871                          delim: char)
872                          -> bool {
873         match first_source_char {
874             '\\' => {
875                 // '\X' for some X must be a character constant:
876                 let escaped = self.ch;
877                 let escaped_pos = self.pos;
878                 self.bump();
879                 match escaped {
880                     None => {}  // EOF here is an error that will be checked later.
881                     Some(e) => {
882                         return match e {
883                             'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
884                             'x' => self.scan_byte_escape(delim, !ascii_only),
885                             'u' => {
886                                 let valid = if self.ch_is('{') {
887                                     self.scan_unicode_escape(delim) && !ascii_only
888                                 } else {
889                                     let span = self.mk_sp(start, self.pos);
890                                     self.sess.span_diagnostic
891                                         .struct_span_err(span, "incorrect unicode escape sequence")
892                                         .span_help(span,
893                                                    "format of unicode escape sequences is \
894                                                     `\\u{…}`")
895                                         .emit();
896                                     false
897                                 };
898                                 if ascii_only {
899                                     self.err_span_(start,
900                                                    self.pos,
901                                                    "unicode escape sequences cannot be used as a \
902                                                     byte or in a byte string");
903                                 }
904                                 valid
905
906                             }
907                             '\n' if delim == '"' => {
908                                 self.consume_whitespace();
909                                 true
910                             }
911                             '\r' if delim == '"' && self.ch_is('\n') => {
912                                 self.consume_whitespace();
913                                 true
914                             }
915                             c => {
916                                 let pos = self.pos;
917                                 let mut err = self.struct_err_span_char(escaped_pos,
918                                                                         pos,
919                                                                         if ascii_only {
920                                                                             "unknown byte escape"
921                                                                         } else {
922                                                                             "unknown character \
923                                                                              escape"
924                                                                         },
925                                                                         c);
926                                 if e == '\r' {
927                                     err.span_help(self.mk_sp(escaped_pos, pos),
928                                                   "this is an isolated carriage return; consider \
929                                                    checking your editor and version control \
930                                                    settings");
931                                 }
932                                 if (e == '{' || e == '}') && !ascii_only {
933                                     err.span_help(self.mk_sp(escaped_pos, pos),
934                                                   "if used in a formatting string, curly braces \
935                                                    are escaped with `{{` and `}}`");
936                                 }
937                                 err.emit();
938                                 false
939                             }
940                         }
941                     }
942                 }
943             }
944             '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
945                 let pos = self.pos;
946                 self.err_span_char(start,
947                                    pos,
948                                    if ascii_only {
949                                        "byte constant must be escaped"
950                                    } else {
951                                        "character constant must be escaped"
952                                    },
953                                    first_source_char);
954                 return false;
955             }
956             '\r' => {
957                 if self.ch_is('\n') {
958                     self.bump();
959                     return true;
960                 } else {
961                     self.err_span_(start,
962                                    self.pos,
963                                    "bare CR not allowed in string, use \\r instead");
964                     return false;
965                 }
966             }
967             _ => {
968                 if ascii_only && first_source_char > '\x7F' {
969                     let pos = self.pos;
970                     self.err_span_(start,
971                                    pos,
972                                    "byte constant must be ASCII. Use a \\xHH escape for a \
973                                     non-ASCII byte");
974                     return false;
975                 }
976             }
977         }
978         true
979     }
980
981     /// Scan over a `\u{...}` escape
982     ///
983     /// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
984     /// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
985     /// and pass over the `}`.
986     fn scan_unicode_escape(&mut self, delim: char) -> bool {
987         self.bump(); // past the {
988         let start_bpos = self.pos;
989         let mut valid = true;
990
991         if let Some('_') = self.ch {
992             // disallow leading `_`
993             self.err_span_(self.pos,
994                            self.next_pos,
995                            "invalid start of unicode escape");
996             valid = false;
997         }
998
999         let count = self.scan_digits(16, 16);
1000
1001         if count > 6 {
1002             self.err_span_(start_bpos,
1003                            self.pos,
1004                            "overlong unicode escape (must have at most 6 hex digits)");
1005             valid = false;
1006         }
1007         loop {
1008             match self.ch {
1009                 Some('}') => {
1010                     if valid && count == 0 {
1011                         self.err_span_(start_bpos,
1012                                        self.pos,
1013                                        "empty unicode escape (must have at least 1 hex digit)");
1014                         valid = false;
1015                     }
1016                     self.bump(); // past the ending `}`
1017                     break;
1018                 },
1019                 Some(c) => {
1020                     if c == delim {
1021                         self.err_span_(self.pos,
1022                                        self.pos,
1023                                        "unterminated unicode escape (needed a `}`)");
1024                         valid = false;
1025                         break;
1026                     } else if valid {
1027                         self.err_span_char(start_bpos,
1028                                            self.pos,
1029                                            "invalid character in unicode escape",
1030                                            c);
1031                         valid = false;
1032                     }
1033                 },
1034                 None => {
1035                     self.fatal_span_(start_bpos,
1036                                      self.pos,
1037                                      "unterminated unicode escape (found EOF)").raise();
1038                 }
1039             }
1040             self.bump();
1041         }
1042         valid
1043     }
1044
1045     /// Scan over a float exponent.
1046     fn scan_float_exponent(&mut self) {
1047         if self.ch_is('e') || self.ch_is('E') {
1048             self.bump();
1049             if self.ch_is('-') || self.ch_is('+') {
1050                 self.bump();
1051             }
1052             if self.scan_digits(10, 10) == 0 {
1053                 self.err_span_(self.pos,
1054                                self.next_pos,
1055                                "expected at least one digit in exponent")
1056             }
1057         }
1058     }
1059
1060     /// Check that a base is valid for a floating literal, emitting a nice
1061     /// error if it isn't.
1062     fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1063         match base {
1064             16 => {
1065                 self.err_span_(start_bpos,
1066                                last_bpos,
1067                                "hexadecimal float literal is not supported")
1068             }
1069             8 => {
1070                 self.err_span_(start_bpos,
1071                                last_bpos,
1072                                "octal float literal is not supported")
1073             }
1074             2 => {
1075                 self.err_span_(start_bpos,
1076                                last_bpos,
1077                                "binary float literal is not supported")
1078             }
1079             _ => (),
1080         }
1081     }
1082
1083     fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1084         self.bump();
1085         if self.ch_is('=') {
1086             self.bump();
1087             token::BinOpEq(op)
1088         } else {
1089             token::BinOp(op)
1090         }
1091     }
1092
1093     /// Return the next token from the string, advances the input past that
1094     /// token, and updates the interner
1095     fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1096         let c = self.ch;
1097
1098         if ident_start(c) {
1099             let (is_ident_start, is_raw_ident) =
1100                 match (c.unwrap(), self.nextch(), self.nextnextch()) {
1101                     // r# followed by an identifier starter is a raw identifier.
1102                     // This is an exception to the r# case below.
1103                     ('r', Some('#'), x) if ident_start(x) => (true, true),
1104                     // r as in r" or r#" is part of a raw string literal.
1105                     // b as in b' is part of a byte literal.
1106                     // They are not identifiers, and are handled further down.
1107                     ('r', Some('"'), _) |
1108                     ('r', Some('#'), _) |
1109                     ('b', Some('"'), _) |
1110                     ('b', Some('\''), _) |
1111                     ('b', Some('r'), Some('"')) |
1112                     ('b', Some('r'), Some('#')) => (false, false),
1113                     _ => (true, false),
1114                 };
1115             if is_ident_start {
1116                 let raw_start = self.pos;
1117                 if is_raw_ident {
1118                     // Consume the 'r#' characters.
1119                     self.bump();
1120                     self.bump();
1121                 }
1122
1123                 let start = self.pos;
1124                 while ident_continue(self.ch) {
1125                     self.bump();
1126                 }
1127
1128                 return Ok(self.with_str_from(start, |string| {
1129                     // FIXME: perform NFKC normalization here. (Issue #2253)
1130                     let ident = self.mk_ident(string);
1131                     if is_raw_ident && (ident.is_path_segment_keyword() ||
1132                                         ident.name == keywords::Underscore.name()) {
1133                         self.fatal_span_(raw_start, self.pos,
1134                             &format!("`r#{}` is not currently supported.", ident.name)
1135                         ).raise();
1136                     }
1137                     if is_raw_ident {
1138                         let span = self.mk_sp(raw_start, self.pos);
1139                         self.sess.raw_identifier_spans.borrow_mut().push(span);
1140                     }
1141                     token::Ident(ident, is_raw_ident)
1142                 }));
1143             }
1144         }
1145
1146         if is_dec_digit(c) {
1147             let num = self.scan_number(c.unwrap());
1148             let suffix = self.scan_optional_raw_name();
1149             debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1150             return Ok(token::Literal(num, suffix));
1151         }
1152
1153         match c.expect("next_token_inner called at EOF") {
1154             // One-byte tokens.
1155             ';' => {
1156                 self.bump();
1157                 Ok(token::Semi)
1158             }
1159             ',' => {
1160                 self.bump();
1161                 Ok(token::Comma)
1162             }
1163             '.' => {
1164                 self.bump();
1165                 if self.ch_is('.') {
1166                     self.bump();
1167                     if self.ch_is('.') {
1168                         self.bump();
1169                         Ok(token::DotDotDot)
1170                     } else if self.ch_is('=') {
1171                         self.bump();
1172                         Ok(token::DotDotEq)
1173                     } else {
1174                         Ok(token::DotDot)
1175                     }
1176                 } else {
1177                     Ok(token::Dot)
1178                 }
1179             }
1180             '(' => {
1181                 self.bump();
1182                 Ok(token::OpenDelim(token::Paren))
1183             }
1184             ')' => {
1185                 self.bump();
1186                 Ok(token::CloseDelim(token::Paren))
1187             }
1188             '{' => {
1189                 self.bump();
1190                 Ok(token::OpenDelim(token::Brace))
1191             }
1192             '}' => {
1193                 self.bump();
1194                 Ok(token::CloseDelim(token::Brace))
1195             }
1196             '[' => {
1197                 self.bump();
1198                 Ok(token::OpenDelim(token::Bracket))
1199             }
1200             ']' => {
1201                 self.bump();
1202                 Ok(token::CloseDelim(token::Bracket))
1203             }
1204             '@' => {
1205                 self.bump();
1206                 Ok(token::At)
1207             }
1208             '#' => {
1209                 self.bump();
1210                 Ok(token::Pound)
1211             }
1212             '~' => {
1213                 self.bump();
1214                 Ok(token::Tilde)
1215             }
1216             '?' => {
1217                 self.bump();
1218                 Ok(token::Question)
1219             }
1220             ':' => {
1221                 self.bump();
1222                 if self.ch_is(':') {
1223                     self.bump();
1224                     Ok(token::ModSep)
1225                 } else {
1226                     Ok(token::Colon)
1227                 }
1228             }
1229
1230             '$' => {
1231                 self.bump();
1232                 Ok(token::Dollar)
1233             }
1234
1235             // Multi-byte tokens.
1236             '=' => {
1237                 self.bump();
1238                 if self.ch_is('=') {
1239                     self.bump();
1240                     Ok(token::EqEq)
1241                 } else if self.ch_is('>') {
1242                     self.bump();
1243                     Ok(token::FatArrow)
1244                 } else {
1245                     Ok(token::Eq)
1246                 }
1247             }
1248             '!' => {
1249                 self.bump();
1250                 if self.ch_is('=') {
1251                     self.bump();
1252                     Ok(token::Ne)
1253                 } else {
1254                     Ok(token::Not)
1255                 }
1256             }
1257             '<' => {
1258                 self.bump();
1259                 match self.ch.unwrap_or('\x00') {
1260                     '=' => {
1261                         self.bump();
1262                         Ok(token::Le)
1263                     }
1264                     '<' => {
1265                         Ok(self.binop(token::Shl))
1266                     }
1267                     '-' => {
1268                         self.bump();
1269                         match self.ch.unwrap_or('\x00') {
1270                             _ => {
1271                                 Ok(token::LArrow)
1272                             }
1273                         }
1274                     }
1275                     _ => {
1276                         Ok(token::Lt)
1277                     }
1278                 }
1279             }
1280             '>' => {
1281                 self.bump();
1282                 match self.ch.unwrap_or('\x00') {
1283                     '=' => {
1284                         self.bump();
1285                         Ok(token::Ge)
1286                     }
1287                     '>' => {
1288                         Ok(self.binop(token::Shr))
1289                     }
1290                     _ => {
1291                         Ok(token::Gt)
1292                     }
1293                 }
1294             }
1295             '\'' => {
1296                 // Either a character constant 'a' OR a lifetime name 'abc
1297                 let start_with_quote = self.pos;
1298                 self.bump();
1299                 let start = self.pos;
1300
1301                 // the eof will be picked up by the final `'` check below
1302                 let c2 = self.ch.unwrap_or('\x00');
1303                 self.bump();
1304
1305                 // If the character is an ident start not followed by another single
1306                 // quote, then this is a lifetime name:
1307                 if ident_start(Some(c2)) && !self.ch_is('\'') {
1308                     while ident_continue(self.ch) {
1309                         self.bump();
1310                     }
1311                     // lifetimes shouldn't end with a single quote
1312                     // if we find one, then this is an invalid character literal
1313                     if self.ch_is('\'') {
1314                         self.fatal_span_verbose(start_with_quote, self.next_pos,
1315                                 String::from("character literal may only contain one codepoint"))
1316                             .raise();
1317
1318                     }
1319
1320                     // Include the leading `'` in the real identifier, for macro
1321                     // expansion purposes. See #12512 for the gory details of why
1322                     // this is necessary.
1323                     let ident = self.with_str_from(start, |lifetime_name| {
1324                         self.mk_ident(&format!("'{}", lifetime_name))
1325                     });
1326
1327                     return Ok(token::Lifetime(ident));
1328                 }
1329
1330                 let valid = self.scan_char_or_byte(start,
1331                                                    c2,
1332                                                    // ascii_only =
1333                                                    false,
1334                                                    '\'');
1335
1336                 if !self.ch_is('\'') {
1337                     let pos = self.pos;
1338                     loop {
1339                         self.bump();
1340                         if self.ch_is('\'') {
1341                             let start = self.src_index(start);
1342                             let end = self.src_index(self.pos);
1343                             self.bump();
1344                             let span = self.mk_sp(start_with_quote, self.pos);
1345                             self.sess.span_diagnostic
1346                                 .struct_span_err(span,
1347                                                  "character literal may only contain one codepoint")
1348                                 .span_suggestion(span,
1349                                                  "if you meant to write a `str` literal, \
1350                                                   use double quotes",
1351                                                  format!("\"{}\"", &self.src[start..end]))
1352                                 .emit();
1353                             return Ok(token::Literal(token::Str_(Symbol::intern("??")), None))
1354                         }
1355                         if self.ch_is('\n') || self.is_eof() || self.ch_is('/') {
1356                             // Only attempt to infer single line string literals. If we encounter
1357                             // a slash, bail out in order to avoid nonsensical suggestion when
1358                             // involving comments.
1359                             break;
1360                         }
1361                     }
1362                     self.fatal_span_verbose(start_with_quote, pos,
1363                         String::from("character literal may only contain one codepoint")).raise();
1364                 }
1365
1366                 let id = if valid {
1367                     self.name_from(start)
1368                 } else {
1369                     Symbol::intern("0")
1370                 };
1371                 self.bump(); // advance ch past token
1372                 let suffix = self.scan_optional_raw_name();
1373                 Ok(token::Literal(token::Char(id), suffix))
1374             }
1375             'b' => {
1376                 self.bump();
1377                 let lit = match self.ch {
1378                     Some('\'') => self.scan_byte(),
1379                     Some('"') => self.scan_byte_string(),
1380                     Some('r') => self.scan_raw_byte_string(),
1381                     _ => unreachable!(),  // Should have been a token::Ident above.
1382                 };
1383                 let suffix = self.scan_optional_raw_name();
1384                 Ok(token::Literal(lit, suffix))
1385             }
1386             '"' => {
1387                 let start_bpos = self.pos;
1388                 let mut valid = true;
1389                 self.bump();
1390                 while !self.ch_is('"') {
1391                     if self.is_eof() {
1392                         let last_bpos = self.pos;
1393                         self.fatal_span_(start_bpos,
1394                                          last_bpos,
1395                                          "unterminated double quote string").raise();
1396                     }
1397
1398                     let ch_start = self.pos;
1399                     let ch = self.ch.unwrap();
1400                     self.bump();
1401                     valid &= self.scan_char_or_byte(ch_start,
1402                                                     ch,
1403                                                     // ascii_only =
1404                                                     false,
1405                                                     '"');
1406                 }
1407                 // adjust for the ASCII " at the start of the literal
1408                 let id = if valid {
1409                     self.name_from(start_bpos + BytePos(1))
1410                 } else {
1411                     Symbol::intern("??")
1412                 };
1413                 self.bump();
1414                 let suffix = self.scan_optional_raw_name();
1415                 Ok(token::Literal(token::Str_(id), suffix))
1416             }
1417             'r' => {
1418                 let start_bpos = self.pos;
1419                 self.bump();
1420                 let mut hash_count: u16 = 0;
1421                 while self.ch_is('#') {
1422                     self.bump();
1423                     hash_count += 1;
1424                 }
1425
1426                 if self.is_eof() {
1427                     self.fail_unterminated_raw_string(start_bpos, hash_count);
1428                 } else if !self.ch_is('"') {
1429                     let last_bpos = self.pos;
1430                     let curr_char = self.ch.unwrap();
1431                     self.fatal_span_char(start_bpos,
1432                                          last_bpos,
1433                                          "found invalid character; only `#` is allowed \
1434                                          in raw string delimitation",
1435                                          curr_char).raise();
1436                 }
1437                 self.bump();
1438                 let content_start_bpos = self.pos;
1439                 let mut content_end_bpos;
1440                 let mut valid = true;
1441                 'outer: loop {
1442                     if self.is_eof() {
1443                         self.fail_unterminated_raw_string(start_bpos, hash_count);
1444                     }
1445                     // if self.ch_is('"') {
1446                     // content_end_bpos = self.pos;
1447                     // for _ in 0..hash_count {
1448                     // self.bump();
1449                     // if !self.ch_is('#') {
1450                     // continue 'outer;
1451                     let c = self.ch.unwrap();
1452                     match c {
1453                         '"' => {
1454                             content_end_bpos = self.pos;
1455                             for _ in 0..hash_count {
1456                                 self.bump();
1457                                 if !self.ch_is('#') {
1458                                     continue 'outer;
1459                                 }
1460                             }
1461                             break;
1462                         }
1463                         '\r' => {
1464                             if !self.nextch_is('\n') {
1465                                 let last_bpos = self.pos;
1466                                 self.err_span_(start_bpos,
1467                                                last_bpos,
1468                                                "bare CR not allowed in raw string, use \\r \
1469                                                 instead");
1470                                 valid = false;
1471                             }
1472                         }
1473                         _ => (),
1474                     }
1475                     self.bump();
1476                 }
1477                 self.bump();
1478                 let id = if valid {
1479                     self.name_from_to(content_start_bpos, content_end_bpos)
1480                 } else {
1481                     Symbol::intern("??")
1482                 };
1483                 let suffix = self.scan_optional_raw_name();
1484                 Ok(token::Literal(token::StrRaw(id, hash_count), suffix))
1485             }
1486             '-' => {
1487                 if self.nextch_is('>') {
1488                     self.bump();
1489                     self.bump();
1490                     Ok(token::RArrow)
1491                 } else {
1492                     Ok(self.binop(token::Minus))
1493                 }
1494             }
1495             '&' => {
1496                 if self.nextch_is('&') {
1497                     self.bump();
1498                     self.bump();
1499                     Ok(token::AndAnd)
1500                 } else {
1501                     Ok(self.binop(token::And))
1502                 }
1503             }
1504             '|' => {
1505                 match self.nextch() {
1506                     Some('|') => {
1507                         self.bump();
1508                         self.bump();
1509                         Ok(token::OrOr)
1510                     }
1511                     _ => {
1512                         Ok(self.binop(token::Or))
1513                     }
1514                 }
1515             }
1516             '+' => {
1517                 Ok(self.binop(token::Plus))
1518             }
1519             '*' => {
1520                 Ok(self.binop(token::Star))
1521             }
1522             '/' => {
1523                 Ok(self.binop(token::Slash))
1524             }
1525             '^' => {
1526                 Ok(self.binop(token::Caret))
1527             }
1528             '%' => {
1529                 Ok(self.binop(token::Percent))
1530             }
1531             c => {
1532                 let last_bpos = self.pos;
1533                 let bpos = self.next_pos;
1534                 let mut err = self.struct_fatal_span_char(last_bpos,
1535                                                           bpos,
1536                                                           "unknown start of token",
1537                                                           c);
1538                 unicode_chars::check_for_substitution(self, c, &mut err);
1539                 self.fatal_errs.push(err);
1540                 Err(())
1541             }
1542         }
1543     }
1544
1545     fn consume_whitespace(&mut self) {
1546         while is_pattern_whitespace(self.ch) && !self.is_eof() {
1547             self.bump();
1548         }
1549     }
1550
1551     fn read_to_eol(&mut self) -> String {
1552         let mut val = String::new();
1553         while !self.ch_is('\n') && !self.is_eof() {
1554             val.push(self.ch.unwrap());
1555             self.bump();
1556         }
1557         if self.ch_is('\n') {
1558             self.bump();
1559         }
1560         val
1561     }
1562
1563     fn read_one_line_comment(&mut self) -> String {
1564         let val = self.read_to_eol();
1565         assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1566                 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1567         val
1568     }
1569
1570     fn consume_non_eol_whitespace(&mut self) {
1571         while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1572             self.bump();
1573         }
1574     }
1575
1576     fn peeking_at_comment(&self) -> bool {
1577         (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1578         // consider shebangs comments, but not inner attributes
1579         (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1580     }
1581
1582     fn scan_byte(&mut self) -> token::Lit {
1583         self.bump();
1584         let start = self.pos;
1585
1586         // the eof will be picked up by the final `'` check below
1587         let c2 = self.ch.unwrap_or('\x00');
1588         self.bump();
1589
1590         let valid = self.scan_char_or_byte(start,
1591                                            c2,
1592                                            // ascii_only =
1593                                            true,
1594                                            '\'');
1595         if !self.ch_is('\'') {
1596             // Byte offsetting here is okay because the
1597             // character before position `start` are an
1598             // ascii single quote and ascii 'b'.
1599             let pos = self.pos;
1600             self.fatal_span_verbose(start - BytePos(2),
1601                                     pos,
1602                                     "unterminated byte constant".to_string()).raise();
1603         }
1604
1605         let id = if valid {
1606             self.name_from(start)
1607         } else {
1608             Symbol::intern("?")
1609         };
1610         self.bump(); // advance ch past token
1611         token::Byte(id)
1612     }
1613
1614     fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1615         self.scan_hex_digits(2, delim, below_0x7f_only)
1616     }
1617
1618     fn scan_byte_string(&mut self) -> token::Lit {
1619         self.bump();
1620         let start = self.pos;
1621         let mut valid = true;
1622
1623         while !self.ch_is('"') {
1624             if self.is_eof() {
1625                 let pos = self.pos;
1626                 self.fatal_span_(start, pos, "unterminated double quote byte string").raise();
1627             }
1628
1629             let ch_start = self.pos;
1630             let ch = self.ch.unwrap();
1631             self.bump();
1632             valid &= self.scan_char_or_byte(ch_start,
1633                                             ch,
1634                                             // ascii_only =
1635                                             true,
1636                                             '"');
1637         }
1638         let id = if valid {
1639             self.name_from(start)
1640         } else {
1641             Symbol::intern("??")
1642         };
1643         self.bump();
1644         token::ByteStr(id)
1645     }
1646
1647     fn scan_raw_byte_string(&mut self) -> token::Lit {
1648         let start_bpos = self.pos;
1649         self.bump();
1650         let mut hash_count = 0;
1651         while self.ch_is('#') {
1652             self.bump();
1653             hash_count += 1;
1654         }
1655
1656         if self.is_eof() {
1657             self.fail_unterminated_raw_string(start_bpos, hash_count);
1658         } else if !self.ch_is('"') {
1659             let pos = self.pos;
1660             let ch = self.ch.unwrap();
1661             self.fatal_span_char(start_bpos,
1662                                         pos,
1663                                         "found invalid character; only `#` is allowed in raw \
1664                                          string delimitation",
1665                                         ch).raise();
1666         }
1667         self.bump();
1668         let content_start_bpos = self.pos;
1669         let mut content_end_bpos;
1670         'outer: loop {
1671             match self.ch {
1672                 None => {
1673                     self.fail_unterminated_raw_string(start_bpos, hash_count);
1674                 }
1675                 Some('"') => {
1676                     content_end_bpos = self.pos;
1677                     for _ in 0..hash_count {
1678                         self.bump();
1679                         if !self.ch_is('#') {
1680                             continue 'outer;
1681                         }
1682                     }
1683                     break;
1684                 }
1685                 Some(c) => {
1686                     if c > '\x7F' {
1687                         let pos = self.pos;
1688                         self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1689                     }
1690                 }
1691             }
1692             self.bump();
1693         }
1694         self.bump();
1695         token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1696                                  hash_count)
1697     }
1698 }
1699
1700 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1701 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1702 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1703     c.map_or(false, Pattern_White_Space)
1704 }
1705
1706 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1707     match c {
1708         Some(c) => lo <= c && c <= hi,
1709         _ => false,
1710     }
1711 }
1712
1713 fn is_dec_digit(c: Option<char>) -> bool {
1714     in_range(c, '0', '9')
1715 }
1716
1717 pub fn is_doc_comment(s: &str) -> bool {
1718     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1719               s.starts_with("//!");
1720     debug!("is {:?} a doc comment? {}", s, res);
1721     res
1722 }
1723
1724 pub fn is_block_doc_comment(s: &str) -> bool {
1725     // Prevent `/**/` from being parsed as a doc comment
1726     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1727                s.starts_with("/*!")) && s.len() >= 5;
1728     debug!("is {:?} a doc comment? {}", s, res);
1729     res
1730 }
1731
1732 fn ident_start(c: Option<char>) -> bool {
1733     let c = match c {
1734         Some(c) => c,
1735         None => return false,
1736     };
1737
1738     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1739 }
1740
1741 fn ident_continue(c: Option<char>) -> bool {
1742     let c = match c {
1743         Some(c) => c,
1744         None => return false,
1745     };
1746
1747     (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1748     (c > '\x7f' && c.is_xid_continue())
1749 }
1750
1751 // The string is a valid identifier or a lifetime identifier.
1752 pub fn is_valid_ident(s: &str) -> bool {
1753     let mut chars = s.chars();
1754     ident_start(chars.next()) && chars.all(|ch| ident_continue(Some(ch)))
1755 }
1756
1757 #[cfg(test)]
1758 mod tests {
1759     use super::*;
1760
1761     use ast::{Ident, CrateConfig};
1762     use symbol::Symbol;
1763     use syntax_pos::{BytePos, Span, NO_EXPANSION};
1764     use codemap::CodeMap;
1765     use errors;
1766     use feature_gate::UnstableFeatures;
1767     use parse::token;
1768     use std::collections::HashSet;
1769     use std::io;
1770     use std::path::PathBuf;
1771     use diagnostics::plugin::ErrorMap;
1772     use rustc_data_structures::sync::Lock;
1773     use with_globals;
1774     fn mk_sess(cm: Lrc<CodeMap>) -> ParseSess {
1775         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()),
1776                                                           Some(cm.clone()),
1777                                                           false,
1778                                                           false);
1779         ParseSess {
1780             span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1781             unstable_features: UnstableFeatures::from_environment(),
1782             config: CrateConfig::new(),
1783             included_mod_stack: Lock::new(Vec::new()),
1784             code_map: cm,
1785             missing_fragment_specifiers: Lock::new(HashSet::new()),
1786             raw_identifier_spans: Lock::new(Vec::new()),
1787             registered_diagnostics: Lock::new(ErrorMap::new()),
1788             non_modrs_mods: Lock::new(vec![]),
1789         }
1790     }
1791
1792     // open a string reader for the given string
1793     fn setup<'a>(cm: &CodeMap,
1794                  sess: &'a ParseSess,
1795                  teststr: String)
1796                  -> StringReader<'a> {
1797         let fm = cm.new_filemap(PathBuf::from("zebra.rs").into(), teststr);
1798         StringReader::new(sess, fm)
1799     }
1800
1801     #[test]
1802     fn t1() {
1803         with_globals(|| {
1804             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1805             let sh = mk_sess(cm.clone());
1806             let mut string_reader = setup(&cm,
1807                                         &sh,
1808                                         "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1809                                             .to_string());
1810             let id = Ident::from_str("fn");
1811             assert_eq!(string_reader.next_token().tok, token::Comment);
1812             assert_eq!(string_reader.next_token().tok, token::Whitespace);
1813             let tok1 = string_reader.next_token();
1814             let tok2 = TokenAndSpan {
1815                 tok: token::Ident(id, false),
1816                 sp: Span::new(BytePos(21), BytePos(23), NO_EXPANSION),
1817             };
1818             assert_eq!(tok1, tok2);
1819             assert_eq!(string_reader.next_token().tok, token::Whitespace);
1820             // the 'main' id is already read:
1821             assert_eq!(string_reader.pos.clone(), BytePos(28));
1822             // read another token:
1823             let tok3 = string_reader.next_token();
1824             let tok4 = TokenAndSpan {
1825                 tok: mk_ident("main"),
1826                 sp: Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
1827             };
1828             assert_eq!(tok3, tok4);
1829             // the lparen is already read:
1830             assert_eq!(string_reader.pos.clone(), BytePos(29))
1831         })
1832     }
1833
1834     // check that the given reader produces the desired stream
1835     // of tokens (stop checking after exhausting the expected vec)
1836     fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1837         for expected_tok in &expected {
1838             assert_eq!(&string_reader.next_token().tok, expected_tok);
1839         }
1840     }
1841
1842     // make the identifier by looking up the string in the interner
1843     fn mk_ident(id: &str) -> token::Token {
1844         token::Token::from_ast_ident(Ident::from_str(id))
1845     }
1846
1847     #[test]
1848     fn doublecolonparsing() {
1849         with_globals(|| {
1850             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1851             let sh = mk_sess(cm.clone());
1852             check_tokenization(setup(&cm, &sh, "a b".to_string()),
1853                             vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1854         })
1855     }
1856
1857     #[test]
1858     fn dcparsing_2() {
1859         with_globals(|| {
1860             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1861             let sh = mk_sess(cm.clone());
1862             check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1863                             vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1864         })
1865     }
1866
1867     #[test]
1868     fn dcparsing_3() {
1869         with_globals(|| {
1870             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1871             let sh = mk_sess(cm.clone());
1872             check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1873                             vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1874         })
1875     }
1876
1877     #[test]
1878     fn dcparsing_4() {
1879         with_globals(|| {
1880             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1881             let sh = mk_sess(cm.clone());
1882             check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1883                             vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1884         })
1885     }
1886
1887     #[test]
1888     fn character_a() {
1889         with_globals(|| {
1890             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1891             let sh = mk_sess(cm.clone());
1892             assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1893                     token::Literal(token::Char(Symbol::intern("a")), None));
1894         })
1895     }
1896
1897     #[test]
1898     fn character_space() {
1899         with_globals(|| {
1900             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1901             let sh = mk_sess(cm.clone());
1902             assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1903                     token::Literal(token::Char(Symbol::intern(" ")), None));
1904         })
1905     }
1906
1907     #[test]
1908     fn character_escaped() {
1909         with_globals(|| {
1910             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1911             let sh = mk_sess(cm.clone());
1912             assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1913                     token::Literal(token::Char(Symbol::intern("\\n")), None));
1914         })
1915     }
1916
1917     #[test]
1918     fn lifetime_name() {
1919         with_globals(|| {
1920             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1921             let sh = mk_sess(cm.clone());
1922             assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1923                     token::Lifetime(Ident::from_str("'abc")));
1924         })
1925     }
1926
1927     #[test]
1928     fn raw_string() {
1929         with_globals(|| {
1930             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1931             let sh = mk_sess(cm.clone());
1932             assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1933                         .next_token()
1934                         .tok,
1935                     token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1936         })
1937     }
1938
1939     #[test]
1940     fn literal_suffixes() {
1941         with_globals(|| {
1942             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1943             let sh = mk_sess(cm.clone());
1944             macro_rules! test {
1945                 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1946                     assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1947                             token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1948                                             Some(Symbol::intern("suffix"))));
1949                     // with a whitespace separator:
1950                     assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1951                             token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1952                                             None));
1953                 }}
1954             }
1955
1956             test!("'a'", Char, "a");
1957             test!("b'a'", Byte, "a");
1958             test!("\"a\"", Str_, "a");
1959             test!("b\"a\"", ByteStr, "a");
1960             test!("1234", Integer, "1234");
1961             test!("0b101", Integer, "0b101");
1962             test!("0xABC", Integer, "0xABC");
1963             test!("1.0", Float, "1.0");
1964             test!("1.0e10", Float, "1.0e10");
1965
1966             assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1967                     token::Literal(token::Integer(Symbol::intern("2")),
1968                                     Some(Symbol::intern("us"))));
1969             assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1970                     token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1971                                     Some(Symbol::intern("suffix"))));
1972             assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1973                     token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1974                                     Some(Symbol::intern("suffix"))));
1975         })
1976     }
1977
1978     #[test]
1979     fn line_doc_comments() {
1980         assert!(is_doc_comment("///"));
1981         assert!(is_doc_comment("/// blah"));
1982         assert!(!is_doc_comment("////"));
1983     }
1984
1985     #[test]
1986     fn nested_block_comments() {
1987         with_globals(|| {
1988             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
1989             let sh = mk_sess(cm.clone());
1990             let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1991             match lexer.next_token().tok {
1992                 token::Comment => {}
1993                 _ => panic!("expected a comment!"),
1994             }
1995             assert_eq!(lexer.next_token().tok,
1996                     token::Literal(token::Char(Symbol::intern("a")), None));
1997         })
1998     }
1999
2000     #[test]
2001     fn crlf_comments() {
2002         with_globals(|| {
2003             let cm = Lrc::new(CodeMap::new(FilePathMapping::empty()));
2004             let sh = mk_sess(cm.clone());
2005             let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
2006             let comment = lexer.next_token();
2007             assert_eq!(comment.tok, token::Comment);
2008             assert_eq!((comment.sp.lo(), comment.sp.hi()), (BytePos(0), BytePos(7)));
2009             assert_eq!(lexer.next_token().tok, token::Whitespace);
2010             assert_eq!(lexer.next_token().tok,
2011                     token::DocComment(Symbol::intern("/// test")));
2012         })
2013     }
2014 }