1 // Copyright 2012-2013 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 use ast::{self, Ident};
12 use syntax_pos::{self, BytePos, CharPos, Pos, Span, NO_EXPANSION};
13 use codemap::{CodeMap, FilePathMapping};
14 use errors::{FatalError, DiagnosticBuilder};
15 use parse::{token, ParseSess};
17 use symbol::{Symbol, keywords};
18 use std_unicode::property::Pattern_White_Space;
22 use std::mem::replace;
29 #[derive(Clone, PartialEq, Eq, Debug)]
30 pub struct TokenAndSpan {
31 pub tok: token::Token,
35 impl Default for TokenAndSpan {
36 fn default() -> Self {
37 TokenAndSpan { tok: token::Underscore, sp: syntax_pos::DUMMY_SP }
41 pub struct StringReader<'a> {
42 pub sess: &'a ParseSess,
43 /// The absolute offset within the codemap of the next character to read
44 pub next_pos: BytePos,
45 /// The absolute offset within the codemap of the current character
47 /// The column of the next character to read
49 /// The current character (which has been read from self.pos)
51 pub filemap: Rc<syntax_pos::FileMap>,
52 /// If Some, stop reading the source at this position (inclusive).
53 pub terminator: Option<BytePos>,
54 /// Whether to record new-lines and multibyte chars in filemap.
55 /// This is only necessary the first time a filemap is lexed.
56 /// If part of a filemap is being re-lexed, this should be set to false.
57 pub save_new_lines_and_multibyte: bool,
59 pub peek_tok: token::Token,
61 pub fatal_errs: Vec<DiagnosticBuilder<'a>>,
62 // cache a direct reference to the source text, so that we don't have to
63 // retrieve it via `self.filemap.src.as_ref().unwrap()` all the time.
64 source_text: Rc<String>,
65 /// Stack of open delimiters and their spans. Used for error message.
68 open_braces: Vec<(token::DelimToken, Span)>,
69 pub override_span: Option<Span>,
72 impl<'a> StringReader<'a> {
73 fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
74 unwrap_or!(self.override_span, Span::new(lo, hi, NO_EXPANSION))
77 fn next_token(&mut self) -> TokenAndSpan where Self: Sized {
78 let res = self.try_next_token();
79 self.unwrap_or_abort(res)
81 fn unwrap_or_abort(&mut self, res: Result<TokenAndSpan, ()>) -> TokenAndSpan {
85 self.emit_fatal_errors();
90 fn try_real_token(&mut self) -> Result<TokenAndSpan, ()> {
91 let mut t = self.try_next_token()?;
94 token::Whitespace | token::Comment | token::Shebang(_) => {
95 t = self.try_next_token()?;
100 self.token = t.tok.clone();
104 pub fn real_token(&mut self) -> TokenAndSpan {
105 let res = self.try_real_token();
106 self.unwrap_or_abort(res)
108 fn is_eof(&self) -> bool {
109 if self.ch.is_none() {
113 match self.terminator {
114 Some(t) => self.next_pos > t,
118 /// Return the next token. EFFECT: advances the string_reader.
119 pub fn try_next_token(&mut self) -> Result<TokenAndSpan, ()> {
120 assert!(self.fatal_errs.is_empty());
121 let ret_val = TokenAndSpan {
122 tok: replace(&mut self.peek_tok, token::Underscore),
125 self.advance_token()?;
128 fn fatal(&self, m: &str) -> FatalError {
129 self.fatal_span(self.peek_span, m)
131 pub fn emit_fatal_errors(&mut self) {
132 for err in &mut self.fatal_errs {
135 self.fatal_errs.clear();
137 pub fn peek(&self) -> TokenAndSpan {
138 // FIXME(pcwalton): Bad copy!
140 tok: self.peek_tok.clone(),
146 impl<'a> StringReader<'a> {
147 /// For comments.rs, which hackily pokes into next_pos and ch
148 pub fn new_raw(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
149 let mut sr = StringReader::new_raw_internal(sess, filemap);
154 fn new_raw_internal(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
155 if filemap.src.is_none() {
156 sess.span_diagnostic.bug(&format!("Cannot lex filemap without source: {}",
160 let source_text = (*filemap.src.as_ref().unwrap()).clone();
164 next_pos: filemap.start_pos,
165 pos: filemap.start_pos,
170 save_new_lines_and_multibyte: true,
171 // dummy values; not read
172 peek_tok: token::Eof,
173 peek_span: syntax_pos::DUMMY_SP,
175 fatal_errs: Vec::new(),
177 span: syntax_pos::DUMMY_SP,
178 open_braces: Vec::new(),
183 pub fn new(sess: &'a ParseSess, filemap: Rc<syntax_pos::FileMap>) -> Self {
184 let mut sr = StringReader::new_raw(sess, filemap);
185 if sr.advance_token().is_err() {
186 sr.emit_fatal_errors();
192 pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
193 let begin = sess.codemap().lookup_byte_offset(span.lo());
194 let end = sess.codemap().lookup_byte_offset(span.hi());
196 // Make the range zero-length if the span is invalid.
197 if span.lo() > span.hi() || begin.fm.start_pos != end.fm.start_pos {
198 span = span.with_hi(span.lo());
201 let mut sr = StringReader::new_raw_internal(sess, begin.fm);
203 // Seek the lexer to the right byte range.
204 sr.save_new_lines_and_multibyte = false;
205 sr.next_pos = span.lo();
206 sr.terminator = Some(span.hi());
210 if sr.advance_token().is_err() {
211 sr.emit_fatal_errors();
217 pub fn ch_is(&self, c: char) -> bool {
221 /// Report a fatal lexical error with a given span.
222 pub fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
223 self.sess.span_diagnostic.span_fatal(sp, m)
226 /// Report a lexical error with a given span.
227 pub fn err_span(&self, sp: Span, m: &str) {
228 self.sess.span_diagnostic.span_err(sp, m)
232 /// Report a fatal error spanning [`from_pos`, `to_pos`).
233 fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
234 self.fatal_span(self.mk_sp(from_pos, to_pos), m)
237 /// Report a lexical error spanning [`from_pos`, `to_pos`).
238 fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
239 self.err_span(self.mk_sp(from_pos, to_pos), m)
242 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
243 /// escaped character to the error message
244 fn fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) -> FatalError {
245 let mut m = m.to_string();
247 for c in c.escape_default() {
250 self.fatal_span_(from_pos, to_pos, &m[..])
252 fn struct_fatal_span_char(&self,
257 -> DiagnosticBuilder<'a> {
258 let mut m = m.to_string();
260 for c in c.escape_default() {
263 self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
266 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending an
267 /// escaped character to the error message
268 fn err_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char) {
269 let mut m = m.to_string();
271 for c in c.escape_default() {
274 self.err_span_(from_pos, to_pos, &m[..]);
276 fn struct_err_span_char(&self,
281 -> DiagnosticBuilder<'a> {
282 let mut m = m.to_string();
284 for c in c.escape_default() {
287 self.sess.span_diagnostic.struct_span_err(self.mk_sp(from_pos, to_pos), &m[..])
290 /// Report a lexical error spanning [`from_pos`, `to_pos`), appending the
291 /// offending string to the error message
292 fn fatal_span_verbose(&self, from_pos: BytePos, to_pos: BytePos, mut m: String) -> FatalError {
294 let from = self.byte_offset(from_pos).to_usize();
295 let to = self.byte_offset(to_pos).to_usize();
296 m.push_str(&self.source_text[from..to]);
297 self.fatal_span_(from_pos, to_pos, &m[..])
300 /// Advance peek_tok and peek_span to refer to the next token, and
301 /// possibly update the interner.
302 fn advance_token(&mut self) -> Result<(), ()> {
303 match self.scan_whitespace_or_comment() {
305 self.peek_span = comment.sp;
306 self.peek_tok = comment.tok;
310 self.peek_tok = token::Eof;
311 self.peek_span = self.mk_sp(self.filemap.end_pos, self.filemap.end_pos);
313 let start_bytepos = self.pos;
314 self.peek_tok = self.next_token_inner()?;
315 self.peek_span = self.mk_sp(start_bytepos, self.pos);
322 fn byte_offset(&self, pos: BytePos) -> BytePos {
323 (pos - self.filemap.start_pos)
326 /// Calls `f` with a string slice of the source text spanning from `start`
327 /// up to but excluding `self.pos`, meaning the slice does not include
328 /// the character `self.ch`.
329 pub fn with_str_from<T, F>(&self, start: BytePos, f: F) -> T
330 where F: FnOnce(&str) -> T
332 self.with_str_from_to(start, self.pos, f)
335 /// Create a Name from a given offset to the current offset, each
336 /// adjusted 1 towards each other (assumes that on either side there is a
337 /// single-byte delimiter).
338 pub fn name_from(&self, start: BytePos) -> ast::Name {
339 debug!("taking an ident from {:?} to {:?}", start, self.pos);
340 self.with_str_from(start, Symbol::intern)
343 /// As name_from, with an explicit endpoint.
344 pub fn name_from_to(&self, start: BytePos, end: BytePos) -> ast::Name {
345 debug!("taking an ident from {:?} to {:?}", start, end);
346 self.with_str_from_to(start, end, Symbol::intern)
349 /// Calls `f` with a string slice of the source text spanning from `start`
350 /// up to but excluding `end`.
351 fn with_str_from_to<T, F>(&self, start: BytePos, end: BytePos, f: F) -> T
352 where F: FnOnce(&str) -> T
354 f(&self.source_text[self.byte_offset(start).to_usize()..self.byte_offset(end).to_usize()])
357 /// Converts CRLF to LF in the given string, raising an error on bare CR.
358 fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
361 let ch = char_at(s, i);
362 let next = i + ch.len_utf8();
364 if next < s.len() && char_at(s, next) == '\n' {
365 return translate_crlf_(self, start, s, errmsg, i).into();
367 let pos = start + BytePos(i as u32);
368 let end_pos = start + BytePos(next as u32);
369 self.err_span_(pos, end_pos, errmsg);
375 fn translate_crlf_(rdr: &StringReader,
381 let mut buf = String::with_capacity(s.len());
384 let ch = char_at(s, i);
385 let next = i + ch.len_utf8();
388 buf.push_str(&s[j..i]);
391 if next >= s.len() || char_at(s, next) != '\n' {
392 let pos = start + BytePos(i as u32);
393 let end_pos = start + BytePos(next as u32);
394 rdr.err_span_(pos, end_pos, errmsg);
400 buf.push_str(&s[j..]);
407 /// Advance the StringReader by one character. If a newline is
408 /// discovered, add it to the FileMap's list of line start offsets.
409 pub fn bump(&mut self) {
410 let new_pos = self.next_pos;
411 let new_byte_offset = self.byte_offset(new_pos).to_usize();
412 let end = self.terminator.map_or(self.source_text.len(), |t| {
413 self.byte_offset(t).to_usize()
415 if new_byte_offset < end {
416 let old_ch_is_newline = self.ch.unwrap() == '\n';
417 let new_ch = char_at(&self.source_text, new_byte_offset);
418 let new_ch_len = new_ch.len_utf8();
420 self.ch = Some(new_ch);
422 self.next_pos = new_pos + Pos::from_usize(new_ch_len);
423 if old_ch_is_newline {
424 if self.save_new_lines_and_multibyte {
425 self.filemap.next_line(self.pos);
427 self.col = CharPos(0);
429 self.col = self.col + CharPos(1);
432 if self.save_new_lines_and_multibyte {
433 self.filemap.record_multibyte_char(self.pos, new_ch_len);
442 pub fn nextch(&self) -> Option<char> {
443 let offset = self.byte_offset(self.next_pos).to_usize();
444 if offset < self.source_text.len() {
445 Some(char_at(&self.source_text, offset))
451 pub fn nextch_is(&self, c: char) -> bool {
452 self.nextch() == Some(c)
455 pub fn nextnextch(&self) -> Option<char> {
456 let offset = self.byte_offset(self.next_pos).to_usize();
457 let s = &self.source_text[..];
458 if offset >= s.len() {
461 let next = offset + char_at(s, offset).len_utf8();
463 Some(char_at(s, next))
469 pub fn nextnextch_is(&self, c: char) -> bool {
470 self.nextnextch() == Some(c)
473 /// Eats <XID_start><XID_continue>*, if possible.
474 fn scan_optional_raw_name(&mut self) -> Option<ast::Name> {
475 if !ident_start(self.ch) {
478 let start = self.pos;
479 while ident_continue(self.ch) {
483 self.with_str_from(start, |string| {
485 self.sess.span_diagnostic
486 .struct_span_warn(self.mk_sp(start, self.pos),
487 "underscore literal suffix is not allowed")
488 .warn("this was previously accepted by the compiler but is \
489 being phased out; it will become a hard error in \
491 .note("for more information, see issue #42326 \
492 <https://github.com/rust-lang/rust/issues/42326>")
496 Some(Symbol::intern(string))
501 /// PRECONDITION: self.ch is not whitespace
502 /// Eats any kind of comment.
503 fn scan_comment(&mut self) -> Option<TokenAndSpan> {
504 if let Some(c) = self.ch {
505 if c.is_whitespace() {
506 let msg = "called consume_any_line_comment, but there was whitespace";
507 self.sess.span_diagnostic.span_err(self.mk_sp(self.pos, self.pos), msg);
512 match self.nextch() {
517 // line comments starting with "///" or "//!" are doc-comments
518 let doc_comment = (self.ch_is('/') && !self.nextch_is('/')) || self.ch_is('!');
519 let start_bpos = self.pos - BytePos(2);
521 while !self.is_eof() {
522 match self.ch.unwrap() {
525 if self.nextch_is('\n') {
528 } else if doc_comment {
529 self.err_span_(self.pos,
531 "bare CR not allowed in doc-comment");
540 self.with_str_from(start_bpos, |string| {
541 // comments with only more "/"s are not doc comments
542 let tok = if is_doc_comment(string) {
543 token::DocComment(Symbol::intern(string))
550 sp: self.mk_sp(start_bpos, self.pos),
556 sp: self.mk_sp(start_bpos, self.pos),
563 self.scan_block_comment()
567 } else if self.ch_is('#') {
568 if self.nextch_is('!') {
570 // Parse an inner attribute.
571 if self.nextnextch_is('[') {
575 // I guess this is the only way to figure out if
576 // we're at the beginning of the file...
577 let cmap = CodeMap::new(FilePathMapping::empty());
578 cmap.files.borrow_mut().push(self.filemap.clone());
579 let loc = cmap.lookup_char_pos_adj(self.pos);
580 debug!("Skipping a shebang");
581 if loc.line == 1 && loc.col == CharPos(0) {
582 // FIXME: Add shebang "token", return it
583 let start = self.pos;
584 while !self.ch_is('\n') && !self.is_eof() {
587 return Some(TokenAndSpan {
588 tok: token::Shebang(self.name_from(start)),
589 sp: self.mk_sp(start, self.pos),
599 /// If there is whitespace, shebang, or a comment, scan it. Otherwise,
601 fn scan_whitespace_or_comment(&mut self) -> Option<TokenAndSpan> {
602 match self.ch.unwrap_or('\0') {
603 // # to handle shebang at start of file -- this is the entry point
604 // for skipping over all "junk"
606 let c = self.scan_comment();
607 debug!("scanning a comment {:?}", c);
610 c if is_pattern_whitespace(Some(c)) => {
611 let start_bpos = self.pos;
612 while is_pattern_whitespace(self.ch) {
615 let c = Some(TokenAndSpan {
616 tok: token::Whitespace,
617 sp: self.mk_sp(start_bpos, self.pos),
619 debug!("scanning whitespace: {:?}", c);
626 /// Might return a sugared-doc-attr
627 fn scan_block_comment(&mut self) -> Option<TokenAndSpan> {
628 // block comments starting with "/**" or "/*!" are doc-comments
629 let is_doc_comment = self.ch_is('*') || self.ch_is('!');
630 let start_bpos = self.pos - BytePos(2);
632 let mut level: isize = 1;
633 let mut has_cr = false;
636 let msg = if is_doc_comment {
637 "unterminated block doc-comment"
639 "unterminated block comment"
641 let last_bpos = self.pos;
642 panic!(self.fatal_span_(start_bpos, last_bpos, msg));
644 let n = self.ch.unwrap();
646 '/' if self.nextch_is('*') => {
650 '*' if self.nextch_is('/') => {
662 self.with_str_from(start_bpos, |string| {
663 // but comments with only "*"s between two "/"s are not
664 let tok = if is_block_doc_comment(string) {
665 let string = if has_cr {
666 self.translate_crlf(start_bpos,
668 "bare CR not allowed in block doc-comment")
672 token::DocComment(Symbol::intern(&string[..]))
679 sp: self.mk_sp(start_bpos, self.pos),
684 /// Scan through any digits (base `scan_radix`) or underscores,
685 /// and return how many digits there were.
687 /// `real_radix` represents the true radix of the number we're
688 /// interested in, and errors will be emitted for any digits
689 /// between `real_radix` and `scan_radix`.
690 fn scan_digits(&mut self, real_radix: u32, scan_radix: u32) -> usize {
691 assert!(real_radix <= scan_radix);
696 debug!("skipping a _");
700 match c.and_then(|cc| cc.to_digit(scan_radix)) {
702 debug!("{:?} in scan_digits", c);
703 // check that the hypothetical digit is actually
704 // in range for the true radix
705 if c.unwrap().to_digit(real_radix).is_none() {
706 self.err_span_(self.pos,
708 &format!("invalid digit for a base {} literal", real_radix));
718 /// Lex a LIT_INTEGER or a LIT_FLOAT
719 fn scan_number(&mut self, c: char) -> token::Lit {
722 let start_bpos = self.pos;
727 match self.ch.unwrap_or('\0') {
731 num_digits = self.scan_digits(2, 10);
736 num_digits = self.scan_digits(8, 10);
741 num_digits = self.scan_digits(16, 16);
743 '0'...'9' | '_' | '.' | 'e' | 'E' => {
744 num_digits = self.scan_digits(10, 10) + 1;
748 return token::Integer(self.name_from(start_bpos));
751 } else if c.is_digit(10) {
752 num_digits = self.scan_digits(10, 10) + 1;
758 self.err_span_(start_bpos,
760 "no valid digits found for number");
761 return token::Integer(Symbol::intern("0"));
764 // might be a float, but don't be greedy if this is actually an
765 // integer literal followed by field/method access or a range pattern
766 // (`0..2` and `12.foo()`)
767 if self.ch_is('.') && !self.nextch_is('.') &&
768 !ident_start(self.nextch()) {
769 // might have stuff after the ., and if it does, it needs to start
772 if self.ch.unwrap_or('\0').is_digit(10) {
773 self.scan_digits(10, 10);
774 self.scan_float_exponent();
777 self.check_float_base(start_bpos, pos, base);
778 token::Float(self.name_from(start_bpos))
780 // it might be a float if it has an exponent
781 if self.ch_is('e') || self.ch_is('E') {
782 self.scan_float_exponent();
784 self.check_float_base(start_bpos, pos, base);
785 return token::Float(self.name_from(start_bpos));
787 // but we certainly have an integer!
788 token::Integer(self.name_from(start_bpos))
792 /// Scan over `n_digits` hex digits, stopping at `delim`, reporting an
793 /// error if too many or too few digits are encountered.
794 fn scan_hex_digits(&mut self, n_digits: usize, delim: char, below_0x7f_only: bool) -> bool {
795 debug!("scanning {} digits until {:?}", n_digits, delim);
796 let start_bpos = self.pos;
797 let mut accum_int = 0;
799 let mut valid = true;
800 for _ in 0..n_digits {
802 let last_bpos = self.pos;
803 panic!(self.fatal_span_(start_bpos,
805 "unterminated numeric character escape"));
807 if self.ch_is(delim) {
808 let last_bpos = self.pos;
809 self.err_span_(start_bpos,
811 "numeric character escape is too short");
815 let c = self.ch.unwrap_or('\x00');
817 accum_int += c.to_digit(16).unwrap_or_else(|| {
818 self.err_span_char(self.pos,
820 "invalid character in numeric character escape",
829 if below_0x7f_only && accum_int >= 0x80 {
830 self.err_span_(start_bpos,
832 "this form of character escape may only be used with characters in \
833 the range [\\x00-\\x7f]");
837 match char::from_u32(accum_int) {
840 let last_bpos = self.pos;
841 self.err_span_(start_bpos, last_bpos, "invalid numeric character escape");
847 /// Scan for a single (possibly escaped) byte or char
848 /// in a byte, (non-raw) byte string, char, or (non-raw) string literal.
849 /// `start` is the position of `first_source_char`, which is already consumed.
851 /// Returns true if there was a valid char/byte, false otherwise.
852 fn scan_char_or_byte(&mut self,
854 first_source_char: char,
858 match first_source_char {
860 // '\X' for some X must be a character constant:
861 let escaped = self.ch;
862 let escaped_pos = self.pos;
865 None => {} // EOF here is an error that will be checked later.
868 'n' | 'r' | 't' | '\\' | '\'' | '"' | '0' => true,
869 'x' => self.scan_byte_escape(delim, !ascii_only),
871 let valid = if self.ch_is('{') {
872 self.scan_unicode_escape(delim) && !ascii_only
874 let span = self.mk_sp(start, self.pos);
875 self.sess.span_diagnostic
876 .struct_span_err(span, "incorrect unicode escape sequence")
878 "format of unicode escape sequences is \
884 self.err_span_(start,
886 "unicode escape sequences cannot be used as a \
887 byte or in a byte string");
892 '\n' if delim == '"' => {
893 self.consume_whitespace();
896 '\r' if delim == '"' && self.ch_is('\n') => {
897 self.consume_whitespace();
902 let mut err = self.struct_err_span_char(escaped_pos,
905 "unknown byte escape"
912 err.span_help(self.mk_sp(escaped_pos, pos),
913 "this is an isolated carriage return; consider \
914 checking your editor and version control \
917 if (e == '{' || e == '}') && !ascii_only {
918 err.span_help(self.mk_sp(escaped_pos, pos),
919 "if used in a formatting string, curly braces \
920 are escaped with `{{` and `}}`");
929 '\t' | '\n' | '\r' | '\'' if delim == '\'' => {
931 self.err_span_char(start,
934 "byte constant must be escaped"
936 "character constant must be escaped"
942 if self.ch_is('\n') {
946 self.err_span_(start,
948 "bare CR not allowed in string, use \\r instead");
953 if ascii_only && first_source_char > '\x7F' {
955 self.err_span_(start,
957 "byte constant must be ASCII. Use a \\xHH escape for a \
966 /// Scan over a \u{...} escape
968 /// At this point, we have already seen the \ and the u, the { is the current character. We
969 /// will read at least one digit, and up to 6, and pass over the }.
970 fn scan_unicode_escape(&mut self, delim: char) -> bool {
971 self.bump(); // past the {
972 let start_bpos = self.pos;
974 let mut accum_int = 0;
975 let mut valid = true;
977 while !self.ch_is('}') && count <= 6 {
978 let c = match self.ch {
981 panic!(self.fatal_span_(start_bpos,
983 "unterminated unicode escape (found EOF)"));
987 accum_int += c.to_digit(16).unwrap_or_else(|| {
989 panic!(self.fatal_span_(self.pos,
991 "unterminated unicode escape (needed a `}`)"));
993 self.err_span_char(self.pos,
995 "invalid character in unicode escape",
1006 self.err_span_(start_bpos,
1008 "overlong unicode escape (can have at most 6 hex digits)");
1012 if valid && (char::from_u32(accum_int).is_none() || count == 0) {
1013 self.err_span_(start_bpos,
1015 "invalid unicode character escape");
1019 self.bump(); // past the ending }
1023 /// Scan over a float exponent.
1024 fn scan_float_exponent(&mut self) {
1025 if self.ch_is('e') || self.ch_is('E') {
1027 if self.ch_is('-') || self.ch_is('+') {
1030 if self.scan_digits(10, 10) == 0 {
1031 self.err_span_(self.pos,
1033 "expected at least one digit in exponent")
1038 /// Check that a base is valid for a floating literal, emitting a nice
1039 /// error if it isn't.
1040 fn check_float_base(&mut self, start_bpos: BytePos, last_bpos: BytePos, base: usize) {
1043 self.err_span_(start_bpos,
1045 "hexadecimal float literal is not supported")
1048 self.err_span_(start_bpos,
1050 "octal float literal is not supported")
1053 self.err_span_(start_bpos,
1055 "binary float literal is not supported")
1061 fn binop(&mut self, op: token::BinOpToken) -> token::Token {
1063 if self.ch_is('=') {
1071 /// Return the next token from the string, advances the input past that
1072 /// token, and updates the interner
1073 fn next_token_inner(&mut self) -> Result<token::Token, ()> {
1075 if ident_start(c) &&
1076 match (c.unwrap(), self.nextch(), self.nextnextch()) {
1077 // Note: r as in r" or r#" is part of a raw string literal,
1078 // b as in b' is part of a byte literal.
1079 // They are not identifiers, and are handled further down.
1080 ('r', Some('"'), _) |
1081 ('r', Some('#'), _) |
1082 ('b', Some('"'), _) |
1083 ('b', Some('\''), _) |
1084 ('b', Some('r'), Some('"')) |
1085 ('b', Some('r'), Some('#')) => false,
1088 let start = self.pos;
1089 while ident_continue(self.ch) {
1093 return Ok(self.with_str_from(start, |string| {
1097 // FIXME: perform NFKC normalization here. (Issue #2253)
1098 token::Ident(Ident::from_str(string))
1103 if is_dec_digit(c) {
1104 let num = self.scan_number(c.unwrap());
1105 let suffix = self.scan_optional_raw_name();
1106 debug!("next_token_inner: scanned number {:?}, {:?}", num, suffix);
1107 return Ok(token::Literal(num, suffix));
1110 match c.expect("next_token_inner called at EOF") {
1122 if self.ch_is('.') {
1124 if self.ch_is('.') {
1126 Ok(token::DotDotDot)
1136 Ok(token::OpenDelim(token::Paren))
1140 Ok(token::CloseDelim(token::Paren))
1144 Ok(token::OpenDelim(token::Brace))
1148 Ok(token::CloseDelim(token::Brace))
1152 Ok(token::OpenDelim(token::Bracket))
1156 Ok(token::CloseDelim(token::Bracket))
1176 if self.ch_is(':') {
1189 // Multi-byte tokens.
1192 if self.ch_is('=') {
1195 } else if self.ch_is('>') {
1204 if self.ch_is('=') {
1213 match self.ch.unwrap_or('\x00') {
1219 Ok(self.binop(token::Shl))
1223 match self.ch.unwrap_or('\x00') {
1236 match self.ch.unwrap_or('\x00') {
1242 Ok(self.binop(token::Shr))
1250 // Either a character constant 'a' OR a lifetime name 'abc
1251 let start_with_quote = self.pos;
1253 let start = self.pos;
1255 // the eof will be picked up by the final `'` check below
1256 let c2 = self.ch.unwrap_or('\x00');
1259 // If the character is an ident start not followed by another single
1260 // quote, then this is a lifetime name:
1261 if ident_start(Some(c2)) && !self.ch_is('\'') {
1262 while ident_continue(self.ch) {
1265 // lifetimes shouldn't end with a single quote
1266 // if we find one, then this is an invalid character literal
1267 if self.ch_is('\'') {
1268 panic!(self.fatal_span_verbose(
1269 start_with_quote, self.next_pos,
1270 String::from("character literal may only contain one codepoint")));
1274 // Include the leading `'` in the real identifier, for macro
1275 // expansion purposes. See #12512 for the gory details of why
1276 // this is necessary.
1277 let ident = self.with_str_from(start, |lifetime_name| {
1278 Ident::from_str(&format!("'{}", lifetime_name))
1281 // Conjure up a "keyword checking ident" to make sure that
1282 // the lifetime name is not a keyword.
1283 let keyword_checking_ident = self.with_str_from(start, |lifetime_name| {
1284 Ident::from_str(lifetime_name)
1286 let keyword_checking_token = &token::Ident(keyword_checking_ident);
1287 let last_bpos = self.pos;
1288 if keyword_checking_token.is_reserved_ident() &&
1289 !keyword_checking_token.is_keyword(keywords::Static) {
1290 self.err_span_(start, last_bpos, "lifetimes cannot use keyword names");
1293 return Ok(token::Lifetime(ident));
1296 let valid = self.scan_char_or_byte(start,
1302 if !self.ch_is('\'') {
1303 panic!(self.fatal_span_verbose(
1304 start_with_quote, self.pos,
1305 String::from("character literal may only contain one codepoint")));
1309 self.name_from(start)
1313 self.bump(); // advance ch past token
1314 let suffix = self.scan_optional_raw_name();
1315 Ok(token::Literal(token::Char(id), suffix))
1319 let lit = match self.ch {
1320 Some('\'') => self.scan_byte(),
1321 Some('"') => self.scan_byte_string(),
1322 Some('r') => self.scan_raw_byte_string(),
1323 _ => unreachable!(), // Should have been a token::Ident above.
1325 let suffix = self.scan_optional_raw_name();
1326 Ok(token::Literal(lit, suffix))
1329 let start_bpos = self.pos;
1330 let mut valid = true;
1332 while !self.ch_is('"') {
1334 let last_bpos = self.pos;
1335 panic!(self.fatal_span_(start_bpos,
1337 "unterminated double quote string"));
1340 let ch_start = self.pos;
1341 let ch = self.ch.unwrap();
1343 valid &= self.scan_char_or_byte(ch_start,
1349 // adjust for the ASCII " at the start of the literal
1351 self.name_from(start_bpos + BytePos(1))
1353 Symbol::intern("??")
1356 let suffix = self.scan_optional_raw_name();
1357 Ok(token::Literal(token::Str_(id), suffix))
1360 let start_bpos = self.pos;
1362 let mut hash_count = 0;
1363 while self.ch_is('#') {
1369 let last_bpos = self.pos;
1370 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1371 } else if !self.ch_is('"') {
1372 let last_bpos = self.pos;
1373 let curr_char = self.ch.unwrap();
1374 panic!(self.fatal_span_char(start_bpos,
1376 "found invalid character; only `#` is allowed \
1377 in raw string delimitation",
1381 let content_start_bpos = self.pos;
1382 let mut content_end_bpos;
1383 let mut valid = true;
1386 let last_bpos = self.pos;
1387 panic!(self.fatal_span_(start_bpos, last_bpos, "unterminated raw string"));
1389 // if self.ch_is('"') {
1390 // content_end_bpos = self.pos;
1391 // for _ in 0..hash_count {
1393 // if !self.ch_is('#') {
1395 let c = self.ch.unwrap();
1398 content_end_bpos = self.pos;
1399 for _ in 0..hash_count {
1401 if !self.ch_is('#') {
1408 if !self.nextch_is('\n') {
1409 let last_bpos = self.pos;
1410 self.err_span_(start_bpos,
1412 "bare CR not allowed in raw string, use \\r \
1423 self.name_from_to(content_start_bpos, content_end_bpos)
1425 Symbol::intern("??")
1427 let suffix = self.scan_optional_raw_name();
1428 Ok(token::Literal(token::StrRaw(id, hash_count), suffix))
1431 if self.nextch_is('>') {
1436 Ok(self.binop(token::Minus))
1440 if self.nextch_is('&') {
1445 Ok(self.binop(token::And))
1449 match self.nextch() {
1456 Ok(self.binop(token::Or))
1461 Ok(self.binop(token::Plus))
1464 Ok(self.binop(token::Star))
1467 Ok(self.binop(token::Slash))
1470 Ok(self.binop(token::Caret))
1473 Ok(self.binop(token::Percent))
1476 let last_bpos = self.pos;
1477 let bpos = self.next_pos;
1478 let mut err = self.struct_fatal_span_char(last_bpos,
1480 "unknown start of token",
1482 unicode_chars::check_for_substitution(self, c, &mut err);
1483 self.fatal_errs.push(err);
1489 fn consume_whitespace(&mut self) {
1490 while is_pattern_whitespace(self.ch) && !self.is_eof() {
1495 fn read_to_eol(&mut self) -> String {
1496 let mut val = String::new();
1497 while !self.ch_is('\n') && !self.is_eof() {
1498 val.push(self.ch.unwrap());
1501 if self.ch_is('\n') {
1507 fn read_one_line_comment(&mut self) -> String {
1508 let val = self.read_to_eol();
1509 assert!((val.as_bytes()[0] == b'/' && val.as_bytes()[1] == b'/') ||
1510 (val.as_bytes()[0] == b'#' && val.as_bytes()[1] == b'!'));
1514 fn consume_non_eol_whitespace(&mut self) {
1515 while is_pattern_whitespace(self.ch) && !self.ch_is('\n') && !self.is_eof() {
1520 fn peeking_at_comment(&self) -> bool {
1521 (self.ch_is('/') && self.nextch_is('/')) || (self.ch_is('/') && self.nextch_is('*')) ||
1522 // consider shebangs comments, but not inner attributes
1523 (self.ch_is('#') && self.nextch_is('!') && !self.nextnextch_is('['))
1526 fn scan_byte(&mut self) -> token::Lit {
1528 let start = self.pos;
1530 // the eof will be picked up by the final `'` check below
1531 let c2 = self.ch.unwrap_or('\x00');
1534 let valid = self.scan_char_or_byte(start,
1539 if !self.ch_is('\'') {
1540 // Byte offsetting here is okay because the
1541 // character before position `start` are an
1542 // ascii single quote and ascii 'b'.
1544 panic!(self.fatal_span_verbose(start - BytePos(2),
1546 "unterminated byte constant".to_string()));
1550 self.name_from(start)
1554 self.bump(); // advance ch past token
1558 fn scan_byte_escape(&mut self, delim: char, below_0x7f_only: bool) -> bool {
1559 self.scan_hex_digits(2, delim, below_0x7f_only)
1562 fn scan_byte_string(&mut self) -> token::Lit {
1564 let start = self.pos;
1565 let mut valid = true;
1567 while !self.ch_is('"') {
1570 panic!(self.fatal_span_(start, pos, "unterminated double quote byte string"));
1573 let ch_start = self.pos;
1574 let ch = self.ch.unwrap();
1576 valid &= self.scan_char_or_byte(ch_start,
1583 self.name_from(start)
1585 Symbol::intern("??")
1591 fn scan_raw_byte_string(&mut self) -> token::Lit {
1592 let start_bpos = self.pos;
1594 let mut hash_count = 0;
1595 while self.ch_is('#') {
1602 panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"));
1603 } else if !self.ch_is('"') {
1605 let ch = self.ch.unwrap();
1606 panic!(self.fatal_span_char(start_bpos,
1608 "found invalid character; only `#` is allowed in raw \
1609 string delimitation",
1613 let content_start_bpos = self.pos;
1614 let mut content_end_bpos;
1619 panic!(self.fatal_span_(start_bpos, pos, "unterminated raw string"))
1622 content_end_bpos = self.pos;
1623 for _ in 0..hash_count {
1625 if !self.ch_is('#') {
1634 self.err_span_char(pos, pos, "raw byte string must be ASCII", c);
1641 token::ByteStrRaw(self.name_from_to(content_start_bpos, content_end_bpos),
1646 // This tests the character for the unicode property 'PATTERN_WHITE_SPACE' which
1647 // is guaranteed to be forward compatible. http://unicode.org/reports/tr31/#R3
1648 pub fn is_pattern_whitespace(c: Option<char>) -> bool {
1649 c.map_or(false, Pattern_White_Space)
1652 fn in_range(c: Option<char>, lo: char, hi: char) -> bool {
1654 Some(c) => lo <= c && c <= hi,
1659 fn is_dec_digit(c: Option<char>) -> bool {
1660 in_range(c, '0', '9')
1663 pub fn is_doc_comment(s: &str) -> bool {
1664 let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
1665 s.starts_with("//!");
1666 debug!("is {:?} a doc comment? {}", s, res);
1670 pub fn is_block_doc_comment(s: &str) -> bool {
1671 // Prevent `/**/` from being parsed as a doc comment
1672 let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
1673 s.starts_with("/*!")) && s.len() >= 5;
1674 debug!("is {:?} a doc comment? {}", s, res);
1678 fn ident_start(c: Option<char>) -> bool {
1681 None => return false,
1684 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || (c > '\x7f' && c.is_xid_start())
1687 fn ident_continue(c: Option<char>) -> bool {
1690 None => return false,
1693 (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' ||
1694 (c > '\x7f' && c.is_xid_continue())
1701 use ast::{Ident, CrateConfig};
1703 use syntax_pos::{BytePos, Span, NO_EXPANSION};
1704 use codemap::CodeMap;
1706 use feature_gate::UnstableFeatures;
1708 use std::cell::RefCell;
1709 use std::collections::HashSet;
1713 fn mk_sess(cm: Rc<CodeMap>) -> ParseSess {
1714 let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()), Some(cm.clone()));
1716 span_diagnostic: errors::Handler::with_emitter(true, false, Box::new(emitter)),
1717 unstable_features: UnstableFeatures::from_environment(),
1718 config: CrateConfig::new(),
1719 included_mod_stack: RefCell::new(Vec::new()),
1721 missing_fragment_specifiers: RefCell::new(HashSet::new()),
1725 // open a string reader for the given string
1726 fn setup<'a>(cm: &CodeMap,
1727 sess: &'a ParseSess,
1729 -> StringReader<'a> {
1730 let fm = cm.new_filemap("zebra.rs".to_string(), teststr);
1731 StringReader::new(sess, fm)
1736 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1737 let sh = mk_sess(cm.clone());
1738 let mut string_reader = setup(&cm,
1740 "/* my source file */ fn main() { println!(\"zebra\"); }\n"
1742 let id = Ident::from_str("fn");
1743 assert_eq!(string_reader.next_token().tok, token::Comment);
1744 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1745 let tok1 = string_reader.next_token();
1746 let tok2 = TokenAndSpan {
1747 tok: token::Ident(id),
1748 sp: Span::new(BytePos(21), BytePos(23), NO_EXPANSION),
1750 assert_eq!(tok1, tok2);
1751 assert_eq!(string_reader.next_token().tok, token::Whitespace);
1752 // the 'main' id is already read:
1753 assert_eq!(string_reader.pos.clone(), BytePos(28));
1754 // read another token:
1755 let tok3 = string_reader.next_token();
1756 let tok4 = TokenAndSpan {
1757 tok: token::Ident(Ident::from_str("main")),
1758 sp: Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
1760 assert_eq!(tok3, tok4);
1761 // the lparen is already read:
1762 assert_eq!(string_reader.pos.clone(), BytePos(29))
1765 // check that the given reader produces the desired stream
1766 // of tokens (stop checking after exhausting the expected vec)
1767 fn check_tokenization(mut string_reader: StringReader, expected: Vec<token::Token>) {
1768 for expected_tok in &expected {
1769 assert_eq!(&string_reader.next_token().tok, expected_tok);
1773 // make the identifier by looking up the string in the interner
1774 fn mk_ident(id: &str) -> token::Token {
1775 token::Ident(Ident::from_str(id))
1779 fn doublecolonparsing() {
1780 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1781 let sh = mk_sess(cm.clone());
1782 check_tokenization(setup(&cm, &sh, "a b".to_string()),
1783 vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
1788 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1789 let sh = mk_sess(cm.clone());
1790 check_tokenization(setup(&cm, &sh, "a::b".to_string()),
1791 vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
1796 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1797 let sh = mk_sess(cm.clone());
1798 check_tokenization(setup(&cm, &sh, "a ::b".to_string()),
1799 vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
1804 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1805 let sh = mk_sess(cm.clone());
1806 check_tokenization(setup(&cm, &sh, "a:: b".to_string()),
1807 vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
1812 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1813 let sh = mk_sess(cm.clone());
1814 assert_eq!(setup(&cm, &sh, "'a'".to_string()).next_token().tok,
1815 token::Literal(token::Char(Symbol::intern("a")), None));
1819 fn character_space() {
1820 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1821 let sh = mk_sess(cm.clone());
1822 assert_eq!(setup(&cm, &sh, "' '".to_string()).next_token().tok,
1823 token::Literal(token::Char(Symbol::intern(" ")), None));
1827 fn character_escaped() {
1828 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1829 let sh = mk_sess(cm.clone());
1830 assert_eq!(setup(&cm, &sh, "'\\n'".to_string()).next_token().tok,
1831 token::Literal(token::Char(Symbol::intern("\\n")), None));
1835 fn lifetime_name() {
1836 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1837 let sh = mk_sess(cm.clone());
1838 assert_eq!(setup(&cm, &sh, "'abc".to_string()).next_token().tok,
1839 token::Lifetime(Ident::from_str("'abc")));
1844 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1845 let sh = mk_sess(cm.clone());
1846 assert_eq!(setup(&cm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string())
1849 token::Literal(token::StrRaw(Symbol::intern("\"#a\\b\x00c\""), 3), None));
1853 fn literal_suffixes() {
1854 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1855 let sh = mk_sess(cm.clone());
1857 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
1858 assert_eq!(setup(&cm, &sh, format!("{}suffix", $input)).next_token().tok,
1859 token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1860 Some(Symbol::intern("suffix"))));
1861 // with a whitespace separator:
1862 assert_eq!(setup(&cm, &sh, format!("{} suffix", $input)).next_token().tok,
1863 token::Literal(token::$tok_type(Symbol::intern($tok_contents)),
1868 test!("'a'", Char, "a");
1869 test!("b'a'", Byte, "a");
1870 test!("\"a\"", Str_, "a");
1871 test!("b\"a\"", ByteStr, "a");
1872 test!("1234", Integer, "1234");
1873 test!("0b101", Integer, "0b101");
1874 test!("0xABC", Integer, "0xABC");
1875 test!("1.0", Float, "1.0");
1876 test!("1.0e10", Float, "1.0e10");
1878 assert_eq!(setup(&cm, &sh, "2us".to_string()).next_token().tok,
1879 token::Literal(token::Integer(Symbol::intern("2")),
1880 Some(Symbol::intern("us"))));
1881 assert_eq!(setup(&cm, &sh, "r###\"raw\"###suffix".to_string()).next_token().tok,
1882 token::Literal(token::StrRaw(Symbol::intern("raw"), 3),
1883 Some(Symbol::intern("suffix"))));
1884 assert_eq!(setup(&cm, &sh, "br###\"raw\"###suffix".to_string()).next_token().tok,
1885 token::Literal(token::ByteStrRaw(Symbol::intern("raw"), 3),
1886 Some(Symbol::intern("suffix"))));
1890 fn line_doc_comments() {
1891 assert!(is_doc_comment("///"));
1892 assert!(is_doc_comment("/// blah"));
1893 assert!(!is_doc_comment("////"));
1897 fn nested_block_comments() {
1898 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1899 let sh = mk_sess(cm.clone());
1900 let mut lexer = setup(&cm, &sh, "/* /* */ */'a'".to_string());
1901 match lexer.next_token().tok {
1902 token::Comment => {}
1903 _ => panic!("expected a comment!"),
1905 assert_eq!(lexer.next_token().tok,
1906 token::Literal(token::Char(Symbol::intern("a")), None));
1910 fn crlf_comments() {
1911 let cm = Rc::new(CodeMap::new(FilePathMapping::empty()));
1912 let sh = mk_sess(cm.clone());
1913 let mut lexer = setup(&cm, &sh, "// test\r\n/// test\r\n".to_string());
1914 let comment = lexer.next_token();
1915 assert_eq!(comment.tok, token::Comment);
1916 assert_eq!((comment.sp.lo(), comment.sp.hi()), (BytePos(0), BytePos(7)));
1917 assert_eq!(lexer.next_token().tok, token::Whitespace);
1918 assert_eq!(lexer.next_token().tok,
1919 token::DocComment(Symbol::intern("/// test")));