src/libsyntax/parse/lexer/mod.rs

   1 use crate::parse::ParseSess;
   2 use crate::parse::token::{self, Token, TokenKind};
   3 use crate::symbol::{sym, Symbol};
   4 use crate::parse::unescape_error_reporting::{emit_unescape_error, push_escaped_char};
   5
   6 use errors::{FatalError, Diagnostic, DiagnosticBuilder};
   7 use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
   8 use rustc_lexer::Base;
   9 use rustc_lexer::unescape;
  10
  11 use std::borrow::Cow;
  12 use std::char;
  13 use std::iter;
  14 use std::convert::TryInto;
  15 use rustc_data_structures::sync::Lrc;
  16 use log::debug;
  17
  18 pub mod comments;
  19 mod tokentrees;
  20 mod unicode_chars;
  21
  22 #[derive(Clone, Debug)]
  23 pub struct UnmatchedBrace {
  24     pub expected_delim: token::DelimToken,
  25     pub found_delim: token::DelimToken,
  26     pub found_span: Span,
  27     pub unclosed_span: Option<Span>,
  28     pub candidate_span: Option<Span>,
  29 }
  30
  31 pub struct StringReader<'a> {
  32     crate sess: &'a ParseSess,
  33     /// The absolute offset within the source_map of the current character
  34     crate pos: BytePos,
  35     /// The current character (which has been read from self.pos)
  36     crate source_file: Lrc<syntax_pos::SourceFile>,
  37     /// Stop reading src at this index.
  38     crate end_src_index: usize,
  39     fatal_errs: Vec<DiagnosticBuilder<'a>>,
  40     // cache a direct reference to the source text, so that we don't have to
  41     // retrieve it via `self.source_file.src.as_ref().unwrap()` all the time.
  42     src: Lrc<String>,
  43     override_span: Option<Span>,
  44 }
  45
  46 impl<'a> StringReader<'a> {
  47     pub fn new(sess: &'a ParseSess,
  48                source_file: Lrc<syntax_pos::SourceFile>,
  49                override_span: Option<Span>) -> Self {
  50         if source_file.src.is_none() {
  51             sess.span_diagnostic.bug(&format!("Cannot lex source_file without source: {}",
  52                                               source_file.name));
  53         }
  54
  55         let src = (*source_file.src.as_ref().unwrap()).clone();
  56
  57         StringReader {
  58             sess,
  59             pos: source_file.start_pos,
  60             source_file,
  61             end_src_index: src.len(),
  62             src,
  63             fatal_errs: Vec::new(),
  64             override_span,
  65         }
  66     }
  67
  68     pub fn retokenize(sess: &'a ParseSess, mut span: Span) -> Self {
  69         let begin = sess.source_map().lookup_byte_offset(span.lo());
  70         let end = sess.source_map().lookup_byte_offset(span.hi());
  71
  72         // Make the range zero-length if the span is invalid.
  73         if span.lo() > span.hi() || begin.sf.start_pos != end.sf.start_pos {
  74             span = span.shrink_to_lo();
  75         }
  76
  77         let mut sr = StringReader::new(sess, begin.sf, None);
  78
  79         // Seek the lexer to the right byte range.
  80         sr.end_src_index = sr.src_index(span.hi());
  81
  82         sr
  83     }
  84
  85
  86     fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
  87         self.override_span.unwrap_or_else(|| Span::new(lo, hi, NO_EXPANSION))
  88     }
  89
  90     fn unwrap_or_abort(&mut self, res: Result<Token, ()>) -> Token {
  91         match res {
  92             Ok(tok) => tok,
  93             Err(_) => {
  94                 self.emit_fatal_errors();
  95                 FatalError.raise();
  96             }
  97         }
  98     }
  99
 100     /// Returns the next token, including trivia like whitespace or comments.
 101     ///
 102     /// `Err(())` means that some errors were encountered, which can be
 103     /// retrieved using `buffer_fatal_errors`.
 104     pub fn try_next_token(&mut self) -> Result<Token, ()> {
 105         assert!(self.fatal_errs.is_empty());
 106
 107         let start_src_index = self.src_index(self.pos);
 108         let text: &str = &self.src[start_src_index..self.end_src_index];
 109
 110         if text.is_empty() {
 111             let span = self.mk_sp(self.source_file.end_pos, self.source_file.end_pos);
 112             return Ok(Token::new(token::Eof, span));
 113         }
 114
 115         {
 116             let is_beginning_of_file = self.pos == self.source_file.start_pos;
 117             if is_beginning_of_file {
 118                 if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
 119                     let start = self.pos;
 120                     self.pos = self.pos + BytePos::from_usize(shebang_len);
 121
 122                     let sym = self.symbol_from(start + BytePos::from_usize("#!".len()));
 123                     let kind = token::Shebang(sym);
 124
 125                     let span = self.mk_sp(start, self.pos);
 126                     return Ok(Token::new(kind, span));
 127                 }
 128             }
 129         }
 130
 131         let token = rustc_lexer::first_token(text);
 132
 133         let start = self.pos;
 134         self.pos = self.pos + BytePos::from_usize(token.len);
 135
 136         debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));
 137
 138         // This could use `?`, but that makes code significantly (10-20%) slower.
 139         // https://github.com/rust-lang/rust/issues/37939
 140         let kind = match self.cook_lexer_token(token.kind, start) {
 141             Ok(it) => it,
 142             Err(err) => return Err(self.fatal_errs.push(err)),
 143         };
 144
 145         let span = self.mk_sp(start, self.pos);
 146         Ok(Token::new(kind, span))
 147     }
 148
 149     /// Returns the next token, including trivia like whitespace or comments.
 150     ///
 151     /// Aborts in case of an error.
 152     pub fn next_token(&mut self) -> Token {
 153         let res = self.try_next_token();
 154         self.unwrap_or_abort(res)
 155     }
 156
 157     fn emit_fatal_errors(&mut self) {
 158         for err in &mut self.fatal_errs {
 159             err.emit();
 160         }
 161
 162         self.fatal_errs.clear();
 163     }
 164
 165     pub fn buffer_fatal_errors(&mut self) -> Vec<Diagnostic> {
 166         let mut buffer = Vec::new();
 167
 168         for err in self.fatal_errs.drain(..) {
 169             err.buffer(&mut buffer);
 170         }
 171
 172         buffer
 173     }
 174
 175     /// Report a fatal lexical error with a given span.
 176     fn fatal_span(&self, sp: Span, m: &str) -> FatalError {
 177         self.sess.span_diagnostic.span_fatal(sp, m)
 178     }
 179
 180     /// Report a lexical error with a given span.
 181     fn err_span(&self, sp: Span, m: &str) {
 182         self.sess.span_diagnostic.struct_span_err(sp, m).emit();
 183     }
 184
 185
 186     /// Report a fatal error spanning [`from_pos`, `to_pos`).
 187     fn fatal_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) -> FatalError {
 188         self.fatal_span(self.mk_sp(from_pos, to_pos), m)
 189     }
 190
 191     /// Report a lexical error spanning [`from_pos`, `to_pos`).
 192     fn err_span_(&self, from_pos: BytePos, to_pos: BytePos, m: &str) {
 193         self.err_span(self.mk_sp(from_pos, to_pos), m)
 194     }
 195
 196     fn struct_span_fatal(&self, from_pos: BytePos, to_pos: BytePos, m: &str)
 197         -> DiagnosticBuilder<'a>
 198     {
 199         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), m)
 200     }
 201
 202     fn struct_fatal_span_char(&self, from_pos: BytePos, to_pos: BytePos, m: &str, c: char)
 203         -> DiagnosticBuilder<'a>
 204     {
 205         let mut m = m.to_string();
 206         m.push_str(": ");
 207         push_escaped_char(&mut m, c);
 208
 209         self.sess.span_diagnostic.struct_span_fatal(self.mk_sp(from_pos, to_pos), &m[..])
 210     }
 211
 212     /// Turns simple `rustc_lexer::TokenKind` enum into a rich
 213     /// `libsyntax::TokenKind`. This turns strings into interned
 214     /// symbols and runs additional validation.
 215     fn cook_lexer_token(
 216         &self,
 217         token: rustc_lexer::TokenKind,
 218         start: BytePos,
 219     ) -> Result<TokenKind, DiagnosticBuilder<'a>> {
 220         let kind = match token {
 221             rustc_lexer::TokenKind::LineComment => {
 222                 let string = self.str_from(start);
 223                 // comments with only more "/"s are not doc comments
 224                 let tok = if is_doc_comment(string) {
 225                     let mut idx = 0;
 226                     loop {
 227                         idx = match string[idx..].find('\r') {
 228                             None => break,
 229                             Some(it) => idx + it + 1
 230                         };
 231                         if string[idx..].chars().next() != Some('\n') {
 232                             self.err_span_(start + BytePos(idx as u32 - 1),
 233                                             start + BytePos(idx as u32),
 234                                             "bare CR not allowed in doc-comment");
 235                         }
 236                     }
 237                     token::DocComment(Symbol::intern(string))
 238                 } else {
 239                     token::Comment
 240                 };
 241
 242                 tok
 243             }
 244             rustc_lexer::TokenKind::BlockComment { terminated } => {
 245                 let string = self.str_from(start);
 246                 // block comments starting with "/**" or "/*!" are doc-comments
 247                 // but comments with only "*"s between two "/"s are not
 248                 let is_doc_comment = is_block_doc_comment(string);
 249
 250                 if !terminated {
 251                     let msg = if is_doc_comment {
 252                         "unterminated block doc-comment"
 253                     } else {
 254                         "unterminated block comment"
 255                     };
 256                     let last_bpos = self.pos;
 257                     self.fatal_span_(start, last_bpos, msg).raise();
 258                 }
 259
 260                 let tok = if is_doc_comment {
 261                     let has_cr = string.contains('\r');
 262                     let string = if has_cr {
 263                         self.translate_crlf(start,
 264                                             string,
 265                                             "bare CR not allowed in block doc-comment")
 266                     } else {
 267                         string.into()
 268                     };
 269                     token::DocComment(Symbol::intern(&string[..]))
 270                 } else {
 271                     token::Comment
 272                 };
 273
 274                 tok
 275             }
 276             rustc_lexer::TokenKind::Whitespace => token::Whitespace,
 277             rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => {
 278                 let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
 279                 let mut ident_start = start;
 280                 if is_raw_ident {
 281                     ident_start = ident_start + BytePos(2);
 282                 }
 283                 // FIXME: perform NFKC normalization here. (Issue #2253)
 284                 let sym = self.symbol_from(ident_start);
 285                 if is_raw_ident {
 286                     let span = self.mk_sp(start, self.pos);
 287                     if !sym.can_be_raw() {
 288                         self.err_span(span, &format!("`{}` cannot be a raw identifier", sym));
 289                     }
 290                     self.sess.raw_identifier_spans.borrow_mut().push(span);
 291                 }
 292                 token::Ident(sym, is_raw_ident)
 293             }
 294             rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
 295                 let suffix_start = start + BytePos(suffix_start as u32);
 296                 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
 297                 let suffix = if suffix_start < self.pos {
 298                     let string = self.str_from(suffix_start);
 299                     if string == "_" {
 300                         self.sess.span_diagnostic
 301                             .struct_span_warn(self.mk_sp(suffix_start, self.pos),
 302                                               "underscore literal suffix is not allowed")
 303                             .warn("this was previously accepted by the compiler but is \
 304                                    being phased out; it will become a hard error in \
 305                                    a future release!")
 306                             .note("for more information, see issue #42326 \
 307                                    <https://github.com/rust-lang/rust/issues/42326>")
 308                             .emit();
 309                         None
 310                     } else {
 311                         Some(Symbol::intern(string))
 312                     }
 313                 } else {
 314                     None
 315                 };
 316                 token::Literal(token::Lit { kind, symbol, suffix })
 317             }
 318             rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
 319                 // Include the leading `'` in the real identifier, for macro
 320                 // expansion purposes. See #12512 for the gory details of why
 321                 // this is necessary.
 322                 let lifetime_name = self.str_from(start);
 323                 if starts_with_number {
 324                     self.err_span_(
 325                         start,
 326                         self.pos,
 327                         "lifetimes cannot start with a number",
 328                     );
 329                 }
 330                 let ident = Symbol::intern(lifetime_name);
 331                 token::Lifetime(ident)
 332             }
 333             rustc_lexer::TokenKind::Semi => token::Semi,
 334             rustc_lexer::TokenKind::Comma => token::Comma,
 335             rustc_lexer::TokenKind::DotDotDot => token::DotDotDot,
 336             rustc_lexer::TokenKind::DotDotEq => token::DotDotEq,
 337             rustc_lexer::TokenKind::DotDot => token::DotDot,
 338             rustc_lexer::TokenKind::Dot => token::Dot,
 339             rustc_lexer::TokenKind::OpenParen => token::OpenDelim(token::Paren),
 340             rustc_lexer::TokenKind::CloseParen => token::CloseDelim(token::Paren),
 341             rustc_lexer::TokenKind::OpenBrace => token::OpenDelim(token::Brace),
 342             rustc_lexer::TokenKind::CloseBrace => token::CloseDelim(token::Brace),
 343             rustc_lexer::TokenKind::OpenBracket => token::OpenDelim(token::Bracket),
 344             rustc_lexer::TokenKind::CloseBracket => token::CloseDelim(token::Bracket),
 345             rustc_lexer::TokenKind::At => token::At,
 346             rustc_lexer::TokenKind::Pound => token::Pound,
 347             rustc_lexer::TokenKind::Tilde => token::Tilde,
 348             rustc_lexer::TokenKind::Question => token::Question,
 349             rustc_lexer::TokenKind::ColonColon => token::ModSep,
 350             rustc_lexer::TokenKind::Colon => token::Colon,
 351             rustc_lexer::TokenKind::Dollar => token::Dollar,
 352             rustc_lexer::TokenKind::EqEq => token::EqEq,
 353             rustc_lexer::TokenKind::Eq => token::Eq,
 354             rustc_lexer::TokenKind::FatArrow => token::FatArrow,
 355             rustc_lexer::TokenKind::Ne => token::Ne,
 356             rustc_lexer::TokenKind::Not => token::Not,
 357             rustc_lexer::TokenKind::Le => token::Le,
 358             rustc_lexer::TokenKind::LArrow => token::LArrow,
 359             rustc_lexer::TokenKind::Lt => token::Lt,
 360             rustc_lexer::TokenKind::ShlEq => token::BinOpEq(token::Shl),
 361             rustc_lexer::TokenKind::Shl => token::BinOp(token::Shl),
 362             rustc_lexer::TokenKind::Ge => token::Ge,
 363             rustc_lexer::TokenKind::Gt => token::Gt,
 364             rustc_lexer::TokenKind::ShrEq => token::BinOpEq(token::Shr),
 365             rustc_lexer::TokenKind::Shr => token::BinOp(token::Shr),
 366             rustc_lexer::TokenKind::RArrow => token::RArrow,
 367             rustc_lexer::TokenKind::Minus => token::BinOp(token::Minus),
 368             rustc_lexer::TokenKind::MinusEq => token::BinOpEq(token::Minus),
 369             rustc_lexer::TokenKind::And => token::BinOp(token::And),
 370             rustc_lexer::TokenKind::AndEq => token::BinOpEq(token::And),
 371             rustc_lexer::TokenKind::AndAnd => token::AndAnd,
 372             rustc_lexer::TokenKind::Or => token::BinOp(token::Or),
 373             rustc_lexer::TokenKind::OrEq => token::BinOpEq(token::Or),
 374             rustc_lexer::TokenKind::OrOr => token::OrOr,
 375             rustc_lexer::TokenKind::Plus => token::BinOp(token::Plus),
 376             rustc_lexer::TokenKind::PlusEq => token::BinOpEq(token::Plus),
 377             rustc_lexer::TokenKind::Star => token::BinOp(token::Star),
 378             rustc_lexer::TokenKind::StarEq => token::BinOpEq(token::Star),
 379             rustc_lexer::TokenKind::Slash => token::BinOp(token::Slash),
 380             rustc_lexer::TokenKind::SlashEq => token::BinOpEq(token::Slash),
 381             rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
 382             rustc_lexer::TokenKind::CaretEq => token::BinOpEq(token::Caret),
 383             rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 384             rustc_lexer::TokenKind::PercentEq => token::BinOpEq(token::Percent),
 385
 386             rustc_lexer::TokenKind::Unknown => {
 387                 let c = self.str_from(start).chars().next().unwrap();
 388                 let mut err = self.struct_fatal_span_char(start,
 389                                                           self.pos,
 390                                                           "unknown start of token",
 391                                                           c);
 392                 unicode_chars::check_for_substitution(self, start, c, &mut err);
 393                 return Err(err)
 394             }
 395         };
 396         Ok(kind)
 397     }
 398
 399     fn cook_lexer_literal(
 400         &self,
 401         start: BytePos,
 402         suffix_start: BytePos,
 403         kind: rustc_lexer::LiteralKind
 404     ) -> (token::LitKind, Symbol) {
 405         match kind {
 406             rustc_lexer::LiteralKind::Char { terminated } => {
 407                 if !terminated {
 408                     self.fatal_span_(start, suffix_start,
 409                                      "unterminated character literal".into())
 410                         .raise()
 411                 }
 412                 let content_start = start + BytePos(1);
 413                 let content_end = suffix_start - BytePos(1);
 414                 self.validate_char_escape(content_start, content_end);
 415                 let id = self.symbol_from_to(content_start, content_end);
 416                 (token::Char, id)
 417             },
 418             rustc_lexer::LiteralKind::Byte { terminated } => {
 419                 if !terminated {
 420                     self.fatal_span_(start + BytePos(1), suffix_start,
 421                                      "unterminated byte constant".into())
 422                         .raise()
 423                 }
 424                 let content_start = start + BytePos(2);
 425                 let content_end = suffix_start - BytePos(1);
 426                 self.validate_byte_escape(content_start, content_end);
 427                 let id = self.symbol_from_to(content_start, content_end);
 428                 (token::Byte, id)
 429             },
 430             rustc_lexer::LiteralKind::Str { terminated } => {
 431                 if !terminated {
 432                     self.fatal_span_(start, suffix_start,
 433                                      "unterminated double quote string".into())
 434                         .raise()
 435                 }
 436                 let content_start = start + BytePos(1);
 437                 let content_end = suffix_start - BytePos(1);
 438                 self.validate_str_escape(content_start, content_end);
 439                 let id = self.symbol_from_to(content_start, content_end);
 440                 (token::Str, id)
 441             }
 442             rustc_lexer::LiteralKind::ByteStr { terminated } => {
 443                 if !terminated {
 444                     self.fatal_span_(start + BytePos(1), suffix_start,
 445                                      "unterminated double quote byte string".into())
 446                         .raise()
 447                 }
 448                 let content_start = start + BytePos(2);
 449                 let content_end = suffix_start - BytePos(1);
 450                 self.validate_byte_str_escape(content_start, content_end);
 451                 let id = self.symbol_from_to(content_start, content_end);
 452                 (token::ByteStr, id)
 453             }
 454             rustc_lexer::LiteralKind::RawStr { n_hashes, started, terminated } => {
 455                 if !started {
 456                     self.report_non_started_raw_string(start);
 457                 }
 458                 if !terminated {
 459                     self.report_unterminated_raw_string(start, n_hashes)
 460                 }
 461                 let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
 462                 let n = u32::from(n_hashes);
 463                 let content_start = start + BytePos(2 + n);
 464                 let content_end = suffix_start - BytePos(1 + n);
 465                 self.validate_raw_str_escape(content_start, content_end);
 466                 let id = self.symbol_from_to(content_start, content_end);
 467                 (token::StrRaw(n_hashes), id)
 468             }
 469             rustc_lexer::LiteralKind::RawByteStr { n_hashes, started, terminated } => {
 470                 if !started {
 471                     self.report_non_started_raw_string(start);
 472                 }
 473                 if !terminated {
 474                     self.report_unterminated_raw_string(start, n_hashes)
 475                 }
 476                 let n_hashes: u16 = self.restrict_n_hashes(start, n_hashes);
 477                 let n = u32::from(n_hashes);
 478                 let content_start = start + BytePos(3 + n);
 479                 let content_end = suffix_start - BytePos(1 + n);
 480                 self.validate_raw_byte_str_escape(content_start, content_end);
 481                 let id = self.symbol_from_to(content_start, content_end);
 482                 (token::ByteStrRaw(n_hashes), id)
 483             }
 484             rustc_lexer::LiteralKind::Int { base, empty_int } => {
 485                 if empty_int {
 486                     self.err_span_(start, suffix_start, "no valid digits found for number");
 487                     (token::Integer, sym::integer(0))
 488                 } else {
 489                     self.validate_int_literal(base, start, suffix_start);
 490                     (token::Integer, self.symbol_from_to(start, suffix_start))
 491                 }
 492             },
 493             rustc_lexer::LiteralKind::Float { base, empty_exponent } => {
 494                 if empty_exponent {
 495                     let mut err = self.struct_span_fatal(
 496                         start, self.pos,
 497                         "expected at least one digit in exponent"
 498                     );
 499                     err.emit();
 500                 }
 501
 502                 match base {
 503                     Base::Hexadecimal => {
 504                         self.err_span_(start, suffix_start,
 505                                        "hexadecimal float literal is not supported")
 506                     }
 507                     Base::Octal => {
 508                         self.err_span_(start, suffix_start,
 509                                        "octal float literal is not supported")
 510                     }
 511                     Base::Binary => {
 512                         self.err_span_(start, suffix_start,
 513                                        "binary float literal is not supported")
 514                     }
 515                     _ => ()
 516                 }
 517
 518                 let id = self.symbol_from_to(start, suffix_start);
 519                 (token::Float, id)
 520             },
 521         }
 522     }
 523
 524     #[inline]
 525     fn src_index(&self, pos: BytePos) -> usize {
 526         (pos - self.source_file.start_pos).to_usize()
 527     }
 528
 529     /// Slice of the source text from `start` up to but excluding `self.pos`,
 530     /// meaning the slice does not include the character `self.ch`.
 531     fn str_from(&self, start: BytePos) -> &str
 532     {
 533         self.str_from_to(start, self.pos)
 534     }
 535
 536     /// Creates a Symbol from a given offset to the current offset.
 537     fn symbol_from(&self, start: BytePos) -> Symbol {
 538         debug!("taking an ident from {:?} to {:?}", start, self.pos);
 539         Symbol::intern(self.str_from(start))
 540     }
 541
 542     /// As symbol_from, with an explicit endpoint.
 543     fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
 544         debug!("taking an ident from {:?} to {:?}", start, end);
 545         Symbol::intern(self.str_from_to(start, end))
 546     }
 547
 548     /// Slice of the source text spanning from `start` up to but excluding `end`.
 549     fn str_from_to(&self, start: BytePos, end: BytePos) -> &str
 550     {
 551         &self.src[self.src_index(start)..self.src_index(end)]
 552     }
 553
 554     /// Converts CRLF to LF in the given string, raising an error on bare CR.
 555     fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
 556         let mut chars = s.char_indices().peekable();
 557         while let Some((i, ch)) = chars.next() {
 558             if ch == '\r' {
 559                 if let Some((lf_idx, '\n')) = chars.peek() {
 560                     return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
 561                 }
 562                 let pos = start + BytePos(i as u32);
 563                 let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
 564                 self.err_span_(pos, end_pos, errmsg);
 565             }
 566         }
 567         return s.into();
 568
 569         fn translate_crlf_(rdr: &StringReader<'_>,
 570                            start: BytePos,
 571                            s: &str,
 572                            mut j: usize,
 573                            mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
 574                            errmsg: &str)
 575                            -> String {
 576             let mut buf = String::with_capacity(s.len());
 577             // Skip first CR
 578             buf.push_str(&s[.. j - 1]);
 579             while let Some((i, ch)) = chars.next() {
 580                 if ch == '\r' {
 581                     if j < i {
 582                         buf.push_str(&s[j..i]);
 583                     }
 584                     let next = i + ch.len_utf8();
 585                     j = next;
 586                     if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
 587                         let pos = start + BytePos(i as u32);
 588                         let end_pos = start + BytePos(next as u32);
 589                         rdr.err_span_(pos, end_pos, errmsg);
 590                     }
 591                 }
 592             }
 593             if j < s.len() {
 594                 buf.push_str(&s[j..]);
 595             }
 596             buf
 597         }
 598     }
 599
 600     fn report_non_started_raw_string(&self, start: BytePos) -> ! {
 601         let bad_char = self.str_from(start).chars().last().unwrap();
 602         self
 603             .struct_fatal_span_char(
 604                 start,
 605                 self.pos,
 606                 "found invalid character; only `#` is allowed \
 607                  in raw string delimitation",
 608                 bad_char,
 609             )
 610             .emit();
 611         FatalError.raise()
 612     }
 613
 614     fn report_unterminated_raw_string(&self, start: BytePos, n_hashes: usize) -> ! {
 615         let mut err = self.struct_span_fatal(
 616             start, start,
 617             "unterminated raw string",
 618         );
 619         err.span_label(
 620             self.mk_sp(start, start),
 621             "unterminated raw string",
 622         );
 623
 624         if n_hashes > 0 {
 625             err.note(&format!("this raw string should be terminated with `\"{}`",
 626                                 "#".repeat(n_hashes as usize)));
 627         }
 628
 629         err.emit();
 630         FatalError.raise()
 631     }
 632
 633     fn restrict_n_hashes(&self, start: BytePos, n_hashes: usize) -> u16 {
 634         match n_hashes.try_into() {
 635             Ok(n_hashes) => n_hashes,
 636             Err(_) => {
 637                 self.fatal_span_(start,
 638                                  self.pos,
 639                                  "too many `#` symbols: raw strings may be \
 640                                   delimited by up to 65535 `#` symbols").raise();
 641             }
 642         }
 643     }
 644
 645     fn validate_char_escape(&self, content_start: BytePos, content_end: BytePos) {
 646         let lit = self.str_from_to(content_start, content_end);
 647         if let Err((off, err)) = unescape::unescape_char(lit) {
 648             emit_unescape_error(
 649                 &self.sess.span_diagnostic,
 650                 lit,
 651                 self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 652                 unescape::Mode::Char,
 653                 0..off,
 654                 err,
 655             )
 656         }
 657     }
 658
 659     fn validate_byte_escape(&self, content_start: BytePos, content_end: BytePos) {
 660         let lit = self.str_from_to(content_start, content_end);
 661         if let Err((off, err)) = unescape::unescape_byte(lit) {
 662             emit_unescape_error(
 663                 &self.sess.span_diagnostic,
 664                 lit,
 665                 self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 666                 unescape::Mode::Byte,
 667                 0..off,
 668                 err,
 669             )
 670         }
 671     }
 672
 673     fn validate_str_escape(&self, content_start: BytePos, content_end: BytePos) {
 674         let lit = self.str_from_to(content_start, content_end);
 675         unescape::unescape_str(lit, &mut |range, c| {
 676             if let Err(err) = c {
 677                 emit_unescape_error(
 678                     &self.sess.span_diagnostic,
 679                     lit,
 680                     self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 681                     unescape::Mode::Str,
 682                     range,
 683                     err,
 684                 )
 685             }
 686         })
 687     }
 688
 689     fn validate_raw_str_escape(&self, content_start: BytePos, content_end: BytePos) {
 690         let lit = self.str_from_to(content_start, content_end);
 691         unescape::unescape_raw_str(lit, &mut |range, c| {
 692             if let Err(err) = c {
 693                 emit_unescape_error(
 694                     &self.sess.span_diagnostic,
 695                     lit,
 696                     self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 697                     unescape::Mode::Str,
 698                     range,
 699                     err,
 700                 )
 701             }
 702         })
 703     }
 704
 705     fn validate_raw_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
 706         let lit = self.str_from_to(content_start, content_end);
 707         unescape::unescape_raw_byte_str(lit, &mut |range, c| {
 708             if let Err(err) = c {
 709                 emit_unescape_error(
 710                     &self.sess.span_diagnostic,
 711                     lit,
 712                     self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 713                     unescape::Mode::ByteStr,
 714                     range,
 715                     err,
 716                 )
 717             }
 718         })
 719     }
 720
 721     fn validate_byte_str_escape(&self, content_start: BytePos, content_end: BytePos) {
 722         let lit = self.str_from_to(content_start, content_end);
 723         unescape::unescape_byte_str(lit, &mut |range, c| {
 724             if let Err(err) = c {
 725                 emit_unescape_error(
 726                     &self.sess.span_diagnostic,
 727                     lit,
 728                     self.mk_sp(content_start - BytePos(1), content_end + BytePos(1)),
 729                     unescape::Mode::ByteStr,
 730                     range,
 731                     err,
 732                 )
 733             }
 734         })
 735     }
 736
 737     fn validate_int_literal(&self, base: Base, content_start: BytePos, content_end: BytePos) {
 738         let base = match base {
 739             Base::Binary => 2,
 740             Base::Octal => 8,
 741             _ => return,
 742         };
 743         let s = self.str_from_to(content_start + BytePos(2), content_end);
 744         for (idx, c) in s.char_indices() {
 745             let idx = idx as u32;
 746             if c != '_' && c.to_digit(base).is_none() {
 747                 let lo = content_start + BytePos(2 + idx);
 748                 let hi = content_start + BytePos(2 + idx + c.len_utf8() as u32);
 749                 self.err_span_(lo, hi,
 750                                &format!("invalid digit for a base {} literal", base));
 751
 752             }
 753         }
 754     }
 755 }
 756
 757 fn is_doc_comment(s: &str) -> bool {
 758     let res = (s.starts_with("///") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'/') ||
 759               s.starts_with("//!");
 760     debug!("is {:?} a doc comment? {}", s, res);
 761     res
 762 }
 763
 764 fn is_block_doc_comment(s: &str) -> bool {
 765     // Prevent `/**/` from being parsed as a doc comment
 766     let res = ((s.starts_with("/**") && *s.as_bytes().get(3).unwrap_or(&b' ') != b'*') ||
 767                s.starts_with("/*!")) && s.len() >= 5;
 768     debug!("is {:?} a doc comment? {}", s, res);
 769     res
 770 }
 771
 772 #[cfg(test)]
 773 mod tests {
 774     use super::*;
 775
 776     use crate::ast::CrateConfig;
 777     use crate::symbol::Symbol;
 778     use crate::source_map::{SourceMap, FilePathMapping};
 779     use crate::feature_gate::UnstableFeatures;
 780     use crate::parse::token;
 781     use crate::diagnostics::plugin::ErrorMap;
 782     use crate::with_default_globals;
 783     use std::io;
 784     use std::path::PathBuf;
 785     use syntax_pos::{BytePos, Span, NO_EXPANSION, edition::Edition};
 786     use rustc_data_structures::fx::{FxHashSet, FxHashMap};
 787     use rustc_data_structures::sync::Lock;
 788
 789     fn mk_sess(sm: Lrc<SourceMap>) -> ParseSess {
 790         let emitter = errors::emitter::EmitterWriter::new(Box::new(io::sink()),
 791                                                           Some(sm.clone()),
 792                                                           false,
 793                                                           false,
 794                                                           false);
 795         ParseSess {
 796             span_diagnostic: errors::Handler::with_emitter(true, None, Box::new(emitter)),
 797             unstable_features: UnstableFeatures::from_environment(),
 798             config: CrateConfig::default(),
 799             included_mod_stack: Lock::new(Vec::new()),
 800             source_map: sm,
 801             missing_fragment_specifiers: Lock::new(FxHashSet::default()),
 802             raw_identifier_spans: Lock::new(Vec::new()),
 803             registered_diagnostics: Lock::new(ErrorMap::new()),
 804             buffered_lints: Lock::new(vec![]),
 805             edition: Edition::from_session(),
 806             ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
 807             param_attr_spans: Lock::new(Vec::new()),
 808             let_chains_spans: Lock::new(Vec::new()),
 809             async_closure_spans: Lock::new(Vec::new()),
 810         }
 811     }
 812
 813     // open a string reader for the given string
 814     fn setup<'a>(sm: &SourceMap,
 815                  sess: &'a ParseSess,
 816                  teststr: String)
 817                  -> StringReader<'a> {
 818         let sf = sm.new_source_file(PathBuf::from(teststr.clone()).into(), teststr);
 819         StringReader::new(sess, sf, None)
 820     }
 821
 822     #[test]
 823     fn t1() {
 824         with_default_globals(|| {
 825             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 826             let sh = mk_sess(sm.clone());
 827             let mut string_reader = setup(&sm,
 828                                         &sh,
 829                                         "/* my source file */ fn main() { println!(\"zebra\"); }\n"
 830                                             .to_string());
 831             assert_eq!(string_reader.next_token(), token::Comment);
 832             assert_eq!(string_reader.next_token(), token::Whitespace);
 833             let tok1 = string_reader.next_token();
 834             let tok2 = Token::new(
 835                 mk_ident("fn"),
 836                 Span::new(BytePos(21), BytePos(23), NO_EXPANSION),
 837             );
 838             assert_eq!(tok1.kind, tok2.kind);
 839             assert_eq!(tok1.span, tok2.span);
 840             assert_eq!(string_reader.next_token(), token::Whitespace);
 841             // read another token:
 842             let tok3 = string_reader.next_token();
 843             assert_eq!(string_reader.pos.clone(), BytePos(28));
 844             let tok4 = Token::new(
 845                 mk_ident("main"),
 846                 Span::new(BytePos(24), BytePos(28), NO_EXPANSION),
 847             );
 848             assert_eq!(tok3.kind, tok4.kind);
 849             assert_eq!(tok3.span, tok4.span);
 850
 851             assert_eq!(string_reader.next_token(), token::OpenDelim(token::Paren));
 852             assert_eq!(string_reader.pos.clone(), BytePos(29))
 853         })
 854     }
 855
 856     // check that the given reader produces the desired stream
 857     // of tokens (stop checking after exhausting the expected vec)
 858     fn check_tokenization(mut string_reader: StringReader<'_>, expected: Vec<TokenKind>) {
 859         for expected_tok in &expected {
 860             assert_eq!(&string_reader.next_token(), expected_tok);
 861         }
 862     }
 863
 864     // make the identifier by looking up the string in the interner
 865     fn mk_ident(id: &str) -> TokenKind {
 866         token::Ident(Symbol::intern(id), false)
 867     }
 868
 869     fn mk_lit(kind: token::LitKind, symbol: &str, suffix: Option<&str>) -> TokenKind {
 870         TokenKind::lit(kind, Symbol::intern(symbol), suffix.map(Symbol::intern))
 871     }
 872
 873     #[test]
 874     fn doublecolonparsing() {
 875         with_default_globals(|| {
 876             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 877             let sh = mk_sess(sm.clone());
 878             check_tokenization(setup(&sm, &sh, "a b".to_string()),
 879                             vec![mk_ident("a"), token::Whitespace, mk_ident("b")]);
 880         })
 881     }
 882
 883     #[test]
 884     fn dcparsing_2() {
 885         with_default_globals(|| {
 886             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 887             let sh = mk_sess(sm.clone());
 888             check_tokenization(setup(&sm, &sh, "a::b".to_string()),
 889                             vec![mk_ident("a"), token::ModSep, mk_ident("b")]);
 890         })
 891     }
 892
 893     #[test]
 894     fn dcparsing_3() {
 895         with_default_globals(|| {
 896             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 897             let sh = mk_sess(sm.clone());
 898             check_tokenization(setup(&sm, &sh, "a ::b".to_string()),
 899                             vec![mk_ident("a"), token::Whitespace, token::ModSep, mk_ident("b")]);
 900         })
 901     }
 902
 903     #[test]
 904     fn dcparsing_4() {
 905         with_default_globals(|| {
 906             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 907             let sh = mk_sess(sm.clone());
 908             check_tokenization(setup(&sm, &sh, "a:: b".to_string()),
 909                             vec![mk_ident("a"), token::ModSep, token::Whitespace, mk_ident("b")]);
 910         })
 911     }
 912
 913     #[test]
 914     fn character_a() {
 915         with_default_globals(|| {
 916             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 917             let sh = mk_sess(sm.clone());
 918             assert_eq!(setup(&sm, &sh, "'a'".to_string()).next_token(),
 919                        mk_lit(token::Char, "a", None));
 920         })
 921     }
 922
 923     #[test]
 924     fn character_space() {
 925         with_default_globals(|| {
 926             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 927             let sh = mk_sess(sm.clone());
 928             assert_eq!(setup(&sm, &sh, "' '".to_string()).next_token(),
 929                        mk_lit(token::Char, " ", None));
 930         })
 931     }
 932
 933     #[test]
 934     fn character_escaped() {
 935         with_default_globals(|| {
 936             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 937             let sh = mk_sess(sm.clone());
 938             assert_eq!(setup(&sm, &sh, "'\\n'".to_string()).next_token(),
 939                        mk_lit(token::Char, "\\n", None));
 940         })
 941     }
 942
 943     #[test]
 944     fn lifetime_name() {
 945         with_default_globals(|| {
 946             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 947             let sh = mk_sess(sm.clone());
 948             assert_eq!(setup(&sm, &sh, "'abc".to_string()).next_token(),
 949                        token::Lifetime(Symbol::intern("'abc")));
 950         })
 951     }
 952
 953     #[test]
 954     fn raw_string() {
 955         with_default_globals(|| {
 956             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 957             let sh = mk_sess(sm.clone());
 958             assert_eq!(setup(&sm, &sh, "r###\"\"#a\\b\x00c\"\"###".to_string()).next_token(),
 959                        mk_lit(token::StrRaw(3), "\"#a\\b\x00c\"", None));
 960         })
 961     }
 962
 963     #[test]
 964     fn literal_suffixes() {
 965         with_default_globals(|| {
 966             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
 967             let sh = mk_sess(sm.clone());
 968             macro_rules! test {
 969                 ($input: expr, $tok_type: ident, $tok_contents: expr) => {{
 970                     assert_eq!(setup(&sm, &sh, format!("{}suffix", $input)).next_token(),
 971                                mk_lit(token::$tok_type, $tok_contents, Some("suffix")));
 972                     // with a whitespace separator:
 973                     assert_eq!(setup(&sm, &sh, format!("{} suffix", $input)).next_token(),
 974                                mk_lit(token::$tok_type, $tok_contents, None));
 975                 }}
 976             }
 977
 978             test!("'a'", Char, "a");
 979             test!("b'a'", Byte, "a");
 980             test!("\"a\"", Str, "a");
 981             test!("b\"a\"", ByteStr, "a");
 982             test!("1234", Integer, "1234");
 983             test!("0b101", Integer, "0b101");
 984             test!("0xABC", Integer, "0xABC");
 985             test!("1.0", Float, "1.0");
 986             test!("1.0e10", Float, "1.0e10");
 987
 988             assert_eq!(setup(&sm, &sh, "2us".to_string()).next_token(),
 989                        mk_lit(token::Integer, "2", Some("us")));
 990             assert_eq!(setup(&sm, &sh, "r###\"raw\"###suffix".to_string()).next_token(),
 991                        mk_lit(token::StrRaw(3), "raw", Some("suffix")));
 992             assert_eq!(setup(&sm, &sh, "br###\"raw\"###suffix".to_string()).next_token(),
 993                        mk_lit(token::ByteStrRaw(3), "raw", Some("suffix")));
 994         })
 995     }
 996
 997     #[test]
 998     fn line_doc_comments() {
 999         assert!(is_doc_comment("///"));
1000         assert!(is_doc_comment("/// blah"));
1001         assert!(!is_doc_comment("////"));
1002     }
1003
1004     #[test]
1005     fn nested_block_comments() {
1006         with_default_globals(|| {
1007             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
1008             let sh = mk_sess(sm.clone());
1009             let mut lexer = setup(&sm, &sh, "/* /* */ */'a'".to_string());
1010             assert_eq!(lexer.next_token(), token::Comment);
1011             assert_eq!(lexer.next_token(), mk_lit(token::Char, "a", None));
1012         })
1013     }
1014
1015     #[test]
1016     fn crlf_comments() {
1017         with_default_globals(|| {
1018             let sm = Lrc::new(SourceMap::new(FilePathMapping::empty()));
1019             let sh = mk_sess(sm.clone());
1020             let mut lexer = setup(&sm, &sh, "// test\r\n/// test\r\n".to_string());
1021             let comment = lexer.next_token();
1022             assert_eq!(comment.kind, token::Comment);
1023             assert_eq!((comment.span.lo(), comment.span.hi()), (BytePos(0), BytePos(7)));
1024             assert_eq!(lexer.next_token(), token::Whitespace);
1025             assert_eq!(lexer.next_token(), token::DocComment(Symbol::intern("/// test")));
1026         })
1027     }
1028 }