Rollup merge of #105524 - Ayush1325:libc-free, r=ChrisDenton

[rust.git] / compiler / rustc_parse / src / lexer / mod.rs
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs

index f027843e6b43d469ab5483494543ee9d89b42d98..e957224a03377805bbfb7fa666d60df3c29cb607 100644 (file)
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -17,6 +17,7 @@
  use rustc_span::symbol::{sym, Symbol};
  use rustc_span::{edition::Edition, BytePos, Pos, Span};
  
+mod diagnostics;
  mod tokentrees;
  mod unescape_error_reporting;
  mod unicode_chars;
@@ -52,8 +53,15 @@ pub(crate) fn parse_token_trees<'a>(
      }
  
      let cursor = Cursor::new(src);
-    let string_reader =
-        StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
+    let string_reader = StringReader {
+        sess,
+        start_pos,
+        pos: start_pos,
+        src,
+        cursor,
+        override_span,
+        nbsp_is_whitespace: false,
+    };
      tokentrees::TokenTreesReader::parse_all_token_trees(string_reader)
  }
  
@@ -68,6 +76,10 @@ struct StringReader<'a> {
      /// Cursor for getting lexer tokens.
      cursor: Cursor<'a>,
      override_span: Option<Span>,
+    /// When a "unknown start of token: \u{a0}" has already been emitted earlier
+    /// in this file, it's safe to treat further occurrences of the non-breaking
+    /// space character as whitespace.
+    nbsp_is_whitespace: bool,
  }
  
  impl<'a> StringReader<'a> {
@@ -79,7 +91,7 @@ fn mk_sp(&self, lo: BytePos, hi: BytePos) -> Span {
      /// preceded by whitespace.
      fn next_token(&mut self) -> (Token, bool) {
          let mut preceded_by_whitespace = false;
-
+        let mut swallow_next_invalid = 0;
          // Skip trivial (whitespace & comments) tokens
          loop {
              let token = self.cursor.advance_token();
@@ -232,19 +244,44 @@ fn next_token(&mut self) -> (Token, bool) {
                  rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
  
                  rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
-                    let c = self.str_from(start).chars().next().unwrap();
+                    // Don't emit diagnostics for sequences of the same invalid token
+                    if swallow_next_invalid > 0 {
+                        swallow_next_invalid -= 1;
+                        continue;
+                    }
+                    let mut it = self.str_from_to_end(start).chars();
+                    let c = it.next().unwrap();
+                    if c == '\u{00a0}' {
+                        // If an error has already been reported on non-breaking
+                        // space characters earlier in the file, treat all
+                        // subsequent occurrences as whitespace.
+                        if self.nbsp_is_whitespace {
+                            preceded_by_whitespace = true;
+                            continue;
+                        }
+                        self.nbsp_is_whitespace = true;
+                    }
+                    let repeats = it.take_while(|c1| *c1 == c).count();
                      let mut err =
-                        self.struct_err_span_char(start, self.pos, "unknown start of token", c);
+                        self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
                      // FIXME: the lexer could be used to turn the ASCII version of unicode
                      // homoglyphs, instead of keeping a table in `check_for_substitution`into the
                      // token. Ideally, this should be inside `rustc_lexer`. However, we should
                      // first remove compound tokens like `<<` from `rustc_lexer`, and then add
                      // fancier error recovery to it, as there will be less overall work to do this
                      // way.
-                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
+                    let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
                      if c == '\x00' {
                          err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
                      }
+                    if repeats > 0 {
+                        if repeats == 1 {
+                            err.note(format!("character appears once more"));
+                        } else {
+                            err.note(format!("character appears {repeats} more times"));
+                        }
+                        swallow_next_invalid = repeats;
+                    }
                      err.emit();
                      if let Some(token) = token {
                          token
@@ -471,7 +508,7 @@ fn src_index(&self, pos: BytePos) -> usize {
  
      /// Slice of the source text from `start` up to but excluding `self.pos`,
      /// meaning the slice does not include the character `self.ch`.
-    fn str_from(&self, start: BytePos) -> &str {
+    fn str_from(&self, start: BytePos) -> &'a str {
          self.str_from_to(start, self.pos)
      }
  
@@ -482,10 +519,15 @@ fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
      }
  
      /// Slice of the source text spanning from `start` up to but excluding `end`.
-    fn str_from_to(&self, start: BytePos, end: BytePos) -> &str {
+    fn str_from_to(&self, start: BytePos, end: BytePos) -> &'a str {
          &self.src[self.src_index(start)..self.src_index(end)]
      }
  
+    /// Slice of the source text spanning from `start` until the end
+    fn str_from_to_end(&self, start: BytePos) -> &'a str {
+        &self.src[self.src_index(start)..]
+    }
+
      fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
          match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
              Err(RawStrError::InvalidStarter { bad_char }) => {