1 //! This is an NFA-based parser, which calls out to the main Rust parser for named non-terminals
2 //! (which it commits to fully when it hits one in a grammar). There's a set of current NFA threads
3 //! and a set of next ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
4 //! pathological cases, is worse than traditional use of NFA or Earley parsing, but it's an easier
5 //! fit for Macro-by-Example-style rules.
7 //! (In order to prevent the pathological case, we'd need to lazily construct the resulting
8 //! `NamedMatch`es at the very end. It'd be a pain, and require more memory to keep around old
9 //! matcher positions, but it would also save overhead)
11 //! We don't say this parser uses the Earley algorithm, because it's unnecessarily inaccurate.
12 //! The macro parser restricts itself to the features of finite state automata. Earley parsers
13 //! can be described as an extension of NFAs with completion rules, prediction rules, and recursion.
15 //! Quick intro to how the parser works:
17 //! A "matcher position" (a.k.a. "position" or "mp") is a dot in the middle of a matcher, usually
18 //! written as a `·`. For example `· a $( a )* a b` is one, as is `a $( · a )* a b`.
20 //! The parser walks through the input a token at a time, maintaining a list
21 //! of threads consistent with the current position in the input string: `cur_mps`.
23 //! As it processes them, it fills up `eof_mps` with threads that would be valid if
24 //! the macro invocation is now over, `bb_mps` with threads that are waiting on
25 //! a Rust non-terminal like `$e:expr`, and `next_mps` with threads that are waiting
26 //! on a particular token. Most of the logic concerns moving the · through the
27 //! repetitions indicated by Kleene stars. The rules for moving the · without
28 //! consuming any input are called epsilon transitions. It only advances or calls
29 //! out to the real Rust parser when no `cur_mps` threads remain.
34 //! Start parsing a a a a b against [· a $( a )* a b].
36 //! Remaining input: a a a a b
37 //! next: [· a $( a )* a b]
39 //! - - - Advance over an a. - - -
41 //! Remaining input: a a a b
42 //! cur: [a · $( a )* a b]
43 //! Descend/Skip (first position).
44 //! next: [a $( · a )* a b] [a $( a )* · a b].
46 //! - - - Advance over an a. - - -
48 //! Remaining input: a a b
49 //! cur: [a $( a · )* a b] [a $( a )* a · b]
50 //! Follow epsilon transition: Finish/Repeat (first position)
51 //! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
53 //! - - - Advance over an a. - - - (this looks exactly like the last step)
55 //! Remaining input: a b
56 //! cur: [a $( a · )* a b] [a $( a )* a · b]
57 //! Follow epsilon transition: Finish/Repeat (first position)
58 //! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
60 //! - - - Advance over an a. - - - (this looks exactly like the last step)
62 //! Remaining input: b
63 //! cur: [a $( a · )* a b] [a $( a )* a · b]
64 //! Follow epsilon transition: Finish/Repeat (first position)
65 //! next: [a $( a )* · a b] [a $( · a )* a b] [a $( a )* a · b]
67 //! - - - Advance over a b. - - -
69 //! Remaining input: ''
70 //! eof: [a $( a )* a b ·]
73 pub(crate) use NamedMatch::*;
74 pub(crate) use ParseResult::*;
75 use rustc_errors::ErrorGuaranteed;
77 use crate::mbe::{KleeneOp, TokenTree};
79 use rustc_ast::token::{self, DocComment, Nonterminal, NonterminalKind, Token};
80 use rustc_lint_defs::pluralize;
81 use rustc_parse::parser::{NtOrTt, Parser};
82 use rustc_span::symbol::MacroRulesNormalizedIdent;
85 use rustc_data_structures::fx::FxHashMap;
86 use rustc_data_structures::sync::Lrc;
87 use rustc_span::symbol::Ident;
89 use std::collections::hash_map::Entry::{Occupied, Vacant};
91 /// A unit within a matcher that a `MatcherPos` can refer to. Similar to (and derived from)
92 /// `mbe::TokenTree`, but designed specifically for fast and easy traversal during matching.
93 /// Notable differences to `mbe::TokenTree`:
94 /// - It is non-recursive, i.e. there is no nesting.
95 /// - The end pieces of each sequence (the separator, if present, and the Kleene op) are
96 /// represented explicitly, as is the very end of the matcher.
98 /// This means a matcher can be represented by `&[MatcherLoc]`, and traversal mostly involves
99 /// simply incrementing the current matcher position index by one.
100 pub(super) enum MatcherLoc {
107 num_metavar_decls: usize,
108 idx_first_after: usize,
112 SequenceKleeneOpNoSep {
119 SequenceKleeneOpAfterSep {
125 kind: Option<NonterminalKind>,
132 pub(super) fn compute_locs(matcher: &[TokenTree]) -> Vec<MatcherLoc> {
135 locs: &mut Vec<MatcherLoc>,
136 next_metavar: &mut usize,
141 TokenTree::Token(token) => {
142 locs.push(MatcherLoc::Token { token: token.clone() });
144 TokenTree::Delimited(span, delimited) => {
145 let open_token = Token::new(token::OpenDelim(delimited.delim), span.open);
146 let close_token = Token::new(token::CloseDelim(delimited.delim), span.close);
148 locs.push(MatcherLoc::Delimited);
149 locs.push(MatcherLoc::Token { token: open_token });
150 inner(&delimited.tts, locs, next_metavar, seq_depth);
151 locs.push(MatcherLoc::Token { token: close_token });
153 TokenTree::Sequence(_, seq) => {
154 // We can't determine `idx_first_after` and construct the final
155 // `MatcherLoc::Sequence` until after `inner()` is called and the sequence end
156 // pieces are processed. So we push a dummy value (`Eof` is cheapest to
157 // construct) now, and overwrite it with the proper value below.
158 let dummy = MatcherLoc::Eof;
161 let next_metavar_orig = *next_metavar;
162 let op = seq.kleene.op;
163 let idx_first = locs.len();
164 let idx_seq = idx_first - 1;
165 inner(&seq.tts, locs, next_metavar, seq_depth + 1);
167 if let Some(separator) = &seq.separator {
168 locs.push(MatcherLoc::SequenceSep { separator: separator.clone() });
169 locs.push(MatcherLoc::SequenceKleeneOpAfterSep { idx_first });
171 locs.push(MatcherLoc::SequenceKleeneOpNoSep { op, idx_first });
174 // Overwrite the dummy value pushed above with the proper value.
175 locs[idx_seq] = MatcherLoc::Sequence {
177 num_metavar_decls: seq.num_captures,
178 idx_first_after: locs.len(),
179 next_metavar: next_metavar_orig,
183 &TokenTree::MetaVarDecl(span, bind, kind) => {
184 locs.push(MatcherLoc::MetaVarDecl {
188 next_metavar: *next_metavar,
193 TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(),
198 let mut locs = vec![];
199 let mut next_metavar = 0;
200 inner(matcher, &mut locs, &mut next_metavar, /* seq_depth */ 0);
202 // A final entry is needed for eof.
203 locs.push(MatcherLoc::Eof);
208 /// A single matcher position, representing the state of matching.
210 /// The index into `TtParser::locs`, which represents the "dot".
213 /// The matches made against metavar decls so far. On a successful match, this vector ends up
214 /// with one element per metavar decl in the matcher. Each element records token trees matched
215 /// against the relevant metavar by the black box parser. An element will be a `MatchedSeq` if
216 /// the corresponding metavar decl is within a sequence.
218 /// It is critical to performance that this is an `Lrc`, because it gets cloned frequently when
219 /// processing sequences. Mostly for sequence-ending possibilities that must be tried but end
221 matches: Lrc<Vec<NamedMatch>>,
224 // This type is used a lot. Make sure it doesn't unintentionally get bigger.
225 #[cfg(all(target_arch = "x86_64", target_pointer_width = "64"))]
226 rustc_data_structures::static_assert_size!(MatcherPos, 16);
229 /// Adds `m` as a named match for the `metavar_idx`-th metavar. There are only two call sites,
230 /// and both are hot enough to be always worth inlining.
232 fn push_match(&mut self, metavar_idx: usize, seq_depth: usize, m: NamedMatch) {
233 let matches = Lrc::make_mut(&mut self.matches);
236 // We are not within a sequence. Just append `m`.
237 assert_eq!(metavar_idx, matches.len());
241 // We are within a sequence. Find the final `MatchedSeq` at the appropriate depth
242 // and append `m` to its vector.
243 let mut curr = &mut matches[metavar_idx];
244 for _ in 0..seq_depth - 1 {
246 MatchedSeq(seq) => curr = seq.last_mut().unwrap(),
251 MatchedSeq(seq) => seq.push(m),
259 enum EofMatcherPositions {
265 /// Represents the possible results of an attempted parse.
266 pub(crate) enum ParseResult<T> {
267 /// Parsed successfully.
269 /// Arm failed to match. If the second parameter is `token::Eof`, it indicates an unexpected
270 /// end of macro invocation. Otherwise, it indicates that no rules expected the given token.
271 Failure(Token, &'static str),
272 /// Fatal error (malformed macro?). Abort compilation.
273 Error(rustc_span::Span, String),
274 ErrorReported(ErrorGuaranteed),
277 /// A `ParseResult` where the `Success` variant contains a mapping of
278 /// `MacroRulesNormalizedIdent`s to `NamedMatch`es. This represents the mapping
279 /// of metavars to the token trees they bind to.
280 pub(crate) type NamedParseResult = ParseResult<FxHashMap<MacroRulesNormalizedIdent, NamedMatch>>;
282 /// Count how many metavars declarations are in `matcher`.
283 pub(super) fn count_metavar_decls(matcher: &[TokenTree]) -> usize {
287 TokenTree::MetaVarDecl(..) => 1,
288 TokenTree::Sequence(_, seq) => seq.num_captures,
289 TokenTree::Delimited(_, delim) => count_metavar_decls(&delim.tts),
290 TokenTree::Token(..) => 0,
291 TokenTree::MetaVar(..) | TokenTree::MetaVarExpr(..) => unreachable!(),
296 /// `NamedMatch` is a pattern-match result for a single metavar. All
297 /// `MatchedNonterminal`s in the `NamedMatch` have the same non-terminal type
298 /// (expr, item, etc).
300 /// The in-memory structure of a particular `NamedMatch` represents the match
301 /// that occurred when a particular subset of a matcher was applied to a
302 /// particular token tree.
304 /// The width of each `MatchedSeq` in the `NamedMatch`, and the identity of
305 /// the `MatchedNtNonTts`s, will depend on the token tree it was applied
306 /// to: each `MatchedSeq` corresponds to a single repetition in the originating
307 /// token tree. The depth of the `NamedMatch` structure will therefore depend
308 /// only on the nesting depth of repetitions in the originating token tree it
309 /// was derived from.
311 /// In layperson's terms: `NamedMatch` will form a tree representing nested matches of a particular
312 /// meta variable. For example, if we are matching the following macro against the following
316 /// macro_rules! foo {
317 /// ($($($x:ident),+);+) => {}
320 /// foo!(a, b, c, d; a, b, c, d, e);
323 /// Then, the tree will have the following shape:
325 /// ```ignore (private-internal)
326 /// # use NamedMatch::*;
329 /// MatchedNonterminal(a),
330 /// MatchedNonterminal(b),
331 /// MatchedNonterminal(c),
332 /// MatchedNonterminal(d),
335 /// MatchedNonterminal(a),
336 /// MatchedNonterminal(b),
337 /// MatchedNonterminal(c),
338 /// MatchedNonterminal(d),
339 /// MatchedNonterminal(e),
343 #[derive(Debug, Clone)]
344 pub(crate) enum NamedMatch {
345 MatchedSeq(Vec<NamedMatch>),
347 // A metavar match of type `tt`.
348 MatchedTokenTree(rustc_ast::tokenstream::TokenTree),
350 // A metavar match of any type other than `tt`.
351 MatchedNonterminal(Lrc<Nonterminal>),
354 /// Performs a token equality check, ignoring syntax context (that is, an unhygienic comparison)
355 fn token_name_eq(t1: &Token, t2: &Token) -> bool {
356 if let (Some((ident1, is_raw1)), Some((ident2, is_raw2))) = (t1.ident(), t2.ident()) {
357 ident1.name == ident2.name && is_raw1 == is_raw2
358 } else if let (Some(ident1), Some(ident2)) = (t1.lifetime(), t2.lifetime()) {
359 ident1.name == ident2.name
365 // Note: the vectors could be created and dropped within `parse_tt`, but to avoid excess
366 // allocations we have a single vector for each kind that is cleared and reused repeatedly.
367 pub struct TtParser {
370 /// The set of current mps to be processed. This should be empty by the end of a successful
371 /// execution of `parse_tt_inner`.
372 cur_mps: Vec<MatcherPos>,
374 /// The set of newly generated mps. These are used to replenish `cur_mps` in the function
376 next_mps: Vec<MatcherPos>,
378 /// The set of mps that are waiting for the black-box parser.
379 bb_mps: Vec<MatcherPos>,
381 /// Pre-allocate an empty match array, so it can be cloned cheaply for macros with many rules
382 /// that have no metavars.
383 empty_matches: Lrc<Vec<NamedMatch>>,
387 pub(super) fn new(macro_name: Ident) -> TtParser {
393 empty_matches: Lrc::new(vec![]),
397 /// Process the matcher positions of `cur_mps` until it is empty. In the process, this will
398 /// produce more mps in `next_mps` and `bb_mps`.
402 /// `Some(result)` if everything is finished, `None` otherwise. Note that matches are kept
403 /// track of through the mps generated.
406 matcher: &[MatcherLoc],
408 ) -> Option<NamedParseResult> {
409 // Matcher positions that would be valid if the macro invocation was over now. Only
410 // modified if `token == Eof`.
411 let mut eof_mps = EofMatcherPositions::None;
413 while let Some(mut mp) = self.cur_mps.pop() {
414 match &matcher[mp.idx] {
415 MatcherLoc::Token { token: t } => {
416 // If it's a doc comment, we just ignore it and move on to the next tt in the
417 // matcher. This is a bug, but #95267 showed that existing programs rely on
418 // this behaviour, and changing it would require some care and a transition
421 // If the token matches, we can just advance the parser.
423 // Otherwise, this match has failed, there is nothing to do, and hopefully
424 // another mp in `cur_mps` will match.
425 if matches!(t, Token { kind: DocComment(..), .. }) {
427 self.cur_mps.push(mp);
428 } else if token_name_eq(&t, token) {
430 self.next_mps.push(mp);
433 MatcherLoc::Delimited => {
434 // Entering the delimiter is trivial.
436 self.cur_mps.push(mp);
438 &MatcherLoc::Sequence {
445 // Install an empty vec for each metavar within the sequence.
446 for metavar_idx in next_metavar..next_metavar + num_metavar_decls {
447 mp.push_match(metavar_idx, seq_depth, MatchedSeq(vec![]));
450 if op == KleeneOp::ZeroOrMore || op == KleeneOp::ZeroOrOne {
451 // Try zero matches of this sequence, by skipping over it.
452 self.cur_mps.push(MatcherPos {
453 idx: idx_first_after,
454 matches: Lrc::clone(&mp.matches),
458 // Try one or more matches of this sequence, by entering it.
460 self.cur_mps.push(mp);
462 &MatcherLoc::SequenceKleeneOpNoSep { op, idx_first } => {
463 // We are past the end of a sequence with no separator. Try ending the
464 // sequence. If that's not possible, `ending_mp` will fail quietly when it is
465 // processed next time around the loop.
466 let ending_mp = MatcherPos {
467 idx: mp.idx + 1, // +1 skips the Kleene op
468 matches: Lrc::clone(&mp.matches),
470 self.cur_mps.push(ending_mp);
472 if op != KleeneOp::ZeroOrOne {
473 // Try another repetition.
475 self.cur_mps.push(mp);
478 MatcherLoc::SequenceSep { separator } => {
479 // We are past the end of a sequence with a separator but we haven't seen the
480 // separator yet. Try ending the sequence. If that's not possible, `ending_mp`
481 // will fail quietly when it is processed next time around the loop.
482 let ending_mp = MatcherPos {
483 idx: mp.idx + 2, // +2 skips the separator and the Kleene op
484 matches: Lrc::clone(&mp.matches),
486 self.cur_mps.push(ending_mp);
488 if token_name_eq(token, separator) {
489 // The separator matches the current token. Advance past it.
491 self.next_mps.push(mp);
494 &MatcherLoc::SequenceKleeneOpAfterSep { idx_first } => {
495 // We are past the sequence separator. This can't be a `?` Kleene op, because
496 // they don't permit separators. Try another repetition.
498 self.cur_mps.push(mp);
500 &MatcherLoc::MetaVarDecl { span, kind, .. } => {
501 // Built-in nonterminals never start with these tokens, so we can eliminate
502 // them from consideration. We use the span of the metavariable declaration
503 // to determine any edition-specific matching behavior for non-terminals.
504 if let Some(kind) = kind {
505 if Parser::nonterminal_may_begin_with(kind, token) {
506 self.bb_mps.push(mp);
509 // E.g. `$e` instead of `$e:expr`, reported as a hard error if actually used.
510 // Both this check and the one in `nameize` are necessary, surprisingly.
511 return Some(Error(span, "missing fragment specifier".to_string()));
515 // We are past the matcher's end, and not in a sequence. Try to end things.
516 debug_assert_eq!(mp.idx, matcher.len() - 1);
517 if *token == token::Eof {
518 eof_mps = match eof_mps {
519 EofMatcherPositions::None => EofMatcherPositions::One(mp),
520 EofMatcherPositions::One(_) | EofMatcherPositions::Multiple => {
521 EofMatcherPositions::Multiple
529 // If we reached the end of input, check that there is EXACTLY ONE possible matcher.
530 // Otherwise, either the parse is ambiguous (which is an error) or there is a syntax error.
531 if *token == token::Eof {
533 EofMatcherPositions::One(mut eof_mp) => {
534 // Need to take ownership of the matches from within the `Lrc`.
535 Lrc::make_mut(&mut eof_mp.matches);
536 let matches = Lrc::try_unwrap(eof_mp.matches).unwrap().into_iter();
537 self.nameize(matcher, matches)
539 EofMatcherPositions::Multiple => {
540 Error(token.span, "ambiguity: multiple successful parses".to_string())
542 EofMatcherPositions::None => Failure(
545 if token.span.is_dummy() { token.span } else { token.span.shrink_to_hi() },
547 "missing tokens in macro arguments",
555 /// Match the token stream from `parser` against `matcher`.
556 pub(super) fn parse_tt(
558 parser: &mut Cow<'_, Parser<'_>>,
559 matcher: &[MatcherLoc],
560 ) -> NamedParseResult {
561 // A queue of possible matcher positions. We initialize it with the matcher position in
562 // which the "dot" is before the first token of the first token tree in `matcher`.
563 // `parse_tt_inner` then processes all of these possible matcher positions and produces
564 // possible next positions into `next_mps`. After some post-processing, the contents of
565 // `next_mps` replenish `cur_mps` and we start over again.
566 self.cur_mps.clear();
567 self.cur_mps.push(MatcherPos { idx: 0, matches: self.empty_matches.clone() });
570 self.next_mps.clear();
573 // Process `cur_mps` until either we have finished the input or we need to get some
574 // parsing from the black-box parser done.
575 if let Some(res) = self.parse_tt_inner(matcher, &parser.token) {
579 // `parse_tt_inner` handled all of `cur_mps`, so it's empty.
580 assert!(self.cur_mps.is_empty());
582 // Error messages here could be improved with links to original rules.
583 match (self.next_mps.len(), self.bb_mps.len()) {
585 // There are no possible next positions AND we aren't waiting for the black-box
586 // parser: syntax error.
588 parser.token.clone(),
589 "no rules expected this token in macro call",
594 // Dump all possible `next_mps` into `cur_mps` for the next iteration. Then
595 // process the next token.
596 self.cur_mps.append(&mut self.next_mps);
597 parser.to_mut().bump();
601 // We need to call the black-box parser to get some nonterminal.
602 let mut mp = self.bb_mps.pop().unwrap();
603 let loc = &matcher[mp.idx];
604 if let &MatcherLoc::MetaVarDecl {
612 // We use the span of the metavariable declaration to determine any
613 // edition-specific matching behavior for non-terminals.
614 let nt = match parser.to_mut().parse_nonterminal(kind) {
616 let guarantee = err.span_label(
619 "while parsing argument for this `{kind}` macro fragment"
623 return ErrorReported(guarantee);
628 NtOrTt::Nt(nt) => MatchedNonterminal(Lrc::new(nt)),
629 NtOrTt::Tt(tt) => MatchedTokenTree(tt),
631 mp.push_match(next_metavar, seq_depth, m);
636 self.cur_mps.push(mp);
640 // Too many possibilities!
641 return self.ambiguity_error(matcher, parser.token.span);
645 assert!(!self.cur_mps.is_empty());
651 matcher: &[MatcherLoc],
652 token_span: rustc_span::Span,
653 ) -> NamedParseResult {
657 .map(|mp| match &matcher[mp.idx] {
658 MatcherLoc::MetaVarDecl { bind, kind: Some(kind), .. } => {
659 format!("{} ('{}')", kind, bind)
663 .collect::<Vec<String>>()
669 "local ambiguity when calling macro `{}`: multiple parsing options: {}",
671 match self.next_mps.len() {
672 0 => format!("built-in NTs {}.", nts),
673 n => format!("built-in NTs {} or {n} other option{s}.", nts, s = pluralize!(n)),
679 fn nameize<I: Iterator<Item = NamedMatch>>(
681 matcher: &[MatcherLoc],
683 ) -> NamedParseResult {
684 // Make that each metavar has _exactly one_ binding. If so, insert the binding into the
685 // `NamedParseResult`. Otherwise, it's an error.
686 let mut ret_val = FxHashMap::default();
688 if let &MatcherLoc::MetaVarDecl { span, bind, kind, .. } = loc {
690 match ret_val.entry(MacroRulesNormalizedIdent::new(bind)) {
691 Vacant(spot) => spot.insert(res.next().unwrap()),
693 return Error(span, format!("duplicated bind name: {}", bind));
697 // E.g. `$e` instead of `$e:expr`, reported as a hard error if actually used.
698 // Both this check and the one in `parse_tt_inner` are necessary, surprisingly.
699 return Error(span, "missing fragment specifier".to_string());