src/libsyntax/ext/tt/macro_parser.rs

   1 // Copyright 2012 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // Earley-like parser for macros.
  12
  13 use ast;
  14 use ast::{Matcher, MatchTok, MatchSeq, MatchNonterminal, Ident};
  15 use codemap::{BytePos, mk_sp};
  16 use codemap;
  17 use parse::lexer::*; //resolve bug?
  18 use parse::ParseSess;
  19 use parse::attr::ParserAttr;
  20 use parse::parser::{LifetimeAndTypesWithoutColons, Parser};
  21 use parse::token::{Token, EOF, Nonterminal};
  22 use parse::token;
  23
  24 use collections::HashMap;
  25 use std::vec_ng::Vec;
  26
  27 /* This is an Earley-like parser, without support for in-grammar nonterminals,
  28 only by calling out to the main rust parser for named nonterminals (which it
  29 commits to fully when it hits one in a grammar). This means that there are no
  30 completer or predictor rules, and therefore no need to store one column per
  31 token: instead, there's a set of current Earley items and a set of next
  32 ones. Instead of NTs, we have a special case for Kleene star. The big-O, in
  33 pathological cases, is worse than traditional Earley parsing, but it's an
  34 easier fit for Macro-by-Example-style rules, and I think the overhead is
  35 lower. (In order to prevent the pathological case, we'd need to lazily
  36 construct the resulting `NamedMatch`es at the very end. It'd be a pain,
  37 and require more memory to keep around old items, but it would also save
  38 overhead)*/
  39
  40 /* Quick intro to how the parser works:
  41
  42 A 'position' is a dot in the middle of a matcher, usually represented as a
  43 dot. For example `· a $( a )* a b` is a position, as is `a $( · a )* a b`.
  44
  45 The parser walks through the input a character at a time, maintaining a list
  46 of items consistent with the current position in the input string: `cur_eis`.
  47
  48 As it processes them, it fills up `eof_eis` with items that would be valid if
  49 the macro invocation is now over, `bb_eis` with items that are waiting on
  50 a Rust nonterminal like `$e:expr`, and `next_eis` with items that are waiting
  51 on the a particular token. Most of the logic concerns moving the · through the
  52 repetitions indicated by Kleene stars. It only advances or calls out to the
  53 real Rust parser when no `cur_eis` items remain
  54
  55 Example: Start parsing `a a a a b` against [· a $( a )* a b].
  56
  57 Remaining input: `a a a a b`
  58 next_eis: [· a $( a )* a b]
  59
  60 - - - Advance over an `a`. - - -
  61
  62 Remaining input: `a a a b`
  63 cur: [a · $( a )* a b]
  64 Descend/Skip (first item).
  65 next: [a $( · a )* a b]  [a $( a )* · a b].
  66
  67 - - - Advance over an `a`. - - -
  68
  69 Remaining input: `a a b`
  70 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  71 Finish/Repeat (first item)
  72 next: [a $( a )* · a b]  [a $( · a )* a b]  [a $( a )* a · b]
  73
  74 - - - Advance over an `a`. - - - (this looks exactly like the last step)
  75
  76 Remaining input: `a b`
  77 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  78 Finish/Repeat (first item)
  79 next: [a $( a )* · a b]  [a $( · a )* a b]  [a $( a )* a · b]
  80
  81 - - - Advance over an `a`. - - - (this looks exactly like the last step)
  82
  83 Remaining input: `b`
  84 cur: [a $( a · )* a b]  next: [a $( a )* a · b]
  85 Finish/Repeat (first item)
  86 next: [a $( a )* · a b]  [a $( · a )* a b]
  87
  88 - - - Advance over a `b`. - - -
  89
  90 Remaining input: ``
  91 eof: [a $( a )* a b ·]
  92
  93  */
  94
  95
  96 /* to avoid costly uniqueness checks, we require that `MatchSeq` always has a
  97 nonempty body. */
  98
  99
 100 #[deriving(Clone)]
 101 pub struct MatcherPos {
 102     elts: Vec<ast::Matcher> , // maybe should be <'>? Need to understand regions.
 103     sep: Option<Token>,
 104     idx: uint,
 105     up: Option<~MatcherPos>,
 106     matches: Vec<Vec<@NamedMatch>>,
 107     match_lo: uint, match_hi: uint,
 108     sp_lo: BytePos,
 109 }
 110
 111 pub fn count_names(ms: &[Matcher]) -> uint {
 112     ms.iter().fold(0, |ct, m| {
 113         ct + match m.node {
 114             MatchTok(_) => 0u,
 115             MatchSeq(ref more_ms, _, _, _, _) => {
 116                 count_names(more_ms.as_slice())
 117             }
 118             MatchNonterminal(_, _, _) => 1u
 119         }})
 120 }
 121
 122 pub fn initial_matcher_pos(ms: Vec<Matcher> , sep: Option<Token>, lo: BytePos)
 123                         -> ~MatcherPos {
 124     let mut match_idx_hi = 0u;
 125     for elt in ms.iter() {
 126         match elt.node {
 127             MatchTok(_) => (),
 128             MatchSeq(_,_,_,_,hi) => {
 129                 match_idx_hi = hi;       // it is monotonic...
 130             }
 131             MatchNonterminal(_,_,pos) => {
 132                 match_idx_hi = pos+1u;  // ...so latest is highest
 133             }
 134         }
 135     }
 136     let matches = Vec::from_fn(count_names(ms.as_slice()), |_i| Vec::new());
 137     ~MatcherPos {
 138         elts: ms,
 139         sep: sep,
 140         idx: 0u,
 141         up: None,
 142         matches: matches,
 143         match_lo: 0u,
 144         match_hi: match_idx_hi,
 145         sp_lo: lo
 146     }
 147 }
 148
 149 // NamedMatch is a pattern-match result for a single ast::MatchNonterminal:
 150 // so it is associated with a single ident in a parse, and all
 151 // MatchedNonterminal's in the NamedMatch have the same nonterminal type
 152 // (expr, item, etc). All the leaves in a single NamedMatch correspond to a
 153 // single matcher_nonterminal in the ast::Matcher that produced it.
 154 //
 155 // It should probably be renamed, it has more or less exact correspondence to
 156 // ast::match nodes, and the in-memory structure of a particular NamedMatch
 157 // represents the match that occurred when a particular subset of an
 158 // ast::match -- those ast::Matcher nodes leading to a single
 159 // MatchNonterminal -- was applied to a particular token tree.
 160 //
 161 // The width of each MatchedSeq in the NamedMatch, and the identity of the
 162 // MatchedNonterminal's, will depend on the token tree it was applied to: each
 163 // MatchedSeq corresponds to a single MatchSeq in the originating
 164 // ast::Matcher. The depth of the NamedMatch structure will therefore depend
 165 // only on the nesting depth of ast::MatchSeq's in the originating
 166 // ast::Matcher it was derived from.
 167
 168 pub enum NamedMatch {
 169     MatchedSeq(Vec<@NamedMatch> , codemap::Span),
 170     MatchedNonterminal(Nonterminal)
 171 }
 172
 173 pub fn nameize(p_s: @ParseSess, ms: &[Matcher], res: &[@NamedMatch])
 174             -> HashMap<Ident, @NamedMatch> {
 175     fn n_rec(p_s: @ParseSess, m: &Matcher, res: &[@NamedMatch],
 176              ret_val: &mut HashMap<Ident, @NamedMatch>) {
 177         match *m {
 178           codemap::Spanned {node: MatchTok(_), .. } => (),
 179           codemap::Spanned {node: MatchSeq(ref more_ms, _, _, _, _), .. } => {
 180             for next_m in more_ms.iter() {
 181                 n_rec(p_s, next_m, res, ret_val)
 182             };
 183           }
 184           codemap::Spanned {
 185                 node: MatchNonterminal(bind_name, _, idx),
 186                 span
 187           } => {
 188             if ret_val.contains_key(&bind_name) {
 189                 let string = token::get_ident(bind_name);
 190                 p_s.span_diagnostic
 191                    .span_fatal(span, "duplicated bind name: " + string.get())
 192             }
 193             ret_val.insert(bind_name, res[idx]);
 194           }
 195         }
 196     }
 197     let mut ret_val = HashMap::new();
 198     for m in ms.iter() { n_rec(p_s, m, res, &mut ret_val) }
 199     ret_val
 200 }
 201
 202 pub enum ParseResult {
 203     Success(HashMap<Ident, @NamedMatch>),
 204     Failure(codemap::Span, ~str),
 205     Error(codemap::Span, ~str)
 206 }
 207
 208 pub fn parse_or_else<R: Reader>(sess: @ParseSess,
 209                                 cfg: ast::CrateConfig,
 210                                 rdr: R,
 211                                 ms: Vec<Matcher> )
 212                                 -> HashMap<Ident, @NamedMatch> {
 213     match parse(sess, cfg, rdr, ms.as_slice()) {
 214         Success(m) => m,
 215         Failure(sp, str) => sess.span_diagnostic.span_fatal(sp, str),
 216         Error(sp, str) => sess.span_diagnostic.span_fatal(sp, str)
 217     }
 218 }
 219
 220 // perform a token equality check, ignoring syntax context (that is, an unhygienic comparison)
 221 pub fn token_name_eq(t1 : &Token, t2 : &Token) -> bool {
 222     match (t1,t2) {
 223         (&token::IDENT(id1,_),&token::IDENT(id2,_))
 224         | (&token::LIFETIME(id1),&token::LIFETIME(id2)) =>
 225             id1.name == id2.name,
 226         _ => *t1 == *t2
 227     }
 228 }
 229
 230 pub fn parse<R: Reader>(sess: @ParseSess,
 231                         cfg: ast::CrateConfig,
 232                         rdr: R,
 233                         ms: &[Matcher])
 234                         -> ParseResult {
 235     let mut cur_eis = Vec::new();
 236     cur_eis.push(initial_matcher_pos(ms.iter()
 237                                        .map(|x| (*x).clone())
 238                                        .collect(),
 239                                      None,
 240                                      rdr.peek().sp.lo));
 241
 242     loop {
 243         let mut bb_eis = Vec::new(); // black-box parsed by parser.rs
 244         let mut next_eis = Vec::new(); // or proceed normally
 245         let mut eof_eis = Vec::new();
 246
 247         let TokenAndSpan {tok: tok, sp: sp} = rdr.peek();
 248
 249         /* we append new items to this while we go */
 250         loop {
 251             let ei = match cur_eis.pop() {
 252                 None => break, /* for each Earley Item */
 253                 Some(ei) => ei,
 254             };
 255
 256             let idx = ei.idx;
 257             let len = ei.elts.len();
 258
 259             /* at end of sequence */
 260             if idx >= len {
 261                 // can't move out of `match`es, so:
 262                 if ei.up.is_some() {
 263                     // hack: a matcher sequence is repeating iff it has a
 264                     // parent (the top level is just a container)
 265
 266
 267                     // disregard separator, try to go up
 268                     // (remove this condition to make trailing seps ok)
 269                     if idx == len {
 270                         // pop from the matcher position
 271
 272                         let mut new_pos = ei.up.clone().unwrap();
 273
 274                         // update matches (the MBE "parse tree") by appending
 275                         // each tree as a subtree.
 276
 277                         // I bet this is a perf problem: we're preemptively
 278                         // doing a lot of array work that will get thrown away
 279                         // most of the time.
 280
 281                         // Only touch the binders we have actually bound
 282                         for idx in range(ei.match_lo, ei.match_hi) {
 283                             let sub = (*ei.matches.get(idx)).clone();
 284                             new_pos.matches
 285                                    .get_mut(idx)
 286                                    .push(@MatchedSeq(sub, mk_sp(ei.sp_lo,
 287                                                                 sp.hi)));
 288                         }
 289
 290                         new_pos.idx += 1;
 291                         cur_eis.push(new_pos);
 292                     }
 293
 294                     // can we go around again?
 295
 296                     // the *_t vars are workarounds for the lack of unary move
 297                     match ei.sep {
 298                       Some(ref t) if idx == len => { // we need a separator
 299                         // i'm conflicted about whether this should be hygienic....
 300                         // though in this case, if the separators are never legal
 301                         // idents, it shouldn't matter.
 302                         if token_name_eq(&tok, t) { //pass the separator
 303                             let mut ei_t = ei.clone();
 304                             ei_t.idx += 1;
 305                             next_eis.push(ei_t);
 306                         }
 307                       }
 308                       _ => { // we don't need a separator
 309                         let mut ei_t = ei;
 310                         ei_t.idx = 0;
 311                         cur_eis.push(ei_t);
 312                       }
 313                     }
 314                 } else {
 315                     eof_eis.push(ei);
 316                 }
 317             } else {
 318                 match ei.elts.get(idx).node.clone() {
 319                   /* need to descend into sequence */
 320                   MatchSeq(ref matchers, ref sep, zero_ok,
 321                            match_idx_lo, match_idx_hi) => {
 322                     if zero_ok {
 323                         let mut new_ei = ei.clone();
 324                         new_ei.idx += 1u;
 325                         //we specifically matched zero repeats.
 326                         for idx in range(match_idx_lo, match_idx_hi) {
 327                             new_ei.matches
 328                                   .get_mut(idx)
 329                                   .push(@MatchedSeq(Vec::new(), sp));
 330                         }
 331
 332                         cur_eis.push(new_ei);
 333                     }
 334
 335                     let matches = Vec::from_elem(ei.matches.len(), Vec::new());
 336                     let ei_t = ei;
 337                     cur_eis.push(~MatcherPos {
 338                         elts: (*matchers).clone(),
 339                         sep: (*sep).clone(),
 340                         idx: 0u,
 341                         up: Some(ei_t),
 342                         matches: matches,
 343                         match_lo: match_idx_lo, match_hi: match_idx_hi,
 344                         sp_lo: sp.lo
 345                     });
 346                   }
 347                   MatchNonterminal(_,_,_) => { bb_eis.push(ei) }
 348                   MatchTok(ref t) => {
 349                     let mut ei_t = ei.clone();
 350                     //if (token_name_eq(t,&tok)) {
 351                     if token::mtwt_token_eq(t,&tok) {
 352                         ei_t.idx += 1;
 353                         next_eis.push(ei_t);
 354                     }
 355                   }
 356                 }
 357             }
 358         }
 359
 360         /* error messages here could be improved with links to orig. rules */
 361         if token_name_eq(&tok, &EOF) {
 362             if eof_eis.len() == 1u {
 363                 let mut v = Vec::new();
 364                 for dv in eof_eis.get_mut(0).matches.mut_iter() {
 365                     v.push(dv.pop().unwrap());
 366                 }
 367                 return Success(nameize(sess, ms, v.as_slice()));
 368             } else if eof_eis.len() > 1u {
 369                 return Error(sp, ~"ambiguity: multiple successful parses");
 370             } else {
 371                 return Failure(sp, ~"unexpected end of macro invocation");
 372             }
 373         } else {
 374             if (bb_eis.len() > 0u && next_eis.len() > 0u)
 375                 || bb_eis.len() > 1u {
 376                 let nts = bb_eis.map(|ei| {
 377                     match ei.elts.get(ei.idx).node {
 378                       MatchNonterminal(bind, name, _) => {
 379                         format!("{} ('{}')",
 380                                 token::get_ident(name),
 381                                 token::get_ident(bind))
 382                       }
 383                       _ => fail!()
 384                     } }).connect(" or ");
 385                 return Error(sp, format!(
 386                     "local ambiguity: multiple parsing options: \
 387                      built-in NTs {} or {} other options.",
 388                     nts, next_eis.len()));
 389             } else if bb_eis.len() == 0u && next_eis.len() == 0u {
 390                 return Failure(sp, format!("no rules expected the token `{}`",
 391                             token::to_str(&tok)));
 392             } else if next_eis.len() > 0u {
 393                 /* Now process the next token */
 394                 while next_eis.len() > 0u {
 395                     cur_eis.push(next_eis.pop().unwrap());
 396                 }
 397                 rdr.next_token();
 398             } else /* bb_eis.len() == 1 */ {
 399                 let mut rust_parser = Parser(sess, cfg.clone(), rdr.dup());
 400
 401                 let mut ei = bb_eis.pop().unwrap();
 402                 match ei.elts.get(ei.idx).node {
 403                   MatchNonterminal(_, name, idx) => {
 404                     let name_string = token::get_ident(name);
 405                     ei.matches.get_mut(idx).push(@MatchedNonterminal(
 406                         parse_nt(&mut rust_parser, name_string.get())));
 407                     ei.idx += 1u;
 408                   }
 409                   _ => fail!()
 410                 }
 411                 cur_eis.push(ei);
 412
 413                 for _ in range(0, rust_parser.tokens_consumed) {
 414                     let _ = rdr.next_token();
 415                 }
 416             }
 417         }
 418
 419         assert!(cur_eis.len() > 0u);
 420     }
 421 }
 422
 423 pub fn parse_nt(p: &mut Parser, name: &str) -> Nonterminal {
 424     match name {
 425       "item" => match p.parse_item(Vec::new()) {
 426         Some(i) => token::NtItem(i),
 427         None => p.fatal("expected an item keyword")
 428       },
 429       "block" => token::NtBlock(p.parse_block()),
 430       "stmt" => token::NtStmt(p.parse_stmt(Vec::new())),
 431       "pat" => token::NtPat(p.parse_pat()),
 432       "expr" => token::NtExpr(p.parse_expr()),
 433       "ty" => token::NtTy(p.parse_ty(false /* no need to disambiguate*/)),
 434       // this could be handled like a token, since it is one
 435       "ident" => match p.token {
 436         token::IDENT(sn,b) => { p.bump(); token::NtIdent(~sn,b) }
 437         _ => {
 438             let token_str = token::to_str(&p.token);
 439             p.fatal(~"expected ident, found " + token_str)
 440         }
 441       },
 442       "path" => {
 443         token::NtPath(~p.parse_path(LifetimeAndTypesWithoutColons).path)
 444       }
 445       "attr" => token::NtAttr(@p.parse_attribute(false)),
 446       "tt" => {
 447         p.quote_depth += 1u; //but in theory, non-quoted tts might be useful
 448         let res = token::NtTT(@p.parse_token_tree());
 449         p.quote_depth -= 1u;
 450         res
 451       }
 452       "matchers" => token::NtMatchers(p.parse_matchers()),
 453       _ => p.fatal(~"unsupported builtin nonterminal parser: " + name)
 454     }
 455 }