src/libregex_macros/lib.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 //! This crate provides the `regex!` macro. Its use is documented in the
  12 //! `regex` crate.
  13
  14 #![crate_name = "regex_macros"]
  15 #![crate_type = "dylib"]
  16 #![experimental = "use the crates.io `regex_macros` library instead"]
  17 #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
  18        html_favicon_url = "http://www.rust-lang.org/favicon.ico",
  19        html_root_url = "http://doc.rust-lang.org/nightly/")]
  20
  21 #![feature(plugin_registrar, quote)]
  22 #![feature(unboxed_closures)]
  23
  24 extern crate regex;
  25 extern crate syntax;
  26 extern crate rustc;
  27
  28 use std::rc::Rc;
  29
  30 use syntax::ast;
  31 use syntax::codemap;
  32 use syntax::ext::build::AstBuilder;
  33 use syntax::ext::base::{ExtCtxt, MacResult, MacExpr, DummyResult};
  34 use syntax::parse::token;
  35 use syntax::print::pprust;
  36 use syntax::fold::Folder;
  37 use syntax::ptr::P;
  38
  39 use rustc::plugin::Registry;
  40
  41 use regex::Regex;
  42 use regex::native::{
  43     OneChar, CharClass, Any, Save, Jump, Split,
  44     Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
  45     Program, Dynamic, ExDynamic, Native,
  46     FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
  47 };
  48
  49 /// For the `regex!` syntax extension. Do not use.
  50 #[plugin_registrar]
  51 #[doc(hidden)]
  52 pub fn plugin_registrar(reg: &mut Registry) {
  53     reg.register_macro("regex", native);
  54 }
  55
  56 /// Generates specialized code for the Pike VM for a particular regular
  57 /// expression.
  58 ///
  59 /// There are two primary differences between the code generated here and the
  60 /// general code in vm.rs.
  61 ///
  62 /// 1. All heap allocation is removed. Sized vector types are used instead.
  63 ///    Care must be taken to make sure that these vectors are not copied
  64 ///    gratuitously. (If you're not sure, run the benchmarks. They will yell
  65 ///    at you if you do.)
  66 /// 2. The main `match instruction { ... }` expressions are replaced with more
  67 ///    direct `match pc { ... }`. The generators can be found in
  68 ///    `step_insts` and `add_insts`.
  69 ///
  70 /// Other more minor changes include eliding code when possible (although this
  71 /// isn't completely thorough at the moment), and translating character class
  72 /// matching from using a binary search to a simple `match` expression (see
  73 /// `match_class`).
  74 ///
  75 /// It is strongly recommended to read the dynamic implementation in vm.rs
  76 /// first before trying to understand the code generator. The implementation
  77 /// strategy is identical and vm.rs has comments and will be easier to follow.
  78 #[allow(experimental)]
  79 fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
  80           -> Box<MacResult+'static> {
  81     let regex = match parse(cx, tts) {
  82         Some(r) => r,
  83         // error is logged in 'parse' with cx.span_err
  84         None => return DummyResult::any(sp),
  85     };
  86     let re = match Regex::new(regex.as_slice()) {
  87         Ok(re) => re,
  88         Err(err) => {
  89             cx.span_err(sp, err.to_string().as_slice());
  90             return DummyResult::any(sp)
  91         }
  92     };
  93     let prog = match re {
  94         Dynamic(ExDynamic { ref prog, .. }) => prog.clone(),
  95         Native(_) => unreachable!(),
  96     };
  97
  98     let mut gen = NfaGen {
  99         cx: &*cx, sp: sp, prog: prog,
 100         names: re.names_iter().collect(), original: re.as_str().to_string(),
 101     };
 102     MacExpr::new(gen.code())
 103 }
 104
 105 struct NfaGen<'a> {
 106     cx: &'a ExtCtxt<'a>,
 107     sp: codemap::Span,
 108     prog: Program,
 109     names: Vec<Option<String>>,
 110     original: String,
 111 }
 112
 113 impl<'a> NfaGen<'a> {
 114     fn code(&mut self) -> P<ast::Expr> {
 115         // Most or all of the following things are used in the quasiquoted
 116         // expression returned.
 117         let num_cap_locs = 2 * self.prog.num_captures();
 118         let num_insts = self.prog.insts.len();
 119         let cap_names = self.vec_expr(self.names.iter(),
 120             |cx, name| match *name {
 121                 Some(ref name) => {
 122                     let name = name.as_slice();
 123                     quote_expr!(cx, Some($name))
 124                 }
 125                 None => cx.expr_none(self.sp),
 126             }
 127         );
 128         let prefix_anchor =
 129             match self.prog.insts[1] {
 130                 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
 131                 _ => false,
 132             };
 133         let init_groups = self.vec_expr(range(0, num_cap_locs),
 134                                         |cx, _| cx.expr_none(self.sp));
 135
 136         let prefix_lit = Rc::new(self.prog.prefix.as_bytes().to_vec());
 137         let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit));
 138
 139         let check_prefix = self.check_prefix();
 140         let step_insts = self.step_insts();
 141         let add_insts = self.add_insts();
 142         let regex = self.original.as_slice();
 143
 144         quote_expr!(self.cx, {
 145 // When `regex!` is bound to a name that is not used, we have to make sure
 146 // that dead_code warnings don't bubble up to the user from the generated
 147 // code. Therefore, we suppress them by allowing dead_code. The effect is that
 148 // the user is only warned about *their* unused variable/code, and not the
 149 // unused code generated by regex!. See #14185 for an example.
 150 #[allow(dead_code)]
 151 static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
 152
 153 #[allow(dead_code)]
 154 fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
 155             start: uint, end: uint) -> Vec<Option<uint>> {
 156     #![allow(unused_imports)]
 157     #![allow(unused_mut)]
 158
 159     use regex::native::{
 160         MatchKind, Exists, Location, Submatches,
 161         StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
 162         CharReader, find_prefix,
 163     };
 164
 165     return Nfa {
 166         which: which,
 167         input: input,
 168         ic: 0,
 169         chars: CharReader::new(input),
 170     }.run(start, end);
 171
 172     type Captures = [Option<uint>; $num_cap_locs];
 173
 174     struct Nfa<'t> {
 175         which: MatchKind,
 176         input: &'t str,
 177         ic: uint,
 178         chars: CharReader<'t>,
 179     }
 180
 181     impl<'t> Nfa<'t> {
 182         #[allow(unused_variables)]
 183         fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
 184             let mut matched = false;
 185             let prefix_bytes: &[u8] = $prefix_bytes;
 186             let mut clist = &mut Threads::new(self.which);
 187             let mut nlist = &mut Threads::new(self.which);
 188
 189             let mut groups = $init_groups;
 190
 191             self.ic = start;
 192             let mut next_ic = self.chars.set(start);
 193             while self.ic <= end {
 194                 if clist.size == 0 {
 195                     if matched {
 196                         break
 197                     }
 198                     $check_prefix
 199                 }
 200                 if clist.size == 0 || (!$prefix_anchor && !matched) {
 201                     self.add(clist, 0, &mut groups)
 202                 }
 203
 204                 self.ic = next_ic;
 205                 next_ic = self.chars.advance();
 206
 207                 for i in range(0, clist.size) {
 208                     let pc = clist.pc(i);
 209                     let step_state = self.step(&mut groups, nlist,
 210                                                clist.groups(i), pc);
 211                     match step_state {
 212                         StepMatchEarlyReturn =>
 213                             return vec![Some(0u), Some(0u)],
 214                         StepMatch => { matched = true; break },
 215                         StepContinue => {},
 216                     }
 217                 }
 218                 ::std::mem::swap(&mut clist, &mut nlist);
 219                 nlist.empty();
 220             }
 221             match self.which {
 222                 Exists if matched     => vec![Some(0u), Some(0u)],
 223                 Exists                => vec![None, None],
 224                 Location | Submatches => groups.iter().map(|x| *x).collect(),
 225             }
 226         }
 227
 228         // Sometimes `nlist` is never used (for empty regexes).
 229         #[allow(unused_variables)]
 230         #[inline]
 231         fn step(&self, groups: &mut Captures, nlist: &mut Threads,
 232                 caps: &mut Captures, pc: uint) -> StepState {
 233             $step_insts
 234             StepContinue
 235         }
 236
 237         fn add(&self, nlist: &mut Threads, pc: uint,
 238                groups: &mut Captures) {
 239             if nlist.contains(pc) {
 240                 return
 241             }
 242             $add_insts
 243         }
 244     }
 245
 246     struct Thread {
 247         pc: uint,
 248         groups: Captures,
 249     }
 250
 251     struct Threads {
 252         which: MatchKind,
 253         queue: [Thread; $num_insts],
 254         sparse: [uint; $num_insts],
 255         size: uint,
 256     }
 257
 258     impl Threads {
 259         fn new(which: MatchKind) -> Threads {
 260             Threads {
 261                 which: which,
 262                 // These unsafe blocks are used for performance reasons, as it
 263                 // gives us a zero-cost initialization of a sparse set. The
 264                 // trick is described in more detail here:
 265                 // http://research.swtch.com/sparse
 266                 // The idea here is to avoid initializing threads that never
 267                 // need to be initialized, particularly for larger regexs with
 268                 // a lot of instructions.
 269                 queue: unsafe { ::std::mem::uninitialized() },
 270                 sparse: unsafe { ::std::mem::uninitialized() },
 271                 size: 0,
 272             }
 273         }
 274
 275         #[inline]
 276         fn add(&mut self, pc: uint, groups: &Captures) {
 277             let t = &mut self.queue[self.size];
 278             t.pc = pc;
 279             match self.which {
 280                 Exists => {},
 281                 Location => {
 282                     t.groups[0] = groups[0];
 283                     t.groups[1] = groups[1];
 284                 }
 285                 Submatches => {
 286                     for (slot, val) in t.groups.iter_mut().zip(groups.iter()) {
 287                         *slot = *val;
 288                     }
 289                 }
 290             }
 291             self.sparse[pc] = self.size;
 292             self.size += 1;
 293         }
 294
 295         #[inline]
 296         fn add_empty(&mut self, pc: uint) {
 297             self.queue[self.size].pc = pc;
 298             self.sparse[pc] = self.size;
 299             self.size += 1;
 300         }
 301
 302         #[inline]
 303         fn contains(&self, pc: uint) -> bool {
 304             let s = self.sparse[pc];
 305             s < self.size && self.queue[s].pc == pc
 306         }
 307
 308         #[inline]
 309         fn empty(&mut self) {
 310             self.size = 0;
 311         }
 312
 313         #[inline]
 314         fn pc(&self, i: uint) -> uint {
 315             self.queue[i].pc
 316         }
 317
 318         #[inline]
 319         fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
 320             &mut self.queue[i].groups
 321         }
 322     }
 323 }
 324
 325 ::regex::native::Native(::regex::native::ExNative {
 326     original: $regex,
 327     names: &CAP_NAMES,
 328     prog: exec,
 329 })
 330         })
 331     }
 332
 333     // Generates code for the `add` method, which is responsible for adding
 334     // zero-width states to the next queue of states to visit.
 335     fn add_insts(&self) -> P<ast::Expr> {
 336         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
 337             let nextpc = pc + 1;
 338             let body = match *inst {
 339                 EmptyBegin(flags) => {
 340                     let cond =
 341                         if flags & FLAG_MULTI > 0 {
 342                             quote_expr!(self.cx,
 343                                 self.chars.is_begin()
 344                                 || self.chars.prev == Some('\n')
 345                             )
 346                         } else {
 347                             quote_expr!(self.cx, self.chars.is_begin())
 348                         };
 349                     quote_expr!(self.cx, {
 350                         nlist.add_empty($pc);
 351                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
 352                     })
 353                 }
 354                 EmptyEnd(flags) => {
 355                     let cond =
 356                         if flags & FLAG_MULTI > 0 {
 357                             quote_expr!(self.cx,
 358                                 self.chars.is_end()
 359                                 || self.chars.cur == Some('\n')
 360                             )
 361                         } else {
 362                             quote_expr!(self.cx, self.chars.is_end())
 363                         };
 364                     quote_expr!(self.cx, {
 365                         nlist.add_empty($pc);
 366                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
 367                     })
 368                 }
 369                 EmptyWordBoundary(flags) => {
 370                     let cond =
 371                         if flags & FLAG_NEGATED > 0 {
 372                             quote_expr!(self.cx, !self.chars.is_word_boundary())
 373                         } else {
 374                             quote_expr!(self.cx, self.chars.is_word_boundary())
 375                         };
 376                     quote_expr!(self.cx, {
 377                         nlist.add_empty($pc);
 378                         if $cond { self.add(nlist, $nextpc, &mut *groups) }
 379                     })
 380                 }
 381                 Save(slot) => {
 382                     let save = quote_expr!(self.cx, {
 383                         let old = groups[$slot];
 384                         groups[$slot] = Some(self.ic);
 385                         self.add(nlist, $nextpc, &mut *groups);
 386                         groups[$slot] = old;
 387                     });
 388                     let add = quote_expr!(self.cx, {
 389                         self.add(nlist, $nextpc, &mut *groups);
 390                     });
 391                     // If this is saving a submatch location but we request
 392                     // existence or only full match location, then we can skip
 393                     // right over it every time.
 394                     if slot > 1 {
 395                         quote_expr!(self.cx, {
 396                             nlist.add_empty($pc);
 397                             match self.which {
 398                                 Submatches => $save,
 399                                 Exists | Location => $add,
 400                             }
 401                         })
 402                     } else {
 403                         quote_expr!(self.cx, {
 404                             nlist.add_empty($pc);
 405                             match self.which {
 406                                 Submatches | Location => $save,
 407                                 Exists => $add,
 408                             }
 409                         })
 410                     }
 411                 }
 412                 Jump(to) => {
 413                     quote_expr!(self.cx, {
 414                         nlist.add_empty($pc);
 415                         self.add(nlist, $to, &mut *groups);
 416                     })
 417                 }
 418                 Split(x, y) => {
 419                     quote_expr!(self.cx, {
 420                         nlist.add_empty($pc);
 421                         self.add(nlist, $x, &mut *groups);
 422                         self.add(nlist, $y, &mut *groups);
 423                     })
 424                 }
 425                 // For Match, OneChar, CharClass, Any
 426                 _ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
 427             };
 428             self.arm_inst(pc, body)
 429         }).collect::<Vec<ast::Arm>>();
 430
 431         self.match_insts(arms)
 432     }
 433
 434     // Generates the code for the `step` method, which processes all states
 435     // in the current queue that consume a single character.
 436     fn step_insts(&self) -> P<ast::Expr> {
 437         let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
 438             let nextpc = pc + 1;
 439             let body = match *inst {
 440                 Match => {
 441                     quote_expr!(self.cx, {
 442                         match self.which {
 443                             Exists => {
 444                                 return StepMatchEarlyReturn
 445                             }
 446                             Location => {
 447                                 groups[0] = caps[0];
 448                                 groups[1] = caps[1];
 449                                 return StepMatch
 450                             }
 451                             Submatches => {
 452                                 for (slot, val) in groups.iter_mut().zip(caps.iter()) {
 453                                     *slot = *val;
 454                                 }
 455                                 return StepMatch
 456                             }
 457                         }
 458                     })
 459                 }
 460                 OneChar(c, flags) => {
 461                     if flags & FLAG_NOCASE > 0 {
 462                         let upc = c.to_uppercase();
 463                         quote_expr!(self.cx, {
 464                             let upc = self.chars.prev.map(|c| c.to_uppercase());
 465                             if upc == Some($upc) {
 466                                 self.add(nlist, $nextpc, caps);
 467                             }
 468                         })
 469                     } else {
 470                         quote_expr!(self.cx, {
 471                             if self.chars.prev == Some($c) {
 472                                 self.add(nlist, $nextpc, caps);
 473                             }
 474                         })
 475                     }
 476                 }
 477                 CharClass(ref ranges, flags) => {
 478                     let negate = flags & FLAG_NEGATED > 0;
 479                     let casei = flags & FLAG_NOCASE > 0;
 480                     let get_char =
 481                         if casei {
 482                             quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
 483                         } else {
 484                             quote_expr!(self.cx, self.chars.prev.unwrap())
 485                         };
 486                     let negcond =
 487                         if negate {
 488                             quote_expr!(self.cx, !found)
 489                         } else {
 490                             quote_expr!(self.cx, found)
 491                         };
 492                     let mranges = self.match_class(casei, ranges.as_slice());
 493                     quote_expr!(self.cx, {
 494                         if self.chars.prev.is_some() {
 495                             let c = $get_char;
 496                             let found = $mranges;
 497                             if $negcond {
 498                                 self.add(nlist, $nextpc, caps);
 499                             }
 500                         }
 501                     })
 502                 }
 503                 Any(flags) => {
 504                     if flags & FLAG_DOTNL > 0 {
 505                         quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
 506                     } else {
 507                         quote_expr!(self.cx, {
 508                             if self.chars.prev != Some('\n') {
 509                                 self.add(nlist, $nextpc, caps)
 510                             }
 511                             ()
 512                         })
 513                     }
 514                 }
 515                 // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
 516                 _ => self.empty_block(),
 517             };
 518             self.arm_inst(pc, body)
 519         }).collect::<Vec<ast::Arm>>();
 520
 521         self.match_insts(arms)
 522     }
 523
 524     // Translates a character class into a match expression.
 525     // This avoids a binary search (and is hopefully replaced by a jump
 526     // table).
 527     fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
 528         let mut arms = ranges.iter().map(|&(mut start, mut end)| {
 529             if casei {
 530                 start = start.to_uppercase();
 531                 end = end.to_uppercase();
 532             }
 533             let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
 534                                                          quote_expr!(self.cx, $end)));
 535             self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))
 536         }).collect::<Vec<ast::Arm>>();
 537
 538         arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
 539
 540         let match_on = quote_expr!(self.cx, c);
 541         self.cx.expr_match(self.sp, match_on, arms)
 542     }
 543
 544     // Generates code for checking a literal prefix of the search string.
 545     // The code is only generated if the regex *has* a literal prefix.
 546     // Otherwise, a no-op is returned.
 547     fn check_prefix(&self) -> P<ast::Expr> {
 548         if self.prog.prefix.len() == 0 {
 549             self.empty_block()
 550         } else {
 551             quote_expr!(self.cx,
 552                 if clist.size == 0 {
 553                     let haystack = self.input.as_bytes()[self.ic..];
 554                     match find_prefix(prefix_bytes, haystack) {
 555                         None => break,
 556                         Some(i) => {
 557                             self.ic += i;
 558                             next_ic = self.chars.set(self.ic);
 559                         }
 560                     }
 561                 }
 562             )
 563         }
 564     }
 565
 566     // Builds a `match pc { ... }` expression from a list of arms, specifically
 567     // for matching the current program counter with an instruction.
 568     // A wild-card arm is automatically added that executes a no-op. It will
 569     // never be used, but is added to satisfy the compiler complaining about
 570     // non-exhaustive patterns.
 571     fn match_insts(&self, mut arms: Vec<ast::Arm>) -> P<ast::Expr> {
 572         arms.push(self.wild_arm_expr(self.empty_block()));
 573         self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms)
 574     }
 575
 576     fn empty_block(&self) -> P<ast::Expr> {
 577         quote_expr!(self.cx, {})
 578     }
 579
 580     // Creates a match arm for the instruction at `pc` with the expression
 581     // `body`.
 582     fn arm_inst(&self, pc: uint, body: P<ast::Expr>) -> ast::Arm {
 583         let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc));
 584
 585         self.cx.arm(self.sp, vec!(pc_pat), body)
 586     }
 587
 588     // Creates a wild-card match arm with the expression `body`.
 589     fn wild_arm_expr(&self, body: P<ast::Expr>) -> ast::Arm {
 590         ast::Arm {
 591             attrs: vec!(),
 592             pats: vec!(P(ast::Pat{
 593                 id: ast::DUMMY_NODE_ID,
 594                 span: self.sp,
 595                 node: ast::PatWild(ast::PatWildSingle),
 596             })),
 597             guard: None,
 598             body: body,
 599         }
 600     }
 601
 602
 603     // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
 604     // on each element in `xs`.
 605     fn vec_expr<T, It, F>(&self, xs: It, mut to_expr: F) -> P<ast::Expr> where
 606         It: Iterator<Item=T>,
 607         F: FnMut(&ExtCtxt, T) -> P<ast::Expr>,
 608     {
 609         let exprs = xs.map(|x| to_expr(self.cx, x)).collect();
 610         self.cx.expr_vec(self.sp, exprs)
 611     }
 612 }
 613
 614 /// Looks for a single string literal and returns it.
 615 /// Otherwise, logs an error with cx.span_err and returns None.
 616 fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<String> {
 617     let mut parser = cx.new_parser_from_tts(tts);
 618     let entry = cx.expander().fold_expr(parser.parse_expr());
 619     let regex = match entry.node {
 620         ast::ExprLit(ref lit) => {
 621             match lit.node {
 622                 ast::LitStr(ref s, _) => s.to_string(),
 623                 _ => {
 624                     cx.span_err(entry.span, format!(
 625                         "expected string literal but got `{}`",
 626                         pprust::lit_to_string(&**lit)).as_slice());
 627                     return None
 628                 }
 629             }
 630         }
 631         _ => {
 632             cx.span_err(entry.span, format!(
 633                 "expected string literal but got `{}`",
 634                 pprust::expr_to_string(&*entry)).as_slice());
 635             return None
 636         }
 637     };
 638     if !parser.eat(&token::Eof) {
 639         cx.span_err(parser.span, "only one string literal allowed");
 640         return None;
 641     }
 642     Some(regex)
 643 }