1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! This crate provides the `regex!` macro. Its use is documented in the
14 #![crate_name = "regex_macros"]
15 #![crate_type = "dylib"]
16 #![experimental = "use the crates.io `regex_macros` library instead"]
17 #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
18 html_favicon_url = "http://www.rust-lang.org/favicon.ico",
19 html_root_url = "http://doc.rust-lang.org/nightly/")]
21 #![feature(plugin_registrar, quote)]
22 #![feature(unboxed_closures)]
32 use syntax::ext::build::AstBuilder;
33 use syntax::ext::base::{ExtCtxt, MacResult, MacExpr, DummyResult};
34 use syntax::parse::token;
35 use syntax::print::pprust;
36 use syntax::fold::Folder;
39 use rustc::plugin::Registry;
43 OneChar, CharClass, Any, Save, Jump, Split,
44 Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
45 Program, Dynamic, ExDynamic, Native,
46 FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
49 /// For the `regex!` syntax extension. Do not use.
52 pub fn plugin_registrar(reg: &mut Registry) {
53 reg.register_macro("regex", native);
56 /// Generates specialized code for the Pike VM for a particular regular
59 /// There are two primary differences between the code generated here and the
60 /// general code in vm.rs.
62 /// 1. All heap allocation is removed. Sized vector types are used instead.
63 /// Care must be taken to make sure that these vectors are not copied
64 /// gratuitously. (If you're not sure, run the benchmarks. They will yell
65 /// at you if you do.)
66 /// 2. The main `match instruction { ... }` expressions are replaced with more
67 /// direct `match pc { ... }`. The generators can be found in
68 /// `step_insts` and `add_insts`.
70 /// Other more minor changes include eliding code when possible (although this
71 /// isn't completely thorough at the moment), and translating character class
72 /// matching from using a binary search to a simple `match` expression (see
75 /// It is strongly recommended to read the dynamic implementation in vm.rs
76 /// first before trying to understand the code generator. The implementation
77 /// strategy is identical and vm.rs has comments and will be easier to follow.
78 #[allow(experimental)]
79 fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
80 -> Box<MacResult+'static> {
81 let regex = match parse(cx, tts) {
83 // error is logged in 'parse' with cx.span_err
84 None => return DummyResult::any(sp),
86 let re = match Regex::new(regex.as_slice()) {
89 cx.span_err(sp, err.to_string().as_slice());
90 return DummyResult::any(sp)
94 Dynamic(ExDynamic { ref prog, .. }) => prog.clone(),
95 Native(_) => unreachable!(),
98 let mut gen = NfaGen {
99 cx: &*cx, sp: sp, prog: prog,
100 names: re.names_iter().collect(), original: re.as_str().to_string(),
102 MacExpr::new(gen.code())
109 names: Vec<Option<String>>,
113 impl<'a> NfaGen<'a> {
114 fn code(&mut self) -> P<ast::Expr> {
115 // Most or all of the following things are used in the quasiquoted
116 // expression returned.
117 let num_cap_locs = 2 * self.prog.num_captures();
118 let num_insts = self.prog.insts.len();
119 let cap_names = self.vec_expr(self.names.iter(),
120 |cx, name| match *name {
122 let name = name.as_slice();
123 quote_expr!(cx, Some($name))
125 None => cx.expr_none(self.sp),
129 match self.prog.insts[1] {
130 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
133 let init_groups = self.vec_expr(range(0, num_cap_locs),
134 |cx, _| cx.expr_none(self.sp));
136 let prefix_lit = Rc::new(self.prog.prefix.as_bytes().to_vec());
137 let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit));
139 let check_prefix = self.check_prefix();
140 let step_insts = self.step_insts();
141 let add_insts = self.add_insts();
142 let regex = self.original.as_slice();
144 quote_expr!(self.cx, {
145 // When `regex!` is bound to a name that is not used, we have to make sure
146 // that dead_code warnings don't bubble up to the user from the generated
147 // code. Therefore, we suppress them by allowing dead_code. The effect is that
148 // the user is only warned about *their* unused variable/code, and not the
149 // unused code generated by regex!. See #14185 for an example.
151 static CAP_NAMES: &'static [Option<&'static str>] = &$cap_names;
154 fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
155 start: uint, end: uint) -> Vec<Option<uint>> {
156 #![allow(unused_imports)]
157 #![allow(unused_mut)]
160 MatchKind, Exists, Location, Submatches,
161 StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
162 CharReader, find_prefix,
169 chars: CharReader::new(input),
172 type Captures = [Option<uint>; $num_cap_locs];
178 chars: CharReader<'t>,
182 #[allow(unused_variables)]
183 fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
184 let mut matched = false;
185 let prefix_bytes: &[u8] = $prefix_bytes;
186 let mut clist = &mut Threads::new(self.which);
187 let mut nlist = &mut Threads::new(self.which);
189 let mut groups = $init_groups;
192 let mut next_ic = self.chars.set(start);
193 while self.ic <= end {
200 if clist.size == 0 || (!$prefix_anchor && !matched) {
201 self.add(clist, 0, &mut groups)
205 next_ic = self.chars.advance();
207 for i in range(0, clist.size) {
208 let pc = clist.pc(i);
209 let step_state = self.step(&mut groups, nlist,
210 clist.groups(i), pc);
212 StepMatchEarlyReturn =>
213 return vec![Some(0u), Some(0u)],
214 StepMatch => { matched = true; break },
218 ::std::mem::swap(&mut clist, &mut nlist);
222 Exists if matched => vec![Some(0u), Some(0u)],
223 Exists => vec![None, None],
224 Location | Submatches => groups.iter().map(|x| *x).collect(),
228 // Sometimes `nlist` is never used (for empty regexes).
229 #[allow(unused_variables)]
231 fn step(&self, groups: &mut Captures, nlist: &mut Threads,
232 caps: &mut Captures, pc: uint) -> StepState {
237 fn add(&self, nlist: &mut Threads, pc: uint,
238 groups: &mut Captures) {
239 if nlist.contains(pc) {
253 queue: [Thread; $num_insts],
254 sparse: [uint; $num_insts],
259 fn new(which: MatchKind) -> Threads {
262 // These unsafe blocks are used for performance reasons, as it
263 // gives us a zero-cost initialization of a sparse set. The
264 // trick is described in more detail here:
265 // http://research.swtch.com/sparse
266 // The idea here is to avoid initializing threads that never
267 // need to be initialized, particularly for larger regexs with
268 // a lot of instructions.
269 queue: unsafe { ::std::mem::uninitialized() },
270 sparse: unsafe { ::std::mem::uninitialized() },
276 fn add(&mut self, pc: uint, groups: &Captures) {
277 let t = &mut self.queue[self.size];
282 t.groups[0] = groups[0];
283 t.groups[1] = groups[1];
286 for (slot, val) in t.groups.iter_mut().zip(groups.iter()) {
291 self.sparse[pc] = self.size;
296 fn add_empty(&mut self, pc: uint) {
297 self.queue[self.size].pc = pc;
298 self.sparse[pc] = self.size;
303 fn contains(&self, pc: uint) -> bool {
304 let s = self.sparse[pc];
305 s < self.size && self.queue[s].pc == pc
309 fn empty(&mut self) {
314 fn pc(&self, i: uint) -> uint {
319 fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
320 &mut self.queue[i].groups
325 ::regex::native::Native(::regex::native::ExNative {
333 // Generates code for the `add` method, which is responsible for adding
334 // zero-width states to the next queue of states to visit.
335 fn add_insts(&self) -> P<ast::Expr> {
336 let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
338 let body = match *inst {
339 EmptyBegin(flags) => {
341 if flags & FLAG_MULTI > 0 {
343 self.chars.is_begin()
344 || self.chars.prev == Some('\n')
347 quote_expr!(self.cx, self.chars.is_begin())
349 quote_expr!(self.cx, {
350 nlist.add_empty($pc);
351 if $cond { self.add(nlist, $nextpc, &mut *groups) }
356 if flags & FLAG_MULTI > 0 {
359 || self.chars.cur == Some('\n')
362 quote_expr!(self.cx, self.chars.is_end())
364 quote_expr!(self.cx, {
365 nlist.add_empty($pc);
366 if $cond { self.add(nlist, $nextpc, &mut *groups) }
369 EmptyWordBoundary(flags) => {
371 if flags & FLAG_NEGATED > 0 {
372 quote_expr!(self.cx, !self.chars.is_word_boundary())
374 quote_expr!(self.cx, self.chars.is_word_boundary())
376 quote_expr!(self.cx, {
377 nlist.add_empty($pc);
378 if $cond { self.add(nlist, $nextpc, &mut *groups) }
382 let save = quote_expr!(self.cx, {
383 let old = groups[$slot];
384 groups[$slot] = Some(self.ic);
385 self.add(nlist, $nextpc, &mut *groups);
388 let add = quote_expr!(self.cx, {
389 self.add(nlist, $nextpc, &mut *groups);
391 // If this is saving a submatch location but we request
392 // existence or only full match location, then we can skip
393 // right over it every time.
395 quote_expr!(self.cx, {
396 nlist.add_empty($pc);
399 Exists | Location => $add,
403 quote_expr!(self.cx, {
404 nlist.add_empty($pc);
406 Submatches | Location => $save,
413 quote_expr!(self.cx, {
414 nlist.add_empty($pc);
415 self.add(nlist, $to, &mut *groups);
419 quote_expr!(self.cx, {
420 nlist.add_empty($pc);
421 self.add(nlist, $x, &mut *groups);
422 self.add(nlist, $y, &mut *groups);
425 // For Match, OneChar, CharClass, Any
426 _ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
428 self.arm_inst(pc, body)
429 }).collect::<Vec<ast::Arm>>();
431 self.match_insts(arms)
434 // Generates the code for the `step` method, which processes all states
435 // in the current queue that consume a single character.
436 fn step_insts(&self) -> P<ast::Expr> {
437 let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
439 let body = match *inst {
441 quote_expr!(self.cx, {
444 return StepMatchEarlyReturn
452 for (slot, val) in groups.iter_mut().zip(caps.iter()) {
460 OneChar(c, flags) => {
461 if flags & FLAG_NOCASE > 0 {
462 let upc = c.to_uppercase();
463 quote_expr!(self.cx, {
464 let upc = self.chars.prev.map(|c| c.to_uppercase());
465 if upc == Some($upc) {
466 self.add(nlist, $nextpc, caps);
470 quote_expr!(self.cx, {
471 if self.chars.prev == Some($c) {
472 self.add(nlist, $nextpc, caps);
477 CharClass(ref ranges, flags) => {
478 let negate = flags & FLAG_NEGATED > 0;
479 let casei = flags & FLAG_NOCASE > 0;
482 quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
484 quote_expr!(self.cx, self.chars.prev.unwrap())
488 quote_expr!(self.cx, !found)
490 quote_expr!(self.cx, found)
492 let mranges = self.match_class(casei, ranges.as_slice());
493 quote_expr!(self.cx, {
494 if self.chars.prev.is_some() {
496 let found = $mranges;
498 self.add(nlist, $nextpc, caps);
504 if flags & FLAG_DOTNL > 0 {
505 quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
507 quote_expr!(self.cx, {
508 if self.chars.prev != Some('\n') {
509 self.add(nlist, $nextpc, caps)
515 // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
516 _ => self.empty_block(),
518 self.arm_inst(pc, body)
519 }).collect::<Vec<ast::Arm>>();
521 self.match_insts(arms)
524 // Translates a character class into a match expression.
525 // This avoids a binary search (and is hopefully replaced by a jump
527 fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> P<ast::Expr> {
528 let mut arms = ranges.iter().map(|&(mut start, mut end)| {
530 start = start.to_uppercase();
531 end = end.to_uppercase();
533 let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
534 quote_expr!(self.cx, $end)));
535 self.cx.arm(self.sp, vec!(pat), quote_expr!(self.cx, true))
536 }).collect::<Vec<ast::Arm>>();
538 arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
540 let match_on = quote_expr!(self.cx, c);
541 self.cx.expr_match(self.sp, match_on, arms)
544 // Generates code for checking a literal prefix of the search string.
545 // The code is only generated if the regex *has* a literal prefix.
546 // Otherwise, a no-op is returned.
547 fn check_prefix(&self) -> P<ast::Expr> {
548 if self.prog.prefix.len() == 0 {
553 let haystack = self.input.as_bytes()[self.ic..];
554 match find_prefix(prefix_bytes, haystack) {
558 next_ic = self.chars.set(self.ic);
566 // Builds a `match pc { ... }` expression from a list of arms, specifically
567 // for matching the current program counter with an instruction.
568 // A wild-card arm is automatically added that executes a no-op. It will
569 // never be used, but is added to satisfy the compiler complaining about
570 // non-exhaustive patterns.
571 fn match_insts(&self, mut arms: Vec<ast::Arm>) -> P<ast::Expr> {
572 arms.push(self.wild_arm_expr(self.empty_block()));
573 self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms)
576 fn empty_block(&self) -> P<ast::Expr> {
577 quote_expr!(self.cx, {})
580 // Creates a match arm for the instruction at `pc` with the expression
582 fn arm_inst(&self, pc: uint, body: P<ast::Expr>) -> ast::Arm {
583 let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc));
585 self.cx.arm(self.sp, vec!(pc_pat), body)
588 // Creates a wild-card match arm with the expression `body`.
589 fn wild_arm_expr(&self, body: P<ast::Expr>) -> ast::Arm {
592 pats: vec!(P(ast::Pat{
593 id: ast::DUMMY_NODE_ID,
595 node: ast::PatWild(ast::PatWildSingle),
603 // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
604 // on each element in `xs`.
605 fn vec_expr<T, It, F>(&self, xs: It, mut to_expr: F) -> P<ast::Expr> where
606 It: Iterator<Item=T>,
607 F: FnMut(&ExtCtxt, T) -> P<ast::Expr>,
609 let exprs = xs.map(|x| to_expr(self.cx, x)).collect();
610 self.cx.expr_vec(self.sp, exprs)
614 /// Looks for a single string literal and returns it.
615 /// Otherwise, logs an error with cx.span_err and returns None.
616 fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<String> {
617 let mut parser = cx.new_parser_from_tts(tts);
618 let entry = cx.expander().fold_expr(parser.parse_expr());
619 let regex = match entry.node {
620 ast::ExprLit(ref lit) => {
622 ast::LitStr(ref s, _) => s.to_string(),
624 cx.span_err(entry.span, format!(
625 "expected string literal but got `{}`",
626 pprust::lit_to_string(&**lit)).as_slice());
632 cx.span_err(entry.span, format!(
633 "expected string literal but got `{}`",
634 pprust::expr_to_string(&*entry)).as_slice());
638 if !parser.eat(&token::Eof) {
639 cx.span_err(parser.span, "only one string literal allowed");