1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! This crate provides the `regex!` macro. Its use is documented in the
14 #![crate_id = "regex_macros#0.11-pre"]
15 #![crate_type = "dylib"]
17 #![license = "MIT/ASL2"]
18 #![doc(html_logo_url = "http://www.rust-lang.org/logos/rust-logo-128x128-blk-v2.png",
19 html_favicon_url = "http://www.rust-lang.org/favicon.ico",
20 html_root_url = "http://static.rust-lang.org/doc/master")]
22 #![feature(macro_registrar, managed_boxes, quote)]
31 use syntax::ext::build::AstBuilder;
32 use syntax::ext::base::{
33 SyntaxExtension, ExtCtxt, MacResult, MacExpr, DummyResult,
34 NormalTT, BasicMacroExpander,
37 use syntax::parse::token;
38 use syntax::print::pprust;
42 OneChar, CharClass, Any, Save, Jump, Split,
43 Match, EmptyBegin, EmptyEnd, EmptyWordBoundary,
44 Program, Dynamic, Native,
45 FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED,
48 /// For the `regex!` syntax extension. Do not use.
51 pub fn macro_registrar(register: |ast::Name, SyntaxExtension|) {
52 let expander = box BasicMacroExpander { expander: native, span: None };
53 register(token::intern("regex"), NormalTT(expander, None))
56 /// Generates specialized code for the Pike VM for a particular regular
59 /// There are two primary differences between the code generated here and the
60 /// general code in vm.rs.
62 /// 1. All heap allocation is removed. Sized vector types are used instead.
63 /// Care must be taken to make sure that these vectors are not copied
64 /// gratuitously. (If you're not sure, run the benchmarks. They will yell
65 /// at you if you do.)
66 /// 2. The main `match instruction { ... }` expressions are replaced with more
67 /// direct `match pc { ... }`. The generators can be found in
68 /// `step_insts` and `add_insts`.
70 /// Other more minor changes include eliding code when possible (although this
71 /// isn't completely thorough at the moment), and translating character class
72 /// matching from using a binary search to a simple `match` expression (see
75 /// It is strongly recommended to read the dynamic implementation in vm.rs
76 /// first before trying to understand the code generator. The implementation
77 /// strategy is identical and vm.rs has comments and will be easier to follow.
78 fn native(cx: &mut ExtCtxt, sp: codemap::Span, tts: &[ast::TokenTree])
80 let regex = match parse(cx, tts) {
82 // error is logged in 'parse' with cx.span_err
83 None => return DummyResult::any(sp),
85 let re = match Regex::new(regex.to_owned()) {
88 cx.span_err(sp, err.to_str());
89 return DummyResult::any(sp)
92 let prog = match re.p {
93 Dynamic(ref prog) => prog.clone(),
94 Native(_) => unreachable!(),
97 let mut gen = NfaGen {
98 cx: &*cx, sp: sp, prog: prog,
99 names: re.names.clone(), original: re.original.clone(),
101 MacExpr::new(gen.code())
108 names: Vec<Option<~str>>,
112 impl<'a> NfaGen<'a> {
113 fn code(&mut self) -> @ast::Expr {
114 // Most or all of the following things are used in the quasiquoted
115 // expression returned.
116 let num_cap_locs = 2 * self.prog.num_captures();
117 let num_insts = self.prog.insts.len();
118 let cap_names = self.vec_expr(self.names.as_slice().iter(),
119 |cx, name| match *name {
121 let name = name.as_slice();
122 quote_expr!(cx, Some($name.to_owned()))
124 None => cx.expr_none(self.sp),
128 match self.prog.insts.as_slice()[1] {
129 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
132 let init_groups = self.vec_expr(range(0, num_cap_locs),
133 |cx, _| cx.expr_none(self.sp));
135 let prefix_lit = Rc::new(Vec::from_slice(self.prog.prefix.as_slice().as_bytes()));
136 let prefix_bytes = self.cx.expr_lit(self.sp, ast::LitBinary(prefix_lit));
138 let check_prefix = self.check_prefix();
139 let step_insts = self.step_insts();
140 let add_insts = self.add_insts();
141 let regex = self.original.as_slice();
143 quote_expr!(self.cx, {
144 fn exec<'t>(which: ::regex::native::MatchKind, input: &'t str,
145 start: uint, end: uint) -> Vec<Option<uint>> {
146 #![allow(unused_imports)]
148 MatchKind, Exists, Location, Submatches,
149 StepState, StepMatchEarlyReturn, StepMatch, StepContinue,
150 CharReader, find_prefix,
157 chars: CharReader::new(input),
160 type Captures = [Option<uint>, ..$num_cap_locs];
166 chars: CharReader<'t>,
170 #[allow(unused_variable)]
171 fn run(&mut self, start: uint, end: uint) -> Vec<Option<uint>> {
172 let mut matched = false;
173 let prefix_bytes: &[u8] = &$prefix_bytes;
174 let mut clist = &mut Threads::new(self.which);
175 let mut nlist = &mut Threads::new(self.which);
177 let mut groups = $init_groups;
180 let mut next_ic = self.chars.set(start);
181 while self.ic <= end {
188 if clist.size == 0 || (!$prefix_anchor && !matched) {
189 self.add(clist, 0, &mut groups)
193 next_ic = self.chars.advance();
195 for i in range(0, clist.size) {
196 let pc = clist.pc(i);
197 let step_state = self.step(&mut groups, nlist,
198 clist.groups(i), pc);
200 StepMatchEarlyReturn =>
201 return vec![Some(0u), Some(0u)],
202 StepMatch => { matched = true; break },
206 ::std::mem::swap(&mut clist, &mut nlist);
210 Exists if matched => vec![Some(0u), Some(0u)],
211 Exists => vec![None, None],
212 Location | Submatches => groups.iter().map(|x| *x).collect(),
216 // Sometimes `nlist` is never used (for empty regexes).
217 #[allow(unused_variable)]
219 fn step(&self, groups: &mut Captures, nlist: &mut Threads,
220 caps: &mut Captures, pc: uint) -> StepState {
225 fn add(&self, nlist: &mut Threads, pc: uint,
226 groups: &mut Captures) {
227 if nlist.contains(pc) {
241 queue: [Thread, ..$num_insts],
242 sparse: [uint, ..$num_insts],
247 fn new(which: MatchKind) -> Threads {
250 // These unsafe blocks are used for performance reasons, as it
251 // gives us a zero-cost initialization of a sparse set. The
252 // trick is described in more detail here:
253 // http://research.swtch.com/sparse
254 // The idea here is to avoid initializing threads that never
255 // need to be initialized, particularly for larger regexs with
256 // a lot of instructions.
257 queue: unsafe { ::std::mem::uninit() },
258 sparse: unsafe { ::std::mem::uninit() },
264 fn add(&mut self, pc: uint, groups: &Captures) {
265 let t = &mut self.queue[self.size];
270 t.groups[0] = groups[0];
271 t.groups[1] = groups[1];
274 for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
279 self.sparse[pc] = self.size;
284 fn add_empty(&mut self, pc: uint) {
285 self.queue[self.size].pc = pc;
286 self.sparse[pc] = self.size;
291 fn contains(&self, pc: uint) -> bool {
292 let s = self.sparse[pc];
293 s < self.size && self.queue[s].pc == pc
297 fn empty(&mut self) {
302 fn pc(&self, i: uint) -> uint {
307 fn groups<'r>(&'r mut self, i: uint) -> &'r mut Captures {
308 &'r mut self.queue[i].groups
314 original: $regex.to_owned(),
315 names: vec!$cap_names,
316 p: ::regex::native::Native(exec),
321 // Generates code for the `add` method, which is responsible for adding
322 // zero-width states to the next queue of states to visit.
323 fn add_insts(&self) -> @ast::Expr {
324 let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
326 let body = match *inst {
327 EmptyBegin(flags) => {
329 if flags & FLAG_MULTI > 0 {
331 self.chars.is_begin()
332 || self.chars.prev == Some('\n')
335 quote_expr!(self.cx, self.chars.is_begin())
337 quote_expr!(self.cx, {
338 nlist.add_empty($pc);
339 if $cond { self.add(nlist, $nextpc, &mut *groups) }
344 if flags & FLAG_MULTI > 0 {
347 || self.chars.cur == Some('\n')
350 quote_expr!(self.cx, self.chars.is_end())
352 quote_expr!(self.cx, {
353 nlist.add_empty($pc);
354 if $cond { self.add(nlist, $nextpc, &mut *groups) }
357 EmptyWordBoundary(flags) => {
359 if flags & FLAG_NEGATED > 0 {
360 quote_expr!(self.cx, !self.chars.is_word_boundary())
362 quote_expr!(self.cx, self.chars.is_word_boundary())
364 quote_expr!(self.cx, {
365 nlist.add_empty($pc);
366 if $cond { self.add(nlist, $nextpc, &mut *groups) }
370 let save = quote_expr!(self.cx, {
371 let old = groups[$slot];
372 groups[$slot] = Some(self.ic);
373 self.add(nlist, $nextpc, &mut *groups);
376 let add = quote_expr!(self.cx, {
377 self.add(nlist, $nextpc, &mut *groups);
379 // If this is saving a submatch location but we request
380 // existence or only full match location, then we can skip
381 // right over it every time.
383 quote_expr!(self.cx, {
384 nlist.add_empty($pc);
387 Exists | Location => $add,
391 quote_expr!(self.cx, {
392 nlist.add_empty($pc);
394 Submatches | Location => $save,
401 quote_expr!(self.cx, {
402 nlist.add_empty($pc);
403 self.add(nlist, $to, &mut *groups);
407 quote_expr!(self.cx, {
408 nlist.add_empty($pc);
409 self.add(nlist, $x, &mut *groups);
410 self.add(nlist, $y, &mut *groups);
413 // For Match, OneChar, CharClass, Any
414 _ => quote_expr!(self.cx, nlist.add($pc, &*groups)),
416 self.arm_inst(pc, body)
417 }).collect::<Vec<ast::Arm>>();
419 self.match_insts(arms)
422 // Generates the code for the `step` method, which processes all states
423 // in the current queue that consume a single character.
424 fn step_insts(&self) -> @ast::Expr {
425 let arms = self.prog.insts.iter().enumerate().map(|(pc, inst)| {
427 let body = match *inst {
429 quote_expr!(self.cx, {
432 return StepMatchEarlyReturn
440 for (slot, val) in groups.mut_iter().zip(caps.iter()) {
448 OneChar(c, flags) => {
449 if flags & FLAG_NOCASE > 0 {
450 let upc = c.to_uppercase();
451 quote_expr!(self.cx, {
452 let upc = self.chars.prev.map(|c| c.to_uppercase());
453 if upc == Some($upc) {
454 self.add(nlist, $nextpc, caps);
458 quote_expr!(self.cx, {
459 if self.chars.prev == Some($c) {
460 self.add(nlist, $nextpc, caps);
465 CharClass(ref ranges, flags) => {
466 let negate = flags & FLAG_NEGATED > 0;
467 let casei = flags & FLAG_NOCASE > 0;
470 quote_expr!(self.cx, self.chars.prev.unwrap().to_uppercase())
472 quote_expr!(self.cx, self.chars.prev.unwrap())
476 quote_expr!(self.cx, !found)
478 quote_expr!(self.cx, found)
480 let mranges = self.match_class(casei, ranges.as_slice());
481 quote_expr!(self.cx, {
482 if self.chars.prev.is_some() {
484 let found = $mranges;
486 self.add(nlist, $nextpc, caps);
492 if flags & FLAG_DOTNL > 0 {
493 quote_expr!(self.cx, self.add(nlist, $nextpc, caps))
495 quote_expr!(self.cx, {
496 if self.chars.prev != Some('\n') {
497 self.add(nlist, $nextpc, caps)
503 // EmptyBegin, EmptyEnd, EmptyWordBoundary, Save, Jump, Split
504 _ => self.empty_block(),
506 self.arm_inst(pc, body)
507 }).collect::<Vec<ast::Arm>>();
509 self.match_insts(arms)
512 // Translates a character class into a match expression.
513 // This avoids a binary search (and is hopefully replaced by a jump
515 fn match_class(&self, casei: bool, ranges: &[(char, char)]) -> @ast::Expr {
516 let expr_true = quote_expr!(self.cx, true);
518 let mut arms = ranges.iter().map(|&(mut start, mut end)| {
520 start = start.to_uppercase();
521 end = end.to_uppercase();
523 let pat = self.cx.pat(self.sp, ast::PatRange(quote_expr!(self.cx, $start),
524 quote_expr!(self.cx, $end)));
525 self.cx.arm(self.sp, vec!(pat), expr_true)
526 }).collect::<Vec<ast::Arm>>();
528 arms.push(self.wild_arm_expr(quote_expr!(self.cx, false)));
530 let match_on = quote_expr!(self.cx, c);
531 self.cx.expr_match(self.sp, match_on, arms)
534 // Generates code for checking a literal prefix of the search string.
535 // The code is only generated if the regex *has* a literal prefix.
536 // Otherwise, a no-op is returned.
537 fn check_prefix(&self) -> @ast::Expr {
538 if self.prog.prefix.len() == 0 {
543 let haystack = self.input.as_bytes().slice_from(self.ic);
544 match find_prefix(prefix_bytes, haystack) {
548 next_ic = self.chars.set(self.ic);
556 // Builds a `match pc { ... }` expression from a list of arms, specifically
557 // for matching the current program counter with an instruction.
558 // A wild-card arm is automatically added that executes a no-op. It will
559 // never be used, but is added to satisfy the compiler complaining about
560 // non-exhaustive patterns.
561 fn match_insts(&self, mut arms: Vec<ast::Arm>) -> @ast::Expr {
562 arms.push(self.wild_arm_expr(self.empty_block()));
563 self.cx.expr_match(self.sp, quote_expr!(self.cx, pc), arms)
566 fn empty_block(&self) -> @ast::Expr {
567 quote_expr!(self.cx, {})
570 // Creates a match arm for the instruction at `pc` with the expression
572 fn arm_inst(&self, pc: uint, body: @ast::Expr) -> ast::Arm {
573 let pc_pat = self.cx.pat_lit(self.sp, quote_expr!(self.cx, $pc));
575 self.cx.arm(self.sp, vec!(pc_pat), body)
578 // Creates a wild-card match arm with the expression `body`.
579 fn wild_arm_expr(&self, body: @ast::Expr) -> ast::Arm {
582 pats: vec!(@ast::Pat{
583 id: ast::DUMMY_NODE_ID,
593 // Converts `xs` to a `[x1, x2, .., xN]` expression by calling `to_expr`
594 // on each element in `xs`.
595 fn vec_expr<T, It: Iterator<T>>(&self, xs: It, to_expr: |&ExtCtxt, T| -> @ast::Expr)
597 let exprs = xs.map(|x| to_expr(self.cx, x)).collect();
598 self.cx.expr_vec(self.sp, exprs)
602 /// Looks for a single string literal and returns it.
603 /// Otherwise, logs an error with cx.span_err and returns None.
604 fn parse(cx: &mut ExtCtxt, tts: &[ast::TokenTree]) -> Option<~str> {
605 let mut parser = parse::new_parser_from_tts(cx.parse_sess(), cx.cfg(),
606 Vec::from_slice(tts));
607 let entry = cx.expand_expr(parser.parse_expr());
608 let regex = match entry.node {
609 ast::ExprLit(lit) => {
611 ast::LitStr(ref s, _) => s.to_str(),
613 cx.span_err(entry.span, format!(
614 "expected string literal but got `{}`",
615 pprust::lit_to_str(lit)));
621 cx.span_err(entry.span, format!(
622 "expected string literal but got `{}`",
623 pprust::expr_to_str(entry)));
627 if !parser.eat(&token::EOF) {
628 cx.span_err(parser.span, "only one string literal allowed");