1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 // FIXME: Currently, the VM simulates an NFA. It would be nice to have another
12 // VM that simulates a DFA.
14 // According to Russ Cox[1], a DFA performs better than an NFA, principally
15 // because it reuses states previously computed by the machine *and* doesn't
16 // keep track of capture groups. The drawback of a DFA (aside from its
17 // complexity) is that it can't accurately return the locations of submatches.
18 // The NFA *can* do that. (This is my understanding anyway.)
20 // Cox suggests that a DFA ought to be used to answer "does this match" and
21 // "where does it match" questions. (In the latter, the starting position of
22 // the match is computed by executing the regex backwards.) Cox also suggests
23 // that a DFA should be run when asking "where are the submatches", which can
24 // 1) quickly answer "no" is there's no match and 2) discover the substring
25 // that matches, which means running the NFA on smaller input.
27 // Currently, the NFA simulation implemented below does some dirty tricks to
28 // avoid tracking capture groups when they aren't needed (which only works
29 // for 'is_match', not 'find'). This is a half-measure, but does provide some
32 // AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
34 // [1] - http://swtch.com/~rsc/regex/regex3.html
36 pub use self::MatchKind::*;
37 pub use self::StepState::*;
40 use std::cmp::Ordering::{mod, Less, Equal, Greater};
42 use std::iter::repeat;
43 use std::slice::SliceExt;
46 Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
49 use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
50 use unicode::regex::PERLW;
52 pub type CaptureLocs = Vec<Option<uint>>;
54 /// Indicates the type of match to be performed by the VM.
57 /// Only checks if a match exists or not. Does not return location.
59 /// Returns the start and end indices of the entire match in the input
62 /// Returns the start and end indices of each submatch in the input given.
66 /// Runs an NFA simulation on the compiled expression given on the search text
67 /// `input`. The search begins at byte index `start` and ends at byte index
68 /// `end`. (The range is specified here so that zero-width assertions will work
69 /// correctly when searching for successive non-overlapping matches.)
71 /// The `which` parameter indicates what kind of capture information the caller
72 /// wants. There are three choices: match existence only, the location of the
73 /// entire match or the locations of the entire match in addition to the
74 /// locations of each submatch.
75 pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
76 start: uint, end: uint) -> CaptureLocs {
84 chars: CharReader::new(input),
95 chars: CharReader<'t>,
98 /// Indicates the next action to take after a single non-empty instruction
102 /// This is returned if and only if a Match instruction is reached and
103 /// we only care about the existence of a match. It instructs the VM to
105 StepMatchEarlyReturn,
106 /// Indicates that a match was found. Thus, the rest of the states in the
107 /// *current* queue should be dropped (i.e., leftmost-first semantics).
108 /// States in the "next" queue can still be processed.
110 /// No match was found. Continue with the next state in the queue.
114 impl<'r, 't> Nfa<'r, 't> {
115 fn run(&mut self) -> CaptureLocs {
116 let ncaps = match self.which {
119 Submatches => self.prog.num_captures(),
121 let mut matched = false;
122 let ninsts = self.prog.insts.len();
123 let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
124 let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
126 let mut groups: Vec<_> = repeat(None).take(ncaps * 2).collect();
128 // Determine if the expression starts with a '^' so we can avoid
130 // Make sure multi-line mode isn't enabled for it, otherwise we can't
131 // drop the initial .*?
133 match self.prog.insts[1] {
134 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
138 self.ic = self.start;
139 let mut next_ic = self.chars.set(self.start);
140 while self.ic <= self.end {
142 // We have a match and we're done exploring alternatives.
148 // If there are no threads to try, then we'll have to start
149 // over at the beginning of the regex.
150 // BUT, if there's a literal prefix for the program, try to
151 // jump ahead quickly. If it can't be found, then we can bail
153 if self.prog.prefix.len() > 0 && clist.size == 0 {
154 let needle = self.prog.prefix.as_bytes();
155 let haystack = self.input.as_bytes()[self.ic..];
156 match find_prefix(needle, haystack) {
160 next_ic = self.chars.set(self.ic);
166 // This simulates a preceding '.*?' for every regex by adding
167 // a state starting at the current position in the input for the
168 // beginning of the program only if we don't already have a match.
169 if clist.size == 0 || (!prefix_anchor && !matched) {
170 self.add(clist, 0, groups.as_mut_slice())
173 // Now we try to read the next character.
174 // As a result, the 'step' method will look at the previous
177 next_ic = self.chars.advance();
179 for i in range(0, clist.size) {
180 let pc = clist.pc(i);
181 let step_state = self.step(groups.as_mut_slice(), nlist,
182 clist.groups(i), pc);
184 StepMatchEarlyReturn => return vec![Some(0), Some(0)],
185 StepMatch => { matched = true; break },
189 mem::swap(&mut clist, &mut nlist);
193 Exists if matched => vec![Some(0), Some(0)],
194 Exists => vec![None, None],
195 Location | Submatches => groups,
199 fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
200 caps: &mut [Option<uint>], pc: uint)
202 match self.prog.insts[pc] {
206 return StepMatchEarlyReturn
214 for (slot, val) in groups.iter_mut().zip(caps.iter()) {
221 OneChar(c, flags) => {
222 if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
223 self.add(nlist, pc+1, caps);
226 CharClass(ref ranges, flags) => {
227 if self.chars.prev.is_some() {
228 let c = self.chars.prev.unwrap();
229 let negate = flags & FLAG_NEGATED > 0;
230 let casei = flags & FLAG_NOCASE > 0;
231 let found = ranges.as_slice();
232 let found = found.binary_search_by(|&rc| class_cmp(casei, c, rc)).is_ok();
234 self.add(nlist, pc+1, caps);
239 if flags & FLAG_DOTNL > 0
240 || !self.char_eq(false, self.chars.prev, '\n') {
241 self.add(nlist, pc+1, caps)
244 EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
245 | Save(_) | Jump(_) | Split(_, _) => {},
250 fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
251 if nlist.contains(pc) {
254 // We have to add states to the threads list even if their empty.
255 // TL;DR - It prevents cycles.
256 // If we didn't care about cycles, we'd *only* add threads that
257 // correspond to non-jumping instructions (OneChar, Any, Match, etc.).
258 // But, it's possible for valid regexs (like '(a*)*') to result in
259 // a cycle in the instruction list. e.g., We'll keep chasing the Split
260 // instructions forever.
261 // So we add these instructions to our thread queue, but in the main
262 // VM loop, we look for them but simply ignore them.
263 // Adding them to the queue prevents them from being revisited so we
264 // can avoid cycles (and the inevitable stack overflow).
266 // We make a minor optimization by indicating that the state is "empty"
267 // so that its capture groups are not filled in.
268 match self.prog.insts[pc] {
269 EmptyBegin(flags) => {
270 let multi = flags & FLAG_MULTI > 0;
271 nlist.add(pc, groups, true);
272 if self.chars.is_begin()
273 || (multi && self.char_is(self.chars.prev, '\n')) {
274 self.add(nlist, pc + 1, groups)
278 let multi = flags & FLAG_MULTI > 0;
279 nlist.add(pc, groups, true);
280 if self.chars.is_end()
281 || (multi && self.char_is(self.chars.cur, '\n')) {
282 self.add(nlist, pc + 1, groups)
285 EmptyWordBoundary(flags) => {
286 nlist.add(pc, groups, true);
287 if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
288 self.add(nlist, pc + 1, groups)
292 nlist.add(pc, groups, true);
294 Location if slot <= 1 => {
295 let old = groups[slot];
296 groups[slot] = Some(self.ic);
297 self.add(nlist, pc + 1, groups);
301 let old = groups[slot];
302 groups[slot] = Some(self.ic);
303 self.add(nlist, pc + 1, groups);
306 Exists | Location => self.add(nlist, pc + 1, groups),
310 nlist.add(pc, groups, true);
311 self.add(nlist, to, groups)
314 nlist.add(pc, groups, true);
315 self.add(nlist, x, groups);
316 self.add(nlist, y, groups);
318 Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
319 nlist.add(pc, groups, false);
324 // FIXME: For case insensitive comparisons, it uses the uppercase
325 // character and tests for equality. IIUC, this does not generalize to
326 // all of Unicode. I believe we need to check the entire fold for each
327 // character. This will be easy to add if and when it gets added to Rust's
330 fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
335 || (casei && regc.to_uppercase() == textc.to_uppercase())
341 fn char_is(&self, textc: Option<char>, regc: char) -> bool {
346 /// CharReader is responsible for maintaining a "previous" and a "current"
347 /// character. This one-character lookahead is necessary for assertions that
348 /// look one character before or after the current position.
349 pub struct CharReader<'t> {
350 /// The previous character read. It is None only when processing the first
351 /// character of the input.
352 pub prev: Option<char>,
353 /// The current character.
354 pub cur: Option<char>,
359 impl<'t> CharReader<'t> {
360 /// Returns a new CharReader that advances through the input given.
361 /// Note that a CharReader has no knowledge of the range in which to search
363 pub fn new(input: &'t str) -> CharReader<'t> {
372 /// Sets the previous and current character given any arbitrary byte
373 /// index (at a Unicode codepoint boundary).
375 pub fn set(&mut self, ic: uint) -> uint {
380 if self.input.len() == 0 {
384 let i = cmp::min(ic, self.input.len());
385 let prev = self.input.char_range_at_reverse(i);
386 self.prev = Some(prev.ch);
388 if ic < self.input.len() {
389 let cur = self.input.char_range_at(ic);
390 self.cur = Some(cur.ch);
391 self.next = cur.next;
398 /// Does the same as `set`, except it always advances to the next
399 /// character in the input (and therefore does half as many UTF8 decodings).
401 pub fn advance(&mut self) -> uint {
402 self.prev = self.cur;
403 if self.next < self.input.len() {
404 let cur = self.input.char_range_at(self.next);
405 self.cur = Some(cur.ch);
406 self.next = cur.next;
409 self.next = self.input.len() + 1;
414 /// Returns true if and only if this is the beginning of the input
415 /// (ignoring the range of the input to search).
417 pub fn is_begin(&self) -> bool { self.prev.is_none() }
419 /// Returns true if and only if this is the end of the input
420 /// (ignoring the range of the input to search).
422 pub fn is_end(&self) -> bool { self.cur.is_none() }
424 /// Returns true if and only if the current position is a word boundary.
425 /// (Ignoring the range of the input to search.)
426 pub fn is_word_boundary(&self) -> bool {
428 return is_word(self.cur)
431 return is_word(self.prev)
433 (is_word(self.cur) && !is_word(self.prev))
434 || (is_word(self.prev) && !is_word(self.cur))
440 groups: Vec<Option<uint>>,
451 // This is using a wicked neat trick to provide constant time lookup
452 // for threads in the queue using a sparse set. A queue of threads is
453 // allocated once with maximal size when the VM initializes and is reused
454 // throughout execution. That is, there should be zero allocation during
455 // the execution of a VM.
457 // See http://research.swtch.com/sparse for the deets.
458 fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
461 queue: range(0, num_insts).map(|_| {
462 Thread { pc: 0, groups: repeat(None).take(ncaps * 2).collect() }
464 sparse: repeat(0u).take(num_insts).collect(),
469 fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
470 let t = &mut self.queue[self.size];
472 match (empty, self.which) {
473 (_, Exists) | (true, _) => {},
474 (false, Location) => {
475 t.groups[0] = groups[0];
476 t.groups[1] = groups[1];
478 (false, Submatches) => {
479 for (slot, val) in t.groups.iter_mut().zip(groups.iter()) {
484 self.sparse[pc] = self.size;
489 fn contains(&self, pc: uint) -> bool {
490 let s = self.sparse[pc];
491 s < self.size && self.queue[s].pc == pc
495 fn empty(&mut self) {
500 fn pc(&self, i: uint) -> uint {
505 fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
506 self.queue[i].groups.as_mut_slice()
510 /// Returns true if the character is a word character, according to the
511 /// (Unicode friendly) Perl character class '\w'.
512 /// Note that this is only use for testing word boundaries. The actual '\w'
513 /// is encoded as a CharClass instruction.
514 pub fn is_word(c: Option<char>) -> bool {
516 None => return false,
519 // Try the common ASCII case before invoking binary search.
521 '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
522 _ => PERLW.binary_search_by(|&(start, end)| {
523 if c >= start && c <= end {
525 } else if start > c {
534 /// Given a character and a single character class range, return an ordering
535 /// indicating whether the character is less than the start of the range,
536 /// in the range (inclusive) or greater than the end of the range.
538 /// If `casei` is `true`, then this ordering is computed case insensitively.
540 /// This function is meant to be used with a binary search.
542 fn class_cmp(casei: bool, mut textc: char,
543 (mut start, mut end): (char, char)) -> Ordering {
545 // FIXME: This is pretty ridiculous. All of this case conversion
546 // can be moved outside this function:
547 // 1) textc should be uppercased outside the bsearch.
548 // 2) the character class itself should be uppercased either in the
549 // parser or the compiler.
550 // FIXME: This is too simplistic for correct Unicode support.
552 textc = textc.to_uppercase();
553 start = start.to_uppercase();
554 end = end.to_uppercase();
556 if textc >= start && textc <= end {
558 } else if start > textc {
565 /// Returns the starting location of `needle` in `haystack`.
566 /// If `needle` is not in `haystack`, then `None` is returned.
568 /// Note that this is using a naive substring algorithm.
570 pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
571 let (hlen, nlen) = (haystack.len(), needle.len());
572 if nlen > hlen || nlen == 0 {
575 for (offset, window) in haystack.windows(nlen).enumerate() {
576 if window == needle {