src/libregex/vm.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // FIXME: Currently, the VM simulates an NFA. It would be nice to have another
  12 // VM that simulates a DFA.
  13 //
  14 // According to Russ Cox[1], a DFA performs better than an NFA, principally
  15 // because it reuses states previously computed by the machine *and* doesn't
  16 // keep track of capture groups. The drawback of a DFA (aside from its
  17 // complexity) is that it can't accurately return the locations of submatches.
  18 // The NFA *can* do that. (This is my understanding anyway.)
  19 //
  20 // Cox suggests that a DFA ought to be used to answer "does this match" and
  21 // "where does it match" questions. (In the latter, the starting position of
  22 // the match is computed by executing the regex backwards.) Cox also suggests
  23 // that a DFA should be run when asking "where are the submatches", which can
  24 // 1) quickly answer "no" is there's no match and 2) discover the substring
  25 // that matches, which means running the NFA on smaller input.
  26 //
  27 // Currently, the NFA simulation implemented below does some dirty tricks to
  28 // avoid tracking capture groups when they aren't needed (which only works
  29 // for 'is_match', not 'find'). This is a half-measure, but does provide some
  30 // perf improvement.
  31 //
  32 // AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
  33 //
  34 // [1] - http://swtch.com/~rsc/regex/regex3.html
  35
  36 use std::cmp;
  37 use std::mem;
  38 use std::slice::MutableVector;
  39 use compile::{
  40     Program,
  41     Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
  42     Save, Jump, Split,
  43 };
  44 use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
  45 use unicode::regex::PERLW;
  46
  47 pub type CaptureLocs = Vec<Option<uint>>;
  48
  49 /// Indicates the type of match to be performed by the VM.
  50 pub enum MatchKind {
  51     /// Only checks if a match exists or not. Does not return location.
  52     Exists,
  53     /// Returns the start and end indices of the entire match in the input
  54     /// given.
  55     Location,
  56     /// Returns the start and end indices of each submatch in the input given.
  57     Submatches,
  58 }
  59
  60 /// Runs an NFA simulation on the compiled expression given on the search text
  61 /// `input`. The search begins at byte index `start` and ends at byte index
  62 /// `end`. (The range is specified here so that zero-width assertions will work
  63 /// correctly when searching for successive non-overlapping matches.)
  64 ///
  65 /// The `which` parameter indicates what kind of capture information the caller
  66 /// wants. There are three choices: match existence only, the location of the
  67 /// entire match or the locations of the entire match in addition to the
  68 /// locations of each submatch.
  69 pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
  70                    start: uint, end: uint) -> CaptureLocs {
  71     Nfa {
  72         which: which,
  73         prog: prog,
  74         input: input,
  75         start: start,
  76         end: end,
  77         ic: 0,
  78         chars: CharReader::new(input),
  79     }.run()
  80 }
  81
  82 struct Nfa<'r, 't> {
  83     which: MatchKind,
  84     prog: &'r Program,
  85     input: &'t str,
  86     start: uint,
  87     end: uint,
  88     ic: uint,
  89     chars: CharReader<'t>,
  90 }
  91
  92 /// Indicates the next action to take after a single non-empty instruction
  93 /// is processed.
  94 pub enum StepState {
  95     /// This is returned if and only if a Match instruction is reached and
  96     /// we only care about the existence of a match. It instructs the VM to
  97     /// quit early.
  98     StepMatchEarlyReturn,
  99     /// Indicates that a match was found. Thus, the rest of the states in the
 100     /// *current* queue should be dropped (i.e., leftmost-first semantics).
 101     /// States in the "next" queue can still be processed.
 102     StepMatch,
 103     /// No match was found. Continue with the next state in the queue.
 104     StepContinue,
 105 }
 106
 107 impl<'r, 't> Nfa<'r, 't> {
 108     fn run(&mut self) -> CaptureLocs {
 109         let ncaps = match self.which {
 110             Exists => 0,
 111             Location => 1,
 112             Submatches => self.prog.num_captures(),
 113         };
 114         let mut matched = false;
 115         let ninsts = self.prog.insts.len();
 116         let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
 117         let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
 118
 119         let mut groups = Vec::from_elem(ncaps * 2, None);
 120
 121         // Determine if the expression starts with a '^' so we can avoid
 122         // simulating .*?
 123         // Make sure multi-line mode isn't enabled for it, otherwise we can't
 124         // drop the initial .*?
 125         let prefix_anchor =
 126             match self.prog.insts[1] {
 127                 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
 128                 _ => false,
 129             };
 130
 131         self.ic = self.start;
 132         let mut next_ic = self.chars.set(self.start);
 133         while self.ic <= self.end {
 134             if clist.size == 0 {
 135                 // We have a match and we're done exploring alternatives.
 136                 // Time to quit.
 137                 if matched {
 138                     break
 139                 }
 140
 141                 // If there are no threads to try, then we'll have to start
 142                 // over at the beginning of the regex.
 143                 // BUT, if there's a literal prefix for the program, try to
 144                 // jump ahead quickly. If it can't be found, then we can bail
 145                 // out early.
 146                 if self.prog.prefix.len() > 0 && clist.size == 0 {
 147                     let needle = self.prog.prefix.as_slice().as_bytes();
 148                     let haystack = self.input.as_bytes().slice_from(self.ic);
 149                     match find_prefix(needle, haystack) {
 150                         None => break,
 151                         Some(i) => {
 152                             self.ic += i;
 153                             next_ic = self.chars.set(self.ic);
 154                         }
 155                     }
 156                 }
 157             }
 158
 159             // This simulates a preceding '.*?' for every regex by adding
 160             // a state starting at the current position in the input for the
 161             // beginning of the program only if we don't already have a match.
 162             if clist.size == 0 || (!prefix_anchor && !matched) {
 163                 self.add(clist, 0, groups.as_mut_slice())
 164             }
 165
 166             // Now we try to read the next character.
 167             // As a result, the 'step' method will look at the previous
 168             // character.
 169             self.ic = next_ic;
 170             next_ic = self.chars.advance();
 171
 172             for i in range(0, clist.size) {
 173                 let pc = clist.pc(i);
 174                 let step_state = self.step(groups.as_mut_slice(), nlist,
 175                                            clist.groups(i), pc);
 176                 match step_state {
 177                     StepMatchEarlyReturn => return vec![Some(0), Some(0)],
 178                     StepMatch => { matched = true; break },
 179                     StepContinue => {},
 180                 }
 181             }
 182             mem::swap(&mut clist, &mut nlist);
 183             nlist.empty();
 184         }
 185         match self.which {
 186             Exists if matched     => vec![Some(0), Some(0)],
 187             Exists                => vec![None, None],
 188             Location | Submatches => groups,
 189         }
 190     }
 191
 192     fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
 193             caps: &mut [Option<uint>], pc: uint)
 194            -> StepState {
 195         match self.prog.insts[pc] {
 196             Match => {
 197                 match self.which {
 198                     Exists => {
 199                         return StepMatchEarlyReturn
 200                     }
 201                     Location => {
 202                         groups[0] = caps[0];
 203                         groups[1] = caps[1];
 204                         return StepMatch
 205                     }
 206                     Submatches => {
 207                         for (slot, val) in groups.mut_iter().zip(caps.iter()) {
 208                             *slot = *val;
 209                         }
 210                         return StepMatch
 211                     }
 212                 }
 213             }
 214             OneChar(c, flags) => {
 215                 if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
 216                     self.add(nlist, pc+1, caps);
 217                 }
 218             }
 219             CharClass(ref ranges, flags) => {
 220                 if self.chars.prev.is_some() {
 221                     let c = self.chars.prev.unwrap();
 222                     let negate = flags & FLAG_NEGATED > 0;
 223                     let casei = flags & FLAG_NOCASE > 0;
 224                     let found = ranges.as_slice();
 225                     let found = found.bsearch(|&rc| class_cmp(casei, c, rc));
 226                     let found = found.is_some();
 227                     if found ^ negate {
 228                         self.add(nlist, pc+1, caps);
 229                     }
 230                 }
 231             }
 232             Any(flags) => {
 233                 if flags & FLAG_DOTNL > 0
 234                    || !self.char_eq(false, self.chars.prev, '\n') {
 235                     self.add(nlist, pc+1, caps)
 236                 }
 237             }
 238             EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
 239             | Save(_) | Jump(_) | Split(_, _) => {},
 240         }
 241         StepContinue
 242     }
 243
 244     fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
 245         if nlist.contains(pc) {
 246             return
 247         }
 248         // We have to add states to the threads list even if their empty.
 249         // TL;DR - It prevents cycles.
 250         // If we didn't care about cycles, we'd *only* add threads that
 251         // correspond to non-jumping instructions (OneChar, Any, Match, etc.).
 252         // But, it's possible for valid regexs (like '(a*)*') to result in
 253         // a cycle in the instruction list. e.g., We'll keep chasing the Split
 254         // instructions forever.
 255         // So we add these instructions to our thread queue, but in the main
 256         // VM loop, we look for them but simply ignore them.
 257         // Adding them to the queue prevents them from being revisited so we
 258         // can avoid cycles (and the inevitable stack overflow).
 259         //
 260         // We make a minor optimization by indicating that the state is "empty"
 261         // so that its capture groups are not filled in.
 262         match self.prog.insts[pc] {
 263             EmptyBegin(flags) => {
 264                 let multi = flags & FLAG_MULTI > 0;
 265                 nlist.add(pc, groups, true);
 266                 if self.chars.is_begin()
 267                    || (multi && self.char_is(self.chars.prev, '\n')) {
 268                     self.add(nlist, pc + 1, groups)
 269                 }
 270             }
 271             EmptyEnd(flags) => {
 272                 let multi = flags & FLAG_MULTI > 0;
 273                 nlist.add(pc, groups, true);
 274                 if self.chars.is_end()
 275                    || (multi && self.char_is(self.chars.cur, '\n')) {
 276                     self.add(nlist, pc + 1, groups)
 277                 }
 278             }
 279             EmptyWordBoundary(flags) => {
 280                 nlist.add(pc, groups, true);
 281                 if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
 282                     self.add(nlist, pc + 1, groups)
 283                 }
 284             }
 285             Save(slot) => {
 286                 nlist.add(pc, groups, true);
 287                 match self.which {
 288                     Location if slot <= 1 => {
 289                         let old = groups[slot];
 290                         groups[slot] = Some(self.ic);
 291                         self.add(nlist, pc + 1, groups);
 292                         groups[slot] = old;
 293                     }
 294                     Submatches => {
 295                         let old = groups[slot];
 296                         groups[slot] = Some(self.ic);
 297                         self.add(nlist, pc + 1, groups);
 298                         groups[slot] = old;
 299                     }
 300                     Exists | Location => self.add(nlist, pc + 1, groups),
 301                 }
 302             }
 303             Jump(to) => {
 304                 nlist.add(pc, groups, true);
 305                 self.add(nlist, to, groups)
 306             }
 307             Split(x, y) => {
 308                 nlist.add(pc, groups, true);
 309                 self.add(nlist, x, groups);
 310                 self.add(nlist, y, groups);
 311             }
 312             Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
 313                 nlist.add(pc, groups, false);
 314             }
 315         }
 316     }
 317
 318     // FIXME: For case insensitive comparisons, it uses the uppercase
 319     // character and tests for equality. IIUC, this does not generalize to
 320     // all of Unicode. I believe we need to check the entire fold for each
 321     // character. This will be easy to add if and when it gets added to Rust's
 322     // standard library.
 323     #[inline]
 324     fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
 325         match textc {
 326             None => false,
 327             Some(textc) => {
 328                 regc == textc
 329                     || (casei && regc.to_uppercase() == textc.to_uppercase())
 330             }
 331         }
 332     }
 333
 334     #[inline]
 335     fn char_is(&self, textc: Option<char>, regc: char) -> bool {
 336         textc == Some(regc)
 337     }
 338 }
 339
 340 /// CharReader is responsible for maintaining a "previous" and a "current"
 341 /// character. This one-character lookahead is necessary for assertions that
 342 /// look one character before or after the current position.
 343 pub struct CharReader<'t> {
 344     /// The previous character read. It is None only when processing the first
 345     /// character of the input.
 346     pub prev: Option<char>,
 347     /// The current character.
 348     pub cur: Option<char>,
 349     input: &'t str,
 350     next: uint,
 351 }
 352
 353 impl<'t> CharReader<'t> {
 354     /// Returns a new CharReader that advances through the input given.
 355     /// Note that a CharReader has no knowledge of the range in which to search
 356     /// the input.
 357     pub fn new(input: &'t str) -> CharReader<'t> {
 358         CharReader {
 359             prev: None,
 360             cur: None,
 361             input: input,
 362             next: 0,
 363        }
 364     }
 365
 366     /// Sets the previous and current character given any arbitrary byte
 367     /// index (at a unicode codepoint boundary).
 368     #[inline]
 369     pub fn set(&mut self, ic: uint) -> uint {
 370         self.prev = None;
 371         self.cur = None;
 372         self.next = 0;
 373
 374         if self.input.len() == 0 {
 375             return 1
 376         }
 377         if ic > 0 {
 378             let i = cmp::min(ic, self.input.len());
 379             let prev = self.input.char_range_at_reverse(i);
 380             self.prev = Some(prev.ch);
 381         }
 382         if ic < self.input.len() {
 383             let cur = self.input.char_range_at(ic);
 384             self.cur = Some(cur.ch);
 385             self.next = cur.next;
 386             self.next
 387         } else {
 388             self.input.len() + 1
 389         }
 390     }
 391
 392     /// Does the same as `set`, except it always advances to the next
 393     /// character in the input (and therefore does half as many UTF8 decodings).
 394     #[inline]
 395     pub fn advance(&mut self) -> uint {
 396         self.prev = self.cur;
 397         if self.next < self.input.len() {
 398             let cur = self.input.char_range_at(self.next);
 399             self.cur = Some(cur.ch);
 400             self.next = cur.next;
 401         } else {
 402             self.cur = None;
 403             self.next = self.input.len() + 1;
 404         }
 405         self.next
 406     }
 407
 408     /// Returns true if and only if this is the beginning of the input
 409     /// (ignoring the range of the input to search).
 410     #[inline]
 411     pub fn is_begin(&self) -> bool { self.prev.is_none() }
 412
 413     /// Returns true if and only if this is the end of the input
 414     /// (ignoring the range of the input to search).
 415     #[inline]
 416     pub fn is_end(&self) -> bool { self.cur.is_none() }
 417
 418     /// Returns true if and only if the current position is a word boundary.
 419     /// (Ignoring the range of the input to search.)
 420     pub fn is_word_boundary(&self) -> bool {
 421         if self.is_begin() {
 422             return is_word(self.cur)
 423         }
 424         if self.is_end() {
 425             return is_word(self.prev)
 426         }
 427         (is_word(self.cur) && !is_word(self.prev))
 428         || (is_word(self.prev) && !is_word(self.cur))
 429     }
 430 }
 431
 432 struct Thread {
 433     pc: uint,
 434     groups: Vec<Option<uint>>,
 435 }
 436
 437 struct Threads {
 438     which: MatchKind,
 439     queue: Vec<Thread>,
 440     sparse: Vec<uint>,
 441     size: uint,
 442 }
 443
 444 impl Threads {
 445     // This is using a wicked neat trick to provide constant time lookup
 446     // for threads in the queue using a sparse set. A queue of threads is
 447     // allocated once with maximal size when the VM initializes and is reused
 448     // throughout execution. That is, there should be zero allocation during
 449     // the execution of a VM.
 450     //
 451     // See http://research.swtch.com/sparse for the deets.
 452     fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
 453         Threads {
 454             which: which,
 455             queue: Vec::from_fn(num_insts, |_| {
 456                 Thread { pc: 0, groups: Vec::from_elem(ncaps * 2, None) }
 457             }),
 458             sparse: Vec::from_elem(num_insts, 0u),
 459             size: 0,
 460         }
 461     }
 462
 463     fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
 464         let t = self.queue.get_mut(self.size);
 465         t.pc = pc;
 466         match (empty, self.which) {
 467             (_, Exists) | (true, _) => {},
 468             (false, Location) => {
 469                 *t.groups.get_mut(0) = groups[0];
 470                 *t.groups.get_mut(1) = groups[1];
 471             }
 472             (false, Submatches) => {
 473                 for (slot, val) in t.groups.mut_iter().zip(groups.iter()) {
 474                     *slot = *val;
 475                 }
 476             }
 477         }
 478         *self.sparse.get_mut(pc) = self.size;
 479         self.size += 1;
 480     }
 481
 482     #[inline]
 483     fn contains(&self, pc: uint) -> bool {
 484         let s = self.sparse[pc];
 485         s < self.size && self.queue[s].pc == pc
 486     }
 487
 488     #[inline]
 489     fn empty(&mut self) {
 490         self.size = 0;
 491     }
 492
 493     #[inline]
 494     fn pc(&self, i: uint) -> uint {
 495         self.queue[i].pc
 496     }
 497
 498     #[inline]
 499     fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
 500         self.queue.get_mut(i).groups.as_mut_slice()
 501     }
 502 }
 503
 504 /// Returns true if the character is a word character, according to the
 505 /// (Unicode friendly) Perl character class '\w'.
 506 /// Note that this is only use for testing word boundaries. The actual '\w'
 507 /// is encoded as a CharClass instruction.
 508 pub fn is_word(c: Option<char>) -> bool {
 509     let c = match c {
 510         None => return false,
 511         Some(c) => c,
 512     };
 513     // Try the common ASCII case before invoking binary search.
 514     match c {
 515         '_' | '0' .. '9' | 'a' .. 'z' | 'A' .. 'Z' => true,
 516         _ => PERLW.bsearch(|&(start, end)| {
 517             if c >= start && c <= end {
 518                 Equal
 519             } else if start > c {
 520                 Greater
 521             } else {
 522                 Less
 523             }
 524         }).is_some()
 525     }
 526 }
 527
 528 /// Given a character and a single character class range, return an ordering
 529 /// indicating whether the character is less than the start of the range,
 530 /// in the range (inclusive) or greater than the end of the range.
 531 ///
 532 /// If `casei` is `true`, then this ordering is computed case insensitively.
 533 ///
 534 /// This function is meant to be used with a binary search.
 535 #[inline]
 536 fn class_cmp(casei: bool, mut textc: char,
 537              (mut start, mut end): (char, char)) -> Ordering {
 538     if casei {
 539         // FIXME: This is pretty ridiculous. All of this case conversion
 540         // can be moved outside this function:
 541         // 1) textc should be uppercased outside the bsearch.
 542         // 2) the character class itself should be uppercased either in the
 543         //    parser or the compiler.
 544         // FIXME: This is too simplistic for correct Unicode support.
 545         //        See also: char_eq
 546         textc = textc.to_uppercase();
 547         start = start.to_uppercase();
 548         end = end.to_uppercase();
 549     }
 550     if textc >= start && textc <= end {
 551         Equal
 552     } else if start > textc {
 553         Greater
 554     } else {
 555         Less
 556     }
 557 }
 558
 559 /// Returns the starting location of `needle` in `haystack`.
 560 /// If `needle` is not in `haystack`, then `None` is returned.
 561 ///
 562 /// Note that this is using a naive substring algorithm.
 563 #[inline]
 564 pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
 565     let (hlen, nlen) = (haystack.len(), needle.len());
 566     if nlen > hlen || nlen == 0 {
 567         return None
 568     }
 569     for (offset, window) in haystack.windows(nlen).enumerate() {
 570         if window == needle {
 571             return Some(offset)
 572         }
 573     }
 574     None
 575 }