src/libregex/vm.rs

   1 // Copyright 2014 The Rust Project Developers. See the COPYRIGHT
   2 // file at the top-level directory of this distribution and at
   3 // http://rust-lang.org/COPYRIGHT.
   4 //
   5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
   6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
   7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
   8 // option. This file may not be copied, modified, or distributed
   9 // except according to those terms.
  10
  11 // FIXME: Currently, the VM simulates an NFA. It would be nice to have another
  12 // VM that simulates a DFA.
  13 //
  14 // According to Russ Cox[1], a DFA performs better than an NFA, principally
  15 // because it reuses states previously computed by the machine *and* doesn't
  16 // keep track of capture groups. The drawback of a DFA (aside from its
  17 // complexity) is that it can't accurately return the locations of submatches.
  18 // The NFA *can* do that. (This is my understanding anyway.)
  19 //
  20 // Cox suggests that a DFA ought to be used to answer "does this match" and
  21 // "where does it match" questions. (In the latter, the starting position of
  22 // the match is computed by executing the regex backwards.) Cox also suggests
  23 // that a DFA should be run when asking "where are the submatches", which can
  24 // 1) quickly answer "no" is there's no match and 2) discover the substring
  25 // that matches, which means running the NFA on smaller input.
  26 //
  27 // Currently, the NFA simulation implemented below does some dirty tricks to
  28 // avoid tracking capture groups when they aren't needed (which only works
  29 // for 'is_match', not 'find'). This is a half-measure, but does provide some
  30 // perf improvement.
  31 //
  32 // AFAIK, the DFA/NFA approach is implemented in RE2/C++ but *not* in RE2/Go.
  33 //
  34 // [1] - http://swtch.com/~rsc/regex/regex3.html
  35
  36 pub use self::MatchKind::*;
  37 pub use self::StepState::*;
  38
  39 use std::cmp;
  40 use std::cmp::Ordering::{mod, Less, Equal, Greater};
  41 use std::mem;
  42 use std::iter::repeat;
  43 use std::slice::SliceExt;
  44 use compile::{
  45     Program,
  46     Match, OneChar, CharClass, Any, EmptyBegin, EmptyEnd, EmptyWordBoundary,
  47     Save, Jump, Split,
  48 };
  49 use parse::{FLAG_NOCASE, FLAG_MULTI, FLAG_DOTNL, FLAG_NEGATED};
  50 use unicode::regex::PERLW;
  51
  52 pub type CaptureLocs = Vec<Option<uint>>;
  53
  54 /// Indicates the type of match to be performed by the VM.
  55 #[deriving(Copy)]
  56 pub enum MatchKind {
  57     /// Only checks if a match exists or not. Does not return location.
  58     Exists,
  59     /// Returns the start and end indices of the entire match in the input
  60     /// given.
  61     Location,
  62     /// Returns the start and end indices of each submatch in the input given.
  63     Submatches,
  64 }
  65
  66 /// Runs an NFA simulation on the compiled expression given on the search text
  67 /// `input`. The search begins at byte index `start` and ends at byte index
  68 /// `end`. (The range is specified here so that zero-width assertions will work
  69 /// correctly when searching for successive non-overlapping matches.)
  70 ///
  71 /// The `which` parameter indicates what kind of capture information the caller
  72 /// wants. There are three choices: match existence only, the location of the
  73 /// entire match or the locations of the entire match in addition to the
  74 /// locations of each submatch.
  75 pub fn run<'r, 't>(which: MatchKind, prog: &'r Program, input: &'t str,
  76                    start: uint, end: uint) -> CaptureLocs {
  77     Nfa {
  78         which: which,
  79         prog: prog,
  80         input: input,
  81         start: start,
  82         end: end,
  83         ic: 0,
  84         chars: CharReader::new(input),
  85     }.run()
  86 }
  87
  88 struct Nfa<'r, 't> {
  89     which: MatchKind,
  90     prog: &'r Program,
  91     input: &'t str,
  92     start: uint,
  93     end: uint,
  94     ic: uint,
  95     chars: CharReader<'t>,
  96 }
  97
  98 /// Indicates the next action to take after a single non-empty instruction
  99 /// is processed.
 100 #[deriving(Copy)]
 101 pub enum StepState {
 102     /// This is returned if and only if a Match instruction is reached and
 103     /// we only care about the existence of a match. It instructs the VM to
 104     /// quit early.
 105     StepMatchEarlyReturn,
 106     /// Indicates that a match was found. Thus, the rest of the states in the
 107     /// *current* queue should be dropped (i.e., leftmost-first semantics).
 108     /// States in the "next" queue can still be processed.
 109     StepMatch,
 110     /// No match was found. Continue with the next state in the queue.
 111     StepContinue,
 112 }
 113
 114 impl<'r, 't> Nfa<'r, 't> {
 115     fn run(&mut self) -> CaptureLocs {
 116         let ncaps = match self.which {
 117             Exists => 0,
 118             Location => 1,
 119             Submatches => self.prog.num_captures(),
 120         };
 121         let mut matched = false;
 122         let ninsts = self.prog.insts.len();
 123         let mut clist = &mut Threads::new(self.which, ninsts, ncaps);
 124         let mut nlist = &mut Threads::new(self.which, ninsts, ncaps);
 125
 126         let mut groups: Vec<_> = repeat(None).take(ncaps * 2).collect();
 127
 128         // Determine if the expression starts with a '^' so we can avoid
 129         // simulating .*?
 130         // Make sure multi-line mode isn't enabled for it, otherwise we can't
 131         // drop the initial .*?
 132         let prefix_anchor =
 133             match self.prog.insts[1] {
 134                 EmptyBegin(flags) if flags & FLAG_MULTI == 0 => true,
 135                 _ => false,
 136             };
 137
 138         self.ic = self.start;
 139         let mut next_ic = self.chars.set(self.start);
 140         while self.ic <= self.end {
 141             if clist.size == 0 {
 142                 // We have a match and we're done exploring alternatives.
 143                 // Time to quit.
 144                 if matched {
 145                     break
 146                 }
 147
 148                 // If there are no threads to try, then we'll have to start
 149                 // over at the beginning of the regex.
 150                 // BUT, if there's a literal prefix for the program, try to
 151                 // jump ahead quickly. If it can't be found, then we can bail
 152                 // out early.
 153                 if self.prog.prefix.len() > 0 && clist.size == 0 {
 154                     let needle = self.prog.prefix.as_bytes();
 155                     let haystack = self.input.as_bytes()[self.ic..];
 156                     match find_prefix(needle, haystack) {
 157                         None => break,
 158                         Some(i) => {
 159                             self.ic += i;
 160                             next_ic = self.chars.set(self.ic);
 161                         }
 162                     }
 163                 }
 164             }
 165
 166             // This simulates a preceding '.*?' for every regex by adding
 167             // a state starting at the current position in the input for the
 168             // beginning of the program only if we don't already have a match.
 169             if clist.size == 0 || (!prefix_anchor && !matched) {
 170                 self.add(clist, 0, groups.as_mut_slice())
 171             }
 172
 173             // Now we try to read the next character.
 174             // As a result, the 'step' method will look at the previous
 175             // character.
 176             self.ic = next_ic;
 177             next_ic = self.chars.advance();
 178
 179             for i in range(0, clist.size) {
 180                 let pc = clist.pc(i);
 181                 let step_state = self.step(groups.as_mut_slice(), nlist,
 182                                            clist.groups(i), pc);
 183                 match step_state {
 184                     StepMatchEarlyReturn => return vec![Some(0), Some(0)],
 185                     StepMatch => { matched = true; break },
 186                     StepContinue => {},
 187                 }
 188             }
 189             mem::swap(&mut clist, &mut nlist);
 190             nlist.empty();
 191         }
 192         match self.which {
 193             Exists if matched     => vec![Some(0), Some(0)],
 194             Exists                => vec![None, None],
 195             Location | Submatches => groups,
 196         }
 197     }
 198
 199     fn step(&self, groups: &mut [Option<uint>], nlist: &mut Threads,
 200             caps: &mut [Option<uint>], pc: uint)
 201            -> StepState {
 202         match self.prog.insts[pc] {
 203             Match => {
 204                 match self.which {
 205                     Exists => {
 206                         return StepMatchEarlyReturn
 207                     }
 208                     Location => {
 209                         groups[0] = caps[0];
 210                         groups[1] = caps[1];
 211                         return StepMatch
 212                     }
 213                     Submatches => {
 214                         for (slot, val) in groups.iter_mut().zip(caps.iter()) {
 215                             *slot = *val;
 216                         }
 217                         return StepMatch
 218                     }
 219                 }
 220             }
 221             OneChar(c, flags) => {
 222                 if self.char_eq(flags & FLAG_NOCASE > 0, self.chars.prev, c) {
 223                     self.add(nlist, pc+1, caps);
 224                 }
 225             }
 226             CharClass(ref ranges, flags) => {
 227                 if self.chars.prev.is_some() {
 228                     let c = self.chars.prev.unwrap();
 229                     let negate = flags & FLAG_NEGATED > 0;
 230                     let casei = flags & FLAG_NOCASE > 0;
 231                     let found = ranges.as_slice();
 232                     let found = found.binary_search_by(|&rc| class_cmp(casei, c, rc)).is_ok();
 233                     if found ^ negate {
 234                         self.add(nlist, pc+1, caps);
 235                     }
 236                 }
 237             }
 238             Any(flags) => {
 239                 if flags & FLAG_DOTNL > 0
 240                    || !self.char_eq(false, self.chars.prev, '\n') {
 241                     self.add(nlist, pc+1, caps)
 242                 }
 243             }
 244             EmptyBegin(_) | EmptyEnd(_) | EmptyWordBoundary(_)
 245             | Save(_) | Jump(_) | Split(_, _) => {},
 246         }
 247         StepContinue
 248     }
 249
 250     fn add(&self, nlist: &mut Threads, pc: uint, groups: &mut [Option<uint>]) {
 251         if nlist.contains(pc) {
 252             return
 253         }
 254         // We have to add states to the threads list even if their empty.
 255         // TL;DR - It prevents cycles.
 256         // If we didn't care about cycles, we'd *only* add threads that
 257         // correspond to non-jumping instructions (OneChar, Any, Match, etc.).
 258         // But, it's possible for valid regexs (like '(a*)*') to result in
 259         // a cycle in the instruction list. e.g., We'll keep chasing the Split
 260         // instructions forever.
 261         // So we add these instructions to our thread queue, but in the main
 262         // VM loop, we look for them but simply ignore them.
 263         // Adding them to the queue prevents them from being revisited so we
 264         // can avoid cycles (and the inevitable stack overflow).
 265         //
 266         // We make a minor optimization by indicating that the state is "empty"
 267         // so that its capture groups are not filled in.
 268         match self.prog.insts[pc] {
 269             EmptyBegin(flags) => {
 270                 let multi = flags & FLAG_MULTI > 0;
 271                 nlist.add(pc, groups, true);
 272                 if self.chars.is_begin()
 273                    || (multi && self.char_is(self.chars.prev, '\n')) {
 274                     self.add(nlist, pc + 1, groups)
 275                 }
 276             }
 277             EmptyEnd(flags) => {
 278                 let multi = flags & FLAG_MULTI > 0;
 279                 nlist.add(pc, groups, true);
 280                 if self.chars.is_end()
 281                    || (multi && self.char_is(self.chars.cur, '\n')) {
 282                     self.add(nlist, pc + 1, groups)
 283                 }
 284             }
 285             EmptyWordBoundary(flags) => {
 286                 nlist.add(pc, groups, true);
 287                 if self.chars.is_word_boundary() == !(flags & FLAG_NEGATED > 0) {
 288                     self.add(nlist, pc + 1, groups)
 289                 }
 290             }
 291             Save(slot) => {
 292                 nlist.add(pc, groups, true);
 293                 match self.which {
 294                     Location if slot <= 1 => {
 295                         let old = groups[slot];
 296                         groups[slot] = Some(self.ic);
 297                         self.add(nlist, pc + 1, groups);
 298                         groups[slot] = old;
 299                     }
 300                     Submatches => {
 301                         let old = groups[slot];
 302                         groups[slot] = Some(self.ic);
 303                         self.add(nlist, pc + 1, groups);
 304                         groups[slot] = old;
 305                     }
 306                     Exists | Location => self.add(nlist, pc + 1, groups),
 307                 }
 308             }
 309             Jump(to) => {
 310                 nlist.add(pc, groups, true);
 311                 self.add(nlist, to, groups)
 312             }
 313             Split(x, y) => {
 314                 nlist.add(pc, groups, true);
 315                 self.add(nlist, x, groups);
 316                 self.add(nlist, y, groups);
 317             }
 318             Match | OneChar(_, _) | CharClass(_, _) | Any(_) => {
 319                 nlist.add(pc, groups, false);
 320             }
 321         }
 322     }
 323
 324     // FIXME: For case insensitive comparisons, it uses the uppercase
 325     // character and tests for equality. IIUC, this does not generalize to
 326     // all of Unicode. I believe we need to check the entire fold for each
 327     // character. This will be easy to add if and when it gets added to Rust's
 328     // standard library.
 329     #[inline]
 330     fn char_eq(&self, casei: bool, textc: Option<char>, regc: char) -> bool {
 331         match textc {
 332             None => false,
 333             Some(textc) => {
 334                 regc == textc
 335                     || (casei && regc.to_uppercase() == textc.to_uppercase())
 336             }
 337         }
 338     }
 339
 340     #[inline]
 341     fn char_is(&self, textc: Option<char>, regc: char) -> bool {
 342         textc == Some(regc)
 343     }
 344 }
 345
 346 /// CharReader is responsible for maintaining a "previous" and a "current"
 347 /// character. This one-character lookahead is necessary for assertions that
 348 /// look one character before or after the current position.
 349 pub struct CharReader<'t> {
 350     /// The previous character read. It is None only when processing the first
 351     /// character of the input.
 352     pub prev: Option<char>,
 353     /// The current character.
 354     pub cur: Option<char>,
 355     input: &'t str,
 356     next: uint,
 357 }
 358
 359 impl<'t> CharReader<'t> {
 360     /// Returns a new CharReader that advances through the input given.
 361     /// Note that a CharReader has no knowledge of the range in which to search
 362     /// the input.
 363     pub fn new(input: &'t str) -> CharReader<'t> {
 364         CharReader {
 365             prev: None,
 366             cur: None,
 367             input: input,
 368             next: 0,
 369        }
 370     }
 371
 372     /// Sets the previous and current character given any arbitrary byte
 373     /// index (at a Unicode codepoint boundary).
 374     #[inline]
 375     pub fn set(&mut self, ic: uint) -> uint {
 376         self.prev = None;
 377         self.cur = None;
 378         self.next = 0;
 379
 380         if self.input.len() == 0 {
 381             return 1
 382         }
 383         if ic > 0 {
 384             let i = cmp::min(ic, self.input.len());
 385             let prev = self.input.char_range_at_reverse(i);
 386             self.prev = Some(prev.ch);
 387         }
 388         if ic < self.input.len() {
 389             let cur = self.input.char_range_at(ic);
 390             self.cur = Some(cur.ch);
 391             self.next = cur.next;
 392             self.next
 393         } else {
 394             self.input.len() + 1
 395         }
 396     }
 397
 398     /// Does the same as `set`, except it always advances to the next
 399     /// character in the input (and therefore does half as many UTF8 decodings).
 400     #[inline]
 401     pub fn advance(&mut self) -> uint {
 402         self.prev = self.cur;
 403         if self.next < self.input.len() {
 404             let cur = self.input.char_range_at(self.next);
 405             self.cur = Some(cur.ch);
 406             self.next = cur.next;
 407         } else {
 408             self.cur = None;
 409             self.next = self.input.len() + 1;
 410         }
 411         self.next
 412     }
 413
 414     /// Returns true if and only if this is the beginning of the input
 415     /// (ignoring the range of the input to search).
 416     #[inline]
 417     pub fn is_begin(&self) -> bool { self.prev.is_none() }
 418
 419     /// Returns true if and only if this is the end of the input
 420     /// (ignoring the range of the input to search).
 421     #[inline]
 422     pub fn is_end(&self) -> bool { self.cur.is_none() }
 423
 424     /// Returns true if and only if the current position is a word boundary.
 425     /// (Ignoring the range of the input to search.)
 426     pub fn is_word_boundary(&self) -> bool {
 427         if self.is_begin() {
 428             return is_word(self.cur)
 429         }
 430         if self.is_end() {
 431             return is_word(self.prev)
 432         }
 433         (is_word(self.cur) && !is_word(self.prev))
 434         || (is_word(self.prev) && !is_word(self.cur))
 435     }
 436 }
 437
 438 struct Thread {
 439     pc: uint,
 440     groups: Vec<Option<uint>>,
 441 }
 442
 443 struct Threads {
 444     which: MatchKind,
 445     queue: Vec<Thread>,
 446     sparse: Vec<uint>,
 447     size: uint,
 448 }
 449
 450 impl Threads {
 451     // This is using a wicked neat trick to provide constant time lookup
 452     // for threads in the queue using a sparse set. A queue of threads is
 453     // allocated once with maximal size when the VM initializes and is reused
 454     // throughout execution. That is, there should be zero allocation during
 455     // the execution of a VM.
 456     //
 457     // See http://research.swtch.com/sparse for the deets.
 458     fn new(which: MatchKind, num_insts: uint, ncaps: uint) -> Threads {
 459         Threads {
 460             which: which,
 461             queue: range(0, num_insts).map(|_| {
 462                 Thread { pc: 0, groups: repeat(None).take(ncaps * 2).collect() }
 463             }).collect(),
 464             sparse: repeat(0u).take(num_insts).collect(),
 465             size: 0,
 466         }
 467     }
 468
 469     fn add(&mut self, pc: uint, groups: &[Option<uint>], empty: bool) {
 470         let t = &mut self.queue[self.size];
 471         t.pc = pc;
 472         match (empty, self.which) {
 473             (_, Exists) | (true, _) => {},
 474             (false, Location) => {
 475                 t.groups[0] = groups[0];
 476                 t.groups[1] = groups[1];
 477             }
 478             (false, Submatches) => {
 479                 for (slot, val) in t.groups.iter_mut().zip(groups.iter()) {
 480                     *slot = *val;
 481                 }
 482             }
 483         }
 484         self.sparse[pc] = self.size;
 485         self.size += 1;
 486     }
 487
 488     #[inline]
 489     fn contains(&self, pc: uint) -> bool {
 490         let s = self.sparse[pc];
 491         s < self.size && self.queue[s].pc == pc
 492     }
 493
 494     #[inline]
 495     fn empty(&mut self) {
 496         self.size = 0;
 497     }
 498
 499     #[inline]
 500     fn pc(&self, i: uint) -> uint {
 501         self.queue[i].pc
 502     }
 503
 504     #[inline]
 505     fn groups<'r>(&'r mut self, i: uint) -> &'r mut [Option<uint>] {
 506         self.queue[i].groups.as_mut_slice()
 507     }
 508 }
 509
 510 /// Returns true if the character is a word character, according to the
 511 /// (Unicode friendly) Perl character class '\w'.
 512 /// Note that this is only use for testing word boundaries. The actual '\w'
 513 /// is encoded as a CharClass instruction.
 514 pub fn is_word(c: Option<char>) -> bool {
 515     let c = match c {
 516         None => return false,
 517         Some(c) => c,
 518     };
 519     // Try the common ASCII case before invoking binary search.
 520     match c {
 521         '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
 522         _ => PERLW.binary_search_by(|&(start, end)| {
 523             if c >= start && c <= end {
 524                 Equal
 525             } else if start > c {
 526                 Greater
 527             } else {
 528                 Less
 529             }
 530         }).is_ok()
 531     }
 532 }
 533
 534 /// Given a character and a single character class range, return an ordering
 535 /// indicating whether the character is less than the start of the range,
 536 /// in the range (inclusive) or greater than the end of the range.
 537 ///
 538 /// If `casei` is `true`, then this ordering is computed case insensitively.
 539 ///
 540 /// This function is meant to be used with a binary search.
 541 #[inline]
 542 fn class_cmp(casei: bool, mut textc: char,
 543              (mut start, mut end): (char, char)) -> Ordering {
 544     if casei {
 545         // FIXME: This is pretty ridiculous. All of this case conversion
 546         // can be moved outside this function:
 547         // 1) textc should be uppercased outside the bsearch.
 548         // 2) the character class itself should be uppercased either in the
 549         //    parser or the compiler.
 550         // FIXME: This is too simplistic for correct Unicode support.
 551         //        See also: char_eq
 552         textc = textc.to_uppercase();
 553         start = start.to_uppercase();
 554         end = end.to_uppercase();
 555     }
 556     if textc >= start && textc <= end {
 557         Equal
 558     } else if start > textc {
 559         Greater
 560     } else {
 561         Less
 562     }
 563 }
 564
 565 /// Returns the starting location of `needle` in `haystack`.
 566 /// If `needle` is not in `haystack`, then `None` is returned.
 567 ///
 568 /// Note that this is using a naive substring algorithm.
 569 #[inline]
 570 pub fn find_prefix(needle: &[u8], haystack: &[u8]) -> Option<uint> {
 571     let (hlen, nlen) = (haystack.len(), needle.len());
 572     if nlen > hlen || nlen == 0 {
 573         return None
 574     }
 575     for (offset, window) in haystack.windows(nlen).enumerate() {
 576         if window == needle {
 577             return Some(offset)
 578         }
 579     }
 580     None
 581 }