StrSearcher: Update substring search to use the Two Way algorithm

author Ulrik Sverdrup <root@localhost>

Sun, 14 Jun 2015 21:17:17 +0000 (23:17 +0200)

committer Ulrik Sverdrup <root@localhost>

Sun, 21 Jun 2015 17:58:50 +0000 (19:58 +0200)
author Ulrik Sverdrup <root@localhost>
Sun, 14 Jun 2015 21:17:17 +0000 (23:17 +0200)
committer Ulrik Sverdrup <root@localhost>
Sun, 21 Jun 2015 17:58:50 +0000 (19:58 +0200)
diff --git a/src/libcollectionstest/str.rs b/src/libcollectionstest/str.rs

index 3f32136bc263974c62a1dfd6dea73f9879d88183..87a018ced195accee2164df7b341a5c85d1913a3 100644 (file)
--- a/src/libcollectionstest/str.rs
+++ b/src/libcollectionstest/str.rs
@@ -705,7 +705,7 @@ fn test_split_at() {
  #[should_panic]
  fn test_split_at_boundscheck() {
      let s = "ศไทย中华Việt Nam";
-    let (a, b) = s.split_at(1);
+    s.split_at(1);
  }
  
  #[test]
@@ -1820,6 +1820,14 @@ fn cmp_search_to_vec<'a, P: Pattern<'a>>(rev: bool, pat: P, haystack: &'a str,
          Match (4, 6),
          Reject(6, 7),
      ]);
+    make_test!(str_searcher_ascii_haystack_seq, "bb", "abbcbbbbd", [
+        Reject(0, 1),
+        Match (1, 3),
+        Reject(3, 4),
+        Match (4, 6),
+        Match (6, 8),
+        Reject(8, 9),
+    ]);
      make_test!(str_searcher_empty_needle_ascii_haystack, "", "abbcbbd", [
          Match (0, 0),
          Reject(0, 1),
diff --git a/src/libcore/str/mod.rs b/src/libcore/str/mod.rs

index 5a621176c4a812b45976d9744964f2ea31eeab62..bd6e1a4063abda86a8ab293dbdaed054b62c21b0 100644 (file)
--- a/src/libcore/str/mod.rs
+++ b/src/libcore/str/mod.rs
@@ -15,13 +15,12 @@
  #![doc(primitive = "str")]
  #![stable(feature = "rust1", since = "1.0.0")]
  
-use self::OldSearcher::{TwoWay, TwoWayLong};
  use self::pattern::Pattern;
  use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher};
  
  use char::CharExt;
  use clone::Clone;
-use cmp::{self, Eq};
+use cmp::Eq;
  use convert::AsRef;
  use default::Default;
  use fmt;
@@ -33,7 +32,6 @@
  use raw::{Repr, Slice};
  use result::Result::{self, Ok, Err};
  use slice::{self, SliceExt};
-use usize;
  
  pub mod pattern;
  
@@ -870,301 +868,6 @@ fn next_back(&mut self) -> Option<&'a str> {
      }
  }
  
-/// The internal state of an iterator that searches for matches of a substring
-/// within a larger string using two-way search
-#[derive(Clone)]
-struct TwoWaySearcher {
-    // constants
-    crit_pos: usize,
-    period: usize,
-    byteset: u64,
-
-    // variables
-    position: usize,
-    memory: usize
-}
-
-/*
-    This is the Two-Way search algorithm, which was introduced in the paper:
-    Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
-
-    Here's some background information.
-
-    A *word* is a string of symbols. The *length* of a word should be a familiar
-    notion, and here we denote it for any word x by |x|.
-    (We also allow for the possibility of the *empty word*, a word of length zero).
-
-    If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
-    *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
-    For example, both 1 and 2 are periods for the string "aa". As another example,
-    the only period of the string "abcd" is 4.
-
-    We denote by period(x) the *smallest* period of x (provided that x is non-empty).
-    This is always well-defined since every non-empty word x has at least one period,
-    |x|. We sometimes call this *the period* of x.
-
-    If u, v and x are words such that x = uv, where uv is the concatenation of u and
-    v, then we say that (u, v) is a *factorization* of x.
-
-    Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
-    that both of the following hold
-
-      - either w is a suffix of u or u is a suffix of w
-      - either w is a prefix of v or v is a prefix of w
-
-    then w is said to be a *repetition* for the factorization (u, v).
-
-    Just to unpack this, there are four possibilities here. Let w = "abc". Then we
-    might have:
-
-      - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
-      - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
-      - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
-      - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
-
-    Note that the word vu is a repetition for any factorization (u,v) of x = uv,
-    so every factorization has at least one repetition.
-
-    If x is a string and (u, v) is a factorization for x, then a *local period* for
-    (u, v) is an integer r such that there is some word w such that |w| = r and w is
-    a repetition for (u, v).
-
-    We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
-    call this *the local period* of (u, v). Provided that x = uv is non-empty, this
-    is well-defined (because each non-empty word has at least one factorization, as
-    noted above).
-
-    It can be proven that the following is an equivalent definition of a local period
-    for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
-    all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
-    defined. (i.e. i > 0 and i + r < |x|).
-
-    Using the above reformulation, it is easy to prove that
-
-        1 <= local_period(u, v) <= period(uv)
-
-    A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
-    *critical factorization*.
-
-    The algorithm hinges on the following theorem, which is stated without proof:
-
-    **Critical Factorization Theorem** Any word x has at least one critical
-    factorization (u, v) such that |u| < period(x).
-
-    The purpose of maximal_suffix is to find such a critical factorization.
-
-*/
-impl TwoWaySearcher {
-    #[allow(dead_code)]
-    fn new(needle: &[u8]) -> TwoWaySearcher {
-        let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
-        let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
-
-        let (crit_pos, period) =
-            if crit_pos_false > crit_pos_true {
-                (crit_pos_false, period_false)
-            } else {
-                (crit_pos_true, period_true)
-            };
-
-        // This isn't in the original algorithm, as far as I'm aware.
-        let byteset = needle.iter()
-                            .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
-
-        // A particularly readable explanation of what's going on here can be found
-        // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
-        // see the code for "Algorithm CP" on p. 323.
-        //
-        // What's going on is we have some critical factorization (u, v) of the
-        // needle, and we want to determine whether u is a suffix of
-        // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
-        // "Algorithm CP2", which is optimized for when the period of the needle
-        // is large.
-        if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
-            TwoWaySearcher {
-                crit_pos: crit_pos,
-                period: period,
-                byteset: byteset,
-
-                position: 0,
-                memory: 0
-            }
-        } else {
-            TwoWaySearcher {
-                crit_pos: crit_pos,
-                period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
-                byteset: byteset,
-
-                position: 0,
-                memory: usize::MAX // Dummy value to signify that the period is long
-            }
-        }
-    }
-
-    // One of the main ideas of Two-Way is that we factorize the needle into
-    // two halves, (u, v), and begin trying to find v in the haystack by scanning
-    // left to right. If v matches, we try to match u by scanning right to left.
-    // How far we can jump when we encounter a mismatch is all based on the fact
-    // that (u, v) is a critical factorization for the needle.
-    #[inline]
-    fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
-            -> Option<(usize, usize)> {
-        'search: loop {
-            // Check that we have room to search in
-            if self.position + needle.len() > haystack.len() {
-                return None;
-            }
-
-            // Quickly skip by large portions unrelated to our substring
-            if (self.byteset >>
-                    ((haystack[self.position + needle.len() - 1] & 0x3f)
-                     as usize)) & 1 == 0 {
-                self.position += needle.len();
-                if !long_period {
-                    self.memory = 0;
-                }
-                continue 'search;
-            }
-
-            // See if the right part of the needle matches
-            let start = if long_period { self.crit_pos }
-                        else { cmp::max(self.crit_pos, self.memory) };
-            for i in start..needle.len() {
-                if needle[i] != haystack[self.position + i] {
-                    self.position += i - self.crit_pos + 1;
-                    if !long_period {
-                        self.memory = 0;
-                    }
-                    continue 'search;
-                }
-            }
-
-            // See if the left part of the needle matches
-            let start = if long_period { 0 } else { self.memory };
-            for i in (start..self.crit_pos).rev() {
-                if needle[i] != haystack[self.position + i] {
-                    self.position += self.period;
-                    if !long_period {
-                        self.memory = needle.len() - self.period;
-                    }
-                    continue 'search;
-                }
-            }
-
-            // We have found a match!
-            let match_pos = self.position;
-            self.position += needle.len(); // add self.period for all matches
-            if !long_period {
-                self.memory = 0; // set to needle.len() - self.period for all matches
-            }
-            return Some((match_pos, match_pos + needle.len()));
-        }
-    }
-
-    // Computes a critical factorization (u, v) of `arr`.
-    // Specifically, returns (i, p), where i is the starting index of v in some
-    // critical factorization (u, v) and p = period(v)
-    #[inline]
-    #[allow(dead_code)]
-    #[allow(deprecated)]
-    fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
-        let mut left: usize = !0; // Corresponds to i in the paper
-        let mut right = 0; // Corresponds to j in the paper
-        let mut offset = 1; // Corresponds to k in the paper
-        let mut period = 1; // Corresponds to p in the paper
-
-        while right + offset < arr.len() {
-            let a;
-            let b;
-            if reversed {
-                a = arr[left.wrapping_add(offset)];
-                b = arr[right + offset];
-            } else {
-                a = arr[right + offset];
-                b = arr[left.wrapping_add(offset)];
-            }
-            if a < b {
-                // Suffix is smaller, period is entire prefix so far.
-                right += offset;
-                offset = 1;
-                period = right.wrapping_sub(left);
-            } else if a == b {
-                // Advance through repetition of the current period.
-                if offset == period {
-                    right += offset;
-                    offset = 1;
-                } else {
-                    offset += 1;
-                }
-            } else {
-                // Suffix is larger, start over from current location.
-                left = right;
-                right += 1;
-                offset = 1;
-                period = 1;
-            }
-        }
-        (left.wrapping_add(1), period)
-    }
-}
-
-/// The internal state of an iterator that searches for matches of a substring
-/// within a larger string using a dynamically chosen search algorithm
-#[derive(Clone)]
-// NB: This is kept around for convenience because
-// it is planned to be used again in the future
-enum OldSearcher {
-    TwoWay(TwoWaySearcher),
-    TwoWayLong(TwoWaySearcher),
-}
-
-impl OldSearcher {
-    #[allow(dead_code)]
-    fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher {
-        if needle.is_empty() {
-            // Handle specially
-            unimplemented!()
-        // FIXME: Tune this.
-        // FIXME(#16715): This unsigned integer addition will probably not
-        // overflow because that would mean that the memory almost solely
-        // consists of the needle. Needs #16715 to be formally fixed.
-        } else if needle.len() + 20 > haystack.len() {
-            // Use naive searcher
-            unimplemented!()
-        } else {
-            let searcher = TwoWaySearcher::new(needle);
-            if searcher.memory == usize::MAX { // If the period is long
-                TwoWayLong(searcher)
-            } else {
-                TwoWay(searcher)
-            }
-        }
-    }
-}
-
-#[derive(Clone)]
-// NB: This is kept around for convenience because
-// it is planned to be used again in the future
-struct OldMatchIndices<'a, 'b> {
-    // constants
-    haystack: &'a str,
-    needle: &'b str,
-    searcher: OldSearcher
-}
-
-impl<'a, 'b>  OldMatchIndices<'a, 'b> {
-    #[inline]
-    #[allow(dead_code)]
-    fn next(&mut self) -> Option<(usize, usize)> {
-        match self.searcher {
-            TwoWay(ref mut searcher)
-                => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
-            TwoWayLong(ref mut searcher)
-                => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true),
-        }
-    }
-}
-
  /*
  Section: Comparing strings
  */
diff --git a/src/libcore/str/pattern.rs b/src/libcore/str/pattern.rs

index 8bdbab55211d8cce2dc735d1d3fd670450098d15..de40a57fe9c17b2f6ba8e553f3b0d43aa14d1aad 100644 (file)
--- a/src/libcore/str/pattern.rs
+++ b/src/libcore/str/pattern.rs
@@ -17,6 +17,8 @@
              reason = "API not fully fleshed out and ready to be stabilized")]
  
  use prelude::*;
+use core::cmp;
+use usize;
  
  // Pattern
  
@@ -345,141 +347,16 @@ impl<'a, C: CharEq> DoubleEndedSearcher<'a> for CharEqSearcher<'a, C> {}
  // Impl for &str
  /////////////////////////////////////////////////////////////////////////////
  
-// Todo: Optimize the naive implementation here
-
-/// Associated type for `<&str as Pattern<'a>>::Searcher`.
-#[derive(Clone)]
-pub struct StrSearcher<'a, 'b> {
-    haystack: &'a str,
-    needle: &'b str,
-    start: usize,
-    end: usize,
-    state: State,
-}
-
-#[derive(Clone, PartialEq)]
-enum State { Done, NotDone, Reject(usize, usize) }
-impl State {
-    #[inline] fn done(&self) -> bool { *self == State::Done }
-    #[inline] fn take(&mut self) -> State { ::mem::replace(self, State::NotDone) }
-}
-
  /// Non-allocating substring search.
  ///
-/// Will handle the pattern `""` as returning empty matches at each utf8
+/// Will handle the pattern `""` as returning empty matches at each character
  /// boundary.
  impl<'a, 'b> Pattern<'a> for &'b str {
      type Searcher = StrSearcher<'a, 'b>;
  
      #[inline]
      fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
-        StrSearcher {
-            haystack: haystack,
-            needle: self,
-            start: 0,
-            end: haystack.len(),
-            state: State::NotDone,
-        }
-    }
-}
-
-unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b>  {
-    #[inline]
-    fn haystack(&self) -> &'a str {
-        self.haystack
-    }
-
-    #[inline]
-    fn next(&mut self) -> SearchStep {
-        str_search_step(self,
-        |m: &mut StrSearcher| {
-            // Forward step for empty needle
-            let current_start = m.start;
-            if !m.state.done() {
-                m.start = m.haystack.char_range_at(current_start).next;
-                m.state = State::Reject(current_start, m.start);
-            }
-            SearchStep::Match(current_start, current_start)
-        },
-        |m: &mut StrSearcher| {
-            // Forward step for nonempty needle
-            let current_start = m.start;
-            // Compare byte window because this might break utf8 boundaries
-            let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()];
-            if possible_match == m.needle.as_bytes() {
-                m.start += m.needle.len();
-                SearchStep::Match(current_start, m.start)
-            } else {
-                // Skip a char
-                let haystack_suffix = &m.haystack[m.start..];
-                m.start += haystack_suffix.chars().next().unwrap().len_utf8();
-                SearchStep::Reject(current_start, m.start)
-            }
-        })
-    }
-}
-
-unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b>  {
-    #[inline]
-    fn next_back(&mut self) -> SearchStep {
-        str_search_step(self,
-        |m: &mut StrSearcher| {
-            // Backward step for empty needle
-            let current_end = m.end;
-            if !m.state.done() {
-                m.end = m.haystack.char_range_at_reverse(current_end).next;
-                m.state = State::Reject(m.end, current_end);
-            }
-            SearchStep::Match(current_end, current_end)
-        },
-        |m: &mut StrSearcher| {
-            // Backward step for nonempty needle
-            let current_end = m.end;
-            // Compare byte window because this might break utf8 boundaries
-            let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end];
-            if possible_match == m.needle.as_bytes() {
-                m.end -= m.needle.len();
-                SearchStep::Match(m.end, current_end)
-            } else {
-                // Skip a char
-                let haystack_prefix = &m.haystack[..m.end];
-                m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8();
-                SearchStep::Reject(m.end, current_end)
-            }
-        })
-    }
-}
-
-// Helper function for encapsulating the common control flow
-// of doing a search step from the front or doing a search step from the back
-fn str_search_step<F, G>(mut m: &mut StrSearcher,
-                         empty_needle_step: F,
-                         nonempty_needle_step: G) -> SearchStep
-    where F: FnOnce(&mut StrSearcher) -> SearchStep,
-          G: FnOnce(&mut StrSearcher) -> SearchStep
-{
-    if m.state.done() {
-        SearchStep::Done
-    } else if m.needle.is_empty() && m.start <= m.end {
-        // Case for needle == ""
-        if let State::Reject(a, b) = m.state.take() {
-            SearchStep::Reject(a, b)
-        } else {
-            if m.start == m.end {
-                m.state = State::Done;
-            }
-            empty_needle_step(&mut m)
-        }
-    } else if m.start + m.needle.len() <= m.end {
-        // Case for needle != ""
-        nonempty_needle_step(&mut m)
-    } else if m.start < m.end {
-        // Remaining slice shorter than needle, reject it
-        m.state = State::Done;
-        SearchStep::Reject(m.start, m.end)
-    } else {
-        m.state = State::Done;
-        SearchStep::Done
+        StrSearcher::new(haystack, self)
      }
  }
  
@@ -633,3 +510,464 @@ impl<'a, F> Pattern<'a> for F where F: FnMut(char) -> bool {
  impl<'a, 'b> Pattern<'a> for &'b &'b str {
      pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
  }
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Two Way substring searcher
+/////////////////////////////////////////////////////////////////////////////
+
+#[derive(Clone, Debug)]
+/// Associated type for `<&str as Pattern<'a>>::Searcher`.
+pub struct StrSearcher<'a, 'b> {
+    haystack: &'a str,
+    needle: &'b str,
+
+    searcher: StrSearcherImpl,
+}
+
+#[derive(Clone, Debug)]
+enum StrSearcherImpl {
+    Empty(EmptyNeedle),
+    TwoWay {
+        last_match_fw: Option<(usize, usize)>,
+        last_match_bw: Option<(usize, usize)>,
+        searcher: TwoWaySearcher,
+    }
+}
+
+#[derive(Clone, Debug)]
+struct EmptyNeedle {
+    position: usize,
+    end: usize,
+    is_match_fw: bool,
+    is_match_bw: bool,
+}
+
+impl<'a, 'b> StrSearcher<'a, 'b> {
+    fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
+        if needle.is_empty() {
+            StrSearcher {
+                haystack: haystack,
+                needle: needle,
+                searcher: StrSearcherImpl::Empty(EmptyNeedle {
+                    position: 0,
+                    end: haystack.len(),
+                    is_match_fw: true,
+                    is_match_bw: true,
+                }),
+            }
+        } else {
+            StrSearcher {
+                haystack: haystack,
+                needle: needle,
+                searcher: StrSearcherImpl::TwoWay {
+                    last_match_fw: None,
+                    last_match_bw: None,
+                    searcher: TwoWaySearcher::new(needle.as_bytes(), haystack.len())
+                },
+            }
+        }
+    }
+}
+
+unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
+    fn haystack(&self) -> &'a str { self.haystack }
+
+    #[inline]
+    fn next(&mut self) -> SearchStep {
+        match self.searcher {
+            StrSearcherImpl::Empty(ref mut searcher) => {
+                // empty needle rejects every char and matches every empty string between them
+                let is_match = searcher.is_match_fw;
+                searcher.is_match_fw = !searcher.is_match_fw;
+                let pos = searcher.position;
+                match self.haystack[pos..].chars().next() {
+                    _ if is_match => SearchStep::Match(pos, pos),
+                    None => SearchStep::Done,
+                    Some(ch) => {
+                        searcher.position += ch.len_utf8();
+                        SearchStep::Reject(pos, searcher.position)
+                    }
+                }
+            }
+            StrSearcherImpl::TwoWay { ref mut last_match_fw, ref mut searcher, .. } => {
+                // TwoWaySearcher produces valid *Match* indices that split at char boundaries
+                // as long as it does correct matching and that haystack and needle are
+                // valid UTF-8
+                // *Rejects* fall on the same indices (the intervals between matches)
+                // so they are always on character boundaries.
+                if let Some((a, b)) = last_match_fw.take() {
+                    return SearchStep::Match(a, b);
+                }
+                let last_pos = searcher.position;
+                let is_long = searcher.memory == usize::MAX;
+                let next_match = searcher.next(self.haystack.as_bytes(),
+                                               self.needle.as_bytes(),
+                                               is_long);
+                match next_match {
+                    None => if last_pos != self.haystack.len() {
+                        SearchStep::Reject(last_pos, self.haystack.len())
+                    } else {
+                        SearchStep::Done
+                    },
+                    Some((a, b)) => {
+                        if a == last_pos {
+                            SearchStep::Match(a, b)
+                        } else {
+                            *last_match_fw = Some((a, b));
+                            SearchStep::Reject(last_pos, a)
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+}
+unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
+    #[inline]
+    fn next_back(&mut self) -> SearchStep {
+        match self.searcher {
+            StrSearcherImpl::Empty(ref mut searcher) => {
+                let is_match = searcher.is_match_bw;
+                searcher.is_match_bw = !searcher.is_match_bw;
+                let end = searcher.end;
+                match self.haystack[..end].chars().next_back() {
+                    _ if is_match => SearchStep::Match(end, end),
+                    None => SearchStep::Done,
+                    Some(ch) => {
+                        searcher.end -= ch.len_utf8();
+                        SearchStep::Reject(searcher.end, end)
+                    }
+                }
+            }
+            StrSearcherImpl::TwoWay { ref mut last_match_bw, ref mut searcher, .. } => {
+                if let Some((a, b)) = last_match_bw.take() {
+                    return SearchStep::Match(a, b);
+                }
+                let last_end = searcher.end;
+                let next_match = searcher.next_back(self.haystack.as_bytes(),
+                                                    self.needle.as_bytes());
+                match next_match {
+                    None => if last_end != 0 {
+                        SearchStep::Reject(0, last_end)
+                    } else {
+                        SearchStep::Done
+                    },
+                    Some((a, b)) => {
+                        if b == last_end {
+                            SearchStep::Match(a, b)
+                        } else {
+                            *last_match_bw = Some((a, b));
+                            SearchStep::Reject(b, last_end)
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+/// The internal state of an iterator that searches for matches of a substring
+/// within a larger string using two-way search
+#[derive(Clone, Debug)]
+struct TwoWaySearcher {
+    // constants
+    crit_pos: usize,
+    period: usize,
+    byteset: u64,
+
+    // variables
+    position: usize,
+    end: usize,
+    memory: usize
+}
+
+/*
+    This is the Two-Way search algorithm, which was introduced in the paper:
+    Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+
+    Here's some background information.
+
+    A *word* is a string of symbols. The *length* of a word should be a familiar
+    notion, and here we denote it for any word x by |x|.
+    (We also allow for the possibility of the *empty word*, a word of length zero).
+
+    If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
+    *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
+    For example, both 1 and 2 are periods for the string "aa". As another example,
+    the only period of the string "abcd" is 4.
+
+    We denote by period(x) the *smallest* period of x (provided that x is non-empty).
+    This is always well-defined since every non-empty word x has at least one period,
+    |x|. We sometimes call this *the period* of x.
+
+    If u, v and x are words such that x = uv, where uv is the concatenation of u and
+    v, then we say that (u, v) is a *factorization* of x.
+
+    Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
+    that both of the following hold
+
+      - either w is a suffix of u or u is a suffix of w
+      - either w is a prefix of v or v is a prefix of w
+
+    then w is said to be a *repetition* for the factorization (u, v).
+
+    Just to unpack this, there are four possibilities here. Let w = "abc". Then we
+    might have:
+
+      - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
+      - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
+      - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
+      - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
+
+    Note that the word vu is a repetition for any factorization (u,v) of x = uv,
+    so every factorization has at least one repetition.
+
+    If x is a string and (u, v) is a factorization for x, then a *local period* for
+    (u, v) is an integer r such that there is some word w such that |w| = r and w is
+    a repetition for (u, v).
+
+    We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
+    call this *the local period* of (u, v). Provided that x = uv is non-empty, this
+    is well-defined (because each non-empty word has at least one factorization, as
+    noted above).
+
+    It can be proven that the following is an equivalent definition of a local period
+    for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
+    all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
+    defined. (i.e. i > 0 and i + r < |x|).
+
+    Using the above reformulation, it is easy to prove that
+
+        1 <= local_period(u, v) <= period(uv)
+
+    A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
+    *critical factorization*.
+
+    The algorithm hinges on the following theorem, which is stated without proof:
+
+    **Critical Factorization Theorem** Any word x has at least one critical
+    factorization (u, v) such that |u| < period(x).
+
+    The purpose of maximal_suffix is to find such a critical factorization.
+
+*/
+impl TwoWaySearcher {
+    fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
+        let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
+        let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
+
+        let (crit_pos, period) =
+            if crit_pos_false > crit_pos_true {
+                (crit_pos_false, period_false)
+            } else {
+                (crit_pos_true, period_true)
+            };
+
+        // This isn't in the original algorithm, as far as I'm aware.
+        let byteset = needle.iter()
+                            .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
+
+        // A particularly readable explanation of what's going on here can be found
+        // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
+        // see the code for "Algorithm CP" on p. 323.
+        //
+        // What's going on is we have some critical factorization (u, v) of the
+        // needle, and we want to determine whether u is a suffix of
+        // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
+        // "Algorithm CP2", which is optimized for when the period of the needle
+        // is large.
+        if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
+            // short period case
+            TwoWaySearcher {
+                crit_pos: crit_pos,
+                period: period,
+                byteset: byteset,
+
+                position: 0,
+                end: end,
+                memory: 0
+            }
+        } else {
+            // long period case
+            // we have an approximation to the actual period, and don't use memory.
+            TwoWaySearcher {
+                crit_pos: crit_pos,
+                period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
+                byteset: byteset,
+
+                position: 0,
+                end: end,
+                memory: usize::MAX // Dummy value to signify that the period is long
+            }
+        }
+    }
+
+    #[inline(always)]
+    fn byteset_contains(&self, byte: u8) -> bool {
+        (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
+    }
+
+    // One of the main ideas of Two-Way is that we factorize the needle into
+    // two halves, (u, v), and begin trying to find v in the haystack by scanning
+    // left to right. If v matches, we try to match u by scanning right to left.
+    // How far we can jump when we encounter a mismatch is all based on the fact
+    // that (u, v) is a critical factorization for the needle.
+    #[inline]
+    fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
+            -> Option<(usize, usize)> {
+        // `next()` uses `self.position` as its cursor
+        'search: loop {
+            // Check that we have room to search in
+            if self.position + needle.len() > haystack.len() {
+                self.position = haystack.len();
+                return None;
+            }
+
+            // Quickly skip by large portions unrelated to our substring
+            if !self.byteset_contains(haystack[self.position + needle.len() - 1]) {
+                self.position += needle.len();
+                if !long_period {
+                    self.memory = 0;
+                }
+                continue 'search;
+            }
+
+            // See if the right part of the needle matches
+            let start = if long_period { self.crit_pos }
+                        else { cmp::max(self.crit_pos, self.memory) };
+            for i in start..needle.len() {
+                if needle[i] != haystack[self.position + i] {
+                    self.position += i - self.crit_pos + 1;
+                    if !long_period {
+                        self.memory = 0;
+                    }
+                    continue 'search;
+                }
+            }
+
+            // See if the left part of the needle matches
+            let start = if long_period { 0 } else { self.memory };
+            for i in (start..self.crit_pos).rev() {
+                if needle[i] != haystack[self.position + i] {
+                    self.position += self.period;
+                    if !long_period {
+                        self.memory = needle.len() - self.period;
+                    }
+                    continue 'search;
+                }
+            }
+
+            // We have found a match!
+            let match_pos = self.position;
+
+            // Note: add self.period instead of needle.len() to have overlapping matches
+            self.position += needle.len();
+            if !long_period {
+                self.memory = 0; // set to needle.len() - self.period for overlapping matches
+            }
+
+            return Some((match_pos, match_pos + needle.len()));
+        }
+    }
+
+    // Follows the ideas in `next()`.
+    //
+    // All the definitions are completely symmetrical, with period(x) = period(reverse(x))
+    // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
+    // is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
+    // the "period" stored in self.period is the real period if long_period is
+    // false, and so is still valid for a reversed needle, and if long_period is
+    // true, all the algorithm requires is that self.period is less than or
+    // equal to the real period, which must be true for the forward case anyway.
+    //
+    // To search in reverse through the haystack, we search forward through
+    // a reversed haystack with a reversed needle, and the above paragraph shows
+    // that the precomputed parameters can be left alone.
+    #[inline]
+    fn next_back(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> {
+        // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
+        // are independent.
+        'search: loop {
+            // Check that we have room to search in
+            if needle.len() > self.end {
+                self.end = 0;
+                return None;
+            }
+
+            // Quickly skip by large portions unrelated to our substring
+            if !self.byteset_contains(haystack[self.end - needle.len()]) {
+                self.end -= needle.len();
+                continue 'search;
+            }
+
+            // See if the left part of the needle matches
+            for i in (0..self.crit_pos).rev() {
+                if needle[i] != haystack[self.end - needle.len() + i] {
+                    self.end -= self.crit_pos - i;
+                    continue 'search;
+                }
+            }
+
+            // See if the right part of the needle matches
+            for i in self.crit_pos..needle.len() {
+                if needle[i] != haystack[self.end - needle.len() + i] {
+                    self.end -= self.period;
+                    continue 'search;
+                }
+            }
+
+            // We have found a match!
+            let match_pos = self.end - needle.len();
+            // Note: sub self.period instead of needle.len() to have overlapping matches
+            self.end -= needle.len();
+
+            return Some((match_pos, match_pos + needle.len()));
+        }
+    }
+
+    // Computes a critical factorization (u, v) of `arr`.
+    // Specifically, returns (i, p), where i is the starting index of v in some
+    // critical factorization (u, v) and p = period(v)
+    #[inline]
+    fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
+        let mut left: usize = !0; // Corresponds to i in the paper
+        let mut right = 0; // Corresponds to j in the paper
+        let mut offset = 1; // Corresponds to k in the paper
+        let mut period = 1; // Corresponds to p in the paper
+
+        while right + offset < arr.len() {
+            let a;
+            let b;
+            if reversed {
+                a = arr[left.wrapping_add(offset)];
+                b = arr[right + offset];
+            } else {
+                a = arr[right + offset];
+                b = arr[left.wrapping_add(offset)];
+            }
+            if a < b {
+                // Suffix is smaller, period is entire prefix so far.
+                right += offset;
+                offset = 1;
+                period = right.wrapping_sub(left);
+            } else if a == b {
+                // Advance through repetition of the current period.
+                if offset == period {
+                    right += offset;
+                    offset = 1;
+                } else {
+                    offset += 1;
+                }
+            } else {
+                // Suffix is larger, start over from current location.
+                left = right;
+                right += 1;
+                offset = 1;
+                period = 1;
+            }
+        }
+        (left.wrapping_add(1), period)
+    }
+}
author	Ulrik Sverdrup <root@localhost>
	Sun, 14 Jun 2015 21:17:17 +0000 (23:17 +0200)
committer	Ulrik Sverdrup <root@localhost>
	Sun, 21 Jun 2015 17:58:50 +0000 (19:58 +0200)
src/libcollectionstest/str.rs		patch \| blob \| history
src/libcore/str/mod.rs		patch \| blob \| history
src/libcore/str/pattern.rs		patch \| blob \| history