if searcher.end == 0 {
return SearchStep::Done;
}
+ let is_long = searcher.memory == usize::MAX;
match searcher.next_back::<RejectAndMatch>(self.haystack.as_bytes(),
- self.needle.as_bytes())
+ self.needle.as_bytes(),
+ is_long)
{
SearchStep::Reject(mut a, b) => {
// skip to next char boundary
}
}
StrSearcherImpl::TwoWay(ref mut searcher) => {
- searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
- self.needle.as_bytes())
+ let is_long = searcher.memory == usize::MAX;
+ if is_long {
+ searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
+ self.needle.as_bytes(),
+ true)
+ } else {
+ searcher.next_back::<MatchOnly>(self.haystack.as_bytes(),
+ self.needle.as_bytes(),
+ false)
+ }
}
}
}
#[derive(Clone, Debug)]
struct TwoWaySearcher {
// constants
+ /// critical factorization index
crit_pos: usize,
+ /// critical factorization index for reversed needle
+ crit_pos_back: usize,
period: usize,
+ /// `byteset` is an extension (not part of the two way algorithm);
+ /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
+ /// to a (byte & 63) == j present in the needle.
byteset: u64,
// variables
position: usize,
end: usize,
- memory: usize
+ memory: usize,
+ memory_back: usize,
}
/*
The purpose of maximal_suffix is to find such a critical factorization.
+ If the period is short, compute another factorization x = u' v' to use
+ for reverse search, chosen instead so that |v'| < period(x).
+
*/
impl TwoWaySearcher {
fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
(crit_pos_true, period_true)
};
- // This isn't in the original algorithm, as far as I'm aware.
- let byteset = needle.iter()
- .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
-
// A particularly readable explanation of what's going on here can be found
// in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
// see the code for "Algorithm CP" on p. 323.
// "Algorithm CP2", which is optimized for when the period of the needle
// is large.
if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
- // short period case
+ // short period case -- the period is exact
+ let byteset = needle[..period].iter()
+ .fold(0, |a, &b| (1 << (b & 0x3f)) | a);
+
+ // compute a separate critical factorization for the reversed needle
+ // x = u' v' where |v'| < period(x).
+ //
+ // This is sped up by the period being known already.
+ // Note that a case like x = "acba" may be factored exactly forwards
+ // (crit_pos = 1, period = 3) while being factored with approximate
+ // period in reverse (crit_pos = 2, period = 2). We use the given
+ // reverse factorization but keep the exact period.
+ let crit_pos_back = needle.len() - cmp::max(
+ TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
+ TwoWaySearcher::reverse_maximal_suffix(needle, period, true));
+
TwoWaySearcher {
crit_pos: crit_pos,
+ crit_pos_back: crit_pos_back,
period: period,
byteset: byteset,
position: 0,
end: end,
- memory: 0
+ memory: 0,
+ // memory_back after which we have already matched
+ memory_back: needle.len(),
}
} else {
- // long period case
- // we have an approximation to the actual period, and don't use memory.
+ // long period case -- we have an approximation to the actual period,
+ // and don't use memorization.
+
+ let byteset = needle.iter()
+ .fold(0, |a, &b| (1 << (b & 0x3f)) | a);
+
TwoWaySearcher {
crit_pos: crit_pos,
+ crit_pos_back: crit_pos,
period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
byteset: byteset,
position: 0,
end: end,
- memory: usize::MAX // Dummy value to signify that the period is long
+ memory: usize::MAX, // Dummy value to signify that the period is long
+ memory_back: usize::MAX,
}
}
}
// Follows the ideas in `next()`.
//
- // All the definitions are completely symmetrical, with period(x) = period(reverse(x))
+ // The definitions are symmetrical, with period(x) = period(reverse(x))
// and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
- // is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
- // the "period" stored in self.period is the real period if long_period is
- // false, and so is still valid for a reversed needle, and if long_period is
- // true, all the algorithm requires is that self.period is less than or
- // equal to the real period, which must be true for the forward case anyway.
+ // is a critical factorization, so is (reverse(v), reverse(u)).
+ //
+ // For the short period case, using memorization, we rely on |u| < period(x).
+ // For this case we have computed a critical factorization x = u' v'
+ // where |v'| < period(x) instead (field `crit_pos_back`).
//
// To search in reverse through the haystack, we search forward through
- // a reversed haystack with a reversed needle, and the above paragraph shows
- // that the precomputed parameters can be left alone.
+ // a reversed haystack with a reversed needle, matching first u' and then v'.
#[inline]
- fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8])
+ fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
-> S::Output
where S: TwoWayStrategy
{
// Quickly skip by large portions unrelated to our substring
if !self.byteset_contains(haystack[self.end - needle.len()]) {
self.end -= needle.len();
+ if !long_period {
+ self.memory_back = needle.len();
+ }
continue 'search;
}
// See if the left part of the needle matches
- for i in (0..self.crit_pos).rev() {
+ let crit = if long_period { self.crit_pos_back }
+ else { cmp::min(self.crit_pos_back, self.memory_back) };
+ for i in (0..crit).rev() {
if needle[i] != haystack[self.end - needle.len() + i] {
- self.end -= self.crit_pos - i;
+ self.end -= self.crit_pos_back - i;
+ if !long_period {
+ self.memory_back = needle.len();
+ }
continue 'search;
}
}
// See if the right part of the needle matches
- for i in self.crit_pos..needle.len() {
+ let needle_end = if long_period { needle.len() }
+ else { self.memory_back };
+ for i in self.crit_pos_back..needle_end {
if needle[i] != haystack[self.end - needle.len() + i] {
self.end -= self.period;
+ if !long_period {
+ self.memory_back = self.period;
+ }
continue 'search;
}
}
let match_pos = self.end - needle.len();
// Note: sub self.period instead of needle.len() to have overlapping matches
self.end -= needle.len();
+ if !long_period {
+ self.memory_back = needle.len();
+ }
return S::matching(match_pos, match_pos + needle.len());
}
}
- // Computes a critical factorization (u, v) of `arr`.
- // Specifically, returns (i, p), where i is the starting index of v in some
- // critical factorization (u, v) and p = period(v)
+ // Compute the maximal suffix of `arr`.
+ //
+ // The maximal suffix is a possible critical factorization (u, v) of `arr`.
+ //
+ // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
+ // period of v.
+ //
+ // `order_greater` determines if lexical order is `<` or `>`. Both
+ // orders must be computed -- the ordering with the largest `i` gives
+ // a critical factorization.
+ //
+ // For long period cases, the resulting period is not exact (it is too short).
#[inline]
- fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
- let mut left: usize = !0; // Corresponds to i in the paper
- let mut right = 0; // Corresponds to j in the paper
- let mut offset = 1; // Corresponds to k in the paper
+ fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
+ let mut left = 0; // Corresponds to i in the paper
+ let mut right = 1; // Corresponds to j in the paper
+ let mut offset = 0; // Corresponds to k in the paper
let mut period = 1; // Corresponds to p in the paper
- while right + offset < arr.len() {
- let a;
- let b;
- if reversed {
- a = arr[left.wrapping_add(offset)];
- b = arr[right + offset];
+ while let Some(&a) = arr.get(right + offset) {
+ // `left` will be inbounds when `right` is.
+ let b = arr[left + offset];
+ if (a < b && !order_greater) || (a > b && order_greater) {
+ // Suffix is smaller, period is entire prefix so far.
+ right += offset + 1;
+ offset = 0;
+ period = right - left;
+ } else if a == b {
+ // Advance through repetition of the current period.
+ if offset + 1 == period {
+ right += offset + 1;
+ offset = 0;
+ } else {
+ offset += 1;
+ }
} else {
- a = arr[right + offset];
- b = arr[left.wrapping_add(offset)];
+ // Suffix is larger, start over from current location.
+ left = right;
+ right += 1;
+ offset = 0;
+ period = 1;
}
- if a < b {
+ }
+ (left, period)
+ }
+
+ // Compute the maximal suffix of the reverse of `arr`.
+ //
+ // The maximal suffix is a possible critical factorization (u', v') of `arr`.
+ //
+ // Returns `i` where `i` is the starting index of v', from the back;
+ // returns immedately when a period of `known_period` is reached.
+ //
+ // `order_greater` determines if lexical order is `<` or `>`. Both
+ // orders must be computed -- the ordering with the largest `i` gives
+ // a critical factorization.
+ //
+ // For long period cases, the resulting period is not exact (it is too short).
+ fn reverse_maximal_suffix(arr: &[u8], known_period: usize,
+ order_greater: bool) -> usize
+ {
+ let mut left = 0; // Corresponds to i in the paper
+ let mut right = 1; // Corresponds to j in the paper
+ let mut offset = 0; // Corresponds to k in the paper
+ let mut period = 1; // Corresponds to p in the paper
+ let n = arr.len();
+
+ while right + offset < n {
+ let a = arr[n - (1 + right + offset)];
+ let b = arr[n - (1 + left + offset)];
+ if (a < b && !order_greater) || (a > b && order_greater) {
// Suffix is smaller, period is entire prefix so far.
- right += offset;
- offset = 1;
- period = right.wrapping_sub(left);
+ right += offset + 1;
+ offset = 0;
+ period = right - left;
} else if a == b {
// Advance through repetition of the current period.
- if offset == period {
- right += offset;
- offset = 1;
+ if offset + 1 == period {
+ right += offset + 1;
+ offset = 0;
} else {
offset += 1;
}
// Suffix is larger, start over from current location.
left = right;
right += 1;
- offset = 1;
+ offset = 0;
period = 1;
}
+ if period == known_period {
+ break;
+ }
}
- (left.wrapping_add(1), period)
+ debug_assert!(period <= known_period);
+ left
}
}