#![doc(primitive = "str")]
#![stable(feature = "rust1", since = "1.0.0")]
-use self::OldSearcher::{TwoWay, TwoWayLong};
use self::pattern::Pattern;
use self::pattern::{Searcher, ReverseSearcher, DoubleEndedSearcher};
use char::CharExt;
use clone::Clone;
-use cmp::{self, Eq};
+use cmp::Eq;
use convert::AsRef;
use default::Default;
use fmt;
use raw::{Repr, Slice};
use result::Result::{self, Ok, Err};
use slice::{self, SliceExt};
-use usize;
pub mod pattern;
}
}
-/// The internal state of an iterator that searches for matches of a substring
-/// within a larger string using two-way search
-#[derive(Clone)]
-struct TwoWaySearcher {
- // constants
- crit_pos: usize,
- period: usize,
- byteset: u64,
-
- // variables
- position: usize,
- memory: usize
-}
-
-/*
- This is the Two-Way search algorithm, which was introduced in the paper:
- Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
-
- Here's some background information.
-
- A *word* is a string of symbols. The *length* of a word should be a familiar
- notion, and here we denote it for any word x by |x|.
- (We also allow for the possibility of the *empty word*, a word of length zero).
-
- If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
- *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
- For example, both 1 and 2 are periods for the string "aa". As another example,
- the only period of the string "abcd" is 4.
-
- We denote by period(x) the *smallest* period of x (provided that x is non-empty).
- This is always well-defined since every non-empty word x has at least one period,
- |x|. We sometimes call this *the period* of x.
-
- If u, v and x are words such that x = uv, where uv is the concatenation of u and
- v, then we say that (u, v) is a *factorization* of x.
-
- Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
- that both of the following hold
-
- - either w is a suffix of u or u is a suffix of w
- - either w is a prefix of v or v is a prefix of w
-
- then w is said to be a *repetition* for the factorization (u, v).
-
- Just to unpack this, there are four possibilities here. Let w = "abc". Then we
- might have:
-
- - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
- - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
- - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
- - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
-
- Note that the word vu is a repetition for any factorization (u,v) of x = uv,
- so every factorization has at least one repetition.
-
- If x is a string and (u, v) is a factorization for x, then a *local period* for
- (u, v) is an integer r such that there is some word w such that |w| = r and w is
- a repetition for (u, v).
-
- We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
- call this *the local period* of (u, v). Provided that x = uv is non-empty, this
- is well-defined (because each non-empty word has at least one factorization, as
- noted above).
-
- It can be proven that the following is an equivalent definition of a local period
- for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
- all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
- defined. (i.e. i > 0 and i + r < |x|).
-
- Using the above reformulation, it is easy to prove that
-
- 1 <= local_period(u, v) <= period(uv)
-
- A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
- *critical factorization*.
-
- The algorithm hinges on the following theorem, which is stated without proof:
-
- **Critical Factorization Theorem** Any word x has at least one critical
- factorization (u, v) such that |u| < period(x).
-
- The purpose of maximal_suffix is to find such a critical factorization.
-
-*/
-impl TwoWaySearcher {
- #[allow(dead_code)]
- fn new(needle: &[u8]) -> TwoWaySearcher {
- let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
- let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
-
- let (crit_pos, period) =
- if crit_pos_false > crit_pos_true {
- (crit_pos_false, period_false)
- } else {
- (crit_pos_true, period_true)
- };
-
- // This isn't in the original algorithm, as far as I'm aware.
- let byteset = needle.iter()
- .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
-
- // A particularly readable explanation of what's going on here can be found
- // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
- // see the code for "Algorithm CP" on p. 323.
- //
- // What's going on is we have some critical factorization (u, v) of the
- // needle, and we want to determine whether u is a suffix of
- // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
- // "Algorithm CP2", which is optimized for when the period of the needle
- // is large.
- if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
- TwoWaySearcher {
- crit_pos: crit_pos,
- period: period,
- byteset: byteset,
-
- position: 0,
- memory: 0
- }
- } else {
- TwoWaySearcher {
- crit_pos: crit_pos,
- period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
- byteset: byteset,
-
- position: 0,
- memory: usize::MAX // Dummy value to signify that the period is long
- }
- }
- }
-
- // One of the main ideas of Two-Way is that we factorize the needle into
- // two halves, (u, v), and begin trying to find v in the haystack by scanning
- // left to right. If v matches, we try to match u by scanning right to left.
- // How far we can jump when we encounter a mismatch is all based on the fact
- // that (u, v) is a critical factorization for the needle.
- #[inline]
- fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
- -> Option<(usize, usize)> {
- 'search: loop {
- // Check that we have room to search in
- if self.position + needle.len() > haystack.len() {
- return None;
- }
-
- // Quickly skip by large portions unrelated to our substring
- if (self.byteset >>
- ((haystack[self.position + needle.len() - 1] & 0x3f)
- as usize)) & 1 == 0 {
- self.position += needle.len();
- if !long_period {
- self.memory = 0;
- }
- continue 'search;
- }
-
- // See if the right part of the needle matches
- let start = if long_period { self.crit_pos }
- else { cmp::max(self.crit_pos, self.memory) };
- for i in start..needle.len() {
- if needle[i] != haystack[self.position + i] {
- self.position += i - self.crit_pos + 1;
- if !long_period {
- self.memory = 0;
- }
- continue 'search;
- }
- }
-
- // See if the left part of the needle matches
- let start = if long_period { 0 } else { self.memory };
- for i in (start..self.crit_pos).rev() {
- if needle[i] != haystack[self.position + i] {
- self.position += self.period;
- if !long_period {
- self.memory = needle.len() - self.period;
- }
- continue 'search;
- }
- }
-
- // We have found a match!
- let match_pos = self.position;
- self.position += needle.len(); // add self.period for all matches
- if !long_period {
- self.memory = 0; // set to needle.len() - self.period for all matches
- }
- return Some((match_pos, match_pos + needle.len()));
- }
- }
-
- // Computes a critical factorization (u, v) of `arr`.
- // Specifically, returns (i, p), where i is the starting index of v in some
- // critical factorization (u, v) and p = period(v)
- #[inline]
- #[allow(dead_code)]
- #[allow(deprecated)]
- fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
- let mut left: usize = !0; // Corresponds to i in the paper
- let mut right = 0; // Corresponds to j in the paper
- let mut offset = 1; // Corresponds to k in the paper
- let mut period = 1; // Corresponds to p in the paper
-
- while right + offset < arr.len() {
- let a;
- let b;
- if reversed {
- a = arr[left.wrapping_add(offset)];
- b = arr[right + offset];
- } else {
- a = arr[right + offset];
- b = arr[left.wrapping_add(offset)];
- }
- if a < b {
- // Suffix is smaller, period is entire prefix so far.
- right += offset;
- offset = 1;
- period = right.wrapping_sub(left);
- } else if a == b {
- // Advance through repetition of the current period.
- if offset == period {
- right += offset;
- offset = 1;
- } else {
- offset += 1;
- }
- } else {
- // Suffix is larger, start over from current location.
- left = right;
- right += 1;
- offset = 1;
- period = 1;
- }
- }
- (left.wrapping_add(1), period)
- }
-}
-
-/// The internal state of an iterator that searches for matches of a substring
-/// within a larger string using a dynamically chosen search algorithm
-#[derive(Clone)]
-// NB: This is kept around for convenience because
-// it is planned to be used again in the future
-enum OldSearcher {
- TwoWay(TwoWaySearcher),
- TwoWayLong(TwoWaySearcher),
-}
-
-impl OldSearcher {
- #[allow(dead_code)]
- fn new(haystack: &[u8], needle: &[u8]) -> OldSearcher {
- if needle.is_empty() {
- // Handle specially
- unimplemented!()
- // FIXME: Tune this.
- // FIXME(#16715): This unsigned integer addition will probably not
- // overflow because that would mean that the memory almost solely
- // consists of the needle. Needs #16715 to be formally fixed.
- } else if needle.len() + 20 > haystack.len() {
- // Use naive searcher
- unimplemented!()
- } else {
- let searcher = TwoWaySearcher::new(needle);
- if searcher.memory == usize::MAX { // If the period is long
- TwoWayLong(searcher)
- } else {
- TwoWay(searcher)
- }
- }
- }
-}
-
-#[derive(Clone)]
-// NB: This is kept around for convenience because
-// it is planned to be used again in the future
-struct OldMatchIndices<'a, 'b> {
- // constants
- haystack: &'a str,
- needle: &'b str,
- searcher: OldSearcher
-}
-
-impl<'a, 'b> OldMatchIndices<'a, 'b> {
- #[inline]
- #[allow(dead_code)]
- fn next(&mut self) -> Option<(usize, usize)> {
- match self.searcher {
- TwoWay(ref mut searcher)
- => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), false),
- TwoWayLong(ref mut searcher)
- => searcher.next(self.haystack.as_bytes(), self.needle.as_bytes(), true),
- }
- }
-}
-
/*
Section: Comparing strings
*/
reason = "API not fully fleshed out and ready to be stabilized")]
use prelude::*;
+use core::cmp;
+use usize;
// Pattern
// Impl for &str
/////////////////////////////////////////////////////////////////////////////
-// Todo: Optimize the naive implementation here
-
-/// Associated type for `<&str as Pattern<'a>>::Searcher`.
-#[derive(Clone)]
-pub struct StrSearcher<'a, 'b> {
- haystack: &'a str,
- needle: &'b str,
- start: usize,
- end: usize,
- state: State,
-}
-
-#[derive(Clone, PartialEq)]
-enum State { Done, NotDone, Reject(usize, usize) }
-impl State {
- #[inline] fn done(&self) -> bool { *self == State::Done }
- #[inline] fn take(&mut self) -> State { ::mem::replace(self, State::NotDone) }
-}
-
/// Non-allocating substring search.
///
-/// Will handle the pattern `""` as returning empty matches at each utf8
+/// Will handle the pattern `""` as returning empty matches at each character
/// boundary.
impl<'a, 'b> Pattern<'a> for &'b str {
type Searcher = StrSearcher<'a, 'b>;
#[inline]
fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
- StrSearcher {
- haystack: haystack,
- needle: self,
- start: 0,
- end: haystack.len(),
- state: State::NotDone,
- }
- }
-}
-
-unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
- #[inline]
- fn haystack(&self) -> &'a str {
- self.haystack
- }
-
- #[inline]
- fn next(&mut self) -> SearchStep {
- str_search_step(self,
- |m: &mut StrSearcher| {
- // Forward step for empty needle
- let current_start = m.start;
- if !m.state.done() {
- m.start = m.haystack.char_range_at(current_start).next;
- m.state = State::Reject(current_start, m.start);
- }
- SearchStep::Match(current_start, current_start)
- },
- |m: &mut StrSearcher| {
- // Forward step for nonempty needle
- let current_start = m.start;
- // Compare byte window because this might break utf8 boundaries
- let possible_match = &m.haystack.as_bytes()[m.start .. m.start + m.needle.len()];
- if possible_match == m.needle.as_bytes() {
- m.start += m.needle.len();
- SearchStep::Match(current_start, m.start)
- } else {
- // Skip a char
- let haystack_suffix = &m.haystack[m.start..];
- m.start += haystack_suffix.chars().next().unwrap().len_utf8();
- SearchStep::Reject(current_start, m.start)
- }
- })
- }
-}
-
-unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
- #[inline]
- fn next_back(&mut self) -> SearchStep {
- str_search_step(self,
- |m: &mut StrSearcher| {
- // Backward step for empty needle
- let current_end = m.end;
- if !m.state.done() {
- m.end = m.haystack.char_range_at_reverse(current_end).next;
- m.state = State::Reject(m.end, current_end);
- }
- SearchStep::Match(current_end, current_end)
- },
- |m: &mut StrSearcher| {
- // Backward step for nonempty needle
- let current_end = m.end;
- // Compare byte window because this might break utf8 boundaries
- let possible_match = &m.haystack.as_bytes()[m.end - m.needle.len() .. m.end];
- if possible_match == m.needle.as_bytes() {
- m.end -= m.needle.len();
- SearchStep::Match(m.end, current_end)
- } else {
- // Skip a char
- let haystack_prefix = &m.haystack[..m.end];
- m.end -= haystack_prefix.chars().rev().next().unwrap().len_utf8();
- SearchStep::Reject(m.end, current_end)
- }
- })
- }
-}
-
-// Helper function for encapsulating the common control flow
-// of doing a search step from the front or doing a search step from the back
-fn str_search_step<F, G>(mut m: &mut StrSearcher,
- empty_needle_step: F,
- nonempty_needle_step: G) -> SearchStep
- where F: FnOnce(&mut StrSearcher) -> SearchStep,
- G: FnOnce(&mut StrSearcher) -> SearchStep
-{
- if m.state.done() {
- SearchStep::Done
- } else if m.needle.is_empty() && m.start <= m.end {
- // Case for needle == ""
- if let State::Reject(a, b) = m.state.take() {
- SearchStep::Reject(a, b)
- } else {
- if m.start == m.end {
- m.state = State::Done;
- }
- empty_needle_step(&mut m)
- }
- } else if m.start + m.needle.len() <= m.end {
- // Case for needle != ""
- nonempty_needle_step(&mut m)
- } else if m.start < m.end {
- // Remaining slice shorter than needle, reject it
- m.state = State::Done;
- SearchStep::Reject(m.start, m.end)
- } else {
- m.state = State::Done;
- SearchStep::Done
+ StrSearcher::new(haystack, self)
}
}
impl<'a, 'b> Pattern<'a> for &'b &'b str {
pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
}
+
+
+/////////////////////////////////////////////////////////////////////////////
+// Two Way substring searcher
+/////////////////////////////////////////////////////////////////////////////
+
+#[derive(Clone, Debug)]
+/// Associated type for `<&str as Pattern<'a>>::Searcher`.
+pub struct StrSearcher<'a, 'b> {
+ haystack: &'a str,
+ needle: &'b str,
+
+ searcher: StrSearcherImpl,
+}
+
+#[derive(Clone, Debug)]
+enum StrSearcherImpl {
+ Empty(EmptyNeedle),
+ TwoWay {
+ last_match_fw: Option<(usize, usize)>,
+ last_match_bw: Option<(usize, usize)>,
+ searcher: TwoWaySearcher,
+ }
+}
+
+#[derive(Clone, Debug)]
+struct EmptyNeedle {
+ position: usize,
+ end: usize,
+ is_match_fw: bool,
+ is_match_bw: bool,
+}
+
+impl<'a, 'b> StrSearcher<'a, 'b> {
+ fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
+ if needle.is_empty() {
+ StrSearcher {
+ haystack: haystack,
+ needle: needle,
+ searcher: StrSearcherImpl::Empty(EmptyNeedle {
+ position: 0,
+ end: haystack.len(),
+ is_match_fw: true,
+ is_match_bw: true,
+ }),
+ }
+ } else {
+ StrSearcher {
+ haystack: haystack,
+ needle: needle,
+ searcher: StrSearcherImpl::TwoWay {
+ last_match_fw: None,
+ last_match_bw: None,
+ searcher: TwoWaySearcher::new(needle.as_bytes(), haystack.len())
+ },
+ }
+ }
+ }
+}
+
+unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
+ fn haystack(&self) -> &'a str { self.haystack }
+
+ #[inline]
+ fn next(&mut self) -> SearchStep {
+ match self.searcher {
+ StrSearcherImpl::Empty(ref mut searcher) => {
+ // empty needle rejects every char and matches every empty string between them
+ let is_match = searcher.is_match_fw;
+ searcher.is_match_fw = !searcher.is_match_fw;
+ let pos = searcher.position;
+ match self.haystack[pos..].chars().next() {
+ _ if is_match => SearchStep::Match(pos, pos),
+ None => SearchStep::Done,
+ Some(ch) => {
+ searcher.position += ch.len_utf8();
+ SearchStep::Reject(pos, searcher.position)
+ }
+ }
+ }
+ StrSearcherImpl::TwoWay { ref mut last_match_fw, ref mut searcher, .. } => {
+ // TwoWaySearcher produces valid *Match* indices that split at char boundaries
+ // as long as it does correct matching and that haystack and needle are
+ // valid UTF-8
+ // *Rejects* fall on the same indices (the intervals between matches)
+ // so they are always on character boundaries.
+ if let Some((a, b)) = last_match_fw.take() {
+ return SearchStep::Match(a, b);
+ }
+ let last_pos = searcher.position;
+ let is_long = searcher.memory == usize::MAX;
+ let next_match = searcher.next(self.haystack.as_bytes(),
+ self.needle.as_bytes(),
+ is_long);
+ match next_match {
+ None => if last_pos != self.haystack.len() {
+ SearchStep::Reject(last_pos, self.haystack.len())
+ } else {
+ SearchStep::Done
+ },
+ Some((a, b)) => {
+ if a == last_pos {
+ SearchStep::Match(a, b)
+ } else {
+ *last_match_fw = Some((a, b));
+ SearchStep::Reject(last_pos, a)
+ }
+ }
+ }
+ }
+ }
+ }
+
+}
+unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
+ #[inline]
+ fn next_back(&mut self) -> SearchStep {
+ match self.searcher {
+ StrSearcherImpl::Empty(ref mut searcher) => {
+ let is_match = searcher.is_match_bw;
+ searcher.is_match_bw = !searcher.is_match_bw;
+ let end = searcher.end;
+ match self.haystack[..end].chars().next_back() {
+ _ if is_match => SearchStep::Match(end, end),
+ None => SearchStep::Done,
+ Some(ch) => {
+ searcher.end -= ch.len_utf8();
+ SearchStep::Reject(searcher.end, end)
+ }
+ }
+ }
+ StrSearcherImpl::TwoWay { ref mut last_match_bw, ref mut searcher, .. } => {
+ if let Some((a, b)) = last_match_bw.take() {
+ return SearchStep::Match(a, b);
+ }
+ let last_end = searcher.end;
+ let next_match = searcher.next_back(self.haystack.as_bytes(),
+ self.needle.as_bytes());
+ match next_match {
+ None => if last_end != 0 {
+ SearchStep::Reject(0, last_end)
+ } else {
+ SearchStep::Done
+ },
+ Some((a, b)) => {
+ if b == last_end {
+ SearchStep::Match(a, b)
+ } else {
+ *last_match_bw = Some((a, b));
+ SearchStep::Reject(b, last_end)
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+/// The internal state of an iterator that searches for matches of a substring
+/// within a larger string using two-way search
+#[derive(Clone, Debug)]
+struct TwoWaySearcher {
+ // constants
+ crit_pos: usize,
+ period: usize,
+ byteset: u64,
+
+ // variables
+ position: usize,
+ end: usize,
+ memory: usize
+}
+
+/*
+ This is the Two-Way search algorithm, which was introduced in the paper:
+ Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
+
+ Here's some background information.
+
+ A *word* is a string of symbols. The *length* of a word should be a familiar
+ notion, and here we denote it for any word x by |x|.
+ (We also allow for the possibility of the *empty word*, a word of length zero).
+
+ If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
+ *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
+ For example, both 1 and 2 are periods for the string "aa". As another example,
+ the only period of the string "abcd" is 4.
+
+ We denote by period(x) the *smallest* period of x (provided that x is non-empty).
+ This is always well-defined since every non-empty word x has at least one period,
+ |x|. We sometimes call this *the period* of x.
+
+ If u, v and x are words such that x = uv, where uv is the concatenation of u and
+ v, then we say that (u, v) is a *factorization* of x.
+
+ Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
+ that both of the following hold
+
+ - either w is a suffix of u or u is a suffix of w
+ - either w is a prefix of v or v is a prefix of w
+
+ then w is said to be a *repetition* for the factorization (u, v).
+
+ Just to unpack this, there are four possibilities here. Let w = "abc". Then we
+ might have:
+
+ - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
+ - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
+ - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
+ - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
+
+ Note that the word vu is a repetition for any factorization (u,v) of x = uv,
+ so every factorization has at least one repetition.
+
+ If x is a string and (u, v) is a factorization for x, then a *local period* for
+ (u, v) is an integer r such that there is some word w such that |w| = r and w is
+ a repetition for (u, v).
+
+ We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
+ call this *the local period* of (u, v). Provided that x = uv is non-empty, this
+ is well-defined (because each non-empty word has at least one factorization, as
+ noted above).
+
+ It can be proven that the following is an equivalent definition of a local period
+ for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
+ all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
+ defined. (i.e. i > 0 and i + r < |x|).
+
+ Using the above reformulation, it is easy to prove that
+
+ 1 <= local_period(u, v) <= period(uv)
+
+ A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
+ *critical factorization*.
+
+ The algorithm hinges on the following theorem, which is stated without proof:
+
+ **Critical Factorization Theorem** Any word x has at least one critical
+ factorization (u, v) such that |u| < period(x).
+
+ The purpose of maximal_suffix is to find such a critical factorization.
+
+*/
+impl TwoWaySearcher {
+ fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
+ let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
+ let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
+
+ let (crit_pos, period) =
+ if crit_pos_false > crit_pos_true {
+ (crit_pos_false, period_false)
+ } else {
+ (crit_pos_true, period_true)
+ };
+
+ // This isn't in the original algorithm, as far as I'm aware.
+ let byteset = needle.iter()
+ .fold(0, |a, &b| (1 << ((b & 0x3f) as usize)) | a);
+
+ // A particularly readable explanation of what's going on here can be found
+ // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
+ // see the code for "Algorithm CP" on p. 323.
+ //
+ // What's going on is we have some critical factorization (u, v) of the
+ // needle, and we want to determine whether u is a suffix of
+ // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
+ // "Algorithm CP2", which is optimized for when the period of the needle
+ // is large.
+ if &needle[..crit_pos] == &needle[period.. period + crit_pos] {
+ // short period case
+ TwoWaySearcher {
+ crit_pos: crit_pos,
+ period: period,
+ byteset: byteset,
+
+ position: 0,
+ end: end,
+ memory: 0
+ }
+ } else {
+ // long period case
+ // we have an approximation to the actual period, and don't use memory.
+ TwoWaySearcher {
+ crit_pos: crit_pos,
+ period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
+ byteset: byteset,
+
+ position: 0,
+ end: end,
+ memory: usize::MAX // Dummy value to signify that the period is long
+ }
+ }
+ }
+
+ #[inline(always)]
+ fn byteset_contains(&self, byte: u8) -> bool {
+ (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
+ }
+
+ // One of the main ideas of Two-Way is that we factorize the needle into
+ // two halves, (u, v), and begin trying to find v in the haystack by scanning
+ // left to right. If v matches, we try to match u by scanning right to left.
+ // How far we can jump when we encounter a mismatch is all based on the fact
+ // that (u, v) is a critical factorization for the needle.
+ #[inline]
+ fn next(&mut self, haystack: &[u8], needle: &[u8], long_period: bool)
+ -> Option<(usize, usize)> {
+ // `next()` uses `self.position` as its cursor
+ 'search: loop {
+ // Check that we have room to search in
+ if self.position + needle.len() > haystack.len() {
+ self.position = haystack.len();
+ return None;
+ }
+
+ // Quickly skip by large portions unrelated to our substring
+ if !self.byteset_contains(haystack[self.position + needle.len() - 1]) {
+ self.position += needle.len();
+ if !long_period {
+ self.memory = 0;
+ }
+ continue 'search;
+ }
+
+ // See if the right part of the needle matches
+ let start = if long_period { self.crit_pos }
+ else { cmp::max(self.crit_pos, self.memory) };
+ for i in start..needle.len() {
+ if needle[i] != haystack[self.position + i] {
+ self.position += i - self.crit_pos + 1;
+ if !long_period {
+ self.memory = 0;
+ }
+ continue 'search;
+ }
+ }
+
+ // See if the left part of the needle matches
+ let start = if long_period { 0 } else { self.memory };
+ for i in (start..self.crit_pos).rev() {
+ if needle[i] != haystack[self.position + i] {
+ self.position += self.period;
+ if !long_period {
+ self.memory = needle.len() - self.period;
+ }
+ continue 'search;
+ }
+ }
+
+ // We have found a match!
+ let match_pos = self.position;
+
+ // Note: add self.period instead of needle.len() to have overlapping matches
+ self.position += needle.len();
+ if !long_period {
+ self.memory = 0; // set to needle.len() - self.period for overlapping matches
+ }
+
+ return Some((match_pos, match_pos + needle.len()));
+ }
+ }
+
+ // Follows the ideas in `next()`.
+ //
+ // All the definitions are completely symmetrical, with period(x) = period(reverse(x))
+ // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
+ // is a critical factorization, so is (reverse(v), reverse(u)). Similarly,
+ // the "period" stored in self.period is the real period if long_period is
+ // false, and so is still valid for a reversed needle, and if long_period is
+ // true, all the algorithm requires is that self.period is less than or
+ // equal to the real period, which must be true for the forward case anyway.
+ //
+ // To search in reverse through the haystack, we search forward through
+ // a reversed haystack with a reversed needle, and the above paragraph shows
+ // that the precomputed parameters can be left alone.
+ #[inline]
+ fn next_back(&mut self, haystack: &[u8], needle: &[u8]) -> Option<(usize, usize)> {
+ // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
+ // are independent.
+ 'search: loop {
+ // Check that we have room to search in
+ if needle.len() > self.end {
+ self.end = 0;
+ return None;
+ }
+
+ // Quickly skip by large portions unrelated to our substring
+ if !self.byteset_contains(haystack[self.end - needle.len()]) {
+ self.end -= needle.len();
+ continue 'search;
+ }
+
+ // See if the left part of the needle matches
+ for i in (0..self.crit_pos).rev() {
+ if needle[i] != haystack[self.end - needle.len() + i] {
+ self.end -= self.crit_pos - i;
+ continue 'search;
+ }
+ }
+
+ // See if the right part of the needle matches
+ for i in self.crit_pos..needle.len() {
+ if needle[i] != haystack[self.end - needle.len() + i] {
+ self.end -= self.period;
+ continue 'search;
+ }
+ }
+
+ // We have found a match!
+ let match_pos = self.end - needle.len();
+ // Note: sub self.period instead of needle.len() to have overlapping matches
+ self.end -= needle.len();
+
+ return Some((match_pos, match_pos + needle.len()));
+ }
+ }
+
+ // Computes a critical factorization (u, v) of `arr`.
+ // Specifically, returns (i, p), where i is the starting index of v in some
+ // critical factorization (u, v) and p = period(v)
+ #[inline]
+ fn maximal_suffix(arr: &[u8], reversed: bool) -> (usize, usize) {
+ let mut left: usize = !0; // Corresponds to i in the paper
+ let mut right = 0; // Corresponds to j in the paper
+ let mut offset = 1; // Corresponds to k in the paper
+ let mut period = 1; // Corresponds to p in the paper
+
+ while right + offset < arr.len() {
+ let a;
+ let b;
+ if reversed {
+ a = arr[left.wrapping_add(offset)];
+ b = arr[right + offset];
+ } else {
+ a = arr[right + offset];
+ b = arr[left.wrapping_add(offset)];
+ }
+ if a < b {
+ // Suffix is smaller, period is entire prefix so far.
+ right += offset;
+ offset = 1;
+ period = right.wrapping_sub(left);
+ } else if a == b {
+ // Advance through repetition of the current period.
+ if offset == period {
+ right += offset;
+ offset = 1;
+ } else {
+ offset += 1;
+ }
+ } else {
+ // Suffix is larger, start over from current location.
+ left = right;
+ right += 1;
+ offset = 1;
+ period = 1;
+ }
+ }
+ (left.wrapping_add(1), period)
+ }
+}