1 //! The string Pattern API.
3 //! For more details, see the traits [`Pattern`], [`Searcher`],
4 //! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
6 // ignore-tidy-undocumented-unsafe
10 reason = "API not fully fleshed out and ready to be stabilized",
16 use crate::slice::memchr;
23 /// A `Pattern<'a>` expresses that the implementing type
24 /// can be used as a string pattern for searching in a `&'a str`.
26 /// For example, both `'a'` and `"aa"` are patterns that
27 /// would match at index `1` in the string `"baaaab"`.
29 /// The trait itself acts as a builder for an associated
30 /// `Searcher` type, which does the actual work of finding
31 /// occurrences of the pattern in a string.
32 pub trait Pattern<'a>: Sized {
33 /// Associated searcher for this pattern
34 type Searcher: Searcher<'a>;
36 /// Constructs the associated searcher from
37 /// `self` and the `haystack` to search in.
38 fn into_searcher(self, haystack: &'a str) -> Self::Searcher;
40 /// Checks whether the pattern matches anywhere in the haystack
42 fn is_contained_in(self, haystack: &'a str) -> bool {
43 self.into_searcher(haystack).next_match().is_some()
46 /// Checks whether the pattern matches at the front of the haystack
48 fn is_prefix_of(self, haystack: &'a str) -> bool {
49 matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _))
52 /// Checks whether the pattern matches at the back of the haystack
54 fn is_suffix_of(self, haystack: &'a str) -> bool
56 Self::Searcher: ReverseSearcher<'a>,
58 matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j)
64 /// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`.
65 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
67 /// Expresses that a match of the pattern has been found at
70 /// Expresses that `haystack[a..b]` has been rejected as a possible match
73 /// Note that there might be more than one `Reject` between two `Match`es,
74 /// there is no requirement for them to be combined into one.
76 /// Expresses that every byte of the haystack has been visited, ending
81 /// A searcher for a string pattern.
83 /// This trait provides methods for searching for non-overlapping
84 /// matches of a pattern starting from the front (left) of a string.
86 /// It will be implemented by associated `Searcher`
87 /// types of the `Pattern` trait.
89 /// The trait is marked unsafe because the indices returned by the
90 /// `next()` methods are required to lie on valid utf8 boundaries in
91 /// the haystack. This enables consumers of this trait to
92 /// slice the haystack without additional runtime checks.
93 pub unsafe trait Searcher<'a> {
94 /// Getter for the underlying string to be searched in
96 /// Will always return the same `&str`
97 fn haystack(&self) -> &'a str;
99 /// Performs the next search step starting from the front.
101 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
102 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
103 /// pattern, even partially.
104 /// - Returns `Done` if every byte of the haystack has been visited
106 /// The stream of `Match` and `Reject` values up to a `Done`
107 /// will contain index ranges that are adjacent, non-overlapping,
108 /// covering the whole haystack, and laying on utf8 boundaries.
110 /// A `Match` result needs to contain the whole matched pattern,
111 /// however `Reject` results may be split up into arbitrary
112 /// many adjacent fragments. Both ranges may have zero length.
114 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
115 /// might produce the stream
116 /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]`
117 fn next(&mut self) -> SearchStep;
119 /// Finds the next `Match` result. See `next()`
121 /// Unlike next(), there is no guarantee that the returned ranges
122 /// of this and next_reject will overlap. This will return (start_match, end_match),
123 /// where start_match is the index of where the match begins, and end_match is
124 /// the index after the end of the match.
126 fn next_match(&mut self) -> Option<(usize, usize)> {
129 SearchStep::Match(a, b) => return Some((a, b)),
130 SearchStep::Done => return None,
136 /// Finds the next `Reject` result. See `next()` and `next_match()`
138 /// Unlike next(), there is no guarantee that the returned ranges
139 /// of this and next_match will overlap.
141 fn next_reject(&mut self) -> Option<(usize, usize)> {
144 SearchStep::Reject(a, b) => return Some((a, b)),
145 SearchStep::Done => return None,
152 /// A reverse searcher for a string pattern.
154 /// This trait provides methods for searching for non-overlapping
155 /// matches of a pattern starting from the back (right) of a string.
157 /// It will be implemented by associated `Searcher`
158 /// types of the `Pattern` trait if the pattern supports searching
159 /// for it from the back.
161 /// The index ranges returned by this trait are not required
162 /// to exactly match those of the forward search in reverse.
164 /// For the reason why this trait is marked unsafe, see them
165 /// parent trait `Searcher`.
166 pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
167 /// Performs the next search step starting from the back.
169 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
170 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
171 /// pattern, even partially.
172 /// - Returns `Done` if every byte of the haystack has been visited
174 /// The stream of `Match` and `Reject` values up to a `Done`
175 /// will contain index ranges that are adjacent, non-overlapping,
176 /// covering the whole haystack, and laying on utf8 boundaries.
178 /// A `Match` result needs to contain the whole matched pattern,
179 /// however `Reject` results may be split up into arbitrary
180 /// many adjacent fragments. Both ranges may have zero length.
182 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
183 /// might produce the stream
184 /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`
185 fn next_back(&mut self) -> SearchStep;
187 /// Finds the next `Match` result. See `next_back()`
189 fn next_match_back(&mut self) -> Option<(usize, usize)> {
191 match self.next_back() {
192 SearchStep::Match(a, b) => return Some((a, b)),
193 SearchStep::Done => return None,
199 /// Finds the next `Reject` result. See `next_back()`
201 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
203 match self.next_back() {
204 SearchStep::Reject(a, b) => return Some((a, b)),
205 SearchStep::Done => return None,
212 /// A marker trait to express that a `ReverseSearcher`
213 /// can be used for a `DoubleEndedIterator` implementation.
215 /// For this, the impl of `Searcher` and `ReverseSearcher` need
216 /// to follow these conditions:
218 /// - All results of `next()` need to be identical
219 /// to the results of `next_back()` in reverse order.
220 /// - `next()` and `next_back()` need to behave as
221 /// the two ends of a range of values, that is they
222 /// can not "walk past each other".
226 /// `char::Searcher` is a `DoubleEndedSearcher` because searching for a
227 /// `char` only requires looking at one at a time, which behaves the same
230 /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because
231 /// the pattern `"aa"` in the haystack `"aaa"` matches as either
232 /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
233 pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
235 /////////////////////////////////////////////////////////////////////////////
237 /////////////////////////////////////////////////////////////////////////////
239 /// Associated type for `<char as Pattern<'a>>::Searcher`.
240 #[derive(Clone, Debug)]
241 pub struct CharSearcher<'a> {
243 // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
244 // This invariant can be broken *within* next_match and next_match_back, however
245 // they must exit with fingers on valid code point boundaries.
246 /// `finger` is the current byte index of the forward search.
247 /// Imagine that it exists before the byte at its index, i.e.
248 /// `haystack[finger]` is the first byte of the slice we must inspect during
249 /// forward searching
251 /// `finger_back` is the current byte index of the reverse search.
252 /// Imagine that it exists after the byte at its index, i.e.
253 /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
254 /// forward searching (and thus the first byte to be inspected when calling next_back())
256 /// The character being searched for
259 // safety invariant: `utf8_size` must be less than 5
260 /// The number of bytes `needle` takes up when encoded in utf8
262 /// A utf8 encoded copy of the `needle`
263 utf8_encoded: [u8; 4],
266 unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
268 fn haystack(&self) -> &'a str {
272 fn next(&mut self) -> SearchStep {
273 let old_finger = self.finger;
274 let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
275 let mut iter = slice.chars();
276 let old_len = iter.iter.len();
277 if let Some(ch) = iter.next() {
278 // add byte offset of current character
279 // without re-encoding as utf-8
280 self.finger += old_len - iter.iter.len();
281 if ch == self.needle {
282 SearchStep::Match(old_finger, self.finger)
284 SearchStep::Reject(old_finger, self.finger)
291 fn next_match(&mut self) -> Option<(usize, usize)> {
293 // get the haystack after the last character found
294 let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
295 // the last byte of the utf8 encoded needle
296 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
297 if let Some(index) = memchr::memchr(last_byte, bytes) {
298 // The new finger is the index of the byte we found,
299 // plus one, since we memchr'd for the last byte of the character.
301 // Note that this doesn't always give us a finger on a UTF8 boundary.
302 // If we *didn't* find our character
303 // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
304 // We can't just skip to the next valid starting byte because a character like
305 // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
306 // the second byte when searching for the third.
308 // However, this is totally okay. While we have the invariant that
309 // self.finger is on a UTF8 boundary, this invariant is not relied upon
310 // within this method (it is relied upon in CharSearcher::next()).
312 // We only exit this method when we reach the end of the string, or if we
313 // find something. When we find something the `finger` will be set
314 // to a UTF8 boundary.
315 self.finger += index + 1;
316 if self.finger >= self.utf8_size {
317 let found_char = self.finger - self.utf8_size;
318 if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
319 if slice == &self.utf8_encoded[0..self.utf8_size] {
320 return Some((found_char, self.finger));
325 // found nothing, exit
326 self.finger = self.finger_back;
332 // let next_reject use the default implementation from the Searcher trait
335 unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
337 fn next_back(&mut self) -> SearchStep {
338 let old_finger = self.finger_back;
339 let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
340 let mut iter = slice.chars();
341 let old_len = iter.iter.len();
342 if let Some(ch) = iter.next_back() {
343 // subtract byte offset of current character
344 // without re-encoding as utf-8
345 self.finger_back -= old_len - iter.iter.len();
346 if ch == self.needle {
347 SearchStep::Match(self.finger_back, old_finger)
349 SearchStep::Reject(self.finger_back, old_finger)
356 fn next_match_back(&mut self) -> Option<(usize, usize)> {
357 let haystack = self.haystack.as_bytes();
359 // get the haystack up to but not including the last character searched
360 let bytes = if let Some(slice) = haystack.get(self.finger..self.finger_back) {
365 // the last byte of the utf8 encoded needle
366 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
367 if let Some(index) = memchr::memrchr(last_byte, bytes) {
368 // we searched a slice that was offset by self.finger,
369 // add self.finger to recoup the original index
370 let index = self.finger + index;
371 // memrchr will return the index of the byte we wish to
372 // find. In case of an ASCII character, this is indeed
373 // were we wish our new finger to be ("after" the found
374 // char in the paradigm of reverse iteration). For
375 // multibyte chars we need to skip down by the number of more
376 // bytes they have than ASCII
377 let shift = self.utf8_size - 1;
379 let found_char = index - shift;
380 if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
381 if slice == &self.utf8_encoded[0..self.utf8_size] {
382 // move finger to before the character found (i.e., at its start index)
383 self.finger_back = found_char;
384 return Some((self.finger_back, self.finger_back + self.utf8_size));
388 // We can't use finger_back = index - size + 1 here. If we found the last char
389 // of a different-sized character (or the middle byte of a different character)
390 // we need to bump the finger_back down to `index`. This similarly makes
391 // `finger_back` have the potential to no longer be on a boundary,
392 // but this is OK since we only exit this function on a boundary
393 // or when the haystack has been searched completely.
395 // Unlike next_match this does not
396 // have the problem of repeated bytes in utf-8 because
397 // we're searching for the last byte, and we can only have
398 // found the last byte when searching in reverse.
399 self.finger_back = index;
401 self.finger_back = self.finger;
402 // found nothing, exit
408 // let next_reject_back use the default implementation from the Searcher trait
411 impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
413 /// Searches for chars that are equal to a given char
414 impl<'a> Pattern<'a> for char {
415 type Searcher = CharSearcher<'a>;
418 fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
419 let mut utf8_encoded = [0; 4];
420 let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
424 finger_back: haystack.len(),
432 fn is_contained_in(self, haystack: &'a str) -> bool {
433 if (self as u32) < 128 {
434 haystack.as_bytes().contains(&(self as u8))
436 let mut buffer = [0u8; 4];
437 self.encode_utf8(&mut buffer).is_contained_in(haystack)
442 fn is_prefix_of(self, haystack: &'a str) -> bool {
443 self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack)
447 fn is_suffix_of(self, haystack: &'a str) -> bool
449 Self::Searcher: ReverseSearcher<'a>,
451 self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack)
455 /////////////////////////////////////////////////////////////////////////////
456 // Impl for a MultiCharEq wrapper
457 /////////////////////////////////////////////////////////////////////////////
461 fn matches(&mut self, c: char) -> bool;
464 impl<F> MultiCharEq for F
466 F: FnMut(char) -> bool,
469 fn matches(&mut self, c: char) -> bool {
474 impl MultiCharEq for &[char] {
476 fn matches(&mut self, c: char) -> bool {
477 self.iter().any(|&m| m == c)
481 struct MultiCharEqPattern<C: MultiCharEq>(C);
483 #[derive(Clone, Debug)]
484 struct MultiCharEqSearcher<'a, C: MultiCharEq> {
487 char_indices: super::CharIndices<'a>,
490 impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
491 type Searcher = MultiCharEqSearcher<'a, C>;
494 fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
495 MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
499 unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
501 fn haystack(&self) -> &'a str {
506 fn next(&mut self) -> SearchStep {
507 let s = &mut self.char_indices;
508 // Compare lengths of the internal byte slice iterator
509 // to find length of current char
510 let pre_len = s.iter.iter.len();
511 if let Some((i, c)) = s.next() {
512 let len = s.iter.iter.len();
513 let char_len = pre_len - len;
514 if self.char_eq.matches(c) {
515 return SearchStep::Match(i, i + char_len);
517 return SearchStep::Reject(i, i + char_len);
524 unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
526 fn next_back(&mut self) -> SearchStep {
527 let s = &mut self.char_indices;
528 // Compare lengths of the internal byte slice iterator
529 // to find length of current char
530 let pre_len = s.iter.iter.len();
531 if let Some((i, c)) = s.next_back() {
532 let len = s.iter.iter.len();
533 let char_len = pre_len - len;
534 if self.char_eq.matches(c) {
535 return SearchStep::Match(i, i + char_len);
537 return SearchStep::Reject(i, i + char_len);
544 impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
546 /////////////////////////////////////////////////////////////////////////////
548 macro_rules! pattern_methods {
549 ($t:ty, $pmap:expr, $smap:expr) => {
553 fn into_searcher(self, haystack: &'a str) -> $t {
554 ($smap)(($pmap)(self).into_searcher(haystack))
558 fn is_contained_in(self, haystack: &'a str) -> bool {
559 ($pmap)(self).is_contained_in(haystack)
563 fn is_prefix_of(self, haystack: &'a str) -> bool {
564 ($pmap)(self).is_prefix_of(haystack)
568 fn is_suffix_of(self, haystack: &'a str) -> bool
569 where $t: ReverseSearcher<'a>
571 ($pmap)(self).is_suffix_of(haystack)
576 macro_rules! searcher_methods {
579 fn haystack(&self) -> &'a str {
583 fn next(&mut self) -> SearchStep {
587 fn next_match(&mut self) -> Option<(usize, usize)> {
591 fn next_reject(&mut self) -> Option<(usize, usize)> {
597 fn next_back(&mut self) -> SearchStep {
601 fn next_match_back(&mut self) -> Option<(usize, usize)> {
602 self.0.next_match_back()
605 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
606 self.0.next_reject_back()
611 /////////////////////////////////////////////////////////////////////////////
613 /////////////////////////////////////////////////////////////////////////////
615 // Todo: Change / Remove due to ambiguity in meaning.
617 /// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
618 #[derive(Clone, Debug)]
619 pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
621 unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
622 searcher_methods!(forward);
625 unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> {
626 searcher_methods!(reverse);
629 impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
631 /// Searches for chars that are equal to any of the chars in the array
632 impl<'a, 'b> Pattern<'a> for &'b [char] {
633 pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
636 /////////////////////////////////////////////////////////////////////////////
637 // Impl for F: FnMut(char) -> bool
638 /////////////////////////////////////////////////////////////////////////////
640 /// Associated type for `<F as Pattern<'a>>::Searcher`.
642 pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
644 F: FnMut(char) -> bool;
646 impl<F> fmt::Debug for CharPredicateSearcher<'_, F>
648 F: FnMut(char) -> bool,
650 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
651 f.debug_struct("CharPredicateSearcher")
652 .field("haystack", &self.0.haystack)
653 .field("char_indices", &self.0.char_indices)
657 unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F>
659 F: FnMut(char) -> bool,
661 searcher_methods!(forward);
664 unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F>
666 F: FnMut(char) -> bool,
668 searcher_methods!(reverse);
671 impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {}
673 /// Searches for chars that match the given predicate
674 impl<'a, F> Pattern<'a> for F
676 F: FnMut(char) -> bool,
678 pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
681 /////////////////////////////////////////////////////////////////////////////
683 /////////////////////////////////////////////////////////////////////////////
685 /// Delegates to the `&str` impl.
686 impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str {
687 pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
690 /////////////////////////////////////////////////////////////////////////////
692 /////////////////////////////////////////////////////////////////////////////
694 /// Non-allocating substring search.
696 /// Will handle the pattern `""` as returning empty matches at each character
698 impl<'a, 'b> Pattern<'a> for &'b str {
699 type Searcher = StrSearcher<'a, 'b>;
702 fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
703 StrSearcher::new(haystack, self)
706 /// Checks whether the pattern matches at the front of the haystack
708 fn is_prefix_of(self, haystack: &'a str) -> bool {
709 haystack.as_bytes().starts_with(self.as_bytes())
712 /// Checks whether the pattern matches at the back of the haystack
714 fn is_suffix_of(self, haystack: &'a str) -> bool {
715 haystack.as_bytes().ends_with(self.as_bytes())
719 /////////////////////////////////////////////////////////////////////////////
720 // Two Way substring searcher
721 /////////////////////////////////////////////////////////////////////////////
723 #[derive(Clone, Debug)]
724 /// Associated type for `<&str as Pattern<'a>>::Searcher`.
725 pub struct StrSearcher<'a, 'b> {
729 searcher: StrSearcherImpl,
732 #[derive(Clone, Debug)]
733 enum StrSearcherImpl {
735 TwoWay(TwoWaySearcher),
738 #[derive(Clone, Debug)]
746 impl<'a, 'b> StrSearcher<'a, 'b> {
747 fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
748 if needle.is_empty() {
752 searcher: StrSearcherImpl::Empty(EmptyNeedle {
763 searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new(
772 unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
774 fn haystack(&self) -> &'a str {
779 fn next(&mut self) -> SearchStep {
780 match self.searcher {
781 StrSearcherImpl::Empty(ref mut searcher) => {
782 // empty needle rejects every char and matches every empty string between them
783 let is_match = searcher.is_match_fw;
784 searcher.is_match_fw = !searcher.is_match_fw;
785 let pos = searcher.position;
786 match self.haystack[pos..].chars().next() {
787 _ if is_match => SearchStep::Match(pos, pos),
788 None => SearchStep::Done,
790 searcher.position += ch.len_utf8();
791 SearchStep::Reject(pos, searcher.position)
795 StrSearcherImpl::TwoWay(ref mut searcher) => {
796 // TwoWaySearcher produces valid *Match* indices that split at char boundaries
797 // as long as it does correct matching and that haystack and needle are
799 // *Rejects* from the algorithm can fall on any indices, but we will walk them
800 // manually to the next character boundary, so that they are utf-8 safe.
801 if searcher.position == self.haystack.len() {
802 return SearchStep::Done;
804 let is_long = searcher.memory == usize::MAX;
805 match searcher.next::<RejectAndMatch>(
806 self.haystack.as_bytes(),
807 self.needle.as_bytes(),
810 SearchStep::Reject(a, mut b) => {
811 // skip to next char boundary
812 while !self.haystack.is_char_boundary(b) {
815 searcher.position = cmp::max(b, searcher.position);
816 SearchStep::Reject(a, b)
818 otherwise => otherwise,
825 fn next_match(&mut self) -> Option<(usize, usize)> {
826 match self.searcher {
827 StrSearcherImpl::Empty(..) => loop {
829 SearchStep::Match(a, b) => return Some((a, b)),
830 SearchStep::Done => return None,
831 SearchStep::Reject(..) => {}
834 StrSearcherImpl::TwoWay(ref mut searcher) => {
835 let is_long = searcher.memory == usize::MAX;
836 // write out `true` and `false` cases to encourage the compiler
837 // to specialize the two cases separately.
839 searcher.next::<MatchOnly>(
840 self.haystack.as_bytes(),
841 self.needle.as_bytes(),
845 searcher.next::<MatchOnly>(
846 self.haystack.as_bytes(),
847 self.needle.as_bytes(),
856 unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
858 fn next_back(&mut self) -> SearchStep {
859 match self.searcher {
860 StrSearcherImpl::Empty(ref mut searcher) => {
861 let is_match = searcher.is_match_bw;
862 searcher.is_match_bw = !searcher.is_match_bw;
863 let end = searcher.end;
864 match self.haystack[..end].chars().next_back() {
865 _ if is_match => SearchStep::Match(end, end),
866 None => SearchStep::Done,
868 searcher.end -= ch.len_utf8();
869 SearchStep::Reject(searcher.end, end)
873 StrSearcherImpl::TwoWay(ref mut searcher) => {
874 if searcher.end == 0 {
875 return SearchStep::Done;
877 let is_long = searcher.memory == usize::MAX;
878 match searcher.next_back::<RejectAndMatch>(
879 self.haystack.as_bytes(),
880 self.needle.as_bytes(),
883 SearchStep::Reject(mut a, b) => {
884 // skip to next char boundary
885 while !self.haystack.is_char_boundary(a) {
888 searcher.end = cmp::min(a, searcher.end);
889 SearchStep::Reject(a, b)
891 otherwise => otherwise,
898 fn next_match_back(&mut self) -> Option<(usize, usize)> {
899 match self.searcher {
900 StrSearcherImpl::Empty(..) => loop {
901 match self.next_back() {
902 SearchStep::Match(a, b) => return Some((a, b)),
903 SearchStep::Done => return None,
904 SearchStep::Reject(..) => {}
907 StrSearcherImpl::TwoWay(ref mut searcher) => {
908 let is_long = searcher.memory == usize::MAX;
909 // write out `true` and `false`, like `next_match`
911 searcher.next_back::<MatchOnly>(
912 self.haystack.as_bytes(),
913 self.needle.as_bytes(),
917 searcher.next_back::<MatchOnly>(
918 self.haystack.as_bytes(),
919 self.needle.as_bytes(),
928 /// The internal state of the two-way substring search algorithm.
929 #[derive(Clone, Debug)]
930 struct TwoWaySearcher {
932 /// critical factorization index
934 /// critical factorization index for reversed needle
935 crit_pos_back: usize,
937 /// `byteset` is an extension (not part of the two way algorithm);
938 /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
939 /// to a (byte & 63) == j present in the needle.
945 /// index into needle before which we have already matched
947 /// index into needle after which we have already matched
952 This is the Two-Way search algorithm, which was introduced in the paper:
953 Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
955 Here's some background information.
957 A *word* is a string of symbols. The *length* of a word should be a familiar
958 notion, and here we denote it for any word x by |x|.
959 (We also allow for the possibility of the *empty word*, a word of length zero).
961 If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
962 *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
963 For example, both 1 and 2 are periods for the string "aa". As another example,
964 the only period of the string "abcd" is 4.
966 We denote by period(x) the *smallest* period of x (provided that x is non-empty).
967 This is always well-defined since every non-empty word x has at least one period,
968 |x|. We sometimes call this *the period* of x.
970 If u, v and x are words such that x = uv, where uv is the concatenation of u and
971 v, then we say that (u, v) is a *factorization* of x.
973 Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
974 that both of the following hold
976 - either w is a suffix of u or u is a suffix of w
977 - either w is a prefix of v or v is a prefix of w
979 then w is said to be a *repetition* for the factorization (u, v).
981 Just to unpack this, there are four possibilities here. Let w = "abc". Then we
984 - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
985 - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
986 - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
987 - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
989 Note that the word vu is a repetition for any factorization (u,v) of x = uv,
990 so every factorization has at least one repetition.
992 If x is a string and (u, v) is a factorization for x, then a *local period* for
993 (u, v) is an integer r such that there is some word w such that |w| = r and w is
994 a repetition for (u, v).
996 We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
997 call this *the local period* of (u, v). Provided that x = uv is non-empty, this
998 is well-defined (because each non-empty word has at least one factorization, as
1001 It can be proven that the following is an equivalent definition of a local period
1002 for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
1003 all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
1004 defined. (i.e., i > 0 and i + r < |x|).
1006 Using the above reformulation, it is easy to prove that
1008 1 <= local_period(u, v) <= period(uv)
1010 A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
1011 *critical factorization*.
1013 The algorithm hinges on the following theorem, which is stated without proof:
1015 **Critical Factorization Theorem** Any word x has at least one critical
1016 factorization (u, v) such that |u| < period(x).
1018 The purpose of maximal_suffix is to find such a critical factorization.
1020 If the period is short, compute another factorization x = u' v' to use
1021 for reverse search, chosen instead so that |v'| < period(x).
1024 impl TwoWaySearcher {
1025 fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
1026 let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
1027 let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
1029 let (crit_pos, period) = if crit_pos_false > crit_pos_true {
1030 (crit_pos_false, period_false)
1032 (crit_pos_true, period_true)
1035 // A particularly readable explanation of what's going on here can be found
1036 // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
1037 // see the code for "Algorithm CP" on p. 323.
1039 // What's going on is we have some critical factorization (u, v) of the
1040 // needle, and we want to determine whether u is a suffix of
1041 // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
1042 // "Algorithm CP2", which is optimized for when the period of the needle
1044 if &needle[..crit_pos] == &needle[period..period + crit_pos] {
1045 // short period case -- the period is exact
1046 // compute a separate critical factorization for the reversed needle
1047 // x = u' v' where |v'| < period(x).
1049 // This is sped up by the period being known already.
1050 // Note that a case like x = "acba" may be factored exactly forwards
1051 // (crit_pos = 1, period = 3) while being factored with approximate
1052 // period in reverse (crit_pos = 2, period = 2). We use the given
1053 // reverse factorization but keep the exact period.
1054 let crit_pos_back = needle.len()
1056 TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
1057 TwoWaySearcher::reverse_maximal_suffix(needle, period, true),
1064 byteset: Self::byteset_create(&needle[..period]),
1069 memory_back: needle.len(),
1072 // long period case -- we have an approximation to the actual period,
1073 // and don't use memorization.
1075 // Approximate the period by lower bound max(|u|, |v|) + 1.
1076 // The critical factorization is efficient to use for both forward and
1081 crit_pos_back: crit_pos,
1082 period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
1083 byteset: Self::byteset_create(needle),
1087 memory: usize::MAX, // Dummy value to signify that the period is long
1088 memory_back: usize::MAX,
1094 fn byteset_create(bytes: &[u8]) -> u64 {
1095 bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
1099 fn byteset_contains(&self, byte: u8) -> bool {
1100 (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
1103 // One of the main ideas of Two-Way is that we factorize the needle into
1104 // two halves, (u, v), and begin trying to find v in the haystack by scanning
1105 // left to right. If v matches, we try to match u by scanning right to left.
1106 // How far we can jump when we encounter a mismatch is all based on the fact
1107 // that (u, v) is a critical factorization for the needle.
1109 fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1113 // `next()` uses `self.position` as its cursor
1114 let old_pos = self.position;
1115 let needle_last = needle.len() - 1;
1117 // Check that we have room to search in
1118 // position + needle_last can not overflow if we assume slices
1119 // are bounded by isize's range.
1120 let tail_byte = match haystack.get(self.position + needle_last) {
1123 self.position = haystack.len();
1124 return S::rejecting(old_pos, self.position);
1128 if S::use_early_reject() && old_pos != self.position {
1129 return S::rejecting(old_pos, self.position);
1132 // Quickly skip by large portions unrelated to our substring
1133 if !self.byteset_contains(tail_byte) {
1134 self.position += needle.len();
1141 // See if the right part of the needle matches
1143 if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) };
1144 for i in start..needle.len() {
1145 if needle[i] != haystack[self.position + i] {
1146 self.position += i - self.crit_pos + 1;
1154 // See if the left part of the needle matches
1155 let start = if long_period { 0 } else { self.memory };
1156 for i in (start..self.crit_pos).rev() {
1157 if needle[i] != haystack[self.position + i] {
1158 self.position += self.period;
1160 self.memory = needle.len() - self.period;
1166 // We have found a match!
1167 let match_pos = self.position;
1169 // Note: add self.period instead of needle.len() to have overlapping matches
1170 self.position += needle.len();
1172 self.memory = 0; // set to needle.len() - self.period for overlapping matches
1175 return S::matching(match_pos, match_pos + needle.len());
1179 // Follows the ideas in `next()`.
1181 // The definitions are symmetrical, with period(x) = period(reverse(x))
1182 // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
1183 // is a critical factorization, so is (reverse(v), reverse(u)).
1185 // For the reverse case we have computed a critical factorization x = u' v'
1186 // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
1187 // thus |v'| < period(x) for the reverse.
1189 // To search in reverse through the haystack, we search forward through
1190 // a reversed haystack with a reversed needle, matching first u' and then v'.
1192 fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1196 // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
1198 let old_end = self.end;
1200 // Check that we have room to search in
1201 // end - needle.len() will wrap around when there is no more room,
1202 // but due to slice length limits it can never wrap all the way back
1203 // into the length of haystack.
1204 let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) {
1208 return S::rejecting(0, old_end);
1212 if S::use_early_reject() && old_end != self.end {
1213 return S::rejecting(self.end, old_end);
1216 // Quickly skip by large portions unrelated to our substring
1217 if !self.byteset_contains(front_byte) {
1218 self.end -= needle.len();
1220 self.memory_back = needle.len();
1225 // See if the left part of the needle matches
1226 let crit = if long_period {
1229 cmp::min(self.crit_pos_back, self.memory_back)
1231 for i in (0..crit).rev() {
1232 if needle[i] != haystack[self.end - needle.len() + i] {
1233 self.end -= self.crit_pos_back - i;
1235 self.memory_back = needle.len();
1241 // See if the right part of the needle matches
1242 let needle_end = if long_period { needle.len() } else { self.memory_back };
1243 for i in self.crit_pos_back..needle_end {
1244 if needle[i] != haystack[self.end - needle.len() + i] {
1245 self.end -= self.period;
1247 self.memory_back = self.period;
1253 // We have found a match!
1254 let match_pos = self.end - needle.len();
1255 // Note: sub self.period instead of needle.len() to have overlapping matches
1256 self.end -= needle.len();
1258 self.memory_back = needle.len();
1261 return S::matching(match_pos, match_pos + needle.len());
1265 // Compute the maximal suffix of `arr`.
1267 // The maximal suffix is a possible critical factorization (u, v) of `arr`.
1269 // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1272 // `order_greater` determines if lexical order is `<` or `>`. Both
1273 // orders must be computed -- the ordering with the largest `i` gives
1274 // a critical factorization.
1276 // For long period cases, the resulting period is not exact (it is too short).
1278 fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1279 let mut left = 0; // Corresponds to i in the paper
1280 let mut right = 1; // Corresponds to j in the paper
1281 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1282 // to match 0-based indexing.
1283 let mut period = 1; // Corresponds to p in the paper
1285 while let Some(&a) = arr.get(right + offset) {
1286 // `left` will be inbounds when `right` is.
1287 let b = arr[left + offset];
1288 if (a < b && !order_greater) || (a > b && order_greater) {
1289 // Suffix is smaller, period is entire prefix so far.
1290 right += offset + 1;
1292 period = right - left;
1294 // Advance through repetition of the current period.
1295 if offset + 1 == period {
1296 right += offset + 1;
1302 // Suffix is larger, start over from current location.
1312 // Compute the maximal suffix of the reverse of `arr`.
1314 // The maximal suffix is a possible critical factorization (u', v') of `arr`.
1316 // Returns `i` where `i` is the starting index of v', from the back;
1317 // returns immediately when a period of `known_period` is reached.
1319 // `order_greater` determines if lexical order is `<` or `>`. Both
1320 // orders must be computed -- the ordering with the largest `i` gives
1321 // a critical factorization.
1323 // For long period cases, the resulting period is not exact (it is too short).
1324 fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize {
1325 let mut left = 0; // Corresponds to i in the paper
1326 let mut right = 1; // Corresponds to j in the paper
1327 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1328 // to match 0-based indexing.
1329 let mut period = 1; // Corresponds to p in the paper
1332 while right + offset < n {
1333 let a = arr[n - (1 + right + offset)];
1334 let b = arr[n - (1 + left + offset)];
1335 if (a < b && !order_greater) || (a > b && order_greater) {
1336 // Suffix is smaller, period is entire prefix so far.
1337 right += offset + 1;
1339 period = right - left;
1341 // Advance through repetition of the current period.
1342 if offset + 1 == period {
1343 right += offset + 1;
1349 // Suffix is larger, start over from current location.
1355 if period == known_period {
1359 debug_assert!(period <= known_period);
1364 // TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1365 // as possible, or to work in a mode where it emits Rejects relatively quickly.
1366 trait TwoWayStrategy {
1368 fn use_early_reject() -> bool;
1369 fn rejecting(a: usize, b: usize) -> Self::Output;
1370 fn matching(a: usize, b: usize) -> Self::Output;
1373 /// Skip to match intervals as quickly as possible
1376 impl TwoWayStrategy for MatchOnly {
1377 type Output = Option<(usize, usize)>;
1380 fn use_early_reject() -> bool {
1384 fn rejecting(_a: usize, _b: usize) -> Self::Output {
1388 fn matching(a: usize, b: usize) -> Self::Output {
1393 /// Emit Rejects regularly
1394 enum RejectAndMatch {}
1396 impl TwoWayStrategy for RejectAndMatch {
1397 type Output = SearchStep;
1400 fn use_early_reject() -> bool {
1404 fn rejecting(a: usize, b: usize) -> Self::Output {
1405 SearchStep::Reject(a, b)
1408 fn matching(a: usize, b: usize) -> Self::Output {
1409 SearchStep::Match(a, b)