1 //! The string Pattern API.
3 //! For more details, see the traits [`Pattern`], [`Searcher`],
4 //! [`ReverseSearcher`], and [`DoubleEndedSearcher`].
8 reason = "API not fully fleshed out and ready to be stabilized",
14 use crate::slice::memchr;
21 /// A `Pattern<'a>` expresses that the implementing type
22 /// can be used as a string pattern for searching in a `&'a str`.
24 /// For example, both `'a'` and `"aa"` are patterns that
25 /// would match at index `1` in the string `"baaaab"`.
27 /// The trait itself acts as a builder for an associated
28 /// `Searcher` type, which does the actual work of finding
29 /// occurrences of the pattern in a string.
30 pub trait Pattern<'a>: Sized {
31 /// Associated searcher for this pattern
32 type Searcher: Searcher<'a>;
34 /// Constructs the associated searcher from
35 /// `self` and the `haystack` to search in.
36 fn into_searcher(self, haystack: &'a str) -> Self::Searcher;
38 /// Checks whether the pattern matches anywhere in the haystack
40 fn is_contained_in(self, haystack: &'a str) -> bool {
41 self.into_searcher(haystack).next_match().is_some()
44 /// Checks whether the pattern matches at the front of the haystack
46 fn is_prefix_of(self, haystack: &'a str) -> bool {
47 matches!(self.into_searcher(haystack).next(), SearchStep::Match(0, _))
50 /// Removes the pattern from the front of haystack, if it matches.
52 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
53 if let SearchStep::Match(start, len) = self.into_searcher(haystack).next() {
56 "The first search step from Searcher \
57 must include the first character"
59 // SAFETY: `Searcher` is known to return valid indices.
60 unsafe { Some(haystack.get_unchecked(len..)) }
66 /// Checks whether the pattern matches at the back of the haystack
68 fn is_suffix_of(self, haystack: &'a str) -> bool
70 Self::Searcher: ReverseSearcher<'a>,
72 matches!(self.into_searcher(haystack).next_back(), SearchStep::Match(_, j) if haystack.len() == j)
75 /// Removes the pattern from the back of haystack, if it matches.
77 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
79 Self::Searcher: ReverseSearcher<'a>,
81 if let SearchStep::Match(start, end) = self.into_searcher(haystack).next_back() {
85 "The first search step from ReverseSearcher \
86 must include the last character"
88 // SAFETY: `Searcher` is known to return valid indices.
89 unsafe { Some(haystack.get_unchecked(..start)) }
98 /// Result of calling `Searcher::next()` or `ReverseSearcher::next_back()`.
99 #[derive(Copy, Clone, Eq, PartialEq, Debug)]
100 pub enum SearchStep {
101 /// Expresses that a match of the pattern has been found at
102 /// `haystack[a..b]`.
104 /// Expresses that `haystack[a..b]` has been rejected as a possible match
107 /// Note that there might be more than one `Reject` between two `Match`es,
108 /// there is no requirement for them to be combined into one.
109 Reject(usize, usize),
110 /// Expresses that every byte of the haystack has been visited, ending
115 /// A searcher for a string pattern.
117 /// This trait provides methods for searching for non-overlapping
118 /// matches of a pattern starting from the front (left) of a string.
120 /// It will be implemented by associated `Searcher`
121 /// types of the `Pattern` trait.
123 /// The trait is marked unsafe because the indices returned by the
124 /// `next()` methods are required to lie on valid utf8 boundaries in
125 /// the haystack. This enables consumers of this trait to
126 /// slice the haystack without additional runtime checks.
127 pub unsafe trait Searcher<'a> {
128 /// Getter for the underlying string to be searched in
130 /// Will always return the same `&str`
131 fn haystack(&self) -> &'a str;
133 /// Performs the next search step starting from the front.
135 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
136 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
137 /// pattern, even partially.
138 /// - Returns `Done` if every byte of the haystack has been visited
140 /// The stream of `Match` and `Reject` values up to a `Done`
141 /// will contain index ranges that are adjacent, non-overlapping,
142 /// covering the whole haystack, and laying on utf8 boundaries.
144 /// A `Match` result needs to contain the whole matched pattern,
145 /// however `Reject` results may be split up into arbitrary
146 /// many adjacent fragments. Both ranges may have zero length.
148 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
149 /// might produce the stream
150 /// `[Reject(0, 1), Reject(1, 2), Match(2, 5), Reject(5, 8)]`
151 fn next(&mut self) -> SearchStep;
153 /// Finds the next `Match` result. See `next()`
155 /// Unlike next(), there is no guarantee that the returned ranges
156 /// of this and next_reject will overlap. This will return (start_match, end_match),
157 /// where start_match is the index of where the match begins, and end_match is
158 /// the index after the end of the match.
160 fn next_match(&mut self) -> Option<(usize, usize)> {
163 SearchStep::Match(a, b) => return Some((a, b)),
164 SearchStep::Done => return None,
170 /// Finds the next `Reject` result. See `next()` and `next_match()`
172 /// Unlike next(), there is no guarantee that the returned ranges
173 /// of this and next_match will overlap.
175 fn next_reject(&mut self) -> Option<(usize, usize)> {
178 SearchStep::Reject(a, b) => return Some((a, b)),
179 SearchStep::Done => return None,
186 /// A reverse searcher for a string pattern.
188 /// This trait provides methods for searching for non-overlapping
189 /// matches of a pattern starting from the back (right) of a string.
191 /// It will be implemented by associated `Searcher`
192 /// types of the `Pattern` trait if the pattern supports searching
193 /// for it from the back.
195 /// The index ranges returned by this trait are not required
196 /// to exactly match those of the forward search in reverse.
198 /// For the reason why this trait is marked unsafe, see them
199 /// parent trait `Searcher`.
200 pub unsafe trait ReverseSearcher<'a>: Searcher<'a> {
201 /// Performs the next search step starting from the back.
203 /// - Returns `Match(a, b)` if `haystack[a..b]` matches the pattern.
204 /// - Returns `Reject(a, b)` if `haystack[a..b]` can not match the
205 /// pattern, even partially.
206 /// - Returns `Done` if every byte of the haystack has been visited
208 /// The stream of `Match` and `Reject` values up to a `Done`
209 /// will contain index ranges that are adjacent, non-overlapping,
210 /// covering the whole haystack, and laying on utf8 boundaries.
212 /// A `Match` result needs to contain the whole matched pattern,
213 /// however `Reject` results may be split up into arbitrary
214 /// many adjacent fragments. Both ranges may have zero length.
216 /// As an example, the pattern `"aaa"` and the haystack `"cbaaaaab"`
217 /// might produce the stream
218 /// `[Reject(7, 8), Match(4, 7), Reject(1, 4), Reject(0, 1)]`
219 fn next_back(&mut self) -> SearchStep;
221 /// Finds the next `Match` result. See `next_back()`
223 fn next_match_back(&mut self) -> Option<(usize, usize)> {
225 match self.next_back() {
226 SearchStep::Match(a, b) => return Some((a, b)),
227 SearchStep::Done => return None,
233 /// Finds the next `Reject` result. See `next_back()`
235 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
237 match self.next_back() {
238 SearchStep::Reject(a, b) => return Some((a, b)),
239 SearchStep::Done => return None,
246 /// A marker trait to express that a `ReverseSearcher`
247 /// can be used for a `DoubleEndedIterator` implementation.
249 /// For this, the impl of `Searcher` and `ReverseSearcher` need
250 /// to follow these conditions:
252 /// - All results of `next()` need to be identical
253 /// to the results of `next_back()` in reverse order.
254 /// - `next()` and `next_back()` need to behave as
255 /// the two ends of a range of values, that is they
256 /// can not "walk past each other".
260 /// `char::Searcher` is a `DoubleEndedSearcher` because searching for a
261 /// `char` only requires looking at one at a time, which behaves the same
264 /// `(&str)::Searcher` is not a `DoubleEndedSearcher` because
265 /// the pattern `"aa"` in the haystack `"aaa"` matches as either
266 /// `"[aa]a"` or `"a[aa]"`, depending from which side it is searched.
267 pub trait DoubleEndedSearcher<'a>: ReverseSearcher<'a> {}
269 /////////////////////////////////////////////////////////////////////////////
271 /////////////////////////////////////////////////////////////////////////////
273 /// Associated type for `<char as Pattern<'a>>::Searcher`.
274 #[derive(Clone, Debug)]
275 pub struct CharSearcher<'a> {
277 // safety invariant: `finger`/`finger_back` must be a valid utf8 byte index of `haystack`
278 // This invariant can be broken *within* next_match and next_match_back, however
279 // they must exit with fingers on valid code point boundaries.
280 /// `finger` is the current byte index of the forward search.
281 /// Imagine that it exists before the byte at its index, i.e.
282 /// `haystack[finger]` is the first byte of the slice we must inspect during
283 /// forward searching
285 /// `finger_back` is the current byte index of the reverse search.
286 /// Imagine that it exists after the byte at its index, i.e.
287 /// haystack[finger_back - 1] is the last byte of the slice we must inspect during
288 /// forward searching (and thus the first byte to be inspected when calling next_back())
290 /// The character being searched for
293 // safety invariant: `utf8_size` must be less than 5
294 /// The number of bytes `needle` takes up when encoded in utf8
296 /// A utf8 encoded copy of the `needle`
297 utf8_encoded: [u8; 4],
300 unsafe impl<'a> Searcher<'a> for CharSearcher<'a> {
302 fn haystack(&self) -> &'a str {
306 fn next(&mut self) -> SearchStep {
307 let old_finger = self.finger;
308 // SAFETY: 1-4 guarantee safety of `get_unchecked`
309 // 1. `self.finger` and `self.finger_back` are kept on unicode boundaries
310 // (this is invariant)
311 // 2. `self.finger >= 0` since it starts at 0 and only increases
312 // 3. `self.finger < self.finger_back` because otherwise the char `iter`
313 // would return `SearchStep::Done`
314 // 4. `self.finger` comes before the end of the haystack because `self.finger_back`
315 // starts at the end and only decreases
316 let slice = unsafe { self.haystack.get_unchecked(old_finger..self.finger_back) };
317 let mut iter = slice.chars();
318 let old_len = iter.iter.len();
319 if let Some(ch) = iter.next() {
320 // add byte offset of current character
321 // without re-encoding as utf-8
322 self.finger += old_len - iter.iter.len();
323 if ch == self.needle {
324 SearchStep::Match(old_finger, self.finger)
326 SearchStep::Reject(old_finger, self.finger)
333 fn next_match(&mut self) -> Option<(usize, usize)> {
335 // get the haystack after the last character found
336 let bytes = self.haystack.as_bytes().get(self.finger..self.finger_back)?;
337 // the last byte of the utf8 encoded needle
338 // SAFETY: we have an invariant that `utf8_size < 5`
339 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
340 if let Some(index) = memchr::memchr(last_byte, bytes) {
341 // The new finger is the index of the byte we found,
342 // plus one, since we memchr'd for the last byte of the character.
344 // Note that this doesn't always give us a finger on a UTF8 boundary.
345 // If we *didn't* find our character
346 // we may have indexed to the non-last byte of a 3-byte or 4-byte character.
347 // We can't just skip to the next valid starting byte because a character like
348 // ꁁ (U+A041 YI SYLLABLE PA), utf-8 `EA 81 81` will have us always find
349 // the second byte when searching for the third.
351 // However, this is totally okay. While we have the invariant that
352 // self.finger is on a UTF8 boundary, this invariant is not relied upon
353 // within this method (it is relied upon in CharSearcher::next()).
355 // We only exit this method when we reach the end of the string, or if we
356 // find something. When we find something the `finger` will be set
357 // to a UTF8 boundary.
358 self.finger += index + 1;
359 if self.finger >= self.utf8_size {
360 let found_char = self.finger - self.utf8_size;
361 if let Some(slice) = self.haystack.as_bytes().get(found_char..self.finger) {
362 if slice == &self.utf8_encoded[0..self.utf8_size] {
363 return Some((found_char, self.finger));
368 // found nothing, exit
369 self.finger = self.finger_back;
375 // let next_reject use the default implementation from the Searcher trait
378 unsafe impl<'a> ReverseSearcher<'a> for CharSearcher<'a> {
380 fn next_back(&mut self) -> SearchStep {
381 let old_finger = self.finger_back;
382 // SAFETY: see the comment for next() above
383 let slice = unsafe { self.haystack.get_unchecked(self.finger..old_finger) };
384 let mut iter = slice.chars();
385 let old_len = iter.iter.len();
386 if let Some(ch) = iter.next_back() {
387 // subtract byte offset of current character
388 // without re-encoding as utf-8
389 self.finger_back -= old_len - iter.iter.len();
390 if ch == self.needle {
391 SearchStep::Match(self.finger_back, old_finger)
393 SearchStep::Reject(self.finger_back, old_finger)
400 fn next_match_back(&mut self) -> Option<(usize, usize)> {
401 let haystack = self.haystack.as_bytes();
403 // get the haystack up to but not including the last character searched
404 let bytes = haystack.get(self.finger..self.finger_back)?;
405 // the last byte of the utf8 encoded needle
406 // SAFETY: we have an invariant that `utf8_size < 5`
407 let last_byte = unsafe { *self.utf8_encoded.get_unchecked(self.utf8_size - 1) };
408 if let Some(index) = memchr::memrchr(last_byte, bytes) {
409 // we searched a slice that was offset by self.finger,
410 // add self.finger to recoup the original index
411 let index = self.finger + index;
412 // memrchr will return the index of the byte we wish to
413 // find. In case of an ASCII character, this is indeed
414 // were we wish our new finger to be ("after" the found
415 // char in the paradigm of reverse iteration). For
416 // multibyte chars we need to skip down by the number of more
417 // bytes they have than ASCII
418 let shift = self.utf8_size - 1;
420 let found_char = index - shift;
421 if let Some(slice) = haystack.get(found_char..(found_char + self.utf8_size)) {
422 if slice == &self.utf8_encoded[0..self.utf8_size] {
423 // move finger to before the character found (i.e., at its start index)
424 self.finger_back = found_char;
425 return Some((self.finger_back, self.finger_back + self.utf8_size));
429 // We can't use finger_back = index - size + 1 here. If we found the last char
430 // of a different-sized character (or the middle byte of a different character)
431 // we need to bump the finger_back down to `index`. This similarly makes
432 // `finger_back` have the potential to no longer be on a boundary,
433 // but this is OK since we only exit this function on a boundary
434 // or when the haystack has been searched completely.
436 // Unlike next_match this does not
437 // have the problem of repeated bytes in utf-8 because
438 // we're searching for the last byte, and we can only have
439 // found the last byte when searching in reverse.
440 self.finger_back = index;
442 self.finger_back = self.finger;
443 // found nothing, exit
449 // let next_reject_back use the default implementation from the Searcher trait
452 impl<'a> DoubleEndedSearcher<'a> for CharSearcher<'a> {}
454 /// Searches for chars that are equal to a given `char`.
455 impl<'a> Pattern<'a> for char {
456 type Searcher = CharSearcher<'a>;
459 fn into_searcher(self, haystack: &'a str) -> Self::Searcher {
460 let mut utf8_encoded = [0; 4];
461 let utf8_size = self.encode_utf8(&mut utf8_encoded).len();
465 finger_back: haystack.len(),
473 fn is_contained_in(self, haystack: &'a str) -> bool {
474 if (self as u32) < 128 {
475 haystack.as_bytes().contains(&(self as u8))
477 let mut buffer = [0u8; 4];
478 self.encode_utf8(&mut buffer).is_contained_in(haystack)
483 fn is_prefix_of(self, haystack: &'a str) -> bool {
484 self.encode_utf8(&mut [0u8; 4]).is_prefix_of(haystack)
488 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
489 self.encode_utf8(&mut [0u8; 4]).strip_prefix_of(haystack)
493 fn is_suffix_of(self, haystack: &'a str) -> bool
495 Self::Searcher: ReverseSearcher<'a>,
497 self.encode_utf8(&mut [0u8; 4]).is_suffix_of(haystack)
501 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
503 Self::Searcher: ReverseSearcher<'a>,
505 self.encode_utf8(&mut [0u8; 4]).strip_suffix_of(haystack)
509 /////////////////////////////////////////////////////////////////////////////
510 // Impl for a MultiCharEq wrapper
511 /////////////////////////////////////////////////////////////////////////////
515 fn matches(&mut self, c: char) -> bool;
518 impl<F> MultiCharEq for F
520 F: FnMut(char) -> bool,
523 fn matches(&mut self, c: char) -> bool {
528 impl MultiCharEq for &[char] {
530 fn matches(&mut self, c: char) -> bool {
531 self.iter().any(|&m| m == c)
535 struct MultiCharEqPattern<C: MultiCharEq>(C);
537 #[derive(Clone, Debug)]
538 struct MultiCharEqSearcher<'a, C: MultiCharEq> {
541 char_indices: super::CharIndices<'a>,
544 impl<'a, C: MultiCharEq> Pattern<'a> for MultiCharEqPattern<C> {
545 type Searcher = MultiCharEqSearcher<'a, C>;
548 fn into_searcher(self, haystack: &'a str) -> MultiCharEqSearcher<'a, C> {
549 MultiCharEqSearcher { haystack, char_eq: self.0, char_indices: haystack.char_indices() }
553 unsafe impl<'a, C: MultiCharEq> Searcher<'a> for MultiCharEqSearcher<'a, C> {
555 fn haystack(&self) -> &'a str {
560 fn next(&mut self) -> SearchStep {
561 let s = &mut self.char_indices;
562 // Compare lengths of the internal byte slice iterator
563 // to find length of current char
564 let pre_len = s.iter.iter.len();
565 if let Some((i, c)) = s.next() {
566 let len = s.iter.iter.len();
567 let char_len = pre_len - len;
568 if self.char_eq.matches(c) {
569 return SearchStep::Match(i, i + char_len);
571 return SearchStep::Reject(i, i + char_len);
578 unsafe impl<'a, C: MultiCharEq> ReverseSearcher<'a> for MultiCharEqSearcher<'a, C> {
580 fn next_back(&mut self) -> SearchStep {
581 let s = &mut self.char_indices;
582 // Compare lengths of the internal byte slice iterator
583 // to find length of current char
584 let pre_len = s.iter.iter.len();
585 if let Some((i, c)) = s.next_back() {
586 let len = s.iter.iter.len();
587 let char_len = pre_len - len;
588 if self.char_eq.matches(c) {
589 return SearchStep::Match(i, i + char_len);
591 return SearchStep::Reject(i, i + char_len);
598 impl<'a, C: MultiCharEq> DoubleEndedSearcher<'a> for MultiCharEqSearcher<'a, C> {}
600 /////////////////////////////////////////////////////////////////////////////
602 macro_rules! pattern_methods {
603 ($t:ty, $pmap:expr, $smap:expr) => {
607 fn into_searcher(self, haystack: &'a str) -> $t {
608 ($smap)(($pmap)(self).into_searcher(haystack))
612 fn is_contained_in(self, haystack: &'a str) -> bool {
613 ($pmap)(self).is_contained_in(haystack)
617 fn is_prefix_of(self, haystack: &'a str) -> bool {
618 ($pmap)(self).is_prefix_of(haystack)
622 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
623 ($pmap)(self).strip_prefix_of(haystack)
627 fn is_suffix_of(self, haystack: &'a str) -> bool
629 $t: ReverseSearcher<'a>,
631 ($pmap)(self).is_suffix_of(haystack)
635 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str>
637 $t: ReverseSearcher<'a>,
639 ($pmap)(self).strip_suffix_of(haystack)
644 macro_rules! searcher_methods {
647 fn haystack(&self) -> &'a str {
651 fn next(&mut self) -> SearchStep {
655 fn next_match(&mut self) -> Option<(usize, usize)> {
659 fn next_reject(&mut self) -> Option<(usize, usize)> {
665 fn next_back(&mut self) -> SearchStep {
669 fn next_match_back(&mut self) -> Option<(usize, usize)> {
670 self.0.next_match_back()
673 fn next_reject_back(&mut self) -> Option<(usize, usize)> {
674 self.0.next_reject_back()
679 /////////////////////////////////////////////////////////////////////////////
681 /////////////////////////////////////////////////////////////////////////////
683 // Todo: Change / Remove due to ambiguity in meaning.
685 /// Associated type for `<&[char] as Pattern<'a>>::Searcher`.
686 #[derive(Clone, Debug)]
687 pub struct CharSliceSearcher<'a, 'b>(<MultiCharEqPattern<&'b [char]> as Pattern<'a>>::Searcher);
689 unsafe impl<'a, 'b> Searcher<'a> for CharSliceSearcher<'a, 'b> {
690 searcher_methods!(forward);
693 unsafe impl<'a, 'b> ReverseSearcher<'a> for CharSliceSearcher<'a, 'b> {
694 searcher_methods!(reverse);
697 impl<'a, 'b> DoubleEndedSearcher<'a> for CharSliceSearcher<'a, 'b> {}
699 /// Searches for chars that are equal to any of the chars in the array.
700 impl<'a, 'b> Pattern<'a> for &'b [char] {
701 pattern_methods!(CharSliceSearcher<'a, 'b>, MultiCharEqPattern, CharSliceSearcher);
704 /////////////////////////////////////////////////////////////////////////////
705 // Impl for F: FnMut(char) -> bool
706 /////////////////////////////////////////////////////////////////////////////
708 /// Associated type for `<F as Pattern<'a>>::Searcher`.
710 pub struct CharPredicateSearcher<'a, F>(<MultiCharEqPattern<F> as Pattern<'a>>::Searcher)
712 F: FnMut(char) -> bool;
714 impl<F> fmt::Debug for CharPredicateSearcher<'_, F>
716 F: FnMut(char) -> bool,
718 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
719 f.debug_struct("CharPredicateSearcher")
720 .field("haystack", &self.0.haystack)
721 .field("char_indices", &self.0.char_indices)
725 unsafe impl<'a, F> Searcher<'a> for CharPredicateSearcher<'a, F>
727 F: FnMut(char) -> bool,
729 searcher_methods!(forward);
732 unsafe impl<'a, F> ReverseSearcher<'a> for CharPredicateSearcher<'a, F>
734 F: FnMut(char) -> bool,
736 searcher_methods!(reverse);
739 impl<'a, F> DoubleEndedSearcher<'a> for CharPredicateSearcher<'a, F> where F: FnMut(char) -> bool {}
741 /// Searches for chars that match the given predicate.
742 impl<'a, F> Pattern<'a> for F
744 F: FnMut(char) -> bool,
746 pattern_methods!(CharPredicateSearcher<'a, F>, MultiCharEqPattern, CharPredicateSearcher);
749 /////////////////////////////////////////////////////////////////////////////
751 /////////////////////////////////////////////////////////////////////////////
753 /// Delegates to the `&str` impl.
754 impl<'a, 'b, 'c> Pattern<'a> for &'c &'b str {
755 pattern_methods!(StrSearcher<'a, 'b>, |&s| s, |s| s);
758 /////////////////////////////////////////////////////////////////////////////
760 /////////////////////////////////////////////////////////////////////////////
762 /// Non-allocating substring search.
764 /// Will handle the pattern `""` as returning empty matches at each character
766 impl<'a, 'b> Pattern<'a> for &'b str {
767 type Searcher = StrSearcher<'a, 'b>;
770 fn into_searcher(self, haystack: &'a str) -> StrSearcher<'a, 'b> {
771 StrSearcher::new(haystack, self)
774 /// Checks whether the pattern matches at the front of the haystack.
776 fn is_prefix_of(self, haystack: &'a str) -> bool {
777 haystack.as_bytes().starts_with(self.as_bytes())
780 /// Removes the pattern from the front of haystack, if it matches.
782 fn strip_prefix_of(self, haystack: &'a str) -> Option<&'a str> {
783 if self.is_prefix_of(haystack) {
784 // SAFETY: prefix was just verified to exist.
785 unsafe { Some(haystack.get_unchecked(self.as_bytes().len()..)) }
791 /// Checks whether the pattern matches at the back of the haystack.
793 fn is_suffix_of(self, haystack: &'a str) -> bool {
794 haystack.as_bytes().ends_with(self.as_bytes())
797 /// Removes the pattern from the back of haystack, if it matches.
799 fn strip_suffix_of(self, haystack: &'a str) -> Option<&'a str> {
800 if self.is_suffix_of(haystack) {
801 let i = haystack.len() - self.as_bytes().len();
802 // SAFETY: suffix was just verified to exist.
803 unsafe { Some(haystack.get_unchecked(..i)) }
810 /////////////////////////////////////////////////////////////////////////////
811 // Two Way substring searcher
812 /////////////////////////////////////////////////////////////////////////////
814 #[derive(Clone, Debug)]
815 /// Associated type for `<&str as Pattern<'a>>::Searcher`.
816 pub struct StrSearcher<'a, 'b> {
820 searcher: StrSearcherImpl,
823 #[derive(Clone, Debug)]
824 enum StrSearcherImpl {
826 TwoWay(TwoWaySearcher),
829 #[derive(Clone, Debug)]
837 impl<'a, 'b> StrSearcher<'a, 'b> {
838 fn new(haystack: &'a str, needle: &'b str) -> StrSearcher<'a, 'b> {
839 if needle.is_empty() {
843 searcher: StrSearcherImpl::Empty(EmptyNeedle {
854 searcher: StrSearcherImpl::TwoWay(TwoWaySearcher::new(
863 unsafe impl<'a, 'b> Searcher<'a> for StrSearcher<'a, 'b> {
865 fn haystack(&self) -> &'a str {
870 fn next(&mut self) -> SearchStep {
871 match self.searcher {
872 StrSearcherImpl::Empty(ref mut searcher) => {
873 // empty needle rejects every char and matches every empty string between them
874 let is_match = searcher.is_match_fw;
875 searcher.is_match_fw = !searcher.is_match_fw;
876 let pos = searcher.position;
877 match self.haystack[pos..].chars().next() {
878 _ if is_match => SearchStep::Match(pos, pos),
879 None => SearchStep::Done,
881 searcher.position += ch.len_utf8();
882 SearchStep::Reject(pos, searcher.position)
886 StrSearcherImpl::TwoWay(ref mut searcher) => {
887 // TwoWaySearcher produces valid *Match* indices that split at char boundaries
888 // as long as it does correct matching and that haystack and needle are
890 // *Rejects* from the algorithm can fall on any indices, but we will walk them
891 // manually to the next character boundary, so that they are utf-8 safe.
892 if searcher.position == self.haystack.len() {
893 return SearchStep::Done;
895 let is_long = searcher.memory == usize::MAX;
896 match searcher.next::<RejectAndMatch>(
897 self.haystack.as_bytes(),
898 self.needle.as_bytes(),
901 SearchStep::Reject(a, mut b) => {
902 // skip to next char boundary
903 while !self.haystack.is_char_boundary(b) {
906 searcher.position = cmp::max(b, searcher.position);
907 SearchStep::Reject(a, b)
909 otherwise => otherwise,
916 fn next_match(&mut self) -> Option<(usize, usize)> {
917 match self.searcher {
918 StrSearcherImpl::Empty(..) => loop {
920 SearchStep::Match(a, b) => return Some((a, b)),
921 SearchStep::Done => return None,
922 SearchStep::Reject(..) => {}
925 StrSearcherImpl::TwoWay(ref mut searcher) => {
926 let is_long = searcher.memory == usize::MAX;
927 // write out `true` and `false` cases to encourage the compiler
928 // to specialize the two cases separately.
930 searcher.next::<MatchOnly>(
931 self.haystack.as_bytes(),
932 self.needle.as_bytes(),
936 searcher.next::<MatchOnly>(
937 self.haystack.as_bytes(),
938 self.needle.as_bytes(),
947 unsafe impl<'a, 'b> ReverseSearcher<'a> for StrSearcher<'a, 'b> {
949 fn next_back(&mut self) -> SearchStep {
950 match self.searcher {
951 StrSearcherImpl::Empty(ref mut searcher) => {
952 let is_match = searcher.is_match_bw;
953 searcher.is_match_bw = !searcher.is_match_bw;
954 let end = searcher.end;
955 match self.haystack[..end].chars().next_back() {
956 _ if is_match => SearchStep::Match(end, end),
957 None => SearchStep::Done,
959 searcher.end -= ch.len_utf8();
960 SearchStep::Reject(searcher.end, end)
964 StrSearcherImpl::TwoWay(ref mut searcher) => {
965 if searcher.end == 0 {
966 return SearchStep::Done;
968 let is_long = searcher.memory == usize::MAX;
969 match searcher.next_back::<RejectAndMatch>(
970 self.haystack.as_bytes(),
971 self.needle.as_bytes(),
974 SearchStep::Reject(mut a, b) => {
975 // skip to next char boundary
976 while !self.haystack.is_char_boundary(a) {
979 searcher.end = cmp::min(a, searcher.end);
980 SearchStep::Reject(a, b)
982 otherwise => otherwise,
989 fn next_match_back(&mut self) -> Option<(usize, usize)> {
990 match self.searcher {
991 StrSearcherImpl::Empty(..) => loop {
992 match self.next_back() {
993 SearchStep::Match(a, b) => return Some((a, b)),
994 SearchStep::Done => return None,
995 SearchStep::Reject(..) => {}
998 StrSearcherImpl::TwoWay(ref mut searcher) => {
999 let is_long = searcher.memory == usize::MAX;
1000 // write out `true` and `false`, like `next_match`
1002 searcher.next_back::<MatchOnly>(
1003 self.haystack.as_bytes(),
1004 self.needle.as_bytes(),
1008 searcher.next_back::<MatchOnly>(
1009 self.haystack.as_bytes(),
1010 self.needle.as_bytes(),
1019 /// The internal state of the two-way substring search algorithm.
1020 #[derive(Clone, Debug)]
1021 struct TwoWaySearcher {
1023 /// critical factorization index
1025 /// critical factorization index for reversed needle
1026 crit_pos_back: usize,
1028 /// `byteset` is an extension (not part of the two way algorithm);
1029 /// it's a 64-bit "fingerprint" where each set bit `j` corresponds
1030 /// to a (byte & 63) == j present in the needle.
1036 /// index into needle before which we have already matched
1038 /// index into needle after which we have already matched
1043 This is the Two-Way search algorithm, which was introduced in the paper:
1044 Crochemore, M., Perrin, D., 1991, Two-way string-matching, Journal of the ACM 38(3):651-675.
1046 Here's some background information.
1048 A *word* is a string of symbols. The *length* of a word should be a familiar
1049 notion, and here we denote it for any word x by |x|.
1050 (We also allow for the possibility of the *empty word*, a word of length zero).
1052 If x is any non-empty word, then an integer p with 0 < p <= |x| is said to be a
1053 *period* for x iff for all i with 0 <= i <= |x| - p - 1, we have x[i] == x[i+p].
1054 For example, both 1 and 2 are periods for the string "aa". As another example,
1055 the only period of the string "abcd" is 4.
1057 We denote by period(x) the *smallest* period of x (provided that x is non-empty).
1058 This is always well-defined since every non-empty word x has at least one period,
1059 |x|. We sometimes call this *the period* of x.
1061 If u, v and x are words such that x = uv, where uv is the concatenation of u and
1062 v, then we say that (u, v) is a *factorization* of x.
1064 Let (u, v) be a factorization for a word x. Then if w is a non-empty word such
1065 that both of the following hold
1067 - either w is a suffix of u or u is a suffix of w
1068 - either w is a prefix of v or v is a prefix of w
1070 then w is said to be a *repetition* for the factorization (u, v).
1072 Just to unpack this, there are four possibilities here. Let w = "abc". Then we
1075 - w is a suffix of u and w is a prefix of v. ex: ("lolabc", "abcde")
1076 - w is a suffix of u and v is a prefix of w. ex: ("lolabc", "ab")
1077 - u is a suffix of w and w is a prefix of v. ex: ("bc", "abchi")
1078 - u is a suffix of w and v is a prefix of w. ex: ("bc", "a")
1080 Note that the word vu is a repetition for any factorization (u,v) of x = uv,
1081 so every factorization has at least one repetition.
1083 If x is a string and (u, v) is a factorization for x, then a *local period* for
1084 (u, v) is an integer r such that there is some word w such that |w| = r and w is
1085 a repetition for (u, v).
1087 We denote by local_period(u, v) the smallest local period of (u, v). We sometimes
1088 call this *the local period* of (u, v). Provided that x = uv is non-empty, this
1089 is well-defined (because each non-empty word has at least one factorization, as
1092 It can be proven that the following is an equivalent definition of a local period
1093 for a factorization (u, v): any positive integer r such that x[i] == x[i+r] for
1094 all i such that |u| - r <= i <= |u| - 1 and such that both x[i] and x[i+r] are
1095 defined. (i.e., i > 0 and i + r < |x|).
1097 Using the above reformulation, it is easy to prove that
1099 1 <= local_period(u, v) <= period(uv)
1101 A factorization (u, v) of x such that local_period(u,v) = period(x) is called a
1102 *critical factorization*.
1104 The algorithm hinges on the following theorem, which is stated without proof:
1106 **Critical Factorization Theorem** Any word x has at least one critical
1107 factorization (u, v) such that |u| < period(x).
1109 The purpose of maximal_suffix is to find such a critical factorization.
1111 If the period is short, compute another factorization x = u' v' to use
1112 for reverse search, chosen instead so that |v'| < period(x).
1115 impl TwoWaySearcher {
1116 fn new(needle: &[u8], end: usize) -> TwoWaySearcher {
1117 let (crit_pos_false, period_false) = TwoWaySearcher::maximal_suffix(needle, false);
1118 let (crit_pos_true, period_true) = TwoWaySearcher::maximal_suffix(needle, true);
1120 let (crit_pos, period) = if crit_pos_false > crit_pos_true {
1121 (crit_pos_false, period_false)
1123 (crit_pos_true, period_true)
1126 // A particularly readable explanation of what's going on here can be found
1127 // in Crochemore and Rytter's book "Text Algorithms", ch 13. Specifically
1128 // see the code for "Algorithm CP" on p. 323.
1130 // What's going on is we have some critical factorization (u, v) of the
1131 // needle, and we want to determine whether u is a suffix of
1132 // &v[..period]. If it is, we use "Algorithm CP1". Otherwise we use
1133 // "Algorithm CP2", which is optimized for when the period of the needle
1135 if needle[..crit_pos] == needle[period..period + crit_pos] {
1136 // short period case -- the period is exact
1137 // compute a separate critical factorization for the reversed needle
1138 // x = u' v' where |v'| < period(x).
1140 // This is sped up by the period being known already.
1141 // Note that a case like x = "acba" may be factored exactly forwards
1142 // (crit_pos = 1, period = 3) while being factored with approximate
1143 // period in reverse (crit_pos = 2, period = 2). We use the given
1144 // reverse factorization but keep the exact period.
1145 let crit_pos_back = needle.len()
1147 TwoWaySearcher::reverse_maximal_suffix(needle, period, false),
1148 TwoWaySearcher::reverse_maximal_suffix(needle, period, true),
1155 byteset: Self::byteset_create(&needle[..period]),
1160 memory_back: needle.len(),
1163 // long period case -- we have an approximation to the actual period,
1164 // and don't use memorization.
1166 // Approximate the period by lower bound max(|u|, |v|) + 1.
1167 // The critical factorization is efficient to use for both forward and
1172 crit_pos_back: crit_pos,
1173 period: cmp::max(crit_pos, needle.len() - crit_pos) + 1,
1174 byteset: Self::byteset_create(needle),
1178 memory: usize::MAX, // Dummy value to signify that the period is long
1179 memory_back: usize::MAX,
1185 fn byteset_create(bytes: &[u8]) -> u64 {
1186 bytes.iter().fold(0, |a, &b| (1 << (b & 0x3f)) | a)
1190 fn byteset_contains(&self, byte: u8) -> bool {
1191 (self.byteset >> ((byte & 0x3f) as usize)) & 1 != 0
1194 // One of the main ideas of Two-Way is that we factorize the needle into
1195 // two halves, (u, v), and begin trying to find v in the haystack by scanning
1196 // left to right. If v matches, we try to match u by scanning right to left.
1197 // How far we can jump when we encounter a mismatch is all based on the fact
1198 // that (u, v) is a critical factorization for the needle.
1200 fn next<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1204 // `next()` uses `self.position` as its cursor
1205 let old_pos = self.position;
1206 let needle_last = needle.len() - 1;
1208 // Check that we have room to search in
1209 // position + needle_last can not overflow if we assume slices
1210 // are bounded by isize's range.
1211 let tail_byte = match haystack.get(self.position + needle_last) {
1214 self.position = haystack.len();
1215 return S::rejecting(old_pos, self.position);
1219 if S::use_early_reject() && old_pos != self.position {
1220 return S::rejecting(old_pos, self.position);
1223 // Quickly skip by large portions unrelated to our substring
1224 if !self.byteset_contains(tail_byte) {
1225 self.position += needle.len();
1232 // See if the right part of the needle matches
1234 if long_period { self.crit_pos } else { cmp::max(self.crit_pos, self.memory) };
1235 for i in start..needle.len() {
1236 if needle[i] != haystack[self.position + i] {
1237 self.position += i - self.crit_pos + 1;
1245 // See if the left part of the needle matches
1246 let start = if long_period { 0 } else { self.memory };
1247 for i in (start..self.crit_pos).rev() {
1248 if needle[i] != haystack[self.position + i] {
1249 self.position += self.period;
1251 self.memory = needle.len() - self.period;
1257 // We have found a match!
1258 let match_pos = self.position;
1260 // Note: add self.period instead of needle.len() to have overlapping matches
1261 self.position += needle.len();
1263 self.memory = 0; // set to needle.len() - self.period for overlapping matches
1266 return S::matching(match_pos, match_pos + needle.len());
1270 // Follows the ideas in `next()`.
1272 // The definitions are symmetrical, with period(x) = period(reverse(x))
1273 // and local_period(u, v) = local_period(reverse(v), reverse(u)), so if (u, v)
1274 // is a critical factorization, so is (reverse(v), reverse(u)).
1276 // For the reverse case we have computed a critical factorization x = u' v'
1277 // (field `crit_pos_back`). We need |u| < period(x) for the forward case and
1278 // thus |v'| < period(x) for the reverse.
1280 // To search in reverse through the haystack, we search forward through
1281 // a reversed haystack with a reversed needle, matching first u' and then v'.
1283 fn next_back<S>(&mut self, haystack: &[u8], needle: &[u8], long_period: bool) -> S::Output
1287 // `next_back()` uses `self.end` as its cursor -- so that `next()` and `next_back()`
1289 let old_end = self.end;
1291 // Check that we have room to search in
1292 // end - needle.len() will wrap around when there is no more room,
1293 // but due to slice length limits it can never wrap all the way back
1294 // into the length of haystack.
1295 let front_byte = match haystack.get(self.end.wrapping_sub(needle.len())) {
1299 return S::rejecting(0, old_end);
1303 if S::use_early_reject() && old_end != self.end {
1304 return S::rejecting(self.end, old_end);
1307 // Quickly skip by large portions unrelated to our substring
1308 if !self.byteset_contains(front_byte) {
1309 self.end -= needle.len();
1311 self.memory_back = needle.len();
1316 // See if the left part of the needle matches
1317 let crit = if long_period {
1320 cmp::min(self.crit_pos_back, self.memory_back)
1322 for i in (0..crit).rev() {
1323 if needle[i] != haystack[self.end - needle.len() + i] {
1324 self.end -= self.crit_pos_back - i;
1326 self.memory_back = needle.len();
1332 // See if the right part of the needle matches
1333 let needle_end = if long_period { needle.len() } else { self.memory_back };
1334 for i in self.crit_pos_back..needle_end {
1335 if needle[i] != haystack[self.end - needle.len() + i] {
1336 self.end -= self.period;
1338 self.memory_back = self.period;
1344 // We have found a match!
1345 let match_pos = self.end - needle.len();
1346 // Note: sub self.period instead of needle.len() to have overlapping matches
1347 self.end -= needle.len();
1349 self.memory_back = needle.len();
1352 return S::matching(match_pos, match_pos + needle.len());
1356 // Compute the maximal suffix of `arr`.
1358 // The maximal suffix is a possible critical factorization (u, v) of `arr`.
1360 // Returns (`i`, `p`) where `i` is the starting index of v and `p` is the
1363 // `order_greater` determines if lexical order is `<` or `>`. Both
1364 // orders must be computed -- the ordering with the largest `i` gives
1365 // a critical factorization.
1367 // For long period cases, the resulting period is not exact (it is too short).
1369 fn maximal_suffix(arr: &[u8], order_greater: bool) -> (usize, usize) {
1370 let mut left = 0; // Corresponds to i in the paper
1371 let mut right = 1; // Corresponds to j in the paper
1372 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1373 // to match 0-based indexing.
1374 let mut period = 1; // Corresponds to p in the paper
1376 while let Some(&a) = arr.get(right + offset) {
1377 // `left` will be inbounds when `right` is.
1378 let b = arr[left + offset];
1379 if (a < b && !order_greater) || (a > b && order_greater) {
1380 // Suffix is smaller, period is entire prefix so far.
1381 right += offset + 1;
1383 period = right - left;
1385 // Advance through repetition of the current period.
1386 if offset + 1 == period {
1387 right += offset + 1;
1393 // Suffix is larger, start over from current location.
1403 // Compute the maximal suffix of the reverse of `arr`.
1405 // The maximal suffix is a possible critical factorization (u', v') of `arr`.
1407 // Returns `i` where `i` is the starting index of v', from the back;
1408 // returns immediately when a period of `known_period` is reached.
1410 // `order_greater` determines if lexical order is `<` or `>`. Both
1411 // orders must be computed -- the ordering with the largest `i` gives
1412 // a critical factorization.
1414 // For long period cases, the resulting period is not exact (it is too short).
1415 fn reverse_maximal_suffix(arr: &[u8], known_period: usize, order_greater: bool) -> usize {
1416 let mut left = 0; // Corresponds to i in the paper
1417 let mut right = 1; // Corresponds to j in the paper
1418 let mut offset = 0; // Corresponds to k in the paper, but starting at 0
1419 // to match 0-based indexing.
1420 let mut period = 1; // Corresponds to p in the paper
1423 while right + offset < n {
1424 let a = arr[n - (1 + right + offset)];
1425 let b = arr[n - (1 + left + offset)];
1426 if (a < b && !order_greater) || (a > b && order_greater) {
1427 // Suffix is smaller, period is entire prefix so far.
1428 right += offset + 1;
1430 period = right - left;
1432 // Advance through repetition of the current period.
1433 if offset + 1 == period {
1434 right += offset + 1;
1440 // Suffix is larger, start over from current location.
1446 if period == known_period {
1450 debug_assert!(period <= known_period);
1455 // TwoWayStrategy allows the algorithm to either skip non-matches as quickly
1456 // as possible, or to work in a mode where it emits Rejects relatively quickly.
1457 trait TwoWayStrategy {
1459 fn use_early_reject() -> bool;
1460 fn rejecting(a: usize, b: usize) -> Self::Output;
1461 fn matching(a: usize, b: usize) -> Self::Output;
1464 /// Skip to match intervals as quickly as possible
1467 impl TwoWayStrategy for MatchOnly {
1468 type Output = Option<(usize, usize)>;
1471 fn use_early_reject() -> bool {
1475 fn rejecting(_a: usize, _b: usize) -> Self::Output {
1479 fn matching(a: usize, b: usize) -> Self::Output {
1484 /// Emit Rejects regularly
1485 enum RejectAndMatch {}
1487 impl TwoWayStrategy for RejectAndMatch {
1488 type Output = SearchStep;
1491 fn use_early_reject() -> bool {
1495 fn rejecting(a: usize, b: usize) -> Self::Output {
1496 SearchStep::Reject(a, b)
1499 fn matching(a: usize, b: usize) -> Self::Output {
1500 SearchStep::Match(a, b)