grep_matcher/
lib.rs

1/*!
2This crate provides an interface for regular expressions, with a focus on line
3oriented search. The purpose of this crate is to provide a low level matching
4interface that permits any kind of substring or regex implementation to power
5the search routines provided by the
6[`grep-searcher`](https://docs.rs/grep-searcher)
7crate.
8
9The primary thing provided by this crate is the [`Matcher`] trait. The trait
10defines an abstract interface for text search. It is robust enough to support
11everything from basic substring search all the way to arbitrarily complex
12regular expression implementations without sacrificing performance.
13
14A key design decision made in this crate is the use of *internal iteration*,
15or otherwise known as the "push" model of searching. In this paradigm,
16implementations of the `Matcher` trait will drive search and execute callbacks
17provided by the caller when a match is found. This is in contrast to the
18usual style of *external iteration* (the "pull" model) found throughout the
19Rust ecosystem. There are two primary reasons why internal iteration was
20chosen:
21
22* Some search implementations may themselves require internal iteration.
23  Converting an internal iterator to an external iterator can be non-trivial
24  and sometimes even practically impossible.
25* Rust's type system isn't quite expressive enough to write a generic interface
26  using external iteration without giving something else up (namely, ease of
27  use and/or performance).
28
29In other words, internal iteration was chosen because it is the lowest common
30denominator and because it is probably the least bad way of expressing the
31interface in today's Rust. As a result, this trait isn't specifically intended
32for everyday use, although, you might find it to be a happy price to pay if you
33want to write code that is generic over multiple different regex
34implementations.
35*/
36
37#![deny(missing_docs)]
38
39use crate::interpolate::interpolate;
40
41mod interpolate;
42
43/// The type of a match.
44///
45/// The type of a match is a possibly empty range pointing to a contiguous
46/// block of addressable memory.
47///
48/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
49///
50/// # Indexing
51///
52/// This type is structurally identical to `std::ops::Range<usize>`, but
53/// is a bit more ergonomic for dealing with match indices. In particular,
54/// this type implements `Copy` and provides methods for building new `Match`
55/// values based on old `Match` values. Finally, the invariant that `start`
56/// is always less than or equal to `end` is enforced.
57///
58/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
59/// range notation. e.g.,
60///
61/// ```
62/// use grep_matcher::Match;
63///
64/// let m = Match::new(2, 5);
65/// let bytes = b"abcdefghi";
66/// assert_eq!(b"cde", &bytes[m]);
67/// ```
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub struct Match {
70    start: usize,
71    end: usize,
72}
73
74impl Match {
75    /// Create a new match.
76    ///
77    /// # Panics
78    ///
79    /// This function panics if `start > end`.
80    #[inline]
81    pub fn new(start: usize, end: usize) -> Match {
82        assert!(start <= end);
83        Match { start, end }
84    }
85
86    /// Creates a zero width match at the given offset.
87    #[inline]
88    pub fn zero(offset: usize) -> Match {
89        Match { start: offset, end: offset }
90    }
91
92    /// Return the start offset of this match.
93    #[inline]
94    pub fn start(&self) -> usize {
95        self.start
96    }
97
98    /// Return the end offset of this match.
99    #[inline]
100    pub fn end(&self) -> usize {
101        self.end
102    }
103
104    /// Return a new match with the start offset replaced with the given
105    /// value.
106    ///
107    /// # Panics
108    ///
109    /// This method panics if `start > self.end`.
110    #[inline]
111    pub fn with_start(&self, start: usize) -> Match {
112        assert!(start <= self.end, "{} is not <= {}", start, self.end);
113        Match { start, ..*self }
114    }
115
116    /// Return a new match with the end offset replaced with the given
117    /// value.
118    ///
119    /// # Panics
120    ///
121    /// This method panics if `self.start > end`.
122    #[inline]
123    pub fn with_end(&self, end: usize) -> Match {
124        assert!(self.start <= end, "{} is not <= {}", self.start, end);
125        Match { end, ..*self }
126    }
127
128    /// Offset this match by the given amount and return a new match.
129    ///
130    /// This adds the given offset to the start and end of this match, and
131    /// returns the resulting match.
132    ///
133    /// # Panics
134    ///
135    /// This panics if adding the given amount to either the start or end
136    /// offset would result in an overflow.
137    #[inline]
138    pub fn offset(&self, amount: usize) -> Match {
139        Match {
140            start: self.start.checked_add(amount).unwrap(),
141            end: self.end.checked_add(amount).unwrap(),
142        }
143    }
144
145    /// Returns the number of bytes in this match.
146    #[inline]
147    pub fn len(&self) -> usize {
148        self.end - self.start
149    }
150
151    /// Returns true if and only if this match is empty.
152    #[inline]
153    pub fn is_empty(&self) -> bool {
154        self.len() == 0
155    }
156}
157
158impl std::ops::Index<Match> for [u8] {
159    type Output = [u8];
160
161    #[inline]
162    fn index(&self, index: Match) -> &[u8] {
163        &self[index.start..index.end]
164    }
165}
166
167impl std::ops::IndexMut<Match> for [u8] {
168    #[inline]
169    fn index_mut(&mut self, index: Match) -> &mut [u8] {
170        &mut self[index.start..index.end]
171    }
172}
173
174impl std::ops::Index<Match> for str {
175    type Output = str;
176
177    #[inline]
178    fn index(&self, index: Match) -> &str {
179        &self[index.start..index.end]
180    }
181}
182
183/// A line terminator.
184///
185/// A line terminator represents the end of a line. Generally, every line is
186/// either "terminated" by the end of a stream or a specific byte (or sequence
187/// of bytes).
188///
189/// Generally, a line terminator is a single byte, specifically, `\n`, on
190/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
191/// as `CRLF` for `Carriage Return; Line Feed`).
192///
193/// The default line terminator is `\n` on all platforms.
194#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
195pub struct LineTerminator(LineTerminatorImp);
196
197#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
198enum LineTerminatorImp {
199    /// Any single byte representing a line terminator.
200    Byte(u8),
201    /// A line terminator represented by `\r\n`.
202    ///
203    /// When this option is used, consumers may generally treat a lone `\n` as
204    /// a line terminator in addition to `\r\n`.
205    CRLF,
206}
207
208impl LineTerminator {
209    /// Return a new single-byte line terminator. Any byte is valid.
210    #[inline]
211    pub fn byte(byte: u8) -> LineTerminator {
212        LineTerminator(LineTerminatorImp::Byte(byte))
213    }
214
215    /// Return a new line terminator represented by `\r\n`.
216    ///
217    /// When this option is used, consumers may generally treat a lone `\n` as
218    /// a line terminator in addition to `\r\n`.
219    #[inline]
220    pub fn crlf() -> LineTerminator {
221        LineTerminator(LineTerminatorImp::CRLF)
222    }
223
224    /// Returns true if and only if this line terminator is CRLF.
225    #[inline]
226    pub fn is_crlf(&self) -> bool {
227        self.0 == LineTerminatorImp::CRLF
228    }
229
230    /// Returns this line terminator as a single byte.
231    ///
232    /// If the line terminator is CRLF, then this returns `\n`. This is
233    /// useful for routines that, for example, find line boundaries by treating
234    /// `\n` as a line terminator even when it isn't preceded by `\r`.
235    #[inline]
236    pub fn as_byte(&self) -> u8 {
237        match self.0 {
238            LineTerminatorImp::Byte(byte) => byte,
239            LineTerminatorImp::CRLF => b'\n',
240        }
241    }
242
243    /// Returns this line terminator as a sequence of bytes.
244    ///
245    /// This returns a singleton sequence for all line terminators except for
246    /// `CRLF`, in which case, it returns `\r\n`.
247    ///
248    /// The slice returned is guaranteed to have length at least `1`.
249    #[inline]
250    pub fn as_bytes(&self) -> &[u8] {
251        match self.0 {
252            LineTerminatorImp::Byte(ref byte) => std::slice::from_ref(byte),
253            LineTerminatorImp::CRLF => &[b'\r', b'\n'],
254        }
255    }
256
257    /// Returns true if and only if the given slice ends with this line
258    /// terminator.
259    ///
260    /// If this line terminator is `CRLF`, then this only checks whether the
261    /// last byte is `\n`.
262    #[inline]
263    pub fn is_suffix(&self, slice: &[u8]) -> bool {
264        slice.last().map_or(false, |&b| b == self.as_byte())
265    }
266}
267
268impl Default for LineTerminator {
269    #[inline]
270    fn default() -> LineTerminator {
271        LineTerminator::byte(b'\n')
272    }
273}
274
275/// A set of bytes.
276///
277/// In this crate, byte sets are used to express bytes that can never appear
278/// anywhere in a match for a particular implementation of the `Matcher` trait.
279/// Specifically, if such a set can be determined, then it's possible for
280/// callers to perform additional operations on the basis that certain bytes
281/// may never match.
282///
283/// For example, if a search is configured to possibly produce results that
284/// span multiple lines but a caller provided pattern can never match across
285/// multiple lines, then it may make sense to divert to more optimized line
286/// oriented routines that don't need to handle the multi-line match case.
287#[derive(Clone, Debug)]
288pub struct ByteSet(BitSet);
289
290#[derive(Clone, Copy)]
291struct BitSet([u64; 4]);
292
293impl std::fmt::Debug for BitSet {
294    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
295        let mut fmtd = f.debug_set();
296        for b in 0..=255 {
297            if ByteSet(*self).contains(b) {
298                fmtd.entry(&b);
299            }
300        }
301        fmtd.finish()
302    }
303}
304
305impl ByteSet {
306    /// Create an empty set of bytes.
307    #[inline]
308    pub fn empty() -> ByteSet {
309        ByteSet(BitSet([0; 4]))
310    }
311
312    /// Create a full set of bytes such that every possible byte is in the set
313    /// returned.
314    #[inline]
315    pub fn full() -> ByteSet {
316        ByteSet(BitSet([u64::MAX; 4]))
317    }
318
319    /// Add a byte to this set.
320    ///
321    /// If the given byte already belongs to this set, then this is a no-op.
322    #[inline]
323    pub fn add(&mut self, byte: u8) {
324        let bucket = byte / 64;
325        let bit = byte % 64;
326        (self.0).0[usize::from(bucket)] |= 1 << bit;
327    }
328
329    /// Add an inclusive range of bytes.
330    #[inline]
331    pub fn add_all(&mut self, start: u8, end: u8) {
332        for b in start..=end {
333            self.add(b);
334        }
335    }
336
337    /// Remove a byte from this set.
338    ///
339    /// If the given byte is not in this set, then this is a no-op.
340    #[inline]
341    pub fn remove(&mut self, byte: u8) {
342        let bucket = byte / 64;
343        let bit = byte % 64;
344        (self.0).0[usize::from(bucket)] &= !(1 << bit);
345    }
346
347    /// Remove an inclusive range of bytes.
348    #[inline]
349    pub fn remove_all(&mut self, start: u8, end: u8) {
350        for b in start..=end {
351            self.remove(b);
352        }
353    }
354
355    /// Return true if and only if the given byte is in this set.
356    #[inline]
357    pub fn contains(&self, byte: u8) -> bool {
358        let bucket = byte / 64;
359        let bit = byte % 64;
360        (self.0).0[usize::from(bucket)] & (1 << bit) > 0
361    }
362}
363
364/// A trait that describes implementations of capturing groups.
365///
366/// When a matcher supports capturing group extraction, then it is the
367/// matcher's responsibility to provide an implementation of this trait.
368///
369/// Principally, this trait provides a way to access capturing groups
370/// in a uniform way that does not require any specific representation.
371/// Namely, different matcher implementations may require different in-memory
372/// representations of capturing groups. This trait permits matchers to
373/// maintain their specific in-memory representation.
374///
375/// Note that this trait explicitly does not provide a way to construct a new
376/// capture value. Instead, it is the responsibility of a `Matcher` to build
377/// one, which might require knowledge of the matcher's internal implementation
378/// details.
379pub trait Captures {
380    /// Return the total number of capturing groups. This includes capturing
381    /// groups that have not matched anything.
382    fn len(&self) -> usize;
383
384    /// Return the capturing group match at the given index. If no match of
385    /// that capturing group exists, then this returns `None`.
386    ///
387    /// When a matcher reports a match with capturing groups, then the first
388    /// capturing group (at index `0`) must always correspond to the offsets
389    /// for the overall match.
390    fn get(&self, i: usize) -> Option<Match>;
391
392    /// Returns true if and only if these captures are empty. This occurs
393    /// when `len` is `0`.
394    ///
395    /// Note that capturing groups that have non-zero length but otherwise
396    /// contain no matching groups are *not* empty.
397    #[inline]
398    fn is_empty(&self) -> bool {
399        self.len() == 0
400    }
401
402    /// Expands all instances of `$name` in `replacement` to the corresponding
403    /// capture group `name`, and writes them to the `dst` buffer given.
404    ///
405    /// (Note: If you're looking for a convenient way to perform replacements
406    /// with interpolation, then you'll want to use the `replace_with_captures`
407    /// method on the `Matcher` trait.)
408    ///
409    /// `name` may be an integer corresponding to the index of the
410    /// capture group (counted by order of opening parenthesis where `0` is the
411    /// entire match) or it can be a name (consisting of letters, digits or
412    /// underscores) corresponding to a named capture group.
413    ///
414    /// A `name` is translated to a capture group index via the given
415    /// `name_to_index` function. If `name` isn't a valid capture group
416    /// (whether the name doesn't exist or isn't a valid index), then it is
417    /// replaced with the empty string.
418    ///
419    /// The longest possible name is used. e.g., `$1a` looks up the capture
420    /// group named `1a` and not the capture group at index `1`. To exert
421    /// more precise control over the name, use braces, e.g., `${1}a`. In all
422    /// cases, capture group names are limited to ASCII letters, numbers and
423    /// underscores.
424    ///
425    /// To write a literal `$` use `$$`.
426    ///
427    /// Note that the capture group match indices are resolved by slicing
428    /// the given `haystack`. Generally, this means that `haystack` should be
429    /// the same slice that was searched to get the current capture group
430    /// matches.
431    #[inline]
432    fn interpolate<F>(
433        &self,
434        name_to_index: F,
435        haystack: &[u8],
436        replacement: &[u8],
437        dst: &mut Vec<u8>,
438    ) where
439        F: FnMut(&str) -> Option<usize>,
440    {
441        interpolate(
442            replacement,
443            |i, dst| {
444                if let Some(range) = self.get(i) {
445                    dst.extend(&haystack[range]);
446                }
447            },
448            name_to_index,
449            dst,
450        )
451    }
452}
453
454/// NoCaptures provides an always-empty implementation of the `Captures` trait.
455///
456/// This type is useful for implementations of `Matcher` that don't support
457/// capturing groups.
458#[derive(Clone, Debug)]
459pub struct NoCaptures(());
460
461impl NoCaptures {
462    /// Create an empty set of capturing groups.
463    #[inline]
464    pub fn new() -> NoCaptures {
465        NoCaptures(())
466    }
467}
468
469impl Captures for NoCaptures {
470    #[inline]
471    fn len(&self) -> usize {
472        0
473    }
474
475    #[inline]
476    fn get(&self, _: usize) -> Option<Match> {
477        None
478    }
479}
480
481/// NoError provides an error type for matchers that never produce errors.
482///
483/// This error type implements the `std::error::Error` and `std::fmt::Display`
484/// traits for use in matcher implementations that can never produce errors.
485///
486/// The `std::fmt::Debug` and `std::fmt::Display` impls for this type panics.
487#[derive(Debug, Eq, PartialEq)]
488pub struct NoError(());
489
490impl std::error::Error for NoError {
491    fn description(&self) -> &str {
492        "no error"
493    }
494}
495
496impl std::fmt::Display for NoError {
497    fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
498        panic!("BUG for NoError: an impossible error occurred")
499    }
500}
501
502impl From<NoError> for std::io::Error {
503    fn from(_: NoError) -> std::io::Error {
504        panic!("BUG for NoError: an impossible error occurred")
505    }
506}
507
508/// The type of match for a line oriented matcher.
509#[derive(Clone, Copy, Debug)]
510pub enum LineMatchKind {
511    /// A position inside a line that is known to contain a match.
512    ///
513    /// This position can be anywhere in the line. It does not need to point
514    /// at the location of the match.
515    Confirmed(usize),
516    /// A position inside a line that may contain a match, and must be searched
517    /// for verification.
518    ///
519    /// This position can be anywhere in the line. It does not need to point
520    /// at the location of the match.
521    Candidate(usize),
522}
523
524/// A matcher defines an interface for regular expression implementations.
525///
526/// While this trait is large, there are only two required methods that
527/// implementors must provide: `find_at` and `new_captures`. If captures aren't
528/// supported by your implementation, then `new_captures` can be implemented
529/// with [`NoCaptures`]. If your implementation does support capture groups,
530/// then you should also implement the other capture related methods, as
531/// dictated by the documentation. Crucially, this includes `captures_at`.
532///
533/// The rest of the methods on this trait provide default implementations on
534/// top of `find_at` and `new_captures`. It is not uncommon for implementations
535/// to be able to provide faster variants of some methods; in those cases,
536/// simply override the default implementation.
537pub trait Matcher {
538    /// The concrete type of capturing groups used for this matcher.
539    ///
540    /// If this implementation does not support capturing groups, then set
541    /// this to `NoCaptures`.
542    type Captures: Captures;
543
544    /// The error type used by this matcher.
545    ///
546    /// For matchers in which an error is not possible, they are encouraged to
547    /// use the `NoError` type in this crate. In the future, when the "never"
548    /// (spelled `!`) type is stabilized, then it should probably be used
549    /// instead.
550    type Error: std::fmt::Display;
551
552    /// Returns the start and end byte range of the first match in `haystack`
553    /// after `at`, where the byte offsets are relative to that start of
554    /// `haystack` (and not `at`). If no match exists, then `None` is returned.
555    ///
556    /// The text encoding of `haystack` is not strictly specified. Matchers are
557    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
558    ///
559    /// The significance of the starting point is that it takes the surrounding
560    /// context into consideration. For example, the `\A` anchor can only
561    /// match when `at == 0`.
562    fn find_at(
563        &self,
564        haystack: &[u8],
565        at: usize,
566    ) -> Result<Option<Match>, Self::Error>;
567
568    /// Creates an empty group of captures suitable for use with the capturing
569    /// APIs of this trait.
570    ///
571    /// Implementations that don't support capturing groups should use
572    /// the `NoCaptures` type and implement this method by calling
573    /// `NoCaptures::new()`.
574    fn new_captures(&self) -> Result<Self::Captures, Self::Error>;
575
576    /// Returns the total number of capturing groups in this matcher.
577    ///
578    /// If a matcher supports capturing groups, then this value must always be
579    /// at least 1, where the first capturing group always corresponds to the
580    /// overall match.
581    ///
582    /// If a matcher does not support capturing groups, then this should
583    /// always return 0.
584    ///
585    /// By default, capturing groups are not supported, so this always
586    /// returns 0.
587    #[inline]
588    fn capture_count(&self) -> usize {
589        0
590    }
591
592    /// Maps the given capture group name to its corresponding capture group
593    /// index, if one exists. If one does not exist, then `None` is returned.
594    ///
595    /// If the given capture group name maps to multiple indices, then it is
596    /// not specified which one is returned. However, it is guaranteed that
597    /// one of them is returned.
598    ///
599    /// By default, capturing groups are not supported, so this always returns
600    /// `None`.
601    #[inline]
602    fn capture_index(&self, _name: &str) -> Option<usize> {
603        None
604    }
605
606    /// Returns the start and end byte range of the first match in `haystack`.
607    /// If no match exists, then `None` is returned.
608    ///
609    /// The text encoding of `haystack` is not strictly specified. Matchers are
610    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
611    #[inline]
612    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
613        self.find_at(haystack, 0)
614    }
615
616    /// Executes the given function over successive non-overlapping matches
617    /// in `haystack`. If no match exists, then the given function is never
618    /// called. If the function returns `false`, then iteration stops.
619    #[inline]
620    fn find_iter<F>(
621        &self,
622        haystack: &[u8],
623        matched: F,
624    ) -> Result<(), Self::Error>
625    where
626        F: FnMut(Match) -> bool,
627    {
628        self.find_iter_at(haystack, 0, matched)
629    }
630
631    /// Executes the given function over successive non-overlapping matches
632    /// in `haystack`. If no match exists, then the given function is never
633    /// called. If the function returns `false`, then iteration stops.
634    ///
635    /// The significance of the starting point is that it takes the surrounding
636    /// context into consideration. For example, the `\A` anchor can only
637    /// match when `at == 0`.
638    #[inline]
639    fn find_iter_at<F>(
640        &self,
641        haystack: &[u8],
642        at: usize,
643        mut matched: F,
644    ) -> Result<(), Self::Error>
645    where
646        F: FnMut(Match) -> bool,
647    {
648        self.try_find_iter_at(haystack, at, |m| Ok(matched(m)))
649            .map(|r: Result<(), ()>| r.unwrap())
650    }
651
652    /// Executes the given function over successive non-overlapping matches
653    /// in `haystack`. If no match exists, then the given function is never
654    /// called. If the function returns `false`, then iteration stops.
655    /// Similarly, if the function returns an error then iteration stops and
656    /// the error is yielded. If an error occurs while executing the search,
657    /// then it is converted to
658    /// `E`.
659    #[inline]
660    fn try_find_iter<F, E>(
661        &self,
662        haystack: &[u8],
663        matched: F,
664    ) -> Result<Result<(), E>, Self::Error>
665    where
666        F: FnMut(Match) -> Result<bool, E>,
667    {
668        self.try_find_iter_at(haystack, 0, matched)
669    }
670
671    /// Executes the given function over successive non-overlapping matches
672    /// in `haystack`. If no match exists, then the given function is never
673    /// called. If the function returns `false`, then iteration stops.
674    /// Similarly, if the function returns an error then iteration stops and
675    /// the error is yielded. If an error occurs while executing the search,
676    /// then it is converted to
677    /// `E`.
678    ///
679    /// The significance of the starting point is that it takes the surrounding
680    /// context into consideration. For example, the `\A` anchor can only
681    /// match when `at == 0`.
682    #[inline]
683    fn try_find_iter_at<F, E>(
684        &self,
685        haystack: &[u8],
686        at: usize,
687        mut matched: F,
688    ) -> Result<Result<(), E>, Self::Error>
689    where
690        F: FnMut(Match) -> Result<bool, E>,
691    {
692        let mut last_end = at;
693        let mut last_match = None;
694
695        loop {
696            if last_end > haystack.len() {
697                return Ok(Ok(()));
698            }
699            let m = match self.find_at(haystack, last_end)? {
700                None => return Ok(Ok(())),
701                Some(m) => m,
702            };
703            if m.start == m.end {
704                // This is an empty match. To ensure we make progress, start
705                // the next search at the smallest possible starting position
706                // of the next match following this one.
707                last_end = m.end + 1;
708                // Don't accept empty matches immediately following a match.
709                // Just move on to the next match.
710                if Some(m.end) == last_match {
711                    continue;
712                }
713            } else {
714                last_end = m.end;
715            }
716            last_match = Some(m.end);
717            match matched(m) {
718                Ok(true) => continue,
719                Ok(false) => return Ok(Ok(())),
720                Err(err) => return Ok(Err(err)),
721            }
722        }
723    }
724
725    /// Populates the first set of capture group matches from `haystack` into
726    /// `caps`. If no match exists, then `false` is returned.
727    ///
728    /// The text encoding of `haystack` is not strictly specified. Matchers are
729    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
730    #[inline]
731    fn captures(
732        &self,
733        haystack: &[u8],
734        caps: &mut Self::Captures,
735    ) -> Result<bool, Self::Error> {
736        self.captures_at(haystack, 0, caps)
737    }
738
739    /// Executes the given function over successive non-overlapping matches
740    /// in `haystack` with capture groups extracted from each match. If no
741    /// match exists, then the given function is never called. If the function
742    /// returns `false`, then iteration stops.
743    #[inline]
744    fn captures_iter<F>(
745        &self,
746        haystack: &[u8],
747        caps: &mut Self::Captures,
748        matched: F,
749    ) -> Result<(), Self::Error>
750    where
751        F: FnMut(&Self::Captures) -> bool,
752    {
753        self.captures_iter_at(haystack, 0, caps, matched)
754    }
755
756    /// Executes the given function over successive non-overlapping matches
757    /// in `haystack` with capture groups extracted from each match. If no
758    /// match exists, then the given function is never called. If the function
759    /// returns `false`, then iteration stops.
760    ///
761    /// The significance of the starting point is that it takes the surrounding
762    /// context into consideration. For example, the `\A` anchor can only
763    /// match when `at == 0`.
764    #[inline]
765    fn captures_iter_at<F>(
766        &self,
767        haystack: &[u8],
768        at: usize,
769        caps: &mut Self::Captures,
770        mut matched: F,
771    ) -> Result<(), Self::Error>
772    where
773        F: FnMut(&Self::Captures) -> bool,
774    {
775        self.try_captures_iter_at(haystack, at, caps, |caps| Ok(matched(caps)))
776            .map(|r: Result<(), ()>| r.unwrap())
777    }
778
779    /// Executes the given function over successive non-overlapping matches
780    /// in `haystack` with capture groups extracted from each match. If no
781    /// match exists, then the given function is never called. If the function
782    /// returns `false`, then iteration stops. Similarly, if the function
783    /// returns an error then iteration stops and the error is yielded. If
784    /// an error occurs while executing the search, then it is converted to
785    /// `E`.
786    #[inline]
787    fn try_captures_iter<F, E>(
788        &self,
789        haystack: &[u8],
790        caps: &mut Self::Captures,
791        matched: F,
792    ) -> Result<Result<(), E>, Self::Error>
793    where
794        F: FnMut(&Self::Captures) -> Result<bool, E>,
795    {
796        self.try_captures_iter_at(haystack, 0, caps, matched)
797    }
798
799    /// Executes the given function over successive non-overlapping matches
800    /// in `haystack` with capture groups extracted from each match. If no
801    /// match exists, then the given function is never called. If the function
802    /// returns `false`, then iteration stops. Similarly, if the function
803    /// returns an error then iteration stops and the error is yielded. If
804    /// an error occurs while executing the search, then it is converted to
805    /// `E`.
806    ///
807    /// The significance of the starting point is that it takes the surrounding
808    /// context into consideration. For example, the `\A` anchor can only
809    /// match when `at == 0`.
810    #[inline]
811    fn try_captures_iter_at<F, E>(
812        &self,
813        haystack: &[u8],
814        at: usize,
815        caps: &mut Self::Captures,
816        mut matched: F,
817    ) -> Result<Result<(), E>, Self::Error>
818    where
819        F: FnMut(&Self::Captures) -> Result<bool, E>,
820    {
821        let mut last_end = at;
822        let mut last_match = None;
823
824        loop {
825            if last_end > haystack.len() {
826                return Ok(Ok(()));
827            }
828            if !self.captures_at(haystack, last_end, caps)? {
829                return Ok(Ok(()));
830            }
831            let m = caps.get(0).unwrap();
832            if m.start == m.end {
833                // This is an empty match. To ensure we make progress, start
834                // the next search at the smallest possible starting position
835                // of the next match following this one.
836                last_end = m.end + 1;
837                // Don't accept empty matches immediately following a match.
838                // Just move on to the next match.
839                if Some(m.end) == last_match {
840                    continue;
841                }
842            } else {
843                last_end = m.end;
844            }
845            last_match = Some(m.end);
846            match matched(caps) {
847                Ok(true) => continue,
848                Ok(false) => return Ok(Ok(())),
849                Err(err) => return Ok(Err(err)),
850            }
851        }
852    }
853
854    /// Populates the first set of capture group matches from `haystack`
855    /// into `matches` after `at`, where the byte offsets in each capturing
856    /// group are relative to the start of `haystack` (and not `at`). If no
857    /// match exists, then `false` is returned and the contents of the given
858    /// capturing groups are unspecified.
859    ///
860    /// The text encoding of `haystack` is not strictly specified. Matchers are
861    /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
862    ///
863    /// The significance of the starting point is that it takes the surrounding
864    /// context into consideration. For example, the `\A` anchor can only
865    /// match when `at == 0`.
866    ///
867    /// By default, capturing groups aren't supported, and this implementation
868    /// will always behave as if a match were impossible.
869    ///
870    /// Implementors that provide support for capturing groups must guarantee
871    /// that when a match occurs, the first capture match (at index `0`) is
872    /// always set to the overall match offsets.
873    ///
874    /// Note that if implementors seek to support capturing groups, then they
875    /// should implement this method. Other methods that match based on
876    /// captures will then work automatically.
877    #[inline]
878    fn captures_at(
879        &self,
880        _haystack: &[u8],
881        _at: usize,
882        _caps: &mut Self::Captures,
883    ) -> Result<bool, Self::Error> {
884        Ok(false)
885    }
886
887    /// Replaces every match in the given haystack with the result of calling
888    /// `append`. `append` is given the start and end of a match, along with
889    /// a handle to the `dst` buffer provided.
890    ///
891    /// If the given `append` function returns `false`, then replacement stops.
892    #[inline]
893    fn replace<F>(
894        &self,
895        haystack: &[u8],
896        dst: &mut Vec<u8>,
897        mut append: F,
898    ) -> Result<(), Self::Error>
899    where
900        F: FnMut(Match, &mut Vec<u8>) -> bool,
901    {
902        let mut last_match = 0;
903        self.find_iter(haystack, |m| {
904            dst.extend(&haystack[last_match..m.start]);
905            last_match = m.end;
906            append(m, dst)
907        })?;
908        dst.extend(&haystack[last_match..]);
909        Ok(())
910    }
911
912    /// Replaces every match in the given haystack with the result of calling
913    /// `append` with the matching capture groups.
914    ///
915    /// If the given `append` function returns `false`, then replacement stops.
916    #[inline]
917    fn replace_with_captures<F>(
918        &self,
919        haystack: &[u8],
920        caps: &mut Self::Captures,
921        dst: &mut Vec<u8>,
922        append: F,
923    ) -> Result<(), Self::Error>
924    where
925        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
926    {
927        self.replace_with_captures_at(haystack, 0, caps, dst, append)
928    }
929
930    /// Replaces every match in the given haystack with the result of calling
931    /// `append` with the matching capture groups.
932    ///
933    /// If the given `append` function returns `false`, then replacement stops.
934    ///
935    /// The significance of the starting point is that it takes the surrounding
936    /// context into consideration. For example, the `\A` anchor can only
937    /// match when `at == 0`.
938    #[inline]
939    fn replace_with_captures_at<F>(
940        &self,
941        haystack: &[u8],
942        at: usize,
943        caps: &mut Self::Captures,
944        dst: &mut Vec<u8>,
945        mut append: F,
946    ) -> Result<(), Self::Error>
947    where
948        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
949    {
950        let mut last_match = at;
951        self.captures_iter_at(haystack, at, caps, |caps| {
952            let m = caps.get(0).unwrap();
953            dst.extend(&haystack[last_match..m.start]);
954            last_match = m.end;
955            append(caps, dst)
956        })?;
957        dst.extend(&haystack[last_match..]);
958        Ok(())
959    }
960
961    /// Returns true if and only if the matcher matches the given haystack.
962    ///
963    /// By default, this method is implemented by calling `shortest_match`.
964    #[inline]
965    fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
966        self.is_match_at(haystack, 0)
967    }
968
969    /// Returns true if and only if the matcher matches the given haystack
970    /// starting at the given position.
971    ///
972    /// By default, this method is implemented by calling `shortest_match_at`.
973    ///
974    /// The significance of the starting point is that it takes the surrounding
975    /// context into consideration. For example, the `\A` anchor can only
976    /// match when `at == 0`.
977    #[inline]
978    fn is_match_at(
979        &self,
980        haystack: &[u8],
981        at: usize,
982    ) -> Result<bool, Self::Error> {
983        Ok(self.shortest_match_at(haystack, at)?.is_some())
984    }
985
986    /// Returns an end location of the first match in `haystack`. If no match
987    /// exists, then `None` is returned.
988    ///
989    /// Note that the end location reported by this method may be less than the
990    /// same end location reported by `find`. For example, running `find` with
991    /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
992    /// 3)`, but `shortest_match` may report `1` as the ending location since
993    /// that is the place at which a match is guaranteed to occur.
994    ///
995    /// This method should never report false positives or false negatives. The
996    /// point of this method is that some implementors may be able to provide
997    /// a faster implementation of this than what `find` does.
998    ///
999    /// By default, this method is implemented by calling `find`.
1000    #[inline]
1001    fn shortest_match(
1002        &self,
1003        haystack: &[u8],
1004    ) -> Result<Option<usize>, Self::Error> {
1005        self.shortest_match_at(haystack, 0)
1006    }
1007
1008    /// Returns an end location of the first match in `haystack` starting at
1009    /// the given position. If no match exists, then `None` is returned.
1010    ///
1011    /// Note that the end location reported by this method may be less than the
1012    /// same end location reported by `find`. For example, running `find` with
1013    /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1014    /// 3)`, but `shortest_match` may report `1` as the ending location since
1015    /// that is the place at which a match is guaranteed to occur.
1016    ///
1017    /// This method should never report false positives or false negatives. The
1018    /// point of this method is that some implementors may be able to provide
1019    /// a faster implementation of this than what `find` does.
1020    ///
1021    /// By default, this method is implemented by calling `find_at`.
1022    ///
1023    /// The significance of the starting point is that it takes the surrounding
1024    /// context into consideration. For example, the `\A` anchor can only
1025    /// match when `at == 0`.
1026    #[inline]
1027    fn shortest_match_at(
1028        &self,
1029        haystack: &[u8],
1030        at: usize,
1031    ) -> Result<Option<usize>, Self::Error> {
1032        Ok(self.find_at(haystack, at)?.map(|m| m.end))
1033    }
1034
1035    /// If available, return a set of bytes that will never appear in a match
1036    /// produced by an implementation.
1037    ///
1038    /// Specifically, if such a set can be determined, then it's possible for
1039    /// callers to perform additional operations on the basis that certain
1040    /// bytes may never match.
1041    ///
1042    /// For example, if a search is configured to possibly produce results
1043    /// that span multiple lines but a caller provided pattern can never
1044    /// match across multiple lines, then it may make sense to divert to
1045    /// more optimized line oriented routines that don't need to handle the
1046    /// multi-line match case.
1047    ///
1048    /// Implementations that produce this set must never report false
1049    /// positives, but may produce false negatives. That is, is a byte is in
1050    /// this set then it must be guaranteed that it is never in a match. But,
1051    /// if a byte is not in this set, then callers cannot assume that a match
1052    /// exists with that byte.
1053    ///
1054    /// By default, this returns `None`.
1055    #[inline]
1056    fn non_matching_bytes(&self) -> Option<&ByteSet> {
1057        None
1058    }
1059
1060    /// If this matcher was compiled as a line oriented matcher, then this
1061    /// method returns the line terminator if and only if the line terminator
1062    /// never appears in any match produced by this matcher. If this wasn't
1063    /// compiled as a line oriented matcher, or if the aforementioned guarantee
1064    /// cannot be made, then this must return `None`, which is the default.
1065    /// It is **never wrong** to return `None`, but returning a line terminator
1066    /// when it can appear in a match results in unspecified behavior.
1067    ///
1068    /// The line terminator is typically `b'\n'`, but can be any single byte or
1069    /// `CRLF`.
1070    ///
1071    /// By default, this returns `None`.
1072    #[inline]
1073    fn line_terminator(&self) -> Option<LineTerminator> {
1074        None
1075    }
1076
1077    /// Return one of the following: a confirmed line match, a candidate line
1078    /// match (which may be a false positive) or no match at all (which **must
1079    /// not** be a false negative). When reporting a confirmed or candidate
1080    /// match, the position returned can be any position in the line.
1081    ///
1082    /// By default, this never returns a candidate match, and always either
1083    /// returns a confirmed match or no match at all.
1084    ///
1085    /// When a matcher can match spans over multiple lines, then the behavior
1086    /// of this method is unspecified. Namely, use of this method only
1087    /// makes sense in a context where the caller is looking for the next
1088    /// matching line. That is, callers should only use this method when
1089    /// `line_terminator` does not return `None`.
1090    ///
1091    /// # Design rationale
1092    ///
1093    /// A line matcher is, fundamentally, a normal matcher with the addition
1094    /// of one optional method: finding a line. By default, this routine
1095    /// is implemented via the matcher's `shortest_match` method, which
1096    /// always yields either no match or a `LineMatchKind::Confirmed`. However,
1097    /// implementors may provide a routine for this that can return candidate
1098    /// lines that need subsequent verification to be confirmed as a match.
1099    /// This can be useful in cases where it may be quicker to find candidate
1100    /// lines via some other means instead of relying on the more general
1101    /// implementations for `find` and `shortest_match`.
1102    ///
1103    /// For example, consider the regex `\w+foo\s+`. Both `find` and
1104    /// `shortest_match` must consider the entire regex, including the `\w+`
1105    /// and `\s+`, while searching. However, this method could look for lines
1106    /// containing `foo` and return them as candidates. Finding `foo` might
1107    /// be implemented as a highly optimized substring search routine (like
1108    /// `memmem`), which is likely to be faster than whatever more generalized
1109    /// routine is required for resolving `\w+foo\s+`. The caller is then
1110    /// responsible for confirming whether a match exists or not.
1111    ///
1112    /// Note that while this method may report false positives, it must never
1113    /// report false negatives. That is, it can never skip over lines that
1114    /// contain a match.
1115    #[inline]
1116    fn find_candidate_line(
1117        &self,
1118        haystack: &[u8],
1119    ) -> Result<Option<LineMatchKind>, Self::Error> {
1120        Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
1121    }
1122}
1123
1124impl<'a, M: Matcher> Matcher for &'a M {
1125    type Captures = M::Captures;
1126    type Error = M::Error;
1127
1128    #[inline]
1129    fn find_at(
1130        &self,
1131        haystack: &[u8],
1132        at: usize,
1133    ) -> Result<Option<Match>, Self::Error> {
1134        (*self).find_at(haystack, at)
1135    }
1136
1137    #[inline]
1138    fn new_captures(&self) -> Result<Self::Captures, Self::Error> {
1139        (*self).new_captures()
1140    }
1141
1142    #[inline]
1143    fn captures_at(
1144        &self,
1145        haystack: &[u8],
1146        at: usize,
1147        caps: &mut Self::Captures,
1148    ) -> Result<bool, Self::Error> {
1149        (*self).captures_at(haystack, at, caps)
1150    }
1151
1152    #[inline]
1153    fn capture_index(&self, name: &str) -> Option<usize> {
1154        (*self).capture_index(name)
1155    }
1156
1157    #[inline]
1158    fn capture_count(&self) -> usize {
1159        (*self).capture_count()
1160    }
1161
1162    #[inline]
1163    fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
1164        (*self).find(haystack)
1165    }
1166
1167    #[inline]
1168    fn find_iter<F>(
1169        &self,
1170        haystack: &[u8],
1171        matched: F,
1172    ) -> Result<(), Self::Error>
1173    where
1174        F: FnMut(Match) -> bool,
1175    {
1176        (*self).find_iter(haystack, matched)
1177    }
1178
1179    #[inline]
1180    fn find_iter_at<F>(
1181        &self,
1182        haystack: &[u8],
1183        at: usize,
1184        matched: F,
1185    ) -> Result<(), Self::Error>
1186    where
1187        F: FnMut(Match) -> bool,
1188    {
1189        (*self).find_iter_at(haystack, at, matched)
1190    }
1191
1192    #[inline]
1193    fn try_find_iter<F, E>(
1194        &self,
1195        haystack: &[u8],
1196        matched: F,
1197    ) -> Result<Result<(), E>, Self::Error>
1198    where
1199        F: FnMut(Match) -> Result<bool, E>,
1200    {
1201        (*self).try_find_iter(haystack, matched)
1202    }
1203
1204    #[inline]
1205    fn try_find_iter_at<F, E>(
1206        &self,
1207        haystack: &[u8],
1208        at: usize,
1209        matched: F,
1210    ) -> Result<Result<(), E>, Self::Error>
1211    where
1212        F: FnMut(Match) -> Result<bool, E>,
1213    {
1214        (*self).try_find_iter_at(haystack, at, matched)
1215    }
1216
1217    #[inline]
1218    fn captures(
1219        &self,
1220        haystack: &[u8],
1221        caps: &mut Self::Captures,
1222    ) -> Result<bool, Self::Error> {
1223        (*self).captures(haystack, caps)
1224    }
1225
1226    #[inline]
1227    fn captures_iter<F>(
1228        &self,
1229        haystack: &[u8],
1230        caps: &mut Self::Captures,
1231        matched: F,
1232    ) -> Result<(), Self::Error>
1233    where
1234        F: FnMut(&Self::Captures) -> bool,
1235    {
1236        (*self).captures_iter(haystack, caps, matched)
1237    }
1238
1239    #[inline]
1240    fn captures_iter_at<F>(
1241        &self,
1242        haystack: &[u8],
1243        at: usize,
1244        caps: &mut Self::Captures,
1245        matched: F,
1246    ) -> Result<(), Self::Error>
1247    where
1248        F: FnMut(&Self::Captures) -> bool,
1249    {
1250        (*self).captures_iter_at(haystack, at, caps, matched)
1251    }
1252
1253    #[inline]
1254    fn try_captures_iter<F, E>(
1255        &self,
1256        haystack: &[u8],
1257        caps: &mut Self::Captures,
1258        matched: F,
1259    ) -> Result<Result<(), E>, Self::Error>
1260    where
1261        F: FnMut(&Self::Captures) -> Result<bool, E>,
1262    {
1263        (*self).try_captures_iter(haystack, caps, matched)
1264    }
1265
1266    #[inline]
1267    fn try_captures_iter_at<F, E>(
1268        &self,
1269        haystack: &[u8],
1270        at: usize,
1271        caps: &mut Self::Captures,
1272        matched: F,
1273    ) -> Result<Result<(), E>, Self::Error>
1274    where
1275        F: FnMut(&Self::Captures) -> Result<bool, E>,
1276    {
1277        (*self).try_captures_iter_at(haystack, at, caps, matched)
1278    }
1279
1280    #[inline]
1281    fn replace<F>(
1282        &self,
1283        haystack: &[u8],
1284        dst: &mut Vec<u8>,
1285        append: F,
1286    ) -> Result<(), Self::Error>
1287    where
1288        F: FnMut(Match, &mut Vec<u8>) -> bool,
1289    {
1290        (*self).replace(haystack, dst, append)
1291    }
1292
1293    #[inline]
1294    fn replace_with_captures<F>(
1295        &self,
1296        haystack: &[u8],
1297        caps: &mut Self::Captures,
1298        dst: &mut Vec<u8>,
1299        append: F,
1300    ) -> Result<(), Self::Error>
1301    where
1302        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1303    {
1304        (*self).replace_with_captures(haystack, caps, dst, append)
1305    }
1306
1307    #[inline]
1308    fn replace_with_captures_at<F>(
1309        &self,
1310        haystack: &[u8],
1311        at: usize,
1312        caps: &mut Self::Captures,
1313        dst: &mut Vec<u8>,
1314        append: F,
1315    ) -> Result<(), Self::Error>
1316    where
1317        F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1318    {
1319        (*self).replace_with_captures_at(haystack, at, caps, dst, append)
1320    }
1321
1322    #[inline]
1323    fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
1324        (*self).is_match(haystack)
1325    }
1326
1327    #[inline]
1328    fn is_match_at(
1329        &self,
1330        haystack: &[u8],
1331        at: usize,
1332    ) -> Result<bool, Self::Error> {
1333        (*self).is_match_at(haystack, at)
1334    }
1335
1336    #[inline]
1337    fn shortest_match(
1338        &self,
1339        haystack: &[u8],
1340    ) -> Result<Option<usize>, Self::Error> {
1341        (*self).shortest_match(haystack)
1342    }
1343
1344    #[inline]
1345    fn shortest_match_at(
1346        &self,
1347        haystack: &[u8],
1348        at: usize,
1349    ) -> Result<Option<usize>, Self::Error> {
1350        (*self).shortest_match_at(haystack, at)
1351    }
1352
1353    #[inline]
1354    fn non_matching_bytes(&self) -> Option<&ByteSet> {
1355        (*self).non_matching_bytes()
1356    }
1357
1358    #[inline]
1359    fn line_terminator(&self) -> Option<LineTerminator> {
1360        (*self).line_terminator()
1361    }
1362
1363    #[inline]
1364    fn find_candidate_line(
1365        &self,
1366        haystack: &[u8],
1367    ) -> Result<Option<LineMatchKind>, Self::Error> {
1368        (*self).find_candidate_line(haystack)
1369    }
1370}