grep_matcher/lib.rs
1/*!
2This crate provides an interface for regular expressions, with a focus on line
3oriented search. The purpose of this crate is to provide a low level matching
4interface that permits any kind of substring or regex implementation to power
5the search routines provided by the
6[`grep-searcher`](https://docs.rs/grep-searcher)
7crate.
8
9The primary thing provided by this crate is the [`Matcher`] trait. The trait
10defines an abstract interface for text search. It is robust enough to support
11everything from basic substring search all the way to arbitrarily complex
12regular expression implementations without sacrificing performance.
13
14A key design decision made in this crate is the use of *internal iteration*,
15or otherwise known as the "push" model of searching. In this paradigm,
16implementations of the `Matcher` trait will drive search and execute callbacks
17provided by the caller when a match is found. This is in contrast to the
18usual style of *external iteration* (the "pull" model) found throughout the
19Rust ecosystem. There are two primary reasons why internal iteration was
20chosen:
21
22* Some search implementations may themselves require internal iteration.
23 Converting an internal iterator to an external iterator can be non-trivial
24 and sometimes even practically impossible.
25* Rust's type system isn't quite expressive enough to write a generic interface
26 using external iteration without giving something else up (namely, ease of
27 use and/or performance).
28
29In other words, internal iteration was chosen because it is the lowest common
30denominator and because it is probably the least bad way of expressing the
31interface in today's Rust. As a result, this trait isn't specifically intended
32for everyday use, although, you might find it to be a happy price to pay if you
33want to write code that is generic over multiple different regex
34implementations.
35*/
36
37#![deny(missing_docs)]
38
39use crate::interpolate::interpolate;
40
41mod interpolate;
42
43/// The type of a match.
44///
45/// The type of a match is a possibly empty range pointing to a contiguous
46/// block of addressable memory.
47///
48/// Every `Match` is guaranteed to satisfy the invariant that `start <= end`.
49///
50/// # Indexing
51///
52/// This type is structurally identical to `std::ops::Range<usize>`, but
53/// is a bit more ergonomic for dealing with match indices. In particular,
54/// this type implements `Copy` and provides methods for building new `Match`
55/// values based on old `Match` values. Finally, the invariant that `start`
56/// is always less than or equal to `end` is enforced.
57///
58/// A `Match` can be used to slice a `&[u8]`, `&mut [u8]` or `&str` using
59/// range notation. e.g.,
60///
61/// ```
62/// use grep_matcher::Match;
63///
64/// let m = Match::new(2, 5);
65/// let bytes = b"abcdefghi";
66/// assert_eq!(b"cde", &bytes[m]);
67/// ```
68#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
69pub struct Match {
70 start: usize,
71 end: usize,
72}
73
74impl Match {
75 /// Create a new match.
76 ///
77 /// # Panics
78 ///
79 /// This function panics if `start > end`.
80 #[inline]
81 pub fn new(start: usize, end: usize) -> Match {
82 assert!(start <= end);
83 Match { start, end }
84 }
85
86 /// Creates a zero width match at the given offset.
87 #[inline]
88 pub fn zero(offset: usize) -> Match {
89 Match { start: offset, end: offset }
90 }
91
92 /// Return the start offset of this match.
93 #[inline]
94 pub fn start(&self) -> usize {
95 self.start
96 }
97
98 /// Return the end offset of this match.
99 #[inline]
100 pub fn end(&self) -> usize {
101 self.end
102 }
103
104 /// Return a new match with the start offset replaced with the given
105 /// value.
106 ///
107 /// # Panics
108 ///
109 /// This method panics if `start > self.end`.
110 #[inline]
111 pub fn with_start(&self, start: usize) -> Match {
112 assert!(start <= self.end, "{} is not <= {}", start, self.end);
113 Match { start, ..*self }
114 }
115
116 /// Return a new match with the end offset replaced with the given
117 /// value.
118 ///
119 /// # Panics
120 ///
121 /// This method panics if `self.start > end`.
122 #[inline]
123 pub fn with_end(&self, end: usize) -> Match {
124 assert!(self.start <= end, "{} is not <= {}", self.start, end);
125 Match { end, ..*self }
126 }
127
128 /// Offset this match by the given amount and return a new match.
129 ///
130 /// This adds the given offset to the start and end of this match, and
131 /// returns the resulting match.
132 ///
133 /// # Panics
134 ///
135 /// This panics if adding the given amount to either the start or end
136 /// offset would result in an overflow.
137 #[inline]
138 pub fn offset(&self, amount: usize) -> Match {
139 Match {
140 start: self.start.checked_add(amount).unwrap(),
141 end: self.end.checked_add(amount).unwrap(),
142 }
143 }
144
145 /// Returns the number of bytes in this match.
146 #[inline]
147 pub fn len(&self) -> usize {
148 self.end - self.start
149 }
150
151 /// Returns true if and only if this match is empty.
152 #[inline]
153 pub fn is_empty(&self) -> bool {
154 self.len() == 0
155 }
156}
157
158impl std::ops::Index<Match> for [u8] {
159 type Output = [u8];
160
161 #[inline]
162 fn index(&self, index: Match) -> &[u8] {
163 &self[index.start..index.end]
164 }
165}
166
167impl std::ops::IndexMut<Match> for [u8] {
168 #[inline]
169 fn index_mut(&mut self, index: Match) -> &mut [u8] {
170 &mut self[index.start..index.end]
171 }
172}
173
174impl std::ops::Index<Match> for str {
175 type Output = str;
176
177 #[inline]
178 fn index(&self, index: Match) -> &str {
179 &self[index.start..index.end]
180 }
181}
182
183/// A line terminator.
184///
185/// A line terminator represents the end of a line. Generally, every line is
186/// either "terminated" by the end of a stream or a specific byte (or sequence
187/// of bytes).
188///
189/// Generally, a line terminator is a single byte, specifically, `\n`, on
190/// Unix-like systems. On Windows, a line terminator is `\r\n` (referred to
191/// as `CRLF` for `Carriage Return; Line Feed`).
192///
193/// The default line terminator is `\n` on all platforms.
194#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
195pub struct LineTerminator(LineTerminatorImp);
196
197#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
198enum LineTerminatorImp {
199 /// Any single byte representing a line terminator.
200 Byte(u8),
201 /// A line terminator represented by `\r\n`.
202 ///
203 /// When this option is used, consumers may generally treat a lone `\n` as
204 /// a line terminator in addition to `\r\n`.
205 CRLF,
206}
207
208impl LineTerminator {
209 /// Return a new single-byte line terminator. Any byte is valid.
210 #[inline]
211 pub fn byte(byte: u8) -> LineTerminator {
212 LineTerminator(LineTerminatorImp::Byte(byte))
213 }
214
215 /// Return a new line terminator represented by `\r\n`.
216 ///
217 /// When this option is used, consumers may generally treat a lone `\n` as
218 /// a line terminator in addition to `\r\n`.
219 #[inline]
220 pub fn crlf() -> LineTerminator {
221 LineTerminator(LineTerminatorImp::CRLF)
222 }
223
224 /// Returns true if and only if this line terminator is CRLF.
225 #[inline]
226 pub fn is_crlf(&self) -> bool {
227 self.0 == LineTerminatorImp::CRLF
228 }
229
230 /// Returns this line terminator as a single byte.
231 ///
232 /// If the line terminator is CRLF, then this returns `\n`. This is
233 /// useful for routines that, for example, find line boundaries by treating
234 /// `\n` as a line terminator even when it isn't preceded by `\r`.
235 #[inline]
236 pub fn as_byte(&self) -> u8 {
237 match self.0 {
238 LineTerminatorImp::Byte(byte) => byte,
239 LineTerminatorImp::CRLF => b'\n',
240 }
241 }
242
243 /// Returns this line terminator as a sequence of bytes.
244 ///
245 /// This returns a singleton sequence for all line terminators except for
246 /// `CRLF`, in which case, it returns `\r\n`.
247 ///
248 /// The slice returned is guaranteed to have length at least `1`.
249 #[inline]
250 pub fn as_bytes(&self) -> &[u8] {
251 match self.0 {
252 LineTerminatorImp::Byte(ref byte) => std::slice::from_ref(byte),
253 LineTerminatorImp::CRLF => &[b'\r', b'\n'],
254 }
255 }
256
257 /// Returns true if and only if the given slice ends with this line
258 /// terminator.
259 ///
260 /// If this line terminator is `CRLF`, then this only checks whether the
261 /// last byte is `\n`.
262 #[inline]
263 pub fn is_suffix(&self, slice: &[u8]) -> bool {
264 slice.last().map_or(false, |&b| b == self.as_byte())
265 }
266}
267
268impl Default for LineTerminator {
269 #[inline]
270 fn default() -> LineTerminator {
271 LineTerminator::byte(b'\n')
272 }
273}
274
275/// A set of bytes.
276///
277/// In this crate, byte sets are used to express bytes that can never appear
278/// anywhere in a match for a particular implementation of the `Matcher` trait.
279/// Specifically, if such a set can be determined, then it's possible for
280/// callers to perform additional operations on the basis that certain bytes
281/// may never match.
282///
283/// For example, if a search is configured to possibly produce results that
284/// span multiple lines but a caller provided pattern can never match across
285/// multiple lines, then it may make sense to divert to more optimized line
286/// oriented routines that don't need to handle the multi-line match case.
287#[derive(Clone, Debug)]
288pub struct ByteSet(BitSet);
289
290#[derive(Clone, Copy)]
291struct BitSet([u64; 4]);
292
293impl std::fmt::Debug for BitSet {
294 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
295 let mut fmtd = f.debug_set();
296 for b in 0..=255 {
297 if ByteSet(*self).contains(b) {
298 fmtd.entry(&b);
299 }
300 }
301 fmtd.finish()
302 }
303}
304
305impl ByteSet {
306 /// Create an empty set of bytes.
307 #[inline]
308 pub fn empty() -> ByteSet {
309 ByteSet(BitSet([0; 4]))
310 }
311
312 /// Create a full set of bytes such that every possible byte is in the set
313 /// returned.
314 #[inline]
315 pub fn full() -> ByteSet {
316 ByteSet(BitSet([u64::MAX; 4]))
317 }
318
319 /// Add a byte to this set.
320 ///
321 /// If the given byte already belongs to this set, then this is a no-op.
322 #[inline]
323 pub fn add(&mut self, byte: u8) {
324 let bucket = byte / 64;
325 let bit = byte % 64;
326 (self.0).0[usize::from(bucket)] |= 1 << bit;
327 }
328
329 /// Add an inclusive range of bytes.
330 #[inline]
331 pub fn add_all(&mut self, start: u8, end: u8) {
332 for b in start..=end {
333 self.add(b);
334 }
335 }
336
337 /// Remove a byte from this set.
338 ///
339 /// If the given byte is not in this set, then this is a no-op.
340 #[inline]
341 pub fn remove(&mut self, byte: u8) {
342 let bucket = byte / 64;
343 let bit = byte % 64;
344 (self.0).0[usize::from(bucket)] &= !(1 << bit);
345 }
346
347 /// Remove an inclusive range of bytes.
348 #[inline]
349 pub fn remove_all(&mut self, start: u8, end: u8) {
350 for b in start..=end {
351 self.remove(b);
352 }
353 }
354
355 /// Return true if and only if the given byte is in this set.
356 #[inline]
357 pub fn contains(&self, byte: u8) -> bool {
358 let bucket = byte / 64;
359 let bit = byte % 64;
360 (self.0).0[usize::from(bucket)] & (1 << bit) > 0
361 }
362}
363
364/// A trait that describes implementations of capturing groups.
365///
366/// When a matcher supports capturing group extraction, then it is the
367/// matcher's responsibility to provide an implementation of this trait.
368///
369/// Principally, this trait provides a way to access capturing groups
370/// in a uniform way that does not require any specific representation.
371/// Namely, different matcher implementations may require different in-memory
372/// representations of capturing groups. This trait permits matchers to
373/// maintain their specific in-memory representation.
374///
375/// Note that this trait explicitly does not provide a way to construct a new
376/// capture value. Instead, it is the responsibility of a `Matcher` to build
377/// one, which might require knowledge of the matcher's internal implementation
378/// details.
379pub trait Captures {
380 /// Return the total number of capturing groups. This includes capturing
381 /// groups that have not matched anything.
382 fn len(&self) -> usize;
383
384 /// Return the capturing group match at the given index. If no match of
385 /// that capturing group exists, then this returns `None`.
386 ///
387 /// When a matcher reports a match with capturing groups, then the first
388 /// capturing group (at index `0`) must always correspond to the offsets
389 /// for the overall match.
390 fn get(&self, i: usize) -> Option<Match>;
391
392 /// Returns true if and only if these captures are empty. This occurs
393 /// when `len` is `0`.
394 ///
395 /// Note that capturing groups that have non-zero length but otherwise
396 /// contain no matching groups are *not* empty.
397 #[inline]
398 fn is_empty(&self) -> bool {
399 self.len() == 0
400 }
401
402 /// Expands all instances of `$name` in `replacement` to the corresponding
403 /// capture group `name`, and writes them to the `dst` buffer given.
404 ///
405 /// (Note: If you're looking for a convenient way to perform replacements
406 /// with interpolation, then you'll want to use the `replace_with_captures`
407 /// method on the `Matcher` trait.)
408 ///
409 /// `name` may be an integer corresponding to the index of the
410 /// capture group (counted by order of opening parenthesis where `0` is the
411 /// entire match) or it can be a name (consisting of letters, digits or
412 /// underscores) corresponding to a named capture group.
413 ///
414 /// A `name` is translated to a capture group index via the given
415 /// `name_to_index` function. If `name` isn't a valid capture group
416 /// (whether the name doesn't exist or isn't a valid index), then it is
417 /// replaced with the empty string.
418 ///
419 /// The longest possible name is used. e.g., `$1a` looks up the capture
420 /// group named `1a` and not the capture group at index `1`. To exert
421 /// more precise control over the name, use braces, e.g., `${1}a`. In all
422 /// cases, capture group names are limited to ASCII letters, numbers and
423 /// underscores.
424 ///
425 /// To write a literal `$` use `$$`.
426 ///
427 /// Note that the capture group match indices are resolved by slicing
428 /// the given `haystack`. Generally, this means that `haystack` should be
429 /// the same slice that was searched to get the current capture group
430 /// matches.
431 #[inline]
432 fn interpolate<F>(
433 &self,
434 name_to_index: F,
435 haystack: &[u8],
436 replacement: &[u8],
437 dst: &mut Vec<u8>,
438 ) where
439 F: FnMut(&str) -> Option<usize>,
440 {
441 interpolate(
442 replacement,
443 |i, dst| {
444 if let Some(range) = self.get(i) {
445 dst.extend(&haystack[range]);
446 }
447 },
448 name_to_index,
449 dst,
450 )
451 }
452}
453
454/// NoCaptures provides an always-empty implementation of the `Captures` trait.
455///
456/// This type is useful for implementations of `Matcher` that don't support
457/// capturing groups.
458#[derive(Clone, Debug)]
459pub struct NoCaptures(());
460
461impl NoCaptures {
462 /// Create an empty set of capturing groups.
463 #[inline]
464 pub fn new() -> NoCaptures {
465 NoCaptures(())
466 }
467}
468
469impl Captures for NoCaptures {
470 #[inline]
471 fn len(&self) -> usize {
472 0
473 }
474
475 #[inline]
476 fn get(&self, _: usize) -> Option<Match> {
477 None
478 }
479}
480
481/// NoError provides an error type for matchers that never produce errors.
482///
483/// This error type implements the `std::error::Error` and `std::fmt::Display`
484/// traits for use in matcher implementations that can never produce errors.
485///
486/// The `std::fmt::Debug` and `std::fmt::Display` impls for this type panics.
487#[derive(Debug, Eq, PartialEq)]
488pub struct NoError(());
489
490impl std::error::Error for NoError {
491 fn description(&self) -> &str {
492 "no error"
493 }
494}
495
496impl std::fmt::Display for NoError {
497 fn fmt(&self, _: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
498 panic!("BUG for NoError: an impossible error occurred")
499 }
500}
501
502impl From<NoError> for std::io::Error {
503 fn from(_: NoError) -> std::io::Error {
504 panic!("BUG for NoError: an impossible error occurred")
505 }
506}
507
508/// The type of match for a line oriented matcher.
509#[derive(Clone, Copy, Debug)]
510pub enum LineMatchKind {
511 /// A position inside a line that is known to contain a match.
512 ///
513 /// This position can be anywhere in the line. It does not need to point
514 /// at the location of the match.
515 Confirmed(usize),
516 /// A position inside a line that may contain a match, and must be searched
517 /// for verification.
518 ///
519 /// This position can be anywhere in the line. It does not need to point
520 /// at the location of the match.
521 Candidate(usize),
522}
523
524/// A matcher defines an interface for regular expression implementations.
525///
526/// While this trait is large, there are only two required methods that
527/// implementors must provide: `find_at` and `new_captures`. If captures aren't
528/// supported by your implementation, then `new_captures` can be implemented
529/// with [`NoCaptures`]. If your implementation does support capture groups,
530/// then you should also implement the other capture related methods, as
531/// dictated by the documentation. Crucially, this includes `captures_at`.
532///
533/// The rest of the methods on this trait provide default implementations on
534/// top of `find_at` and `new_captures`. It is not uncommon for implementations
535/// to be able to provide faster variants of some methods; in those cases,
536/// simply override the default implementation.
537pub trait Matcher {
538 /// The concrete type of capturing groups used for this matcher.
539 ///
540 /// If this implementation does not support capturing groups, then set
541 /// this to `NoCaptures`.
542 type Captures: Captures;
543
544 /// The error type used by this matcher.
545 ///
546 /// For matchers in which an error is not possible, they are encouraged to
547 /// use the `NoError` type in this crate. In the future, when the "never"
548 /// (spelled `!`) type is stabilized, then it should probably be used
549 /// instead.
550 type Error: std::fmt::Display;
551
552 /// Returns the start and end byte range of the first match in `haystack`
553 /// after `at`, where the byte offsets are relative to that start of
554 /// `haystack` (and not `at`). If no match exists, then `None` is returned.
555 ///
556 /// The text encoding of `haystack` is not strictly specified. Matchers are
557 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
558 ///
559 /// The significance of the starting point is that it takes the surrounding
560 /// context into consideration. For example, the `\A` anchor can only
561 /// match when `at == 0`.
562 fn find_at(
563 &self,
564 haystack: &[u8],
565 at: usize,
566 ) -> Result<Option<Match>, Self::Error>;
567
568 /// Creates an empty group of captures suitable for use with the capturing
569 /// APIs of this trait.
570 ///
571 /// Implementations that don't support capturing groups should use
572 /// the `NoCaptures` type and implement this method by calling
573 /// `NoCaptures::new()`.
574 fn new_captures(&self) -> Result<Self::Captures, Self::Error>;
575
576 /// Returns the total number of capturing groups in this matcher.
577 ///
578 /// If a matcher supports capturing groups, then this value must always be
579 /// at least 1, where the first capturing group always corresponds to the
580 /// overall match.
581 ///
582 /// If a matcher does not support capturing groups, then this should
583 /// always return 0.
584 ///
585 /// By default, capturing groups are not supported, so this always
586 /// returns 0.
587 #[inline]
588 fn capture_count(&self) -> usize {
589 0
590 }
591
592 /// Maps the given capture group name to its corresponding capture group
593 /// index, if one exists. If one does not exist, then `None` is returned.
594 ///
595 /// If the given capture group name maps to multiple indices, then it is
596 /// not specified which one is returned. However, it is guaranteed that
597 /// one of them is returned.
598 ///
599 /// By default, capturing groups are not supported, so this always returns
600 /// `None`.
601 #[inline]
602 fn capture_index(&self, _name: &str) -> Option<usize> {
603 None
604 }
605
606 /// Returns the start and end byte range of the first match in `haystack`.
607 /// If no match exists, then `None` is returned.
608 ///
609 /// The text encoding of `haystack` is not strictly specified. Matchers are
610 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
611 #[inline]
612 fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
613 self.find_at(haystack, 0)
614 }
615
616 /// Executes the given function over successive non-overlapping matches
617 /// in `haystack`. If no match exists, then the given function is never
618 /// called. If the function returns `false`, then iteration stops.
619 #[inline]
620 fn find_iter<F>(
621 &self,
622 haystack: &[u8],
623 matched: F,
624 ) -> Result<(), Self::Error>
625 where
626 F: FnMut(Match) -> bool,
627 {
628 self.find_iter_at(haystack, 0, matched)
629 }
630
631 /// Executes the given function over successive non-overlapping matches
632 /// in `haystack`. If no match exists, then the given function is never
633 /// called. If the function returns `false`, then iteration stops.
634 ///
635 /// The significance of the starting point is that it takes the surrounding
636 /// context into consideration. For example, the `\A` anchor can only
637 /// match when `at == 0`.
638 #[inline]
639 fn find_iter_at<F>(
640 &self,
641 haystack: &[u8],
642 at: usize,
643 mut matched: F,
644 ) -> Result<(), Self::Error>
645 where
646 F: FnMut(Match) -> bool,
647 {
648 self.try_find_iter_at(haystack, at, |m| Ok(matched(m)))
649 .map(|r: Result<(), ()>| r.unwrap())
650 }
651
652 /// Executes the given function over successive non-overlapping matches
653 /// in `haystack`. If no match exists, then the given function is never
654 /// called. If the function returns `false`, then iteration stops.
655 /// Similarly, if the function returns an error then iteration stops and
656 /// the error is yielded. If an error occurs while executing the search,
657 /// then it is converted to
658 /// `E`.
659 #[inline]
660 fn try_find_iter<F, E>(
661 &self,
662 haystack: &[u8],
663 matched: F,
664 ) -> Result<Result<(), E>, Self::Error>
665 where
666 F: FnMut(Match) -> Result<bool, E>,
667 {
668 self.try_find_iter_at(haystack, 0, matched)
669 }
670
671 /// Executes the given function over successive non-overlapping matches
672 /// in `haystack`. If no match exists, then the given function is never
673 /// called. If the function returns `false`, then iteration stops.
674 /// Similarly, if the function returns an error then iteration stops and
675 /// the error is yielded. If an error occurs while executing the search,
676 /// then it is converted to
677 /// `E`.
678 ///
679 /// The significance of the starting point is that it takes the surrounding
680 /// context into consideration. For example, the `\A` anchor can only
681 /// match when `at == 0`.
682 #[inline]
683 fn try_find_iter_at<F, E>(
684 &self,
685 haystack: &[u8],
686 at: usize,
687 mut matched: F,
688 ) -> Result<Result<(), E>, Self::Error>
689 where
690 F: FnMut(Match) -> Result<bool, E>,
691 {
692 let mut last_end = at;
693 let mut last_match = None;
694
695 loop {
696 if last_end > haystack.len() {
697 return Ok(Ok(()));
698 }
699 let m = match self.find_at(haystack, last_end)? {
700 None => return Ok(Ok(())),
701 Some(m) => m,
702 };
703 if m.start == m.end {
704 // This is an empty match. To ensure we make progress, start
705 // the next search at the smallest possible starting position
706 // of the next match following this one.
707 last_end = m.end + 1;
708 // Don't accept empty matches immediately following a match.
709 // Just move on to the next match.
710 if Some(m.end) == last_match {
711 continue;
712 }
713 } else {
714 last_end = m.end;
715 }
716 last_match = Some(m.end);
717 match matched(m) {
718 Ok(true) => continue,
719 Ok(false) => return Ok(Ok(())),
720 Err(err) => return Ok(Err(err)),
721 }
722 }
723 }
724
725 /// Populates the first set of capture group matches from `haystack` into
726 /// `caps`. If no match exists, then `false` is returned.
727 ///
728 /// The text encoding of `haystack` is not strictly specified. Matchers are
729 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
730 #[inline]
731 fn captures(
732 &self,
733 haystack: &[u8],
734 caps: &mut Self::Captures,
735 ) -> Result<bool, Self::Error> {
736 self.captures_at(haystack, 0, caps)
737 }
738
739 /// Executes the given function over successive non-overlapping matches
740 /// in `haystack` with capture groups extracted from each match. If no
741 /// match exists, then the given function is never called. If the function
742 /// returns `false`, then iteration stops.
743 #[inline]
744 fn captures_iter<F>(
745 &self,
746 haystack: &[u8],
747 caps: &mut Self::Captures,
748 matched: F,
749 ) -> Result<(), Self::Error>
750 where
751 F: FnMut(&Self::Captures) -> bool,
752 {
753 self.captures_iter_at(haystack, 0, caps, matched)
754 }
755
756 /// Executes the given function over successive non-overlapping matches
757 /// in `haystack` with capture groups extracted from each match. If no
758 /// match exists, then the given function is never called. If the function
759 /// returns `false`, then iteration stops.
760 ///
761 /// The significance of the starting point is that it takes the surrounding
762 /// context into consideration. For example, the `\A` anchor can only
763 /// match when `at == 0`.
764 #[inline]
765 fn captures_iter_at<F>(
766 &self,
767 haystack: &[u8],
768 at: usize,
769 caps: &mut Self::Captures,
770 mut matched: F,
771 ) -> Result<(), Self::Error>
772 where
773 F: FnMut(&Self::Captures) -> bool,
774 {
775 self.try_captures_iter_at(haystack, at, caps, |caps| Ok(matched(caps)))
776 .map(|r: Result<(), ()>| r.unwrap())
777 }
778
779 /// Executes the given function over successive non-overlapping matches
780 /// in `haystack` with capture groups extracted from each match. If no
781 /// match exists, then the given function is never called. If the function
782 /// returns `false`, then iteration stops. Similarly, if the function
783 /// returns an error then iteration stops and the error is yielded. If
784 /// an error occurs while executing the search, then it is converted to
785 /// `E`.
786 #[inline]
787 fn try_captures_iter<F, E>(
788 &self,
789 haystack: &[u8],
790 caps: &mut Self::Captures,
791 matched: F,
792 ) -> Result<Result<(), E>, Self::Error>
793 where
794 F: FnMut(&Self::Captures) -> Result<bool, E>,
795 {
796 self.try_captures_iter_at(haystack, 0, caps, matched)
797 }
798
799 /// Executes the given function over successive non-overlapping matches
800 /// in `haystack` with capture groups extracted from each match. If no
801 /// match exists, then the given function is never called. If the function
802 /// returns `false`, then iteration stops. Similarly, if the function
803 /// returns an error then iteration stops and the error is yielded. If
804 /// an error occurs while executing the search, then it is converted to
805 /// `E`.
806 ///
807 /// The significance of the starting point is that it takes the surrounding
808 /// context into consideration. For example, the `\A` anchor can only
809 /// match when `at == 0`.
810 #[inline]
811 fn try_captures_iter_at<F, E>(
812 &self,
813 haystack: &[u8],
814 at: usize,
815 caps: &mut Self::Captures,
816 mut matched: F,
817 ) -> Result<Result<(), E>, Self::Error>
818 where
819 F: FnMut(&Self::Captures) -> Result<bool, E>,
820 {
821 let mut last_end = at;
822 let mut last_match = None;
823
824 loop {
825 if last_end > haystack.len() {
826 return Ok(Ok(()));
827 }
828 if !self.captures_at(haystack, last_end, caps)? {
829 return Ok(Ok(()));
830 }
831 let m = caps.get(0).unwrap();
832 if m.start == m.end {
833 // This is an empty match. To ensure we make progress, start
834 // the next search at the smallest possible starting position
835 // of the next match following this one.
836 last_end = m.end + 1;
837 // Don't accept empty matches immediately following a match.
838 // Just move on to the next match.
839 if Some(m.end) == last_match {
840 continue;
841 }
842 } else {
843 last_end = m.end;
844 }
845 last_match = Some(m.end);
846 match matched(caps) {
847 Ok(true) => continue,
848 Ok(false) => return Ok(Ok(())),
849 Err(err) => return Ok(Err(err)),
850 }
851 }
852 }
853
854 /// Populates the first set of capture group matches from `haystack`
855 /// into `matches` after `at`, where the byte offsets in each capturing
856 /// group are relative to the start of `haystack` (and not `at`). If no
857 /// match exists, then `false` is returned and the contents of the given
858 /// capturing groups are unspecified.
859 ///
860 /// The text encoding of `haystack` is not strictly specified. Matchers are
861 /// advised to assume UTF-8, or at worst, some ASCII compatible encoding.
862 ///
863 /// The significance of the starting point is that it takes the surrounding
864 /// context into consideration. For example, the `\A` anchor can only
865 /// match when `at == 0`.
866 ///
867 /// By default, capturing groups aren't supported, and this implementation
868 /// will always behave as if a match were impossible.
869 ///
870 /// Implementors that provide support for capturing groups must guarantee
871 /// that when a match occurs, the first capture match (at index `0`) is
872 /// always set to the overall match offsets.
873 ///
874 /// Note that if implementors seek to support capturing groups, then they
875 /// should implement this method. Other methods that match based on
876 /// captures will then work automatically.
877 #[inline]
878 fn captures_at(
879 &self,
880 _haystack: &[u8],
881 _at: usize,
882 _caps: &mut Self::Captures,
883 ) -> Result<bool, Self::Error> {
884 Ok(false)
885 }
886
887 /// Replaces every match in the given haystack with the result of calling
888 /// `append`. `append` is given the start and end of a match, along with
889 /// a handle to the `dst` buffer provided.
890 ///
891 /// If the given `append` function returns `false`, then replacement stops.
892 #[inline]
893 fn replace<F>(
894 &self,
895 haystack: &[u8],
896 dst: &mut Vec<u8>,
897 mut append: F,
898 ) -> Result<(), Self::Error>
899 where
900 F: FnMut(Match, &mut Vec<u8>) -> bool,
901 {
902 let mut last_match = 0;
903 self.find_iter(haystack, |m| {
904 dst.extend(&haystack[last_match..m.start]);
905 last_match = m.end;
906 append(m, dst)
907 })?;
908 dst.extend(&haystack[last_match..]);
909 Ok(())
910 }
911
912 /// Replaces every match in the given haystack with the result of calling
913 /// `append` with the matching capture groups.
914 ///
915 /// If the given `append` function returns `false`, then replacement stops.
916 #[inline]
917 fn replace_with_captures<F>(
918 &self,
919 haystack: &[u8],
920 caps: &mut Self::Captures,
921 dst: &mut Vec<u8>,
922 append: F,
923 ) -> Result<(), Self::Error>
924 where
925 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
926 {
927 self.replace_with_captures_at(haystack, 0, caps, dst, append)
928 }
929
930 /// Replaces every match in the given haystack with the result of calling
931 /// `append` with the matching capture groups.
932 ///
933 /// If the given `append` function returns `false`, then replacement stops.
934 ///
935 /// The significance of the starting point is that it takes the surrounding
936 /// context into consideration. For example, the `\A` anchor can only
937 /// match when `at == 0`.
938 #[inline]
939 fn replace_with_captures_at<F>(
940 &self,
941 haystack: &[u8],
942 at: usize,
943 caps: &mut Self::Captures,
944 dst: &mut Vec<u8>,
945 mut append: F,
946 ) -> Result<(), Self::Error>
947 where
948 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
949 {
950 let mut last_match = at;
951 self.captures_iter_at(haystack, at, caps, |caps| {
952 let m = caps.get(0).unwrap();
953 dst.extend(&haystack[last_match..m.start]);
954 last_match = m.end;
955 append(caps, dst)
956 })?;
957 dst.extend(&haystack[last_match..]);
958 Ok(())
959 }
960
961 /// Returns true if and only if the matcher matches the given haystack.
962 ///
963 /// By default, this method is implemented by calling `shortest_match`.
964 #[inline]
965 fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
966 self.is_match_at(haystack, 0)
967 }
968
969 /// Returns true if and only if the matcher matches the given haystack
970 /// starting at the given position.
971 ///
972 /// By default, this method is implemented by calling `shortest_match_at`.
973 ///
974 /// The significance of the starting point is that it takes the surrounding
975 /// context into consideration. For example, the `\A` anchor can only
976 /// match when `at == 0`.
977 #[inline]
978 fn is_match_at(
979 &self,
980 haystack: &[u8],
981 at: usize,
982 ) -> Result<bool, Self::Error> {
983 Ok(self.shortest_match_at(haystack, at)?.is_some())
984 }
985
986 /// Returns an end location of the first match in `haystack`. If no match
987 /// exists, then `None` is returned.
988 ///
989 /// Note that the end location reported by this method may be less than the
990 /// same end location reported by `find`. For example, running `find` with
991 /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
992 /// 3)`, but `shortest_match` may report `1` as the ending location since
993 /// that is the place at which a match is guaranteed to occur.
994 ///
995 /// This method should never report false positives or false negatives. The
996 /// point of this method is that some implementors may be able to provide
997 /// a faster implementation of this than what `find` does.
998 ///
999 /// By default, this method is implemented by calling `find`.
1000 #[inline]
1001 fn shortest_match(
1002 &self,
1003 haystack: &[u8],
1004 ) -> Result<Option<usize>, Self::Error> {
1005 self.shortest_match_at(haystack, 0)
1006 }
1007
1008 /// Returns an end location of the first match in `haystack` starting at
1009 /// the given position. If no match exists, then `None` is returned.
1010 ///
1011 /// Note that the end location reported by this method may be less than the
1012 /// same end location reported by `find`. For example, running `find` with
1013 /// the pattern `a+` on the haystack `aaa` should report a range of `[0,
1014 /// 3)`, but `shortest_match` may report `1` as the ending location since
1015 /// that is the place at which a match is guaranteed to occur.
1016 ///
1017 /// This method should never report false positives or false negatives. The
1018 /// point of this method is that some implementors may be able to provide
1019 /// a faster implementation of this than what `find` does.
1020 ///
1021 /// By default, this method is implemented by calling `find_at`.
1022 ///
1023 /// The significance of the starting point is that it takes the surrounding
1024 /// context into consideration. For example, the `\A` anchor can only
1025 /// match when `at == 0`.
1026 #[inline]
1027 fn shortest_match_at(
1028 &self,
1029 haystack: &[u8],
1030 at: usize,
1031 ) -> Result<Option<usize>, Self::Error> {
1032 Ok(self.find_at(haystack, at)?.map(|m| m.end))
1033 }
1034
1035 /// If available, return a set of bytes that will never appear in a match
1036 /// produced by an implementation.
1037 ///
1038 /// Specifically, if such a set can be determined, then it's possible for
1039 /// callers to perform additional operations on the basis that certain
1040 /// bytes may never match.
1041 ///
1042 /// For example, if a search is configured to possibly produce results
1043 /// that span multiple lines but a caller provided pattern can never
1044 /// match across multiple lines, then it may make sense to divert to
1045 /// more optimized line oriented routines that don't need to handle the
1046 /// multi-line match case.
1047 ///
1048 /// Implementations that produce this set must never report false
1049 /// positives, but may produce false negatives. That is, is a byte is in
1050 /// this set then it must be guaranteed that it is never in a match. But,
1051 /// if a byte is not in this set, then callers cannot assume that a match
1052 /// exists with that byte.
1053 ///
1054 /// By default, this returns `None`.
1055 #[inline]
1056 fn non_matching_bytes(&self) -> Option<&ByteSet> {
1057 None
1058 }
1059
1060 /// If this matcher was compiled as a line oriented matcher, then this
1061 /// method returns the line terminator if and only if the line terminator
1062 /// never appears in any match produced by this matcher. If this wasn't
1063 /// compiled as a line oriented matcher, or if the aforementioned guarantee
1064 /// cannot be made, then this must return `None`, which is the default.
1065 /// It is **never wrong** to return `None`, but returning a line terminator
1066 /// when it can appear in a match results in unspecified behavior.
1067 ///
1068 /// The line terminator is typically `b'\n'`, but can be any single byte or
1069 /// `CRLF`.
1070 ///
1071 /// By default, this returns `None`.
1072 #[inline]
1073 fn line_terminator(&self) -> Option<LineTerminator> {
1074 None
1075 }
1076
1077 /// Return one of the following: a confirmed line match, a candidate line
1078 /// match (which may be a false positive) or no match at all (which **must
1079 /// not** be a false negative). When reporting a confirmed or candidate
1080 /// match, the position returned can be any position in the line.
1081 ///
1082 /// By default, this never returns a candidate match, and always either
1083 /// returns a confirmed match or no match at all.
1084 ///
1085 /// When a matcher can match spans over multiple lines, then the behavior
1086 /// of this method is unspecified. Namely, use of this method only
1087 /// makes sense in a context where the caller is looking for the next
1088 /// matching line. That is, callers should only use this method when
1089 /// `line_terminator` does not return `None`.
1090 ///
1091 /// # Design rationale
1092 ///
1093 /// A line matcher is, fundamentally, a normal matcher with the addition
1094 /// of one optional method: finding a line. By default, this routine
1095 /// is implemented via the matcher's `shortest_match` method, which
1096 /// always yields either no match or a `LineMatchKind::Confirmed`. However,
1097 /// implementors may provide a routine for this that can return candidate
1098 /// lines that need subsequent verification to be confirmed as a match.
1099 /// This can be useful in cases where it may be quicker to find candidate
1100 /// lines via some other means instead of relying on the more general
1101 /// implementations for `find` and `shortest_match`.
1102 ///
1103 /// For example, consider the regex `\w+foo\s+`. Both `find` and
1104 /// `shortest_match` must consider the entire regex, including the `\w+`
1105 /// and `\s+`, while searching. However, this method could look for lines
1106 /// containing `foo` and return them as candidates. Finding `foo` might
1107 /// be implemented as a highly optimized substring search routine (like
1108 /// `memmem`), which is likely to be faster than whatever more generalized
1109 /// routine is required for resolving `\w+foo\s+`. The caller is then
1110 /// responsible for confirming whether a match exists or not.
1111 ///
1112 /// Note that while this method may report false positives, it must never
1113 /// report false negatives. That is, it can never skip over lines that
1114 /// contain a match.
1115 #[inline]
1116 fn find_candidate_line(
1117 &self,
1118 haystack: &[u8],
1119 ) -> Result<Option<LineMatchKind>, Self::Error> {
1120 Ok(self.shortest_match(haystack)?.map(LineMatchKind::Confirmed))
1121 }
1122}
1123
1124impl<'a, M: Matcher> Matcher for &'a M {
1125 type Captures = M::Captures;
1126 type Error = M::Error;
1127
1128 #[inline]
1129 fn find_at(
1130 &self,
1131 haystack: &[u8],
1132 at: usize,
1133 ) -> Result<Option<Match>, Self::Error> {
1134 (*self).find_at(haystack, at)
1135 }
1136
1137 #[inline]
1138 fn new_captures(&self) -> Result<Self::Captures, Self::Error> {
1139 (*self).new_captures()
1140 }
1141
1142 #[inline]
1143 fn captures_at(
1144 &self,
1145 haystack: &[u8],
1146 at: usize,
1147 caps: &mut Self::Captures,
1148 ) -> Result<bool, Self::Error> {
1149 (*self).captures_at(haystack, at, caps)
1150 }
1151
1152 #[inline]
1153 fn capture_index(&self, name: &str) -> Option<usize> {
1154 (*self).capture_index(name)
1155 }
1156
1157 #[inline]
1158 fn capture_count(&self) -> usize {
1159 (*self).capture_count()
1160 }
1161
1162 #[inline]
1163 fn find(&self, haystack: &[u8]) -> Result<Option<Match>, Self::Error> {
1164 (*self).find(haystack)
1165 }
1166
1167 #[inline]
1168 fn find_iter<F>(
1169 &self,
1170 haystack: &[u8],
1171 matched: F,
1172 ) -> Result<(), Self::Error>
1173 where
1174 F: FnMut(Match) -> bool,
1175 {
1176 (*self).find_iter(haystack, matched)
1177 }
1178
1179 #[inline]
1180 fn find_iter_at<F>(
1181 &self,
1182 haystack: &[u8],
1183 at: usize,
1184 matched: F,
1185 ) -> Result<(), Self::Error>
1186 where
1187 F: FnMut(Match) -> bool,
1188 {
1189 (*self).find_iter_at(haystack, at, matched)
1190 }
1191
1192 #[inline]
1193 fn try_find_iter<F, E>(
1194 &self,
1195 haystack: &[u8],
1196 matched: F,
1197 ) -> Result<Result<(), E>, Self::Error>
1198 where
1199 F: FnMut(Match) -> Result<bool, E>,
1200 {
1201 (*self).try_find_iter(haystack, matched)
1202 }
1203
1204 #[inline]
1205 fn try_find_iter_at<F, E>(
1206 &self,
1207 haystack: &[u8],
1208 at: usize,
1209 matched: F,
1210 ) -> Result<Result<(), E>, Self::Error>
1211 where
1212 F: FnMut(Match) -> Result<bool, E>,
1213 {
1214 (*self).try_find_iter_at(haystack, at, matched)
1215 }
1216
1217 #[inline]
1218 fn captures(
1219 &self,
1220 haystack: &[u8],
1221 caps: &mut Self::Captures,
1222 ) -> Result<bool, Self::Error> {
1223 (*self).captures(haystack, caps)
1224 }
1225
1226 #[inline]
1227 fn captures_iter<F>(
1228 &self,
1229 haystack: &[u8],
1230 caps: &mut Self::Captures,
1231 matched: F,
1232 ) -> Result<(), Self::Error>
1233 where
1234 F: FnMut(&Self::Captures) -> bool,
1235 {
1236 (*self).captures_iter(haystack, caps, matched)
1237 }
1238
1239 #[inline]
1240 fn captures_iter_at<F>(
1241 &self,
1242 haystack: &[u8],
1243 at: usize,
1244 caps: &mut Self::Captures,
1245 matched: F,
1246 ) -> Result<(), Self::Error>
1247 where
1248 F: FnMut(&Self::Captures) -> bool,
1249 {
1250 (*self).captures_iter_at(haystack, at, caps, matched)
1251 }
1252
1253 #[inline]
1254 fn try_captures_iter<F, E>(
1255 &self,
1256 haystack: &[u8],
1257 caps: &mut Self::Captures,
1258 matched: F,
1259 ) -> Result<Result<(), E>, Self::Error>
1260 where
1261 F: FnMut(&Self::Captures) -> Result<bool, E>,
1262 {
1263 (*self).try_captures_iter(haystack, caps, matched)
1264 }
1265
1266 #[inline]
1267 fn try_captures_iter_at<F, E>(
1268 &self,
1269 haystack: &[u8],
1270 at: usize,
1271 caps: &mut Self::Captures,
1272 matched: F,
1273 ) -> Result<Result<(), E>, Self::Error>
1274 where
1275 F: FnMut(&Self::Captures) -> Result<bool, E>,
1276 {
1277 (*self).try_captures_iter_at(haystack, at, caps, matched)
1278 }
1279
1280 #[inline]
1281 fn replace<F>(
1282 &self,
1283 haystack: &[u8],
1284 dst: &mut Vec<u8>,
1285 append: F,
1286 ) -> Result<(), Self::Error>
1287 where
1288 F: FnMut(Match, &mut Vec<u8>) -> bool,
1289 {
1290 (*self).replace(haystack, dst, append)
1291 }
1292
1293 #[inline]
1294 fn replace_with_captures<F>(
1295 &self,
1296 haystack: &[u8],
1297 caps: &mut Self::Captures,
1298 dst: &mut Vec<u8>,
1299 append: F,
1300 ) -> Result<(), Self::Error>
1301 where
1302 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1303 {
1304 (*self).replace_with_captures(haystack, caps, dst, append)
1305 }
1306
1307 #[inline]
1308 fn replace_with_captures_at<F>(
1309 &self,
1310 haystack: &[u8],
1311 at: usize,
1312 caps: &mut Self::Captures,
1313 dst: &mut Vec<u8>,
1314 append: F,
1315 ) -> Result<(), Self::Error>
1316 where
1317 F: FnMut(&Self::Captures, &mut Vec<u8>) -> bool,
1318 {
1319 (*self).replace_with_captures_at(haystack, at, caps, dst, append)
1320 }
1321
1322 #[inline]
1323 fn is_match(&self, haystack: &[u8]) -> Result<bool, Self::Error> {
1324 (*self).is_match(haystack)
1325 }
1326
1327 #[inline]
1328 fn is_match_at(
1329 &self,
1330 haystack: &[u8],
1331 at: usize,
1332 ) -> Result<bool, Self::Error> {
1333 (*self).is_match_at(haystack, at)
1334 }
1335
1336 #[inline]
1337 fn shortest_match(
1338 &self,
1339 haystack: &[u8],
1340 ) -> Result<Option<usize>, Self::Error> {
1341 (*self).shortest_match(haystack)
1342 }
1343
1344 #[inline]
1345 fn shortest_match_at(
1346 &self,
1347 haystack: &[u8],
1348 at: usize,
1349 ) -> Result<Option<usize>, Self::Error> {
1350 (*self).shortest_match_at(haystack, at)
1351 }
1352
1353 #[inline]
1354 fn non_matching_bytes(&self) -> Option<&ByteSet> {
1355 (*self).non_matching_bytes()
1356 }
1357
1358 #[inline]
1359 fn line_terminator(&self) -> Option<LineTerminator> {
1360 (*self).line_terminator()
1361 }
1362
1363 #[inline]
1364 fn find_candidate_line(
1365 &self,
1366 haystack: &[u8],
1367 ) -> Result<Option<LineMatchKind>, Self::Error> {
1368 (*self).find_candidate_line(haystack)
1369 }
1370}