regex_automata/util/
search.rs

1/*!
2Types and routines that support the search APIs of most regex engines.
3
4This sub-module isn't exposed directly, but rather, its contents are exported
5at the crate root due to the universality of most of the types and routines in
6this module.
7*/
8
9use core::ops::{Range, RangeBounds};
10
11use crate::util::{escape::DebugByte, primitives::PatternID, utf8};
12
13/// The parameters for a regex search including the haystack to search.
14///
15/// It turns out that regex searches have a few parameters, and in most cases,
16/// those parameters have defaults that work in the vast majority of cases.
17/// This `Input` type exists to make that common case seamnless while also
18/// providing an avenue for changing the parameters of a search. In particular,
19/// this type enables doing so without a combinatorial explosion of different
20/// methods and/or superfluous parameters in the common cases.
21///
22/// An `Input` permits configuring the following things:
23///
24/// * Search only a substring of a haystack, while taking the broader context
25/// into account for resolving look-around assertions.
26/// * Indicating whether to search for all patterns in a regex, or to
27/// only search for one pattern in particular.
28/// * Whether to perform an anchored on unanchored search.
29/// * Whether to report a match as early as possible.
30///
31/// All of these parameters, except for the haystack, have sensible default
32/// values. This means that the minimal search configuration is simply a call
33/// to [`Input::new`] with your haystack. Setting any other parameter is
34/// optional.
35///
36/// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a
37/// `From<H> for Input` implementation. This is useful because many of the
38/// search APIs in this crate accept an `Into<Input>`. This means you can
39/// provide string or byte strings to these routines directly, and they'll
40/// automatically get converted into an `Input` for you.
41///
42/// The lifetime parameter `'h` refers to the lifetime of the haystack.
43///
44/// # Organization
45///
46/// The API of `Input` is split into a few different parts:
47///
48/// * A builder-like API that transforms a `Input` by value. Examples:
49/// [`Input::span`] and [`Input::anchored`].
50/// * A setter API that permits mutating parameters in place. Examples:
51/// [`Input::set_span`] and [`Input::set_anchored`].
52/// * A getter API that permits retrieving any of the search parameters.
53/// Examples: [`Input::get_span`] and [`Input::get_anchored`].
54/// * A few convenience getter routines that don't conform to the above naming
55/// pattern due to how common they are. Examples: [`Input::haystack`],
56/// [`Input::start`] and [`Input::end`].
57/// * Miscellaneous predicates and other helper routines that are useful
58/// in some contexts. Examples: [`Input::is_char_boundary`].
59///
60/// A `Input` exposes so much because it is meant to be used by both callers of
61/// regex engines _and_ implementors of regex engines. A constraining factor is
62/// that regex engines should accept a `&Input` as its lowest level API, which
63/// means that implementors should only use the "getter" APIs of a `Input`.
64///
65/// # Valid bounds and search termination
66///
67/// An `Input` permits setting the bounds of a search via either
68/// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or
69/// else a panic will occur. Bounds are valid if and only if:
70///
71/// * The bounds represent a valid range into the input's haystack.
72/// * **or** the end bound is a valid ending bound for the haystack *and*
73/// the start bound is exactly one greater than the start bound.
74///
75/// In the latter case, [`Input::is_done`] will return true and indicates any
76/// search receiving such an input should immediately return with no match.
77///
78/// Note that while `Input` is used for reverse searches in this crate, the
79/// `Input::is_done` predicate assumes a forward search. Because unsigned
80/// offsets are used internally, there is no way to tell from only the offsets
81/// whether a reverse search is done or not.
82///
83/// # Regex engine support
84///
85/// Any regex engine accepting an `Input` must support at least the following
86/// things:
87///
88/// * Searching a `&[u8]` for matches.
89/// * Searching a substring of `&[u8]` for a match, such that any match
90/// reported must appear entirely within that substring.
91/// * For a forwards search, a match should never be reported when
92/// [`Input::is_done`] returns true. (For reverse searches, termination should
93/// be handled outside of `Input`.)
94///
95/// Supporting other aspects of an `Input` are optional, but regex engines
96/// should handle aspects they don't support gracefully. How this is done is
97/// generally up to the regex engine. This crate generally treats unsupported
98/// anchored modes as an error to report for example, but for simplicity, in
99/// the meta regex engine, trying to search with an invalid pattern ID just
100/// results in no match being reported.
101#[derive(Clone)]
102pub struct Input<'h> {
103    haystack: &'h [u8],
104    span: Span,
105    anchored: Anchored,
106    earliest: bool,
107}
108
109impl<'h> Input<'h> {
110    /// Create a new search configuration for the given haystack.
111    #[inline]
112    pub fn new<H: ?Sized + AsRef<[u8]>>(haystack: &'h H) -> Input<'h> {
113        // Perform only one call to `haystack.as_ref()` to protect from incorrect
114        // implementations that return different values from multiple calls.
115        // This is important because there's code that relies on `span` not being
116        // out of bounds with respect to the stored `haystack`.
117        let haystack = haystack.as_ref();
118        Input {
119            haystack,
120            span: Span { start: 0, end: haystack.len() },
121            anchored: Anchored::No,
122            earliest: false,
123        }
124    }
125
126    /// Set the span for this search.
127    ///
128    /// This routine does not panic if the span given is not a valid range for
129    /// this search's haystack. If this search is run with an invalid range,
130    /// then the most likely outcome is that the actual search execution will
131    /// panic.
132    ///
133    /// This routine is generic over how a span is provided. While
134    /// a [`Span`] may be given directly, one may also provide a
135    /// `std::ops::Range<usize>`. To provide anything supported by range
136    /// syntax, use the [`Input::range`] method.
137    ///
138    /// The default span is the entire haystack.
139    ///
140    /// Note that [`Input::range`] overrides this method and vice versa.
141    ///
142    /// # Panics
143    ///
144    /// This panics if the given span does not correspond to valid bounds in
145    /// the haystack or the termination of a search.
146    ///
147    /// # Example
148    ///
149    /// This example shows how the span of the search can impact whether a
150    /// match is reported or not. This is particularly relevant for look-around
151    /// operators, which might take things outside of the span into account
152    /// when determining whether they match.
153    ///
154    /// ```
155    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156    /// use regex_automata::{
157    ///     nfa::thompson::pikevm::PikeVM,
158    ///     Match, Input,
159    /// };
160    ///
161    /// // Look for 'at', but as a distinct word.
162    /// let re = PikeVM::new(r"\bat\b")?;
163    /// let mut cache = re.create_cache();
164    /// let mut caps = re.create_captures();
165    ///
166    /// // Our haystack contains 'at', but not as a distinct word.
167    /// let haystack = "batter";
168    ///
169    /// // A standard search finds nothing, as expected.
170    /// let input = Input::new(haystack);
171    /// re.search(&mut cache, &input, &mut caps);
172    /// assert_eq!(None, caps.get_match());
173    ///
174    /// // But if we wanted to search starting at position '1', we might
175    /// // slice the haystack. If we do this, it's impossible for the \b
176    /// // anchors to take the surrounding context into account! And thus,
177    /// // a match is produced.
178    /// let input = Input::new(&haystack[1..3]);
179    /// re.search(&mut cache, &input, &mut caps);
180    /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
181    ///
182    /// // But if we specify the span of the search instead of slicing the
183    /// // haystack, then the regex engine can "see" outside of the span
184    /// // and resolve the anchors correctly.
185    /// let input = Input::new(haystack).span(1..3);
186    /// re.search(&mut cache, &input, &mut caps);
187    /// assert_eq!(None, caps.get_match());
188    ///
189    /// # Ok::<(), Box<dyn std::error::Error>>(())
190    /// ```
191    ///
192    /// This may seem a little ham-fisted, but this scenario tends to come up
193    /// if some other regex engine found the match span and now you need to
194    /// re-process that span to look for capturing groups. (e.g., Run a faster
195    /// DFA first, find a match, then run the PikeVM on just the match span to
196    /// resolve capturing groups.) In order to implement that sort of logic
197    /// correctly, you need to set the span on the search instead of slicing
198    /// the haystack directly.
199    ///
200    /// The other advantage of using this routine to specify the bounds of the
201    /// search is that the match offsets are still reported in terms of the
202    /// original haystack. For example, the second search in the example above
203    /// reported a match at position `0`, even though `at` starts at offset
204    /// `1` because we sliced the haystack.
205    #[inline]
206    pub fn span<S: Into<Span>>(mut self, span: S) -> Input<'h> {
207        self.set_span(span);
208        self
209    }
210
211    /// Like `Input::span`, but accepts any range instead.
212    ///
213    /// This routine does not panic if the range given is not a valid range for
214    /// this search's haystack. If this search is run with an invalid range,
215    /// then the most likely outcome is that the actual search execution will
216    /// panic.
217    ///
218    /// The default range is the entire haystack.
219    ///
220    /// Note that [`Input::span`] overrides this method and vice versa.
221    ///
222    /// # Panics
223    ///
224    /// This routine will panic if the given range could not be converted
225    /// to a valid [`Range`]. For example, this would panic when given
226    /// `0..=usize::MAX` since it cannot be represented using a half-open
227    /// interval in terms of `usize`.
228    ///
229    /// This also panics if the given range does not correspond to valid bounds
230    /// in the haystack or the termination of a search.
231    ///
232    /// # Example
233    ///
234    /// ```
235    /// use regex_automata::Input;
236    ///
237    /// let input = Input::new("foobar");
238    /// assert_eq!(0..6, input.get_range());
239    ///
240    /// let input = Input::new("foobar").range(2..=4);
241    /// assert_eq!(2..5, input.get_range());
242    /// ```
243    #[inline]
244    pub fn range<R: RangeBounds<usize>>(mut self, range: R) -> Input<'h> {
245        self.set_range(range);
246        self
247    }
248
249    /// Sets the anchor mode of a search.
250    ///
251    /// When a search is anchored (so that's [`Anchored::Yes`] or
252    /// [`Anchored::Pattern`]), a match must begin at the start of a search.
253    /// When a search is not anchored (that's [`Anchored::No`]), regex engines
254    /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix
255    /// permits a match to appear anywhere.
256    ///
257    /// By default, the anchored mode is [`Anchored::No`].
258    ///
259    /// **WARNING:** this is subtly different than using a `^` at the start of
260    /// your regex. A `^` forces a regex to match exclusively at the start of
261    /// a haystack, regardless of where you begin your search. In contrast,
262    /// anchoring a search will allow your regex to match anywhere in your
263    /// haystack, but the match must start at the beginning of a search.
264    ///
265    /// For example, consider the haystack `aba` and the following searches:
266    ///
267    /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
268    ///    starting at position `2`. Since `^` requires the match to start at
269    ///    the beginning of the haystack and `2 > 0`, no match is found.
270    /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
271    ///    starting at position `2`. This reports a match at `[2, 3]` since
272    ///    the match starts where the search started. Since there is no `^`,
273    ///    there is no requirement for the match to start at the beginning of
274    ///    the haystack.
275    /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
276    ///    starting at position `1`. Since `b` corresponds to position `1` and
277    ///    since the search is anchored, it finds no match. While the regex
278    ///    matches at other positions, configuring the search to be anchored
279    ///    requires that it only report a match that begins at the same offset
280    ///    as the beginning of the search.
281    /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
282    ///    starting at position `1`. Since the search is not anchored and
283    ///    the regex does not start with `^`, the search executes as if there
284    ///    is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
285    ///    reports a match at `[2, 3]`.
286    ///
287    /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
288    /// except it only reports matches for a particular pattern.
289    ///
290    /// # Example
291    ///
292    /// This demonstrates the differences between an anchored search and
293    /// a pattern that begins with `^` (as described in the above warning
294    /// message).
295    ///
296    /// ```
297    /// use regex_automata::{
298    ///     nfa::thompson::pikevm::PikeVM,
299    ///     Anchored, Match, Input,
300    /// };
301    ///
302    /// let haystack = "aba";
303    ///
304    /// let re = PikeVM::new(r"^a")?;
305    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
306    /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No);
307    /// re.search(&mut cache, &input, &mut caps);
308    /// // No match is found because 2 is not the beginning of the haystack,
309    /// // which is what ^ requires.
310    /// assert_eq!(None, caps.get_match());
311    ///
312    /// let re = PikeVM::new(r"a")?;
313    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
314    /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes);
315    /// re.search(&mut cache, &input, &mut caps);
316    /// // An anchored search can still match anywhere in the haystack, it just
317    /// // must begin at the start of the search which is '2' in this case.
318    /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
319    ///
320    /// let re = PikeVM::new(r"a")?;
321    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
322    /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes);
323    /// re.search(&mut cache, &input, &mut caps);
324    /// // No match is found since we start searching at offset 1 which
325    /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
326    /// // is found.
327    /// assert_eq!(None, caps.get_match());
328    ///
329    /// let re = PikeVM::new(r"a")?;
330    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
331    /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No);
332    /// re.search(&mut cache, &input, &mut caps);
333    /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
334    /// // pattern. Even though the search starts at 'b', the 'match anything'
335    /// // prefix allows the search to match 'a'.
336    /// let expected = Some(Match::must(0, 2..3));
337    /// assert_eq!(expected, caps.get_match());
338    ///
339    /// # Ok::<(), Box<dyn std::error::Error>>(())
340    /// ```
341    #[inline]
342    pub fn anchored(mut self, mode: Anchored) -> Input<'h> {
343        self.set_anchored(mode);
344        self
345    }
346
347    /// Whether to execute an "earliest" search or not.
348    ///
349    /// When running a non-overlapping search, an "earliest" search will return
350    /// the match location as early as possible. For example, given a pattern
351    /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search
352    /// will return `foo12345` as a match. But an "earliest" search for regex
353    /// engines that support "earliest" semantics will return `foo1` as a
354    /// match, since as soon as the first digit following `foo` is seen, it is
355    /// known to have found a match.
356    ///
357    /// Note that "earliest" semantics generally depend on the regex engine.
358    /// Different regex engines may determine there is a match at different
359    /// points. So there is no guarantee that "earliest" matches will always
360    /// return the same offsets for all regex engines. The "earliest" notion
361    /// is really about when the particular regex engine determines there is
362    /// a match rather than a consistent semantic unto itself. This is often
363    /// useful for implementing "did a match occur or not" predicates, but
364    /// sometimes the offset is useful as well.
365    ///
366    /// This is disabled by default.
367    ///
368    /// # Example
369    ///
370    /// This example shows the difference between "earliest" searching and
371    /// normal searching.
372    ///
373    /// ```
374    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
375    ///
376    /// let re = PikeVM::new(r"foo[0-9]+")?;
377    /// let mut cache = re.create_cache();
378    /// let mut caps = re.create_captures();
379    ///
380    /// // A normal search implements greediness like you expect.
381    /// let input = Input::new("foo12345");
382    /// re.search(&mut cache, &input, &mut caps);
383    /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
384    ///
385    /// // When 'earliest' is enabled and the regex engine supports
386    /// // it, the search will bail once it knows a match has been
387    /// // found.
388    /// let input = Input::new("foo12345").earliest(true);
389    /// re.search(&mut cache, &input, &mut caps);
390    /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
391    /// # Ok::<(), Box<dyn std::error::Error>>(())
392    /// ```
393    #[inline]
394    pub fn earliest(mut self, yes: bool) -> Input<'h> {
395        self.set_earliest(yes);
396        self
397    }
398
399    /// Set the span for this search configuration.
400    ///
401    /// This is like the [`Input::span`] method, except this mutates the
402    /// span in place.
403    ///
404    /// This routine is generic over how a span is provided. While
405    /// a [`Span`] may be given directly, one may also provide a
406    /// `std::ops::Range<usize>`.
407    ///
408    /// # Panics
409    ///
410    /// This panics if the given span does not correspond to valid bounds in
411    /// the haystack or the termination of a search.
412    ///
413    /// # Example
414    ///
415    /// ```
416    /// use regex_automata::Input;
417    ///
418    /// let mut input = Input::new("foobar");
419    /// assert_eq!(0..6, input.get_range());
420    /// input.set_span(2..4);
421    /// assert_eq!(2..4, input.get_range());
422    /// ```
423    #[inline]
424    pub fn set_span<S: Into<Span>>(&mut self, span: S) {
425        let span = span.into();
426        assert!(
427            span.end <= self.haystack.len()
428                && span.start <= span.end.wrapping_add(1),
429            "invalid span {:?} for haystack of length {}",
430            span,
431            self.haystack.len(),
432        );
433        self.span = span;
434    }
435
436    /// Set the span for this search configuration given any range.
437    ///
438    /// This is like the [`Input::range`] method, except this mutates the
439    /// span in place.
440    ///
441    /// This routine does not panic if the range given is not a valid range for
442    /// this search's haystack. If this search is run with an invalid range,
443    /// then the most likely outcome is that the actual search execution will
444    /// panic.
445    ///
446    /// # Panics
447    ///
448    /// This routine will panic if the given range could not be converted
449    /// to a valid [`Range`]. For example, this would panic when given
450    /// `0..=usize::MAX` since it cannot be represented using a half-open
451    /// interval in terms of `usize`.
452    ///
453    /// This also panics if the given span does not correspond to valid bounds
454    /// in the haystack or the termination of a search.
455    ///
456    /// # Example
457    ///
458    /// ```
459    /// use regex_automata::Input;
460    ///
461    /// let mut input = Input::new("foobar");
462    /// assert_eq!(0..6, input.get_range());
463    /// input.set_range(2..=4);
464    /// assert_eq!(2..5, input.get_range());
465    /// ```
466    #[inline]
467    pub fn set_range<R: RangeBounds<usize>>(&mut self, range: R) {
468        use core::ops::Bound;
469
470        // It's a little weird to convert ranges into spans, and then spans
471        // back into ranges when we actually slice the haystack. Because
472        // of that process, we always represent everything as a half-open
473        // internal. Therefore, handling things like m..=n is a little awkward.
474        let start = match range.start_bound() {
475            Bound::Included(&i) => i,
476            // Can this case ever happen? Range syntax doesn't support it...
477            Bound::Excluded(&i) => i.checked_add(1).unwrap(),
478            Bound::Unbounded => 0,
479        };
480        let end = match range.end_bound() {
481            Bound::Included(&i) => i.checked_add(1).unwrap(),
482            Bound::Excluded(&i) => i,
483            Bound::Unbounded => self.haystack().len(),
484        };
485        self.set_span(Span { start, end });
486    }
487
488    /// Set the starting offset for the span for this search configuration.
489    ///
490    /// This is a convenience routine for only mutating the start of a span
491    /// without having to set the entire span.
492    ///
493    /// # Panics
494    ///
495    /// This panics if the span resulting from the new start position does not
496    /// correspond to valid bounds in the haystack or the termination of a
497    /// search.
498    ///
499    /// # Example
500    ///
501    /// ```
502    /// use regex_automata::Input;
503    ///
504    /// let mut input = Input::new("foobar");
505    /// assert_eq!(0..6, input.get_range());
506    /// input.set_start(5);
507    /// assert_eq!(5..6, input.get_range());
508    /// ```
509    #[inline]
510    pub fn set_start(&mut self, start: usize) {
511        self.set_span(Span { start, ..self.get_span() });
512    }
513
514    /// Set the ending offset for the span for this search configuration.
515    ///
516    /// This is a convenience routine for only mutating the end of a span
517    /// without having to set the entire span.
518    ///
519    /// # Panics
520    ///
521    /// This panics if the span resulting from the new end position does not
522    /// correspond to valid bounds in the haystack or the termination of a
523    /// search.
524    ///
525    /// # Example
526    ///
527    /// ```
528    /// use regex_automata::Input;
529    ///
530    /// let mut input = Input::new("foobar");
531    /// assert_eq!(0..6, input.get_range());
532    /// input.set_end(5);
533    /// assert_eq!(0..5, input.get_range());
534    /// ```
535    #[inline]
536    pub fn set_end(&mut self, end: usize) {
537        self.set_span(Span { end, ..self.get_span() });
538    }
539
540    /// Set the anchor mode of a search.
541    ///
542    /// This is like [`Input::anchored`], except it mutates the search
543    /// configuration in place.
544    ///
545    /// # Example
546    ///
547    /// ```
548    /// use regex_automata::{Anchored, Input, PatternID};
549    ///
550    /// let mut input = Input::new("foobar");
551    /// assert_eq!(Anchored::No, input.get_anchored());
552    ///
553    /// let pid = PatternID::must(5);
554    /// input.set_anchored(Anchored::Pattern(pid));
555    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
556    /// ```
557    #[inline]
558    pub fn set_anchored(&mut self, mode: Anchored) {
559        self.anchored = mode;
560    }
561
562    /// Set whether the search should execute in "earliest" mode or not.
563    ///
564    /// This is like [`Input::earliest`], except it mutates the search
565    /// configuration in place.
566    ///
567    /// # Example
568    ///
569    /// ```
570    /// use regex_automata::Input;
571    ///
572    /// let mut input = Input::new("foobar");
573    /// assert!(!input.get_earliest());
574    /// input.set_earliest(true);
575    /// assert!(input.get_earliest());
576    /// ```
577    #[inline]
578    pub fn set_earliest(&mut self, yes: bool) {
579        self.earliest = yes;
580    }
581
582    /// Return a borrow of the underlying haystack as a slice of bytes.
583    ///
584    /// # Example
585    ///
586    /// ```
587    /// use regex_automata::Input;
588    ///
589    /// let input = Input::new("foobar");
590    /// assert_eq!(b"foobar", input.haystack());
591    /// ```
592    #[inline]
593    pub fn haystack(&self) -> &[u8] {
594        self.haystack
595    }
596
597    /// Return the start position of this search.
598    ///
599    /// This is a convenience routine for `search.get_span().start()`.
600    ///
601    /// When [`Input::is_done`] is `false`, this is guaranteed to return
602    /// an offset that is less than or equal to [`Input::end`]. Otherwise,
603    /// the offset is one greater than [`Input::end`].
604    ///
605    /// # Example
606    ///
607    /// ```
608    /// use regex_automata::Input;
609    ///
610    /// let input = Input::new("foobar");
611    /// assert_eq!(0, input.start());
612    ///
613    /// let input = Input::new("foobar").span(2..4);
614    /// assert_eq!(2, input.start());
615    /// ```
616    #[inline]
617    pub fn start(&self) -> usize {
618        self.get_span().start
619    }
620
621    /// Return the end position of this search.
622    ///
623    /// This is a convenience routine for `search.get_span().end()`.
624    ///
625    /// This is guaranteed to return an offset that is a valid exclusive end
626    /// bound for this input's haystack.
627    ///
628    /// # Example
629    ///
630    /// ```
631    /// use regex_automata::Input;
632    ///
633    /// let input = Input::new("foobar");
634    /// assert_eq!(6, input.end());
635    ///
636    /// let input = Input::new("foobar").span(2..4);
637    /// assert_eq!(4, input.end());
638    /// ```
639    #[inline]
640    pub fn end(&self) -> usize {
641        self.get_span().end
642    }
643
644    /// Return the span for this search configuration.
645    ///
646    /// If one was not explicitly set, then the span corresponds to the entire
647    /// range of the haystack.
648    ///
649    /// When [`Input::is_done`] is `false`, the span returned is guaranteed
650    /// to correspond to valid bounds for this input's haystack.
651    ///
652    /// # Example
653    ///
654    /// ```
655    /// use regex_automata::{Input, Span};
656    ///
657    /// let input = Input::new("foobar");
658    /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
659    /// ```
660    #[inline]
661    pub fn get_span(&self) -> Span {
662        self.span
663    }
664
665    /// Return the span as a range for this search configuration.
666    ///
667    /// If one was not explicitly set, then the span corresponds to the entire
668    /// range of the haystack.
669    ///
670    /// When [`Input::is_done`] is `false`, the range returned is guaranteed
671    /// to correspond to valid bounds for this input's haystack.
672    ///
673    /// # Example
674    ///
675    /// ```
676    /// use regex_automata::Input;
677    ///
678    /// let input = Input::new("foobar");
679    /// assert_eq!(0..6, input.get_range());
680    /// ```
681    #[inline]
682    pub fn get_range(&self) -> Range<usize> {
683        self.get_span().range()
684    }
685
686    /// Return the anchored mode for this search configuration.
687    ///
688    /// If no anchored mode was set, then it defaults to [`Anchored::No`].
689    ///
690    /// # Example
691    ///
692    /// ```
693    /// use regex_automata::{Anchored, Input, PatternID};
694    ///
695    /// let mut input = Input::new("foobar");
696    /// assert_eq!(Anchored::No, input.get_anchored());
697    ///
698    /// let pid = PatternID::must(5);
699    /// input.set_anchored(Anchored::Pattern(pid));
700    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
701    /// ```
702    #[inline]
703    pub fn get_anchored(&self) -> Anchored {
704        self.anchored
705    }
706
707    /// Return whether this search should execute in "earliest" mode.
708    ///
709    /// # Example
710    ///
711    /// ```
712    /// use regex_automata::Input;
713    ///
714    /// let input = Input::new("foobar");
715    /// assert!(!input.get_earliest());
716    /// ```
717    #[inline]
718    pub fn get_earliest(&self) -> bool {
719        self.earliest
720    }
721
722    /// Return true if and only if this search can never return any other
723    /// matches.
724    ///
725    /// This occurs when the start position of this search is greater than the
726    /// end position of the search.
727    ///
728    /// # Example
729    ///
730    /// ```
731    /// use regex_automata::Input;
732    ///
733    /// let mut input = Input::new("foobar");
734    /// assert!(!input.is_done());
735    /// input.set_start(6);
736    /// assert!(!input.is_done());
737    /// input.set_start(7);
738    /// assert!(input.is_done());
739    /// ```
740    #[inline]
741    pub fn is_done(&self) -> bool {
742        self.get_span().start > self.get_span().end
743    }
744
745    /// Returns true if and only if the given offset in this search's haystack
746    /// falls on a valid UTF-8 encoded codepoint boundary.
747    ///
748    /// If the haystack is not valid UTF-8, then the behavior of this routine
749    /// is unspecified.
750    ///
751    /// # Example
752    ///
753    /// This shows where codepoint boundaries do and don't exist in valid
754    /// UTF-8.
755    ///
756    /// ```
757    /// use regex_automata::Input;
758    ///
759    /// let input = Input::new("☃");
760    /// assert!(input.is_char_boundary(0));
761    /// assert!(!input.is_char_boundary(1));
762    /// assert!(!input.is_char_boundary(2));
763    /// assert!(input.is_char_boundary(3));
764    /// assert!(!input.is_char_boundary(4));
765    /// ```
766    #[inline]
767    pub fn is_char_boundary(&self, offset: usize) -> bool {
768        utf8::is_boundary(self.haystack(), offset)
769    }
770}
771
772impl<'h> core::fmt::Debug for Input<'h> {
773    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
774        use crate::util::escape::DebugHaystack;
775
776        f.debug_struct("Input")
777            .field("haystack", &DebugHaystack(self.haystack()))
778            .field("span", &self.span)
779            .field("anchored", &self.anchored)
780            .field("earliest", &self.earliest)
781            .finish()
782    }
783}
784
785impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> {
786    fn from(haystack: &'h H) -> Input<'h> {
787        Input::new(haystack)
788    }
789}
790
791/// A representation of a span reported by a regex engine.
792///
793/// A span corresponds to the starting and ending _byte offsets_ of a
794/// contiguous region of bytes. The starting offset is inclusive while the
795/// ending offset is exclusive. That is, a span is a half-open interval.
796///
797/// A span is used to report the offsets of a match, but it is also used to
798/// convey which region of a haystack should be searched via routines like
799/// [`Input::span`].
800///
801/// This is basically equivalent to a `std::ops::Range<usize>`, except this
802/// type implements `Copy` which makes it more ergonomic to use in the context
803/// of this crate. Like a range, this implements `Index` for `[u8]` and `str`,
804/// and `IndexMut` for `[u8]`. For convenience, this also impls `From<Range>`,
805/// which means things like `Span::from(5..10)` work.
806#[derive(Clone, Copy, Eq, Hash, PartialEq)]
807pub struct Span {
808    /// The start offset of the span, inclusive.
809    pub start: usize,
810    /// The end offset of the span, exclusive.
811    pub end: usize,
812}
813
814impl Span {
815    /// Returns this span as a range.
816    #[inline]
817    pub fn range(&self) -> Range<usize> {
818        Range::from(*self)
819    }
820
821    /// Returns true when this span is empty. That is, when `start >= end`.
822    #[inline]
823    pub fn is_empty(&self) -> bool {
824        self.start >= self.end
825    }
826
827    /// Returns the length of this span.
828    ///
829    /// This returns `0` in precisely the cases that `is_empty` returns `true`.
830    #[inline]
831    pub fn len(&self) -> usize {
832        self.end.saturating_sub(self.start)
833    }
834
835    /// Returns true when the given offset is contained within this span.
836    ///
837    /// Note that an empty span contains no offsets and will always return
838    /// false.
839    #[inline]
840    pub fn contains(&self, offset: usize) -> bool {
841        !self.is_empty() && self.start <= offset && offset <= self.end
842    }
843
844    /// Returns a new span with `offset` added to this span's `start` and `end`
845    /// values.
846    #[inline]
847    pub fn offset(&self, offset: usize) -> Span {
848        Span { start: self.start + offset, end: self.end + offset }
849    }
850}
851
852impl core::fmt::Debug for Span {
853    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
854        write!(f, "{}..{}", self.start, self.end)
855    }
856}
857
858impl core::ops::Index<Span> for [u8] {
859    type Output = [u8];
860
861    #[inline]
862    fn index(&self, index: Span) -> &[u8] {
863        &self[index.range()]
864    }
865}
866
867impl core::ops::IndexMut<Span> for [u8] {
868    #[inline]
869    fn index_mut(&mut self, index: Span) -> &mut [u8] {
870        &mut self[index.range()]
871    }
872}
873
874impl core::ops::Index<Span> for str {
875    type Output = str;
876
877    #[inline]
878    fn index(&self, index: Span) -> &str {
879        &self[index.range()]
880    }
881}
882
883impl From<Range<usize>> for Span {
884    #[inline]
885    fn from(range: Range<usize>) -> Span {
886        Span { start: range.start, end: range.end }
887    }
888}
889
890impl From<Span> for Range<usize> {
891    #[inline]
892    fn from(span: Span) -> Range<usize> {
893        Range { start: span.start, end: span.end }
894    }
895}
896
897impl PartialEq<Range<usize>> for Span {
898    #[inline]
899    fn eq(&self, range: &Range<usize>) -> bool {
900        self.start == range.start && self.end == range.end
901    }
902}
903
904impl PartialEq<Span> for Range<usize> {
905    #[inline]
906    fn eq(&self, span: &Span) -> bool {
907        self.start == span.start && self.end == span.end
908    }
909}
910
911/// A representation of "half" of a match reported by a DFA.
912///
913/// This is called a "half" match because it only includes the end location (or
914/// start location for a reverse search) of a match. This corresponds to the
915/// information that a single DFA scan can report. Getting the other half of
916/// the match requires a second scan with a reversed DFA.
917///
918/// A half match also includes the pattern that matched. The pattern is
919/// identified by an ID, which corresponds to its position (starting from `0`)
920/// relative to other patterns used to construct the corresponding DFA. If only
921/// a single pattern is provided to the DFA, then all matches are guaranteed to
922/// have a pattern ID of `0`.
923#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
924pub struct HalfMatch {
925    /// The pattern ID.
926    pattern: PatternID,
927    /// The offset of the match.
928    ///
929    /// For forward searches, the offset is exclusive. For reverse searches,
930    /// the offset is inclusive.
931    offset: usize,
932}
933
934impl HalfMatch {
935    /// Create a new half match from a pattern ID and a byte offset.
936    #[inline]
937    pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
938        HalfMatch { pattern, offset }
939    }
940
941    /// Create a new half match from a pattern ID and a byte offset.
942    ///
943    /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
944    /// [`PatternID`]. This panics if the given `usize` is not representable
945    /// as a `PatternID`.
946    #[inline]
947    pub fn must(pattern: usize, offset: usize) -> HalfMatch {
948        HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
949    }
950
951    /// Returns the ID of the pattern that matched.
952    ///
953    /// The ID of a pattern is derived from the position in which it was
954    /// originally inserted into the corresponding DFA. The first pattern has
955    /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
956    #[inline]
957    pub fn pattern(&self) -> PatternID {
958        self.pattern
959    }
960
961    /// The position of the match.
962    ///
963    /// If this match was produced by a forward search, then the offset is
964    /// exclusive. If this match was produced by a reverse search, then the
965    /// offset is inclusive.
966    #[inline]
967    pub fn offset(&self) -> usize {
968        self.offset
969    }
970}
971
972/// A representation of a match reported by a regex engine.
973///
974/// A match has two essential pieces of information: the [`PatternID`] that
975/// matches, and the [`Span`] of the match in a haystack.
976///
977/// The pattern is identified by an ID, which corresponds to its position
978/// (starting from `0`) relative to other patterns used to construct the
979/// corresponding regex engine. If only a single pattern is provided, then all
980/// matches are guaranteed to have a pattern ID of `0`.
981///
982/// Every match reported by a regex engine guarantees that its span has its
983/// start offset as less than or equal to its end offset.
984#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
985pub struct Match {
986    /// The pattern ID.
987    pattern: PatternID,
988    /// The underlying match span.
989    span: Span,
990}
991
992impl Match {
993    /// Create a new match from a pattern ID and a span.
994    ///
995    /// This constructor is generic over how a span is provided. While
996    /// a [`Span`] may be given directly, one may also provide a
997    /// `std::ops::Range<usize>`.
998    ///
999    /// # Panics
1000    ///
1001    /// This panics if `end < start`.
1002    ///
1003    /// # Example
1004    ///
1005    /// This shows how to create a match for the first pattern in a regex
1006    /// object using convenient range syntax.
1007    ///
1008    /// ```
1009    /// use regex_automata::{Match, PatternID};
1010    ///
1011    /// let m = Match::new(PatternID::ZERO, 5..10);
1012    /// assert_eq!(0, m.pattern().as_usize());
1013    /// assert_eq!(5, m.start());
1014    /// assert_eq!(10, m.end());
1015    /// ```
1016    #[inline]
1017    pub fn new<S: Into<Span>>(pattern: PatternID, span: S) -> Match {
1018        let span: Span = span.into();
1019        assert!(span.start <= span.end, "invalid match span");
1020        Match { pattern, span }
1021    }
1022
1023    /// Create a new match from a pattern ID and a byte offset span.
1024    ///
1025    /// This constructor is generic over how a span is provided. While
1026    /// a [`Span`] may be given directly, one may also provide a
1027    /// `std::ops::Range<usize>`.
1028    ///
1029    /// This is like [`Match::new`], but accepts a `usize` instead of a
1030    /// [`PatternID`]. This panics if the given `usize` is not representable
1031    /// as a `PatternID`.
1032    ///
1033    /// # Panics
1034    ///
1035    /// This panics if `end < start` or if `pattern > PatternID::MAX`.
1036    ///
1037    /// # Example
1038    ///
1039    /// This shows how to create a match for the third pattern in a regex
1040    /// object using convenient range syntax.
1041    ///
1042    /// ```
1043    /// use regex_automata::Match;
1044    ///
1045    /// let m = Match::must(3, 5..10);
1046    /// assert_eq!(3, m.pattern().as_usize());
1047    /// assert_eq!(5, m.start());
1048    /// assert_eq!(10, m.end());
1049    /// ```
1050    #[inline]
1051    pub fn must<S: Into<Span>>(pattern: usize, span: S) -> Match {
1052        Match::new(PatternID::must(pattern), span)
1053    }
1054
1055    /// Returns the ID of the pattern that matched.
1056    ///
1057    /// The ID of a pattern is derived from the position in which it was
1058    /// originally inserted into the corresponding regex engine. The first
1059    /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
1060    /// so on.
1061    #[inline]
1062    pub fn pattern(&self) -> PatternID {
1063        self.pattern
1064    }
1065
1066    /// The starting position of the match.
1067    ///
1068    /// This is a convenience routine for `Match::span().start`.
1069    #[inline]
1070    pub fn start(&self) -> usize {
1071        self.span().start
1072    }
1073
1074    /// The ending position of the match.
1075    ///
1076    /// This is a convenience routine for `Match::span().end`.
1077    #[inline]
1078    pub fn end(&self) -> usize {
1079        self.span().end
1080    }
1081
1082    /// Returns the match span as a range.
1083    ///
1084    /// This is a convenience routine for `Match::span().range()`.
1085    #[inline]
1086    pub fn range(&self) -> core::ops::Range<usize> {
1087        self.span().range()
1088    }
1089
1090    /// Returns the span for this match.
1091    #[inline]
1092    pub fn span(&self) -> Span {
1093        self.span
1094    }
1095
1096    /// Returns true when the span in this match is empty.
1097    ///
1098    /// An empty match can only be returned when the regex itself can match
1099    /// the empty string.
1100    #[inline]
1101    pub fn is_empty(&self) -> bool {
1102        self.span().is_empty()
1103    }
1104
1105    /// Returns the length of this match.
1106    ///
1107    /// This returns `0` in precisely the cases that `is_empty` returns `true`.
1108    #[inline]
1109    pub fn len(&self) -> usize {
1110        self.span().len()
1111    }
1112}
1113
1114/// A set of `PatternID`s.
1115///
1116/// A set of pattern identifiers is useful for recording which patterns have
1117/// matched a particular haystack. A pattern set _only_ includes pattern
1118/// identifiers. It does not include offset information.
1119///
1120/// # Example
1121///
1122/// This shows basic usage of a set.
1123///
1124/// ```
1125/// use regex_automata::{PatternID, PatternSet};
1126///
1127/// let pid1 = PatternID::must(5);
1128/// let pid2 = PatternID::must(8);
1129/// // Create a new empty set.
1130/// let mut set = PatternSet::new(10);
1131/// // Insert pattern IDs.
1132/// set.insert(pid1);
1133/// set.insert(pid2);
1134/// // Test membership.
1135/// assert!(set.contains(pid1));
1136/// assert!(set.contains(pid2));
1137/// // Get all members.
1138/// assert_eq!(
1139///     vec![5, 8],
1140///     set.iter().map(|p| p.as_usize()).collect::<Vec<usize>>(),
1141/// );
1142/// // Clear the set.
1143/// set.clear();
1144/// // Test that it is indeed empty.
1145/// assert!(set.is_empty());
1146/// ```
1147#[cfg(feature = "alloc")]
1148#[derive(Clone, Debug, Eq, PartialEq)]
1149pub struct PatternSet {
1150    /// The number of patterns set to 'true' in this set.
1151    len: usize,
1152    /// A map from PatternID to boolean of whether a pattern matches or not.
1153    ///
1154    /// This should probably be a bitset, but it's probably unlikely to matter
1155    /// much in practice.
1156    ///
1157    /// The main downside of this representation (and similarly for a bitset)
1158    /// is that iteration scales with the capacity of the set instead of
1159    /// the length of the set. This doesn't seem likely to be a problem in
1160    /// practice.
1161    ///
1162    /// Another alternative is to just use a 'SparseSet' for this. It does use
1163    /// more memory (quite a bit more), but that seems fine I think compared
1164    /// to the memory being used by the regex engine. The real hiccup with
1165    /// it is that it yields pattern IDs in the order they were inserted.
1166    /// Which is actually kind of nice, but at the time of writing, pattern
1167    /// IDs are yielded in ascending order in the regex crate RegexSet API.
1168    /// If we did change to 'SparseSet', we could provide an additional
1169    /// 'iter_match_order' iterator, but keep the ascending order one for
1170    /// compatibility.
1171    which: alloc::boxed::Box<[bool]>,
1172}
1173
1174#[cfg(feature = "alloc")]
1175impl PatternSet {
1176    /// Create a new set of pattern identifiers with the given capacity.
1177    ///
1178    /// The given capacity typically corresponds to (at least) the number of
1179    /// patterns in a compiled regex object.
1180    ///
1181    /// # Panics
1182    ///
1183    /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is
1184    /// impossible if you use the `pattern_len()` method as defined on any of
1185    /// the regex engines in this crate. Namely, a regex will fail to build by
1186    /// returning an error if the number of patterns given to it exceeds the
1187    /// limit. Therefore, the number of patterns in a valid regex is always
1188    /// a correct capacity to provide here.
1189    pub fn new(capacity: usize) -> PatternSet {
1190        assert!(
1191            capacity <= PatternID::LIMIT,
1192            "pattern set capacity exceeds limit of {}",
1193            PatternID::LIMIT,
1194        );
1195        PatternSet {
1196            len: 0,
1197            which: alloc::vec![false; capacity].into_boxed_slice(),
1198        }
1199    }
1200
1201    /// Clear this set such that it contains no pattern IDs.
1202    pub fn clear(&mut self) {
1203        self.len = 0;
1204        for matched in self.which.iter_mut() {
1205            *matched = false;
1206        }
1207    }
1208
1209    /// Return true if and only if the given pattern identifier is in this set.
1210    pub fn contains(&self, pid: PatternID) -> bool {
1211        pid.as_usize() < self.capacity() && self.which[pid]
1212    }
1213
1214    /// Insert the given pattern identifier into this set and return `true` if
1215    /// the given pattern ID was not previously in this set.
1216    ///
1217    /// If the pattern identifier is already in this set, then this is a no-op.
1218    ///
1219    /// Use [`PatternSet::try_insert`] for a fallible version of this routine.
1220    ///
1221    /// # Panics
1222    ///
1223    /// This panics if this pattern set has insufficient capacity to
1224    /// store the given pattern ID.
1225    pub fn insert(&mut self, pid: PatternID) -> bool {
1226        self.try_insert(pid)
1227            .expect("PatternSet should have sufficient capacity")
1228    }
1229
1230    /// Insert the given pattern identifier into this set and return `true` if
1231    /// the given pattern ID was not previously in this set.
1232    ///
1233    /// If the pattern identifier is already in this set, then this is a no-op.
1234    ///
1235    /// # Errors
1236    ///
1237    /// This returns an error if this pattern set has insufficient capacity to
1238    /// store the given pattern ID.
1239    pub fn try_insert(
1240        &mut self,
1241        pid: PatternID,
1242    ) -> Result<bool, PatternSetInsertError> {
1243        if pid.as_usize() >= self.capacity() {
1244            return Err(PatternSetInsertError {
1245                attempted: pid,
1246                capacity: self.capacity(),
1247            });
1248        }
1249        if self.which[pid] {
1250            return Ok(false);
1251        }
1252        self.len += 1;
1253        self.which[pid] = true;
1254        Ok(true)
1255    }
1256
1257    /*
1258    // This is currently commented out because it is unused and it is unclear
1259    // whether it's useful or not. What's the harm in having it? When, if
1260    // we ever wanted to change our representation to a 'SparseSet', then
1261    // supporting this method would be a bit tricky. So in order to keep some
1262    // API evolution flexibility, we leave it out for now.
1263
1264    /// Remove the given pattern identifier from this set.
1265    ///
1266    /// If the pattern identifier was not previously in this set, then this
1267    /// does not change the set and returns `false`.
1268    ///
1269    /// # Panics
1270    ///
1271    /// This panics if `pid` exceeds the capacity of this set.
1272    pub fn remove(&mut self, pid: PatternID) -> bool {
1273        if !self.which[pid] {
1274            return false;
1275        }
1276        self.len -= 1;
1277        self.which[pid] = false;
1278        true
1279    }
1280    */
1281
1282    /// Return true if and only if this set has no pattern identifiers in it.
1283    pub fn is_empty(&self) -> bool {
1284        self.len() == 0
1285    }
1286
1287    /// Return true if and only if this set has the maximum number of pattern
1288    /// identifiers in the set. This occurs precisely when `PatternSet::len()
1289    /// == PatternSet::capacity()`.
1290    ///
1291    /// This particular property is useful to test because it may allow one to
1292    /// stop a search earlier than you might otherwise. Namely, if a search is
1293    /// only reporting which patterns match a haystack and if you know all of
1294    /// the patterns match at a given point, then there's no new information
1295    /// that can be learned by continuing the search. (Because a pattern set
1296    /// does not keep track of offset information.)
1297    pub fn is_full(&self) -> bool {
1298        self.len() == self.capacity()
1299    }
1300
1301    /// Returns the total number of pattern identifiers in this set.
1302    pub fn len(&self) -> usize {
1303        self.len
1304    }
1305
1306    /// Returns the total number of pattern identifiers that may be stored
1307    /// in this set.
1308    ///
1309    /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`].
1310    ///
1311    /// Typically, the capacity of a pattern set matches the number of patterns
1312    /// in a regex object with which you are searching.
1313    pub fn capacity(&self) -> usize {
1314        self.which.len()
1315    }
1316
1317    /// Returns an iterator over all pattern identifiers in this set.
1318    ///
1319    /// The iterator yields pattern identifiers in ascending order, starting
1320    /// at zero.
1321    pub fn iter(&self) -> PatternSetIter<'_> {
1322        PatternSetIter { it: self.which.iter().enumerate() }
1323    }
1324}
1325
1326/// An error that occurs when a `PatternID` failed to insert into a
1327/// `PatternSet`.
1328///
1329/// An insert fails when the given `PatternID` exceeds the configured capacity
1330/// of the `PatternSet`.
1331///
1332/// This error is created by the [`PatternSet::try_insert`] routine.
1333#[cfg(feature = "alloc")]
1334#[derive(Clone, Debug)]
1335pub struct PatternSetInsertError {
1336    attempted: PatternID,
1337    capacity: usize,
1338}
1339
1340#[cfg(feature = "std")]
1341impl std::error::Error for PatternSetInsertError {}
1342
1343#[cfg(feature = "alloc")]
1344impl core::fmt::Display for PatternSetInsertError {
1345    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1346        write!(
1347            f,
1348            "failed to insert pattern ID {} into pattern set \
1349             with insufficiet capacity of {}",
1350            self.attempted.as_usize(),
1351            self.capacity,
1352        )
1353    }
1354}
1355
1356/// An iterator over all pattern identifiers in a [`PatternSet`].
1357///
1358/// The lifetime parameter `'a` refers to the lifetime of the pattern set being
1359/// iterated over.
1360///
1361/// This iterator is created by the [`PatternSet::iter`] method.
1362#[cfg(feature = "alloc")]
1363#[derive(Clone, Debug)]
1364pub struct PatternSetIter<'a> {
1365    it: core::iter::Enumerate<core::slice::Iter<'a, bool>>,
1366}
1367
1368#[cfg(feature = "alloc")]
1369impl<'a> Iterator for PatternSetIter<'a> {
1370    type Item = PatternID;
1371
1372    fn next(&mut self) -> Option<PatternID> {
1373        while let Some((index, &yes)) = self.it.next() {
1374            if yes {
1375                // Only valid 'PatternID' values can be inserted into the set
1376                // and construction of the set panics if the capacity would
1377                // permit storing invalid pattern IDs. Thus, 'yes' is only true
1378                // precisely when 'index' corresponds to a valid 'PatternID'.
1379                return Some(PatternID::new_unchecked(index));
1380            }
1381        }
1382        None
1383    }
1384
1385    fn size_hint(&self) -> (usize, Option<usize>) {
1386        self.it.size_hint()
1387    }
1388}
1389
1390#[cfg(feature = "alloc")]
1391impl<'a> DoubleEndedIterator for PatternSetIter<'a> {
1392    fn next_back(&mut self) -> Option<PatternID> {
1393        while let Some((index, &yes)) = self.it.next_back() {
1394            if yes {
1395                // Only valid 'PatternID' values can be inserted into the set
1396                // and construction of the set panics if the capacity would
1397                // permit storing invalid pattern IDs. Thus, 'yes' is only true
1398                // precisely when 'index' corresponds to a valid 'PatternID'.
1399                return Some(PatternID::new_unchecked(index));
1400            }
1401        }
1402        None
1403    }
1404}
1405
1406/// The type of anchored search to perform.
1407///
1408/// This is *almost* a boolean option. That is, you can either do an unanchored
1409/// search for any pattern in a regex, or you can do an anchored search for any
1410/// pattern in a regex.
1411///
1412/// A third option exists that, assuming the regex engine supports it, permits
1413/// you to do an anchored search for a specific pattern.
1414///
1415/// Note that there is no way to run an unanchored search for a specific
1416/// pattern. If you need that, you'll need to build separate regexes for each
1417/// pattern.
1418///
1419/// # Errors
1420///
1421/// If a regex engine does not support the anchored mode selected, then the
1422/// regex engine will return an error. While any non-trivial regex engine
1423/// should support at least one of the available anchored modes, there is no
1424/// singular mode that is guaranteed to be universally supported. Some regex
1425/// engines might only support unanchored searches (DFAs compiled without
1426/// anchored starting states) and some regex engines might only support
1427/// anchored searches (like the one-pass DFA).
1428///
1429/// The specific error returned is a [`MatchError`] with a
1430/// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the
1431/// `Anchored` value given that is unsupported.
1432///
1433/// Note that regex engines should report "no match" if, for example, an
1434/// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where
1435/// anchored searches for a specific pattern are supported. This is smooths out
1436/// behavior such that it's possible to guarantee that an error never occurs
1437/// based on how the regex engine is configured. All regex engines in this
1438/// crate report "no match" when searching for an invalid pattern ID, but where
1439/// searching for a valid pattern ID is otherwise supported.
1440///
1441/// # Example
1442///
1443/// This example shows how to use the various `Anchored` modes to run a
1444/// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)
1445/// because it supports all modes unconditionally. Some regex engines, like
1446/// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored
1447/// searches.
1448///
1449/// ```
1450/// # if cfg!(miri) { return Ok(()); } // miri takes too long
1451/// use regex_automata::{
1452///     nfa::thompson::pikevm::PikeVM,
1453///     Anchored, Input, Match, PatternID,
1454/// };
1455///
1456/// let re = PikeVM::new_many(&[
1457///     r"Mrs. \w+",
1458///     r"Miss \w+",
1459///     r"Mr. \w+",
1460///     r"Ms. \w+",
1461/// ])?;
1462/// let mut cache = re.create_cache();
1463/// let hay = "Hello Mr. Springsteen!";
1464///
1465/// // The default is to do an unanchored search.
1466/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1467/// // Explicitly ask for an unanchored search. Same as above.
1468/// let input = Input::new(hay).anchored(Anchored::No);
1469/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay));
1470///
1471/// // Now try an anchored search. Since the match doesn't start at the
1472/// // beginning of the haystack, no match is found!
1473/// let input = Input::new(hay).anchored(Anchored::Yes);
1474/// assert_eq!(None, re.find(&mut cache, input));
1475///
1476/// // We can try an anchored search again, but move the location of where
1477/// // we start the search. Note that the offsets reported are still in
1478/// // terms of the overall haystack and not relative to where we started
1479/// // the search.
1480/// let input = Input::new(hay).anchored(Anchored::Yes).range(6..);
1481/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1482///
1483/// // Now try an anchored search for a specific pattern. We specifically
1484/// // choose a pattern that we know doesn't match to prove that the search
1485/// // only looks for the pattern we provide.
1486/// let input = Input::new(hay)
1487///     .anchored(Anchored::Pattern(PatternID::must(1)))
1488///     .range(6..);
1489/// assert_eq!(None, re.find(&mut cache, input));
1490///
1491/// // But if we switch it to the pattern that we know matches, then we find
1492/// // the match.
1493/// let input = Input::new(hay)
1494///     .anchored(Anchored::Pattern(PatternID::must(2)))
1495///     .range(6..);
1496/// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input));
1497///
1498/// # Ok::<(), Box<dyn std::error::Error>>(())
1499/// ```
1500#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1501pub enum Anchored {
1502    /// Run an unanchored search. This means a match may occur anywhere at or
1503    /// after the start position of the search.
1504    ///
1505    /// This search can return a match for any pattern in the regex.
1506    No,
1507    /// Run an anchored search. This means that a match must begin at the
1508    /// start position of the search.
1509    ///
1510    /// This search can return a match for any pattern in the regex.
1511    Yes,
1512    /// Run an anchored search for a specific pattern. This means that a match
1513    /// must be for the given pattern and must begin at the start position of
1514    /// the search.
1515    Pattern(PatternID),
1516}
1517
1518impl Anchored {
1519    /// Returns true if and only if this anchor mode corresponds to any kind of
1520    /// anchored search.
1521    ///
1522    /// # Example
1523    ///
1524    /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern`
1525    /// are considered anchored searches.
1526    ///
1527    /// ```
1528    /// use regex_automata::{Anchored, PatternID};
1529    ///
1530    /// assert!(!Anchored::No.is_anchored());
1531    /// assert!(Anchored::Yes.is_anchored());
1532    /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored());
1533    /// ```
1534    #[inline]
1535    pub fn is_anchored(&self) -> bool {
1536        matches!(*self, Anchored::Yes | Anchored::Pattern(_))
1537    }
1538
1539    /// Returns the pattern ID associated with this configuration if it is an
1540    /// anchored search for a specific pattern. Otherwise `None` is returned.
1541    ///
1542    /// # Example
1543    ///
1544    /// ```
1545    /// use regex_automata::{Anchored, PatternID};
1546    ///
1547    /// assert_eq!(None, Anchored::No.pattern());
1548    /// assert_eq!(None, Anchored::Yes.pattern());
1549    ///
1550    /// let pid = PatternID::must(5);
1551    /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern());
1552    /// ```
1553    #[inline]
1554    pub fn pattern(&self) -> Option<PatternID> {
1555        match *self {
1556            Anchored::Pattern(pid) => Some(pid),
1557            _ => None,
1558        }
1559    }
1560}
1561
1562/// The kind of match semantics to use for a regex pattern.
1563///
1564/// The default match kind is `LeftmostFirst`, and this corresponds to the
1565/// match semantics used by most backtracking engines, such as Perl.
1566///
1567/// # Leftmost first or "preference order" match semantics
1568///
1569/// Leftmost-first semantics determine which match to report when there are
1570/// multiple paths through a regex that match at the same position. The tie is
1571/// essentially broken by how a backtracker would behave. For example, consider
1572/// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this
1573/// case, both the `foofoo` and `foo` branches match at position `0`. So should
1574/// the end of the match be `3` or `6`?
1575///
1576/// A backtracker will conceptually work by trying `foofoofoo` and failing.
1577/// Then it will try `foofoo`, find the match and stop there. Thus, the
1578/// leftmost-first match position is `6`. This is called "leftmost-first" or
1579/// "preference order" because the order of the branches as written in the
1580/// regex pattern is what determines how to break the tie.
1581///
1582/// (Note that leftmost-longest match semantics, which break ties by always
1583/// taking the longest matching string, are not currently supported by this
1584/// crate. These match semantics tend to be found in POSIX regex engines.)
1585///
1586/// This example shows how leftmost-first semantics work, and how it even
1587/// applies to multi-pattern regexes:
1588///
1589/// ```
1590/// use regex_automata::{
1591///     nfa::thompson::pikevm::PikeVM,
1592///     Match,
1593/// };
1594///
1595/// let re = PikeVM::new_many(&[
1596///     r"foofoofoo",
1597///     r"foofoo",
1598///     r"foo",
1599/// ])?;
1600/// let mut cache = re.create_cache();
1601/// let got: Vec<Match> = re.find_iter(&mut cache, "foofoo").collect();
1602/// let expected = vec![Match::must(1, 0..6)];
1603/// assert_eq!(expected, got);
1604///
1605/// # Ok::<(), Box<dyn std::error::Error>>(())
1606/// ```
1607///
1608/// # All matches
1609///
1610/// The `All` match semantics report any and all matches, and generally will
1611/// attempt to match as much as possible. It doesn't respect any sort of match
1612/// priority at all, so things like non-greedy matching don't work in this
1613/// mode.
1614///
1615/// The fact that non-greedy matching doesn't work generally makes most forms
1616/// of unanchored non-overlapping searches have unintuitive behavior. Namely,
1617/// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the
1618/// beginning of the pattern, which is specifically non-greedy. Since it will
1619/// be treated as greedy in `All` match semantics, this generally means that
1620/// it will first attempt to consume all of the haystack and is likely to wind
1621/// up skipping matches.
1622///
1623/// Generally speaking, `All` should only be used in two circumstances:
1624///
1625/// * When running an anchored search and there is a desire to match as much as
1626/// possible. For example, when building a reverse regex matcher to find the
1627/// start of a match after finding the end. In this case, the reverse search
1628/// is anchored to the end of the match found by the forward search.
1629/// * When running overlapping searches. Since `All` encodes all possible
1630/// matches, this is generally what you want for an overlapping search. If you
1631/// try to use leftmost-first in an overlapping search, it is likely to produce
1632/// counter-intuitive results since leftmost-first specifically excludes some
1633/// matches from its underlying finite state machine.
1634///
1635/// This example demonstrates the counter-intuitive behavior of `All` semantics
1636/// when using a standard leftmost unanchored search:
1637///
1638/// ```
1639/// use regex_automata::{
1640///     nfa::thompson::pikevm::PikeVM,
1641///     Match, MatchKind,
1642/// };
1643///
1644/// let re = PikeVM::builder()
1645///     .configure(PikeVM::config().match_kind(MatchKind::All))
1646///     .build("foo")?;
1647/// let hay = "first foo second foo wat";
1648/// let mut cache = re.create_cache();
1649/// let got: Vec<Match> = re.find_iter(&mut cache, hay).collect();
1650/// // Notice that it completely skips the first 'foo'!
1651/// let expected = vec![Match::must(0, 17..20)];
1652/// assert_eq!(expected, got);
1653///
1654/// # Ok::<(), Box<dyn std::error::Error>>(())
1655/// ```
1656///
1657/// This second example shows how `All` semantics are useful for an overlapping
1658/// search. Note that we use lower level lazy DFA APIs here since the NFA
1659/// engines only currently support a very limited form of overlapping search.
1660///
1661/// ```
1662/// use regex_automata::{
1663///     hybrid::dfa::{DFA, OverlappingState},
1664///     HalfMatch, Input, MatchKind,
1665/// };
1666///
1667/// let re = DFA::builder()
1668///     // If we didn't set 'All' semantics here, then the regex would only
1669///     // match 'foo' at offset 3 and nothing else. Why? Because the state
1670///     // machine implements preference order and knows that the 'foofoo' and
1671///     // 'foofoofoo' branches can never match since 'foo' will always match
1672///     // when they match and take priority.
1673///     .configure(DFA::config().match_kind(MatchKind::All))
1674///     .build(r"foo|foofoo|foofoofoo")?;
1675/// let mut cache = re.create_cache();
1676/// let mut state = OverlappingState::start();
1677/// let input = Input::new("foofoofoo");
1678/// let mut got = vec![];
1679/// loop {
1680///     re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?;
1681///     let m = match state.get_match() {
1682///         None => break,
1683///         Some(m) => m,
1684///     };
1685///     got.push(m);
1686/// }
1687/// let expected = vec![
1688///     HalfMatch::must(0, 3),
1689///     HalfMatch::must(0, 6),
1690///     HalfMatch::must(0, 9),
1691/// ];
1692/// assert_eq!(expected, got);
1693///
1694/// # Ok::<(), Box<dyn std::error::Error>>(())
1695/// ```
1696#[non_exhaustive]
1697#[derive(Clone, Copy, Debug, Eq, PartialEq)]
1698pub enum MatchKind {
1699    /// Report all possible matches.
1700    All,
1701    /// Report only the leftmost matches. When multiple leftmost matches exist,
1702    /// report the match corresponding to the part of the regex that appears
1703    /// first in the syntax.
1704    LeftmostFirst,
1705    // There is prior art in RE2 that shows that we should be able to add
1706    // LeftmostLongest too. The tricky part of it is supporting ungreedy
1707    // repetitions. Instead of treating all NFA states as having equivalent
1708    // priority (as in 'All') or treating all NFA states as having distinct
1709    // priority based on order (as in 'LeftmostFirst'), we instead group NFA
1710    // states into sets, and treat members of each set as having equivalent
1711    // priority, but having greater priority than all following members
1712    // of different sets.
1713    //
1714    // However, it's not clear whether it's really worth adding this. After
1715    // all, leftmost-longest can be emulated when using literals by using
1716    // leftmost-first and sorting the literals by length in descending order.
1717    // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
1718    // always match `a` in `ab` when using leftmost-first, but leftmost-longest
1719    // would match `ab`.
1720}
1721
1722impl MatchKind {
1723    #[cfg(feature = "alloc")]
1724    pub(crate) fn continue_past_first_match(&self) -> bool {
1725        *self == MatchKind::All
1726    }
1727}
1728
1729impl Default for MatchKind {
1730    fn default() -> MatchKind {
1731        MatchKind::LeftmostFirst
1732    }
1733}
1734
1735/// An error indicating that a search stopped before reporting whether a
1736/// match exists or not.
1737///
1738/// To be very clear, this error type implies that one cannot assume that no
1739/// matches occur, since the search stopped before completing. That is, if
1740/// you're looking for information about where a search determined that no
1741/// match can occur, then this error type does *not* give you that. (Indeed, at
1742/// the time of writing, if you need such a thing, you have to write your own
1743/// search routine.)
1744///
1745/// Normally, when one searches for something, the response is either an
1746/// affirmative "it was found at this location" or a negative "not found at
1747/// all." However, in some cases, a regex engine can be configured to stop its
1748/// search before concluding whether a match exists or not. When this happens,
1749/// it may be important for the caller to know why the regex engine gave up and
1750/// where in the input it gave up at. This error type exposes the 'why' and the
1751/// 'where.'
1752///
1753/// For example, the DFAs provided by this library generally cannot correctly
1754/// implement Unicode word boundaries. Instead, they provide an option to
1755/// eagerly support them on ASCII text (since Unicode word boundaries are
1756/// equivalent to ASCII word boundaries when searching ASCII text), but will
1757/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
1758/// required to either report the failure to the caller (unergonomic) or
1759/// otherwise fall back to some other regex engine (ergonomic, but potentially
1760/// costly).
1761///
1762/// More generally, some regex engines offer the ability for callers to specify
1763/// certain bytes that will trigger the regex engine to automatically quit if
1764/// they are seen.
1765///
1766/// Still yet, there may be other reasons for a failed match. For example,
1767/// the hybrid DFA provided by this crate can be configured to give up if it
1768/// believes that it is not efficient. This in turn permits callers to choose a
1769/// different regex engine.
1770///
1771/// (Note that DFAs are configured by default to never quit or give up in this
1772/// fashion. For example, by default, a DFA will fail to build if the regex
1773/// pattern contains a Unicode word boundary. One needs to opt into the "quit"
1774/// behavior via options, like
1775/// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).)
1776///
1777/// There are a couple other ways a search
1778/// can fail. For example, when using the
1779/// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker)
1780/// with a haystack that is too long, or trying to run an unanchored search
1781/// with a [one-pass DFA](crate::dfa::onepass).
1782#[derive(Clone, Debug, Eq, PartialEq)]
1783pub struct MatchError(
1784    #[cfg(feature = "alloc")] alloc::boxed::Box<MatchErrorKind>,
1785    #[cfg(not(feature = "alloc"))] MatchErrorKind,
1786);
1787
1788impl MatchError {
1789    /// Create a new error value with the given kind.
1790    ///
1791    /// This is a more verbose version of the kind-specific constructors,
1792    /// e.g., `MatchError::quit`.
1793    pub fn new(kind: MatchErrorKind) -> MatchError {
1794        #[cfg(feature = "alloc")]
1795        {
1796            MatchError(alloc::boxed::Box::new(kind))
1797        }
1798        #[cfg(not(feature = "alloc"))]
1799        {
1800            MatchError(kind)
1801        }
1802    }
1803
1804    /// Returns a reference to the underlying error kind.
1805    pub fn kind(&self) -> &MatchErrorKind {
1806        &self.0
1807    }
1808
1809    /// Create a new "quit" error. The given `byte` corresponds to the value
1810    /// that tripped a search's quit condition, and `offset` corresponds to the
1811    /// location in the haystack at which the search quit.
1812    ///
1813    /// This is the same as calling `MatchError::new` with a
1814    /// [`MatchErrorKind::Quit`] kind.
1815    pub fn quit(byte: u8, offset: usize) -> MatchError {
1816        MatchError::new(MatchErrorKind::Quit { byte, offset })
1817    }
1818
1819    /// Create a new "gave up" error. The given `offset` corresponds to the
1820    /// location in the haystack at which the search gave up.
1821    ///
1822    /// This is the same as calling `MatchError::new` with a
1823    /// [`MatchErrorKind::GaveUp`] kind.
1824    pub fn gave_up(offset: usize) -> MatchError {
1825        MatchError::new(MatchErrorKind::GaveUp { offset })
1826    }
1827
1828    /// Create a new "haystack too long" error. The given `len` corresponds to
1829    /// the length of the haystack that was problematic.
1830    ///
1831    /// This is the same as calling `MatchError::new` with a
1832    /// [`MatchErrorKind::HaystackTooLong`] kind.
1833    pub fn haystack_too_long(len: usize) -> MatchError {
1834        MatchError::new(MatchErrorKind::HaystackTooLong { len })
1835    }
1836
1837    /// Create a new "unsupported anchored" error. This occurs when the caller
1838    /// requests a search with an anchor mode that is not supported by the
1839    /// regex engine.
1840    ///
1841    /// This is the same as calling `MatchError::new` with a
1842    /// [`MatchErrorKind::UnsupportedAnchored`] kind.
1843    pub fn unsupported_anchored(mode: Anchored) -> MatchError {
1844        MatchError::new(MatchErrorKind::UnsupportedAnchored { mode })
1845    }
1846}
1847
1848/// The underlying kind of a [`MatchError`].
1849///
1850/// This is a **non-exhaustive** enum. That means new variants may be added in
1851/// a semver-compatible release.
1852#[non_exhaustive]
1853#[derive(Clone, Debug, Eq, PartialEq)]
1854pub enum MatchErrorKind {
1855    /// The search saw a "quit" byte at which it was instructed to stop
1856    /// searching.
1857    Quit {
1858        /// The "quit" byte that was observed that caused the search to stop.
1859        byte: u8,
1860        /// The offset at which the quit byte was observed.
1861        offset: usize,
1862    },
1863    /// The search, based on heuristics, determined that it would be better
1864    /// to stop, typically to provide the caller an opportunity to use an
1865    /// alternative regex engine.
1866    ///
1867    /// Currently, the only way for this to occur is via the lazy DFA and
1868    /// only when it is configured to do so (it will not return this error by
1869    /// default).
1870    GaveUp {
1871        /// The offset at which the search stopped. This corresponds to the
1872        /// position immediately following the last byte scanned.
1873        offset: usize,
1874    },
1875    /// This error occurs if the haystack given to the regex engine was too
1876    /// long to be searched. This occurs, for example, with regex engines
1877    /// like the bounded backtracker that have a configurable fixed amount of
1878    /// capacity that is tied to the length of the haystack. Anything beyond
1879    /// that configured limit will result in an error at search time.
1880    HaystackTooLong {
1881        /// The length of the haystack that exceeded the limit.
1882        len: usize,
1883    },
1884    /// An error indicating that a particular type of anchored search was
1885    /// requested, but that the regex engine does not support it.
1886    ///
1887    /// Note that this error should not be returned by a regex engine simply
1888    /// because the pattern ID is invalid (i.e., equal to or exceeds the number
1889    /// of patterns in the regex). In that case, the regex engine should report
1890    /// a non-match.
1891    UnsupportedAnchored {
1892        /// The anchored mode given that is unsupported.
1893        mode: Anchored,
1894    },
1895}
1896
1897#[cfg(feature = "std")]
1898impl std::error::Error for MatchError {}
1899
1900impl core::fmt::Display for MatchError {
1901    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
1902        match *self.kind() {
1903            MatchErrorKind::Quit { byte, offset } => write!(
1904                f,
1905                "quit search after observing byte {:?} at offset {}",
1906                DebugByte(byte),
1907                offset,
1908            ),
1909            MatchErrorKind::GaveUp { offset } => {
1910                write!(f, "gave up searching at offset {}", offset)
1911            }
1912            MatchErrorKind::HaystackTooLong { len } => {
1913                write!(f, "haystack of length {} is too long", len)
1914            }
1915            MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => {
1916                write!(f, "anchored searches are not supported or enabled")
1917            }
1918            MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => {
1919                write!(f, "unanchored searches are not supported or enabled")
1920            }
1921            MatchErrorKind::UnsupportedAnchored {
1922                mode: Anchored::Pattern(pid),
1923            } => {
1924                write!(
1925                    f,
1926                    "anchored searches for a specific pattern ({}) are \
1927                     not supported or enabled",
1928                    pid.as_usize(),
1929                )
1930            }
1931        }
1932    }
1933}
1934
1935#[cfg(test)]
1936mod tests {
1937    use super::*;
1938
1939    // We test that our 'MatchError' type is the size we expect. This isn't an
1940    // API guarantee, but if the size increases, we really want to make sure we
1941    // decide to do that intentionally. So this should be a speed bump. And in
1942    // general, we should not increase the size without a very good reason.
1943    //
1944    // Why? Because low level search APIs return Result<.., MatchError>. When
1945    // MatchError gets bigger, so to does the Result type.
1946    //
1947    // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes
1948    // the importance of keeping a small error type. But without 'alloc', we
1949    // still want things to be small.
1950    #[test]
1951    fn match_error_size() {
1952        let expected_size = if cfg!(feature = "alloc") {
1953            core::mem::size_of::<usize>()
1954        } else {
1955            2 * core::mem::size_of::<usize>()
1956        };
1957        assert_eq!(expected_size, core::mem::size_of::<MatchError>());
1958    }
1959
1960    // Same as above, but for the underlying match error kind.
1961    #[cfg(target_pointer_width = "64")]
1962    #[test]
1963    fn match_error_kind_size() {
1964        let expected_size = 2 * core::mem::size_of::<usize>();
1965        assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1966    }
1967
1968    #[cfg(target_pointer_width = "32")]
1969    #[test]
1970    fn match_error_kind_size() {
1971        let expected_size = 3 * core::mem::size_of::<usize>();
1972        assert_eq!(expected_size, core::mem::size_of::<MatchErrorKind>());
1973    }
1974
1975    #[test]
1976    fn incorrect_asref_guard() {
1977        struct Bad(std::cell::Cell<bool>);
1978
1979        impl AsRef<[u8]> for Bad {
1980            fn as_ref(&self) -> &[u8] {
1981                if self.0.replace(false) {
1982                    &[]
1983                } else {
1984                    &[0; 1000]
1985                }
1986            }
1987        }
1988
1989        let bad = Bad(std::cell::Cell::new(true));
1990        let input = Input::new(&bad);
1991        assert!(input.end() <= input.haystack().len());
1992    }
1993}