regex_automata/nfa/thompson/
backtrack.rs

1/*!
2An NFA backed bounded backtracker for executing regex searches with capturing
3groups.
4
5This module provides a [`BoundedBacktracker`] that works by simulating an NFA
6using the classical backtracking algorithm with a twist: it avoids redoing
7work that it has done before and thereby avoids worst case exponential time.
8In exchange, it can only be used on "short" haystacks. Its advantage is that
9is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases
10because it does less book-keeping.
11*/
12
13use alloc::{vec, vec::Vec};
14
15use crate::{
16    nfa::thompson::{self, BuildError, State, NFA},
17    util::{
18        captures::Captures,
19        empty, iter,
20        prefilter::Prefilter,
21        primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
22        search::{Anchored, HalfMatch, Input, Match, MatchError, Span},
23    },
24};
25
26/// Returns the minimum visited capacity for the given haystack.
27///
28/// This function can be used as the argument to [`Config::visited_capacity`]
29/// in order to guarantee that a backtracking search for the given `input`
30/// won't return an error when using a [`BoundedBacktracker`] built from the
31/// given `NFA`.
32///
33/// This routine exists primarily as a way to test that the bounded backtracker
34/// works correctly when its capacity is set to the smallest possible amount.
35/// Still, it may be useful in cases where you know you want to use the bounded
36/// backtracker for a specific input, and just need to know what visited
37/// capacity to provide to make it work.
38///
39/// Be warned that this number could be quite large as it is multiplicative in
40/// the size the given NFA and haystack.
41pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize {
42    div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8)
43}
44
45/// The configuration used for building a bounded backtracker.
46///
47/// A bounded backtracker configuration is a simple data object that is
48/// typically used with [`Builder::configure`].
49#[derive(Clone, Debug, Default)]
50pub struct Config {
51    pre: Option<Option<Prefilter>>,
52    visited_capacity: Option<usize>,
53}
54
55impl Config {
56    /// Return a new default regex configuration.
57    pub fn new() -> Config {
58        Config::default()
59    }
60
61    /// Set a prefilter to be used whenever a start state is entered.
62    ///
63    /// A [`Prefilter`] in this context is meant to accelerate searches by
64    /// looking for literal prefixes that every match for the corresponding
65    /// pattern (or patterns) must start with. Once a prefilter produces a
66    /// match, the underlying search routine continues on to try and confirm
67    /// the match.
68    ///
69    /// Be warned that setting a prefilter does not guarantee that the search
70    /// will be faster. While it's usually a good bet, if the prefilter
71    /// produces a lot of false positive candidates (i.e., positions matched
72    /// by the prefilter but not by the regex), then the overall result can
73    /// be slower than if you had just executed the regex engine without any
74    /// prefilters.
75    ///
76    /// By default no prefilter is set.
77    ///
78    /// # Example
79    ///
80    /// ```
81    /// use regex_automata::{
82    ///     nfa::thompson::backtrack::BoundedBacktracker,
83    ///     util::prefilter::Prefilter,
84    ///     Input, Match, MatchKind,
85    /// };
86    ///
87    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
88    /// let re = BoundedBacktracker::builder()
89    ///     .configure(BoundedBacktracker::config().prefilter(pre))
90    ///     .build(r"(foo|bar)[a-z]+")?;
91    /// let mut cache = re.create_cache();
92    /// let input = Input::new("foo1 barfox bar");
93    /// assert_eq!(
94    ///     Some(Match::must(0, 5..11)),
95    ///     re.try_find(&mut cache, input)?,
96    /// );
97    ///
98    /// # Ok::<(), Box<dyn std::error::Error>>(())
99    /// ```
100    ///
101    /// Be warned though that an incorrect prefilter can lead to incorrect
102    /// results!
103    ///
104    /// ```
105    /// use regex_automata::{
106    ///     nfa::thompson::backtrack::BoundedBacktracker,
107    ///     util::prefilter::Prefilter,
108    ///     Input, HalfMatch, MatchKind,
109    /// };
110    ///
111    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
112    /// let re = BoundedBacktracker::builder()
113    ///     .configure(BoundedBacktracker::config().prefilter(pre))
114    ///     .build(r"(foo|bar)[a-z]+")?;
115    /// let mut cache = re.create_cache();
116    /// let input = Input::new("foo1 barfox bar");
117    /// // No match reported even though there clearly is one!
118    /// assert_eq!(None, re.try_find(&mut cache, input)?);
119    ///
120    /// # Ok::<(), Box<dyn std::error::Error>>(())
121    /// ```
122    pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
123        self.pre = Some(pre);
124        self
125    }
126
127    /// Set the visited capacity used to bound backtracking.
128    ///
129    /// The visited capacity represents the amount of heap memory (in bytes) to
130    /// allocate toward tracking which parts of the backtracking search have
131    /// been done before. The heap memory needed for any particular search is
132    /// proportional to `haystack.len() * nfa.states().len()`, which an be
133    /// quite large. Therefore, the bounded backtracker is typically only able
134    /// to run on shorter haystacks.
135    ///
136    /// For a given regex, increasing the visited capacity means that the
137    /// maximum haystack length that can be searched is increased. The
138    /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum.
139    ///
140    /// The default capacity is a reasonable but empirically chosen size.
141    ///
142    /// # Example
143    ///
144    /// As with other regex engines, Unicode is what tends to make the bounded
145    /// backtracker less useful by making the maximum haystack length quite
146    /// small. If necessary, increasing the visited capacity using this routine
147    /// will increase the maximum haystack length at the cost of using more
148    /// memory.
149    ///
150    /// Note though that the specific maximum values here are not an API
151    /// guarantee. The default visited capacity is subject to change and not
152    /// covered by semver.
153    ///
154    /// ```
155    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
156    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
157    ///
158    /// // Unicode inflates the size of the underlying NFA quite a bit, and
159    /// // thus means that the backtracker can only handle smaller haystacks,
160    /// // assuming that the visited capacity remains unchanged.
161    /// let re = BoundedBacktracker::new(r"\w+")?;
162    /// assert!(re.max_haystack_len() <= 7_000);
163    /// // But we can increase the visited capacity to handle bigger haystacks!
164    /// let re = BoundedBacktracker::builder()
165    ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
166    ///     .build(r"\w+")?;
167    /// assert!(re.max_haystack_len() >= 25_000);
168    /// assert!(re.max_haystack_len() <= 28_000);
169    /// # Ok::<(), Box<dyn std::error::Error>>(())
170    /// ```
171    pub fn visited_capacity(mut self, capacity: usize) -> Config {
172        self.visited_capacity = Some(capacity);
173        self
174    }
175
176    /// Returns the prefilter set in this configuration, if one at all.
177    pub fn get_prefilter(&self) -> Option<&Prefilter> {
178        self.pre.as_ref().unwrap_or(&None).as_ref()
179    }
180
181    /// Returns the configured visited capacity.
182    ///
183    /// Note that the actual capacity used may be slightly bigger than the
184    /// configured capacity.
185    pub fn get_visited_capacity(&self) -> usize {
186        const DEFAULT: usize = 256 * (1 << 10); // 256 KB
187        self.visited_capacity.unwrap_or(DEFAULT)
188    }
189
190    /// Overwrite the default configuration such that the options in `o` are
191    /// always used. If an option in `o` is not set, then the corresponding
192    /// option in `self` is used. If it's not set in `self` either, then it
193    /// remains not set.
194    pub(crate) fn overwrite(&self, o: Config) -> Config {
195        Config {
196            pre: o.pre.or_else(|| self.pre.clone()),
197            visited_capacity: o.visited_capacity.or(self.visited_capacity),
198        }
199    }
200}
201
202/// A builder for a bounded backtracker.
203///
204/// This builder permits configuring options for the syntax of a pattern, the
205/// NFA construction and the `BoundedBacktracker` construction. This builder
206/// is different from a general purpose regex builder in that it permits fine
207/// grain configuration of the construction process. The trade off for this is
208/// complexity, and the possibility of setting a configuration that might not
209/// make sense. For example, there are two different UTF-8 modes:
210///
211/// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls
212/// whether the pattern itself can contain sub-expressions that match invalid
213/// UTF-8.
214/// * [`thompson::Config::utf8`] controls how the regex iterators themselves
215/// advance the starting position of the next search when a match with zero
216/// length is found.
217///
218/// Generally speaking, callers will want to either enable all of these or
219/// disable all of these.
220///
221/// # Example
222///
223/// This example shows how to disable UTF-8 mode in the syntax and the regex
224/// itself. This is generally what you want for matching on arbitrary bytes.
225///
226/// ```
227/// use regex_automata::{
228///     nfa::thompson::{self, backtrack::BoundedBacktracker},
229///     util::syntax,
230///     Match,
231/// };
232///
233/// let re = BoundedBacktracker::builder()
234///     .syntax(syntax::Config::new().utf8(false))
235///     .thompson(thompson::Config::new().utf8(false))
236///     .build(r"foo(?-u:[^b])ar.*")?;
237/// let mut cache = re.create_cache();
238///
239/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
240/// let expected = Some(Ok(Match::must(0, 1..9)));
241/// let got = re.try_find_iter(&mut cache, haystack).next();
242/// assert_eq!(expected, got);
243/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
244/// // but the subsequent `.*` does not! Disabling UTF-8
245/// // on the syntax permits this.
246/// //
247/// // N.B. This example does not show the impact of
248/// // disabling UTF-8 mode on a BoundedBacktracker Config, since that
249/// // only impacts regexes that can produce matches of
250/// // length 0.
251/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]);
252///
253/// # Ok::<(), Box<dyn std::error::Error>>(())
254/// ```
255#[derive(Clone, Debug)]
256pub struct Builder {
257    config: Config,
258    #[cfg(feature = "syntax")]
259    thompson: thompson::Compiler,
260}
261
262impl Builder {
263    /// Create a new BoundedBacktracker builder with its default configuration.
264    pub fn new() -> Builder {
265        Builder {
266            config: Config::default(),
267            #[cfg(feature = "syntax")]
268            thompson: thompson::Compiler::new(),
269        }
270    }
271
272    /// Build a `BoundedBacktracker` from the given pattern.
273    ///
274    /// If there was a problem parsing or compiling the pattern, then an error
275    /// is returned.
276    #[cfg(feature = "syntax")]
277    pub fn build(
278        &self,
279        pattern: &str,
280    ) -> Result<BoundedBacktracker, BuildError> {
281        self.build_many(&[pattern])
282    }
283
284    /// Build a `BoundedBacktracker` from the given patterns.
285    #[cfg(feature = "syntax")]
286    pub fn build_many<P: AsRef<str>>(
287        &self,
288        patterns: &[P],
289    ) -> Result<BoundedBacktracker, BuildError> {
290        let nfa = self.thompson.build_many(patterns)?;
291        self.build_from_nfa(nfa)
292    }
293
294    /// Build a `BoundedBacktracker` directly from its NFA.
295    ///
296    /// Note that when using this method, any configuration that applies to the
297    /// construction of the NFA itself will of course be ignored, since the NFA
298    /// given here is already built.
299    pub fn build_from_nfa(
300        &self,
301        nfa: NFA,
302    ) -> Result<BoundedBacktracker, BuildError> {
303        nfa.look_set_any().available().map_err(BuildError::word)?;
304        Ok(BoundedBacktracker { config: self.config.clone(), nfa })
305    }
306
307    /// Apply the given `BoundedBacktracker` configuration options to this
308    /// builder.
309    pub fn configure(&mut self, config: Config) -> &mut Builder {
310        self.config = self.config.overwrite(config);
311        self
312    }
313
314    /// Set the syntax configuration for this builder using
315    /// [`syntax::Config`](crate::util::syntax::Config).
316    ///
317    /// This permits setting things like case insensitivity, Unicode and multi
318    /// line mode.
319    ///
320    /// These settings only apply when constructing a `BoundedBacktracker`
321    /// directly from a pattern.
322    #[cfg(feature = "syntax")]
323    pub fn syntax(
324        &mut self,
325        config: crate::util::syntax::Config,
326    ) -> &mut Builder {
327        self.thompson.syntax(config);
328        self
329    }
330
331    /// Set the Thompson NFA configuration for this builder using
332    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
333    ///
334    /// This permits setting things like if additional time should be spent
335    /// shrinking the size of the NFA.
336    ///
337    /// These settings only apply when constructing a `BoundedBacktracker`
338    /// directly from a pattern.
339    #[cfg(feature = "syntax")]
340    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
341        self.thompson.configure(config);
342        self
343    }
344}
345
346/// A backtracking regex engine that bounds its execution to avoid exponential
347/// blow-up.
348///
349/// This regex engine only implements leftmost-first match semantics and
350/// only supports leftmost searches. It effectively does the same thing as a
351/// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because
352/// it doesn't have to worry about copying capturing group spans for most NFA
353/// states. Instead, the backtracker can maintain one set of captures (provided
354/// by the caller) and never needs to copy them. In exchange, the backtracker
355/// bounds itself to ensure it doesn't exhibit worst case exponential time.
356/// This results in the backtracker only being able to handle short haystacks
357/// given reasonable memory usage.
358///
359/// # Searches may return an error!
360///
361/// By design, this backtracking regex engine is bounded. This bound is
362/// implemented by not visiting any combination of NFA state ID and position
363/// in a haystack more than once. Thus, the total memory required to bound
364/// backtracking is proportional to `haystack.len() * nfa.states().len()`.
365/// This can obviously get quite large, since large haystacks aren't terribly
366/// uncommon. To avoid using exorbitant memory, the capacity is bounded by
367/// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total
368/// capacity required for a particular regex and a haystack exceeds this
369/// capacity, then the search routine will return an error.
370///
371/// Unlike other regex engines that may return an error at search time (like
372/// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded
373/// backtracker will work for every haystack. Therefore, this regex engine
374/// _only_ exposes fallible search routines to avoid the footgun of panicking
375/// when running a search on a haystack that is too big.
376///
377/// If one wants to use the fallible search APIs without handling the
378/// error, the only way to guarantee an error won't occur from the
379/// haystack length is to ensure the haystack length does not exceed
380/// [`BoundedBacktracker::max_haystack_len`].
381///
382/// # Example: Unicode word boundaries
383///
384/// This example shows that the bounded backtracker implements Unicode word
385/// boundaries correctly by default.
386///
387/// ```
388/// # if cfg!(miri) { return Ok(()); } // miri takes too long
389/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
390///
391/// let re = BoundedBacktracker::new(r"\b\w+\b")?;
392/// let mut cache = re.create_cache();
393///
394/// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс");
395/// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next());
396/// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next());
397/// assert_eq!(None, it.next());
398/// # Ok::<(), Box<dyn std::error::Error>>(())
399/// ```
400///
401/// # Example: multiple regex patterns
402///
403/// The bounded backtracker supports searching for multiple patterns
404/// simultaneously, just like other regex engines. Note though that because it
405/// uses a backtracking strategy, this regex engine is unlikely to scale well
406/// as more patterns are added. But then again, as more patterns are added, the
407/// maximum haystack length allowed will also shorten (assuming the visited
408/// capacity remains invariant).
409///
410/// ```
411/// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match};
412///
413/// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
414/// let mut cache = re.create_cache();
415///
416/// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
417/// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
418/// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
419/// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
420/// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
421/// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
422/// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
423/// assert_eq!(None, it.next());
424/// # Ok::<(), Box<dyn std::error::Error>>(())
425/// ```
426#[derive(Clone, Debug)]
427pub struct BoundedBacktracker {
428    config: Config,
429    nfa: NFA,
430}
431
432impl BoundedBacktracker {
433    /// Parse the given regular expression using the default configuration and
434    /// return the corresponding `BoundedBacktracker`.
435    ///
436    /// If you want a non-default configuration, then use the [`Builder`] to
437    /// set your own configuration.
438    ///
439    /// # Example
440    ///
441    /// ```
442    /// use regex_automata::{
443    ///     nfa::thompson::backtrack::BoundedBacktracker,
444    ///     Match,
445    /// };
446    ///
447    /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
448    /// let mut cache = re.create_cache();
449    /// assert_eq!(
450    ///     Some(Ok(Match::must(0, 3..14))),
451    ///     re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(),
452    /// );
453    /// # Ok::<(), Box<dyn std::error::Error>>(())
454    /// ```
455    #[cfg(feature = "syntax")]
456    pub fn new(pattern: &str) -> Result<BoundedBacktracker, BuildError> {
457        BoundedBacktracker::builder().build(pattern)
458    }
459
460    /// Like `new`, but parses multiple patterns into a single "multi regex."
461    /// This similarly uses the default regex configuration.
462    ///
463    /// # Example
464    ///
465    /// ```
466    /// use regex_automata::{
467    ///     nfa::thompson::backtrack::BoundedBacktracker,
468    ///     Match,
469    /// };
470    ///
471    /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?;
472    /// let mut cache = re.create_cache();
473    ///
474    /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux");
475    /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next());
476    /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next());
477    /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next());
478    /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next());
479    /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next());
480    /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next());
481    /// assert_eq!(None, it.next());
482    /// # Ok::<(), Box<dyn std::error::Error>>(())
483    /// ```
484    #[cfg(feature = "syntax")]
485    pub fn new_many<P: AsRef<str>>(
486        patterns: &[P],
487    ) -> Result<BoundedBacktracker, BuildError> {
488        BoundedBacktracker::builder().build_many(patterns)
489    }
490
491    /// # Example
492    ///
493    /// This shows how to hand assemble a regular expression via its HIR,
494    /// compile an NFA from it and build a BoundedBacktracker from the NFA.
495    ///
496    /// ```
497    /// use regex_automata::{
498    ///     nfa::thompson::{NFA, backtrack::BoundedBacktracker},
499    ///     Match,
500    /// };
501    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
502    ///
503    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
504    ///     ClassBytesRange::new(b'0', b'9'),
505    ///     ClassBytesRange::new(b'A', b'Z'),
506    ///     ClassBytesRange::new(b'_', b'_'),
507    ///     ClassBytesRange::new(b'a', b'z'),
508    /// ])));
509    ///
510    /// let config = NFA::config().nfa_size_limit(Some(1_000));
511    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
512    ///
513    /// let re = BoundedBacktracker::new_from_nfa(nfa)?;
514    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
515    /// let expected = Some(Match::must(0, 3..4));
516    /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?;
517    /// assert_eq!(expected, caps.get_match());
518    ///
519    /// # Ok::<(), Box<dyn std::error::Error>>(())
520    /// ```
521    pub fn new_from_nfa(nfa: NFA) -> Result<BoundedBacktracker, BuildError> {
522        BoundedBacktracker::builder().build_from_nfa(nfa)
523    }
524
525    /// Create a new `BoundedBacktracker` that matches every input.
526    ///
527    /// # Example
528    ///
529    /// ```
530    /// use regex_automata::{
531    ///     nfa::thompson::backtrack::BoundedBacktracker,
532    ///     Match,
533    /// };
534    ///
535    /// let re = BoundedBacktracker::always_match()?;
536    /// let mut cache = re.create_cache();
537    ///
538    /// let expected = Some(Ok(Match::must(0, 0..0)));
539    /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next());
540    /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next());
541    /// # Ok::<(), Box<dyn std::error::Error>>(())
542    /// ```
543    pub fn always_match() -> Result<BoundedBacktracker, BuildError> {
544        let nfa = thompson::NFA::always_match();
545        BoundedBacktracker::new_from_nfa(nfa)
546    }
547
548    /// Create a new `BoundedBacktracker` that never matches any input.
549    ///
550    /// # Example
551    ///
552    /// ```
553    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
554    ///
555    /// let re = BoundedBacktracker::never_match()?;
556    /// let mut cache = re.create_cache();
557    ///
558    /// assert_eq!(None, re.try_find_iter(&mut cache, "").next());
559    /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next());
560    /// # Ok::<(), Box<dyn std::error::Error>>(())
561    /// ```
562    pub fn never_match() -> Result<BoundedBacktracker, BuildError> {
563        let nfa = thompson::NFA::never_match();
564        BoundedBacktracker::new_from_nfa(nfa)
565    }
566
567    /// Return a default configuration for a `BoundedBacktracker`.
568    ///
569    /// This is a convenience routine to avoid needing to import the `Config`
570    /// type when customizing the construction of a `BoundedBacktracker`.
571    ///
572    /// # Example
573    ///
574    /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
575    /// disabled, zero-width matches that split a codepoint are allowed.
576    /// Otherwise they are never reported.
577    ///
578    /// In the code below, notice that `""` is permitted to match positions
579    /// that split the encoding of a codepoint.
580    ///
581    /// ```
582    /// use regex_automata::{
583    ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
584    ///     Match,
585    /// };
586    ///
587    /// let re = BoundedBacktracker::builder()
588    ///     .thompson(thompson::Config::new().utf8(false))
589    ///     .build(r"")?;
590    /// let mut cache = re.create_cache();
591    ///
592    /// let haystack = "a☃z";
593    /// let mut it = re.try_find_iter(&mut cache, haystack);
594    /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next());
595    /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next());
596    /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next());
597    /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next());
598    /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next());
599    /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next());
600    /// assert_eq!(None, it.next());
601    ///
602    /// # Ok::<(), Box<dyn std::error::Error>>(())
603    /// ```
604    pub fn config() -> Config {
605        Config::new()
606    }
607
608    /// Return a builder for configuring the construction of a
609    /// `BoundedBacktracker`.
610    ///
611    /// This is a convenience routine to avoid needing to import the
612    /// [`Builder`] type in common cases.
613    ///
614    /// # Example
615    ///
616    /// This example shows how to use the builder to disable UTF-8 mode
617    /// everywhere.
618    ///
619    /// ```
620    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
621    /// use regex_automata::{
622    ///     nfa::thompson::{self, backtrack::BoundedBacktracker},
623    ///     util::syntax,
624    ///     Match,
625    /// };
626    ///
627    /// let re = BoundedBacktracker::builder()
628    ///     .syntax(syntax::Config::new().utf8(false))
629    ///     .thompson(thompson::Config::new().utf8(false))
630    ///     .build(r"foo(?-u:[^b])ar.*")?;
631    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
632    ///
633    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
634    /// let expected = Some(Match::must(0, 1..9));
635    /// re.try_captures(&mut cache, haystack, &mut caps)?;
636    /// assert_eq!(expected, caps.get_match());
637    ///
638    /// # Ok::<(), Box<dyn std::error::Error>>(())
639    /// ```
640    pub fn builder() -> Builder {
641        Builder::new()
642    }
643
644    /// Create a new cache for this regex.
645    ///
646    /// The cache returned should only be used for searches for this
647    /// regex. If you want to reuse the cache for another regex, then you
648    /// must call [`Cache::reset`] with that regex (or, equivalently,
649    /// [`BoundedBacktracker::reset_cache`]).
650    pub fn create_cache(&self) -> Cache {
651        Cache::new(self)
652    }
653
654    /// Create a new empty set of capturing groups that is guaranteed to be
655    /// valid for the search APIs on this `BoundedBacktracker`.
656    ///
657    /// A `Captures` value created for a specific `BoundedBacktracker` cannot
658    /// be used with any other `BoundedBacktracker`.
659    ///
660    /// This is a convenience function for [`Captures::all`]. See the
661    /// [`Captures`] documentation for an explanation of its alternative
662    /// constructors that permit the `BoundedBacktracker` to do less work
663    /// during a search, and thus might make it faster.
664    pub fn create_captures(&self) -> Captures {
665        Captures::all(self.get_nfa().group_info().clone())
666    }
667
668    /// Reset the given cache such that it can be used for searching with the
669    /// this `BoundedBacktracker` (and only this `BoundedBacktracker`).
670    ///
671    /// A cache reset permits reusing memory already allocated in this cache
672    /// with a different `BoundedBacktracker`.
673    ///
674    /// # Example
675    ///
676    /// This shows how to re-purpose a cache for use with a different
677    /// `BoundedBacktracker`.
678    ///
679    /// ```
680    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
681    /// use regex_automata::{
682    ///     nfa::thompson::backtrack::BoundedBacktracker,
683    ///     Match,
684    /// };
685    ///
686    /// let re1 = BoundedBacktracker::new(r"\w")?;
687    /// let re2 = BoundedBacktracker::new(r"\W")?;
688    ///
689    /// let mut cache = re1.create_cache();
690    /// assert_eq!(
691    ///     Some(Ok(Match::must(0, 0..2))),
692    ///     re1.try_find_iter(&mut cache, "Δ").next(),
693    /// );
694    ///
695    /// // Using 'cache' with re2 is not allowed. It may result in panics or
696    /// // incorrect results. In order to re-purpose the cache, we must reset
697    /// // it with the BoundedBacktracker we'd like to use it with.
698    /// //
699    /// // Similarly, after this reset, using the cache with 're1' is also not
700    /// // allowed.
701    /// cache.reset(&re2);
702    /// assert_eq!(
703    ///     Some(Ok(Match::must(0, 0..3))),
704    ///     re2.try_find_iter(&mut cache, "☃").next(),
705    /// );
706    ///
707    /// # Ok::<(), Box<dyn std::error::Error>>(())
708    /// ```
709    pub fn reset_cache(&self, cache: &mut Cache) {
710        cache.reset(self);
711    }
712
713    /// Returns the total number of patterns compiled into this
714    /// `BoundedBacktracker`.
715    ///
716    /// In the case of a `BoundedBacktracker` that contains no patterns, this
717    /// returns `0`.
718    ///
719    /// # Example
720    ///
721    /// This example shows the pattern length for a `BoundedBacktracker` that
722    /// never matches:
723    ///
724    /// ```
725    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
726    ///
727    /// let re = BoundedBacktracker::never_match()?;
728    /// assert_eq!(re.pattern_len(), 0);
729    /// # Ok::<(), Box<dyn std::error::Error>>(())
730    /// ```
731    ///
732    /// And another example for a `BoundedBacktracker` that matches at every
733    /// position:
734    ///
735    /// ```
736    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
737    ///
738    /// let re = BoundedBacktracker::always_match()?;
739    /// assert_eq!(re.pattern_len(), 1);
740    /// # Ok::<(), Box<dyn std::error::Error>>(())
741    /// ```
742    ///
743    /// And finally, a `BoundedBacktracker` that was constructed from multiple
744    /// patterns:
745    ///
746    /// ```
747    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
748    ///
749    /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
750    /// assert_eq!(re.pattern_len(), 3);
751    /// # Ok::<(), Box<dyn std::error::Error>>(())
752    /// ```
753    pub fn pattern_len(&self) -> usize {
754        self.nfa.pattern_len()
755    }
756
757    /// Return the config for this `BoundedBacktracker`.
758    #[inline]
759    pub fn get_config(&self) -> &Config {
760        &self.config
761    }
762
763    /// Returns a reference to the underlying NFA.
764    #[inline]
765    pub fn get_nfa(&self) -> &NFA {
766        &self.nfa
767    }
768
769    /// Returns the maximum haystack length supported by this backtracker.
770    ///
771    /// This routine is a function of both [`Config::visited_capacity`] and the
772    /// internal size of the backtracker's NFA.
773    ///
774    /// # Example
775    ///
776    /// This example shows how the maximum haystack length can vary depending
777    /// on the size of the regex itself. Note though that the specific maximum
778    /// values here are not an API guarantee. The default visited capacity is
779    /// subject to change and not covered by semver.
780    ///
781    /// ```
782    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
783    /// use regex_automata::{
784    ///     nfa::thompson::backtrack::BoundedBacktracker,
785    ///     Match, MatchError,
786    /// };
787    ///
788    /// // If you're only using ASCII, you get a big budget.
789    /// let re = BoundedBacktracker::new(r"(?-u)\w+")?;
790    /// let mut cache = re.create_cache();
791    /// assert_eq!(re.max_haystack_len(), 299_592);
792    /// // Things work up to the max.
793    /// let mut haystack = "a".repeat(299_592);
794    /// let expected = Some(Ok(Match::must(0, 0..299_592)));
795    /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
796    /// // But you'll get an error if you provide a haystack that's too big.
797    /// // Notice that we use the 'try_find_iter' routine instead, which
798    /// // yields Result<Match, MatchError> instead of Match.
799    /// haystack.push('a');
800    /// let expected = Some(Err(MatchError::haystack_too_long(299_593)));
801    /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next());
802    ///
803    /// // Unicode inflates the size of the underlying NFA quite a bit, and
804    /// // thus means that the backtracker can only handle smaller haystacks,
805    /// // assuming that the visited capacity remains unchanged.
806    /// let re = BoundedBacktracker::new(r"\w+")?;
807    /// assert!(re.max_haystack_len() <= 7_000);
808    /// // But we can increase the visited capacity to handle bigger haystacks!
809    /// let re = BoundedBacktracker::builder()
810    ///     .configure(BoundedBacktracker::config().visited_capacity(1<<20))
811    ///     .build(r"\w+")?;
812    /// assert!(re.max_haystack_len() >= 25_000);
813    /// assert!(re.max_haystack_len() <= 28_000);
814    /// # Ok::<(), Box<dyn std::error::Error>>(())
815    /// ```
816    #[inline]
817    pub fn max_haystack_len(&self) -> usize {
818        // The capacity given in the config is "bytes of heap memory," but the
819        // capacity we use here is "number of bits." So convert the capacity in
820        // bytes to the capacity in bits.
821        let capacity = 8 * self.get_config().get_visited_capacity();
822        let blocks = div_ceil(capacity, Visited::BLOCK_SIZE);
823        let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE);
824        // It's possible for `real_capacity` to be smaller than the number of
825        // NFA states for particularly large regexes, so we saturate towards
826        // zero.
827        (real_capacity / self.nfa.states().len()).saturating_sub(1)
828    }
829}
830
831impl BoundedBacktracker {
832    /// Returns true if and only if this regex matches the given haystack.
833    ///
834    /// In the case of a backtracking regex engine, and unlike most other
835    /// regex engines in this crate, short circuiting isn't practical. However,
836    /// this routine may still be faster because it instructs backtracking to
837    /// not keep track of any capturing groups.
838    ///
839    /// # Errors
840    ///
841    /// This routine only errors if the search could not complete. For this
842    /// backtracking regex engine, this only occurs when the haystack length
843    /// exceeds [`BoundedBacktracker::max_haystack_len`].
844    ///
845    /// When a search cannot complete, callers cannot know whether a match
846    /// exists or not.
847    ///
848    /// # Example
849    ///
850    /// ```
851    /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker;
852    ///
853    /// let re = BoundedBacktracker::new("foo[0-9]+bar")?;
854    /// let mut cache = re.create_cache();
855    ///
856    /// assert!(re.try_is_match(&mut cache, "foo12345bar")?);
857    /// assert!(!re.try_is_match(&mut cache, "foobar")?);
858    /// # Ok::<(), Box<dyn std::error::Error>>(())
859    /// ```
860    ///
861    /// # Example: consistency with search APIs
862    ///
863    /// `is_match` is guaranteed to return `true` whenever `find` returns a
864    /// match. This includes searches that are executed entirely within a
865    /// codepoint:
866    ///
867    /// ```
868    /// use regex_automata::{
869    ///     nfa::thompson::backtrack::BoundedBacktracker,
870    ///     Input,
871    /// };
872    ///
873    /// let re = BoundedBacktracker::new("a*")?;
874    /// let mut cache = re.create_cache();
875    ///
876    /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
877    /// # Ok::<(), Box<dyn std::error::Error>>(())
878    /// ```
879    ///
880    /// Notice that when UTF-8 mode is disabled, then the above reports a
881    /// match because the restriction against zero-width matches that split a
882    /// codepoint has been lifted:
883    ///
884    /// ```
885    /// use regex_automata::{
886    ///     nfa::thompson::{backtrack::BoundedBacktracker, NFA},
887    ///     Input,
888    /// };
889    ///
890    /// let re = BoundedBacktracker::builder()
891    ///     .thompson(NFA::config().utf8(false))
892    ///     .build("a*")?;
893    /// let mut cache = re.create_cache();
894    ///
895    /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?);
896    /// # Ok::<(), Box<dyn std::error::Error>>(())
897    /// ```
898    #[inline]
899    pub fn try_is_match<'h, I: Into<Input<'h>>>(
900        &self,
901        cache: &mut Cache,
902        input: I,
903    ) -> Result<bool, MatchError> {
904        let input = input.into().earliest(true);
905        self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some())
906    }
907
908    /// Executes a leftmost forward search and returns a `Match` if one exists.
909    ///
910    /// This routine only includes the overall match span. To get
911    /// access to the individual spans of each capturing group, use
912    /// [`BoundedBacktracker::try_captures`].
913    ///
914    /// # Errors
915    ///
916    /// This routine only errors if the search could not complete. For this
917    /// backtracking regex engine, this only occurs when the haystack length
918    /// exceeds [`BoundedBacktracker::max_haystack_len`].
919    ///
920    /// When a search cannot complete, callers cannot know whether a match
921    /// exists or not.
922    ///
923    /// # Example
924    ///
925    /// ```
926    /// use regex_automata::{
927    ///     nfa::thompson::backtrack::BoundedBacktracker,
928    ///     Match,
929    /// };
930    ///
931    /// let re = BoundedBacktracker::new("foo[0-9]+")?;
932    /// let mut cache = re.create_cache();
933    /// let expected = Match::must(0, 0..8);
934    /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?);
935    ///
936    /// # Ok::<(), Box<dyn std::error::Error>>(())
937    /// ```
938    #[inline]
939    pub fn try_find<'h, I: Into<Input<'h>>>(
940        &self,
941        cache: &mut Cache,
942        input: I,
943    ) -> Result<Option<Match>, MatchError> {
944        let input = input.into();
945        if self.get_nfa().pattern_len() == 1 {
946            let mut slots = [None, None];
947            let pid = match self.try_search_slots(cache, &input, &mut slots)? {
948                None => return Ok(None),
949                Some(pid) => pid,
950            };
951            let start = match slots[0] {
952                None => return Ok(None),
953                Some(s) => s.get(),
954            };
955            let end = match slots[1] {
956                None => return Ok(None),
957                Some(s) => s.get(),
958            };
959            return Ok(Some(Match::new(pid, Span { start, end })));
960        }
961        let ginfo = self.get_nfa().group_info();
962        let slots_len = ginfo.implicit_slot_len();
963        let mut slots = vec![None; slots_len];
964        let pid = match self.try_search_slots(cache, &input, &mut slots)? {
965            None => return Ok(None),
966            Some(pid) => pid,
967        };
968        let start = match slots[pid.as_usize() * 2] {
969            None => return Ok(None),
970            Some(s) => s.get(),
971        };
972        let end = match slots[pid.as_usize() * 2 + 1] {
973            None => return Ok(None),
974            Some(s) => s.get(),
975        };
976        Ok(Some(Match::new(pid, Span { start, end })))
977    }
978
979    /// Executes a leftmost forward search and writes the spans of capturing
980    /// groups that participated in a match into the provided [`Captures`]
981    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
982    /// to return `false`.
983    ///
984    /// # Errors
985    ///
986    /// This routine only errors if the search could not complete. For this
987    /// backtracking regex engine, this only occurs when the haystack length
988    /// exceeds [`BoundedBacktracker::max_haystack_len`].
989    ///
990    /// When a search cannot complete, callers cannot know whether a match
991    /// exists or not.
992    ///
993    /// # Example
994    ///
995    /// ```
996    /// use regex_automata::{
997    ///     nfa::thompson::backtrack::BoundedBacktracker,
998    ///     Span,
999    /// };
1000    ///
1001    /// let re = BoundedBacktracker::new(
1002    ///     r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$",
1003    /// )?;
1004    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1005    ///
1006    /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?;
1007    /// assert!(caps.is_match());
1008    /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
1009    /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
1010    /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
1011    ///
1012    /// # Ok::<(), Box<dyn std::error::Error>>(())
1013    /// ```
1014    #[inline]
1015    pub fn try_captures<'h, I: Into<Input<'h>>>(
1016        &self,
1017        cache: &mut Cache,
1018        input: I,
1019        caps: &mut Captures,
1020    ) -> Result<(), MatchError> {
1021        self.try_search(cache, &input.into(), caps)
1022    }
1023
1024    /// Returns an iterator over all non-overlapping leftmost matches in the
1025    /// given bytes. If no match exists, then the iterator yields no elements.
1026    ///
1027    /// If the regex engine returns an error at any point, then the iterator
1028    /// will yield that error.
1029    ///
1030    /// # Example
1031    ///
1032    /// ```
1033    /// use regex_automata::{
1034    ///     nfa::thompson::backtrack::BoundedBacktracker,
1035    ///     Match, MatchError,
1036    /// };
1037    ///
1038    /// let re = BoundedBacktracker::new("foo[0-9]+")?;
1039    /// let mut cache = re.create_cache();
1040    ///
1041    /// let text = "foo1 foo12 foo123";
1042    /// let result: Result<Vec<Match>, MatchError> = re
1043    ///     .try_find_iter(&mut cache, text)
1044    ///     .collect();
1045    /// let matches = result?;
1046    /// assert_eq!(matches, vec![
1047    ///     Match::must(0, 0..4),
1048    ///     Match::must(0, 5..10),
1049    ///     Match::must(0, 11..17),
1050    /// ]);
1051    /// # Ok::<(), Box<dyn std::error::Error>>(())
1052    /// ```
1053    #[inline]
1054    pub fn try_find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1055        &'r self,
1056        cache: &'c mut Cache,
1057        input: I,
1058    ) -> TryFindMatches<'r, 'c, 'h> {
1059        let caps = Captures::matches(self.get_nfa().group_info().clone());
1060        let it = iter::Searcher::new(input.into());
1061        TryFindMatches { re: self, cache, caps, it }
1062    }
1063
1064    /// Returns an iterator over all non-overlapping `Captures` values. If no
1065    /// match exists, then the iterator yields no elements.
1066    ///
1067    /// This yields the same matches as [`BoundedBacktracker::try_find_iter`],
1068    /// but it includes the spans of all capturing groups that participate in
1069    /// each match.
1070    ///
1071    /// If the regex engine returns an error at any point, then the iterator
1072    /// will yield that error.
1073    ///
1074    /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
1075    /// how to correctly iterate over all matches in a haystack while avoiding
1076    /// the creation of a new `Captures` value for every match. (Which you are
1077    /// forced to do with an `Iterator`.)
1078    ///
1079    /// # Example
1080    ///
1081    /// ```
1082    /// use regex_automata::{
1083    ///     nfa::thompson::backtrack::BoundedBacktracker,
1084    ///     Span,
1085    /// };
1086    ///
1087    /// let re = BoundedBacktracker::new("foo(?P<numbers>[0-9]+)")?;
1088    /// let mut cache = re.create_cache();
1089    ///
1090    /// let text = "foo1 foo12 foo123";
1091    /// let mut spans = vec![];
1092    /// for result in re.try_captures_iter(&mut cache, text) {
1093    ///     let caps = result?;
1094    ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
1095    ///     spans.push(caps.get_group_by_name("numbers").unwrap());
1096    /// }
1097    /// assert_eq!(spans, vec![
1098    ///     Span::from(3..4),
1099    ///     Span::from(8..10),
1100    ///     Span::from(14..17),
1101    /// ]);
1102    /// # Ok::<(), Box<dyn std::error::Error>>(())
1103    /// ```
1104    #[inline]
1105    pub fn try_captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
1106        &'r self,
1107        cache: &'c mut Cache,
1108        input: I,
1109    ) -> TryCapturesMatches<'r, 'c, 'h> {
1110        let caps = self.create_captures();
1111        let it = iter::Searcher::new(input.into());
1112        TryCapturesMatches { re: self, cache, caps, it }
1113    }
1114}
1115
1116impl BoundedBacktracker {
1117    /// Executes a leftmost forward search and writes the spans of capturing
1118    /// groups that participated in a match into the provided [`Captures`]
1119    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
1120    /// to return `false`.
1121    ///
1122    /// This is like [`BoundedBacktracker::try_captures`], but it accepts a
1123    /// concrete `&Input` instead of an `Into<Input>`.
1124    ///
1125    /// # Errors
1126    ///
1127    /// This routine only errors if the search could not complete. For this
1128    /// backtracking regex engine, this only occurs when the haystack length
1129    /// exceeds [`BoundedBacktracker::max_haystack_len`].
1130    ///
1131    /// When a search cannot complete, callers cannot know whether a match
1132    /// exists or not.
1133    ///
1134    /// # Example: specific pattern search
1135    ///
1136    /// This example shows how to build a multi bounded backtracker that
1137    /// permits searching for specific patterns.
1138    ///
1139    /// ```
1140    /// use regex_automata::{
1141    ///     nfa::thompson::backtrack::BoundedBacktracker,
1142    ///     Anchored, Input, Match, PatternID,
1143    /// };
1144    ///
1145    /// let re = BoundedBacktracker::new_many(&[
1146    ///     "[a-z0-9]{6}",
1147    ///     "[a-z][a-z0-9]{5}",
1148    /// ])?;
1149    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1150    /// let haystack = "foo123";
1151    ///
1152    /// // Since we are using the default leftmost-first match and both
1153    /// // patterns match at the same starting position, only the first pattern
1154    /// // will be returned in this case when doing a search for any of the
1155    /// // patterns.
1156    /// let expected = Some(Match::must(0, 0..6));
1157    /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?;
1158    /// assert_eq!(expected, caps.get_match());
1159    ///
1160    /// // But if we want to check whether some other pattern matches, then we
1161    /// // can provide its pattern ID.
1162    /// let expected = Some(Match::must(1, 0..6));
1163    /// let input = Input::new(haystack)
1164    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
1165    /// re.try_search(&mut cache, &input, &mut caps)?;
1166    /// assert_eq!(expected, caps.get_match());
1167    ///
1168    /// # Ok::<(), Box<dyn std::error::Error>>(())
1169    /// ```
1170    ///
1171    /// # Example: specifying the bounds of a search
1172    ///
1173    /// This example shows how providing the bounds of a search can produce
1174    /// different results than simply sub-slicing the haystack.
1175    ///
1176    /// ```
1177    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1178    /// use regex_automata::{
1179    ///     nfa::thompson::backtrack::BoundedBacktracker,
1180    ///     Match, Input,
1181    /// };
1182    ///
1183    /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?;
1184    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
1185    /// let haystack = "foo123bar";
1186    ///
1187    /// // Since we sub-slice the haystack, the search doesn't know about
1188    /// // the larger context and assumes that `123` is surrounded by word
1189    /// // boundaries. And of course, the match position is reported relative
1190    /// // to the sub-slice as well, which means we get `0..3` instead of
1191    /// // `3..6`.
1192    /// let expected = Some(Match::must(0, 0..3));
1193    /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?;
1194    /// assert_eq!(expected, caps.get_match());
1195    ///
1196    /// // But if we provide the bounds of the search within the context of the
1197    /// // entire haystack, then the search can take the surrounding context
1198    /// // into account. (And if we did find a match, it would be reported
1199    /// // as a valid offset into `haystack` instead of its sub-slice.)
1200    /// let expected = None;
1201    /// re.try_search(
1202    ///     &mut cache, &Input::new(haystack).range(3..6), &mut caps,
1203    /// )?;
1204    /// assert_eq!(expected, caps.get_match());
1205    ///
1206    /// # Ok::<(), Box<dyn std::error::Error>>(())
1207    /// ```
1208    #[inline]
1209    pub fn try_search(
1210        &self,
1211        cache: &mut Cache,
1212        input: &Input<'_>,
1213        caps: &mut Captures,
1214    ) -> Result<(), MatchError> {
1215        caps.set_pattern(None);
1216        let pid = self.try_search_slots(cache, input, caps.slots_mut())?;
1217        caps.set_pattern(pid);
1218        Ok(())
1219    }
1220
1221    /// Executes a leftmost forward search and writes the spans of capturing
1222    /// groups that participated in a match into the provided `slots`, and
1223    /// returns the matching pattern ID. The contents of the slots for patterns
1224    /// other than the matching pattern are unspecified. If no match was found,
1225    /// then `None` is returned and the contents of all `slots` is unspecified.
1226    ///
1227    /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw
1228    /// slots slice instead of a `Captures` value. This is useful in contexts
1229    /// where you don't want or need to allocate a `Captures`.
1230    ///
1231    /// It is legal to pass _any_ number of slots to this routine. If the regex
1232    /// engine would otherwise write a slot offset that doesn't fit in the
1233    /// provided slice, then it is simply skipped. In general though, there are
1234    /// usually three slice lengths you might want to use:
1235    ///
1236    /// * An empty slice, if you only care about which pattern matched.
1237    /// * A slice with
1238    /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1239    /// slots, if you only care about the overall match spans for each matching
1240    /// pattern.
1241    /// * A slice with
1242    /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1243    /// permits recording match offsets for every capturing group in every
1244    /// pattern.
1245    ///
1246    /// # Errors
1247    ///
1248    /// This routine only errors if the search could not complete. For this
1249    /// backtracking regex engine, this only occurs when the haystack length
1250    /// exceeds [`BoundedBacktracker::max_haystack_len`].
1251    ///
1252    /// When a search cannot complete, callers cannot know whether a match
1253    /// exists or not.
1254    ///
1255    /// # Example
1256    ///
1257    /// This example shows how to find the overall match offsets in a
1258    /// multi-pattern search without allocating a `Captures` value. Indeed, we
1259    /// can put our slots right on the stack.
1260    ///
1261    /// ```
1262    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1263    /// use regex_automata::{
1264    ///     nfa::thompson::backtrack::BoundedBacktracker,
1265    ///     PatternID, Input,
1266    /// };
1267    ///
1268    /// let re = BoundedBacktracker::new_many(&[
1269    ///     r"\pL+",
1270    ///     r"\d+",
1271    /// ])?;
1272    /// let mut cache = re.create_cache();
1273    /// let input = Input::new("!@#123");
1274    ///
1275    /// // We only care about the overall match offsets here, so we just
1276    /// // allocate two slots for each pattern. Each slot records the start
1277    /// // and end of the match.
1278    /// let mut slots = [None; 4];
1279    /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?;
1280    /// assert_eq!(Some(PatternID::must(1)), pid);
1281    ///
1282    /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1283    /// // See 'GroupInfo' for more details on the mapping between groups and
1284    /// // slot indices.
1285    /// let slot_start = pid.unwrap().as_usize() * 2;
1286    /// let slot_end = slot_start + 1;
1287    /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1288    /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1289    ///
1290    /// # Ok::<(), Box<dyn std::error::Error>>(())
1291    /// ```
1292    #[inline]
1293    pub fn try_search_slots(
1294        &self,
1295        cache: &mut Cache,
1296        input: &Input<'_>,
1297        slots: &mut [Option<NonMaxUsize>],
1298    ) -> Result<Option<PatternID>, MatchError> {
1299        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1300        if !utf8empty {
1301            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1302            return Ok(maybe_hm.map(|hm| hm.pattern()));
1303        }
1304        // See PikeVM::try_search_slots for why we do this.
1305        let min = self.get_nfa().group_info().implicit_slot_len();
1306        if slots.len() >= min {
1307            let maybe_hm = self.try_search_slots_imp(cache, input, slots)?;
1308            return Ok(maybe_hm.map(|hm| hm.pattern()));
1309        }
1310        if self.get_nfa().pattern_len() == 1 {
1311            let mut enough = [None, None];
1312            let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1313            // This is OK because we know `enough_slots` is strictly bigger
1314            // than `slots`, otherwise this special case isn't reached.
1315            slots.copy_from_slice(&enough[..slots.len()]);
1316            return Ok(got.map(|hm| hm.pattern()));
1317        }
1318        let mut enough = vec![None; min];
1319        let got = self.try_search_slots_imp(cache, input, &mut enough)?;
1320        // This is OK because we know `enough_slots` is strictly bigger than
1321        // `slots`, otherwise this special case isn't reached.
1322        slots.copy_from_slice(&enough[..slots.len()]);
1323        Ok(got.map(|hm| hm.pattern()))
1324    }
1325
1326    /// This is the actual implementation of `try_search_slots_imp` that
1327    /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1328    /// enabled, 2) the NFA can match the empty string and 3) the caller has
1329    /// provided an insufficient number of slots to record match offsets.
1330    #[inline(never)]
1331    fn try_search_slots_imp(
1332        &self,
1333        cache: &mut Cache,
1334        input: &Input<'_>,
1335        slots: &mut [Option<NonMaxUsize>],
1336    ) -> Result<Option<HalfMatch>, MatchError> {
1337        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1338        let hm = match self.search_imp(cache, input, slots)? {
1339            None => return Ok(None),
1340            Some(hm) if !utf8empty => return Ok(Some(hm)),
1341            Some(hm) => hm,
1342        };
1343        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1344            Ok(self
1345                .search_imp(cache, input, slots)?
1346                .map(|hm| (hm, hm.offset())))
1347        })
1348    }
1349
1350    /// The implementation of standard leftmost backtracking search.
1351    ///
1352    /// Capturing group spans are written to 'caps', but only if requested.
1353    /// 'caps' can be one of three things: 1) totally empty, in which case, we
1354    /// only report the pattern that matched or 2) only has slots for recording
1355    /// the overall match offsets for any pattern or 3) has all slots available
1356    /// for recording the spans of any groups participating in a match.
1357    fn search_imp(
1358        &self,
1359        cache: &mut Cache,
1360        input: &Input<'_>,
1361        slots: &mut [Option<NonMaxUsize>],
1362    ) -> Result<Option<HalfMatch>, MatchError> {
1363        // Unlike in the PikeVM, we write our capturing group spans directly
1364        // into the caller's captures groups. So we have to make sure we're
1365        // starting with a blank slate first. In the PikeVM, we avoid this
1366        // by construction: the spans that are copied to every slot in the
1367        // 'Captures' value already account for presence/absence. In this
1368        // backtracker, we write directly into the caller provided slots, where
1369        // as in the PikeVM, we write into scratch space first and only copy
1370        // them to the caller provided slots when a match is found.
1371        for slot in slots.iter_mut() {
1372            *slot = None;
1373        }
1374        cache.setup_search(&self, input)?;
1375        if input.is_done() {
1376            return Ok(None);
1377        }
1378        let (anchored, start_id) = match input.get_anchored() {
1379            // Only way we're unanchored is if both the caller asked for an
1380            // unanchored search *and* the pattern is itself not anchored.
1381            Anchored::No => (
1382                self.nfa.is_always_start_anchored(),
1383                // We always use the anchored starting state here, even if
1384                // doing an unanchored search. The "unanchored" part of it is
1385                // implemented in the loop below, by simply trying the next
1386                // byte offset if the previous backtracking exploration failed.
1387                self.nfa.start_anchored(),
1388            ),
1389            Anchored::Yes => (true, self.nfa.start_anchored()),
1390            Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) {
1391                None => return Ok(None),
1392                Some(sid) => (true, sid),
1393            },
1394        };
1395        if anchored {
1396            let at = input.start();
1397            return Ok(self.backtrack(cache, input, at, start_id, slots));
1398        }
1399        let pre = self.get_config().get_prefilter();
1400        let mut at = input.start();
1401        while at <= input.end() {
1402            if let Some(ref pre) = pre {
1403                let span = Span::from(at..input.end());
1404                match pre.find(input.haystack(), span) {
1405                    None => break,
1406                    Some(ref span) => at = span.start,
1407                }
1408            }
1409            if let Some(hm) = self.backtrack(cache, input, at, start_id, slots)
1410            {
1411                return Ok(Some(hm));
1412            }
1413            at += 1;
1414        }
1415        Ok(None)
1416    }
1417
1418    /// Look for a match starting at `at` in `input` and write the matching
1419    /// pattern ID and group spans to `caps`. The search uses `start_id` as its
1420    /// starting state in the underlying NFA.
1421    ///
1422    /// If no match was found, then the caller should increment `at` and try
1423    /// at the next position.
1424    #[cfg_attr(feature = "perf-inline", inline(always))]
1425    fn backtrack(
1426        &self,
1427        cache: &mut Cache,
1428        input: &Input<'_>,
1429        at: usize,
1430        start_id: StateID,
1431        slots: &mut [Option<NonMaxUsize>],
1432    ) -> Option<HalfMatch> {
1433        cache.stack.push(Frame::Step { sid: start_id, at });
1434        while let Some(frame) = cache.stack.pop() {
1435            match frame {
1436                Frame::Step { sid, at } => {
1437                    if let Some(hm) = self.step(cache, input, sid, at, slots) {
1438                        return Some(hm);
1439                    }
1440                }
1441                Frame::RestoreCapture { slot, offset } => {
1442                    slots[slot] = offset;
1443                }
1444            }
1445        }
1446        None
1447    }
1448
1449    // LAMENTATION: The actual backtracking search is implemented in about
1450    // 75 lines below. Yet this file is over 2,000 lines long. What have I
1451    // done?
1452
1453    /// Execute a "step" in the backtracing algorithm.
1454    ///
1455    /// A "step" is somewhat of a misnomer, because this routine keeps going
1456    /// until it either runs out of things to try or fins a match. In the
1457    /// former case, it may have pushed some things on to the backtracking
1458    /// stack, in which case, those will be tried next as part of the
1459    /// 'backtrack' routine above.
1460    #[cfg_attr(feature = "perf-inline", inline(always))]
1461    fn step(
1462        &self,
1463        cache: &mut Cache,
1464        input: &Input<'_>,
1465        mut sid: StateID,
1466        mut at: usize,
1467        slots: &mut [Option<NonMaxUsize>],
1468    ) -> Option<HalfMatch> {
1469        loop {
1470            if !cache.visited.insert(sid, at - input.start()) {
1471                return None;
1472            }
1473            match *self.nfa.state(sid) {
1474                State::ByteRange { ref trans } => {
1475                    // Why do we need this? Unlike other regex engines in this
1476                    // crate, the backtracker can steam roll ahead in the
1477                    // haystack outside of the main loop over the bytes in the
1478                    // haystack. While 'trans.matches()' below handles the case
1479                    // of 'at' being out of bounds of 'input.haystack()', we
1480                    // also need to handle the case of 'at' going out of bounds
1481                    // of the span the caller asked to search.
1482                    //
1483                    // We should perhaps make the 'trans.matches()' API accept
1484                    // an '&Input' instead of a '&[u8]'. Or at least, add a new
1485                    // API that does it.
1486                    if at >= input.end() {
1487                        return None;
1488                    }
1489                    if !trans.matches(input.haystack(), at) {
1490                        return None;
1491                    }
1492                    sid = trans.next;
1493                    at += 1;
1494                }
1495                State::Sparse(ref sparse) => {
1496                    if at >= input.end() {
1497                        return None;
1498                    }
1499                    sid = sparse.matches(input.haystack(), at)?;
1500                    at += 1;
1501                }
1502                State::Dense(ref dense) => {
1503                    if at >= input.end() {
1504                        return None;
1505                    }
1506                    sid = dense.matches(input.haystack(), at)?;
1507                    at += 1;
1508                }
1509                State::Look { look, next } => {
1510                    // OK because we don't permit building a searcher with a
1511                    // Unicode word boundary if the requisite Unicode data is
1512                    // unavailable.
1513                    if !self.nfa.look_matcher().matches_inline(
1514                        look,
1515                        input.haystack(),
1516                        at,
1517                    ) {
1518                        return None;
1519                    }
1520                    sid = next;
1521                }
1522                State::Union { ref alternates } => {
1523                    sid = match alternates.get(0) {
1524                        None => return None,
1525                        Some(&sid) => sid,
1526                    };
1527                    cache.stack.extend(
1528                        alternates[1..]
1529                            .iter()
1530                            .copied()
1531                            .rev()
1532                            .map(|sid| Frame::Step { sid, at }),
1533                    );
1534                }
1535                State::BinaryUnion { alt1, alt2 } => {
1536                    sid = alt1;
1537                    cache.stack.push(Frame::Step { sid: alt2, at });
1538                }
1539                State::Capture { next, slot, .. } => {
1540                    if slot.as_usize() < slots.len() {
1541                        cache.stack.push(Frame::RestoreCapture {
1542                            slot,
1543                            offset: slots[slot],
1544                        });
1545                        slots[slot] = NonMaxUsize::new(at);
1546                    }
1547                    sid = next;
1548                }
1549                State::Fail => return None,
1550                State::Match { pattern_id } => {
1551                    return Some(HalfMatch::new(pattern_id, at));
1552                }
1553            }
1554        }
1555    }
1556}
1557
1558/// An iterator over all non-overlapping matches for a fallible search.
1559///
1560/// The iterator yields a `Result<Match, MatchError` value until no more
1561/// matches could be found.
1562///
1563/// The lifetime parameters are as follows:
1564///
1565/// * `'r` represents the lifetime of the BoundedBacktracker.
1566/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1567/// * `'h` represents the lifetime of the haystack being searched.
1568///
1569/// This iterator can be created with the [`BoundedBacktracker::try_find_iter`]
1570/// method.
1571#[derive(Debug)]
1572pub struct TryFindMatches<'r, 'c, 'h> {
1573    re: &'r BoundedBacktracker,
1574    cache: &'c mut Cache,
1575    caps: Captures,
1576    it: iter::Searcher<'h>,
1577}
1578
1579impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> {
1580    type Item = Result<Match, MatchError>;
1581
1582    #[inline]
1583    fn next(&mut self) -> Option<Result<Match, MatchError>> {
1584        // Splitting 'self' apart seems necessary to appease borrowck.
1585        let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1586            *self;
1587        it.try_advance(|input| {
1588            re.try_search(cache, input, caps)?;
1589            Ok(caps.get_match())
1590        })
1591        .transpose()
1592    }
1593}
1594
1595/// An iterator over all non-overlapping leftmost matches, with their capturing
1596/// groups, for a fallible search.
1597///
1598/// The iterator yields a `Result<Captures, MatchError>` value until no more
1599/// matches could be found.
1600///
1601/// The lifetime parameters are as follows:
1602///
1603/// * `'r` represents the lifetime of the BoundedBacktracker.
1604/// * `'c` represents the lifetime of the BoundedBacktracker's cache.
1605/// * `'h` represents the lifetime of the haystack being searched.
1606///
1607/// This iterator can be created with the
1608/// [`BoundedBacktracker::try_captures_iter`] method.
1609#[derive(Debug)]
1610pub struct TryCapturesMatches<'r, 'c, 'h> {
1611    re: &'r BoundedBacktracker,
1612    cache: &'c mut Cache,
1613    caps: Captures,
1614    it: iter::Searcher<'h>,
1615}
1616
1617impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> {
1618    type Item = Result<Captures, MatchError>;
1619
1620    #[inline]
1621    fn next(&mut self) -> Option<Result<Captures, MatchError>> {
1622        // Splitting 'self' apart seems necessary to appease borrowck.
1623        let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1624            *self;
1625        let _ = it
1626            .try_advance(|input| {
1627                re.try_search(cache, input, caps)?;
1628                Ok(caps.get_match())
1629            })
1630            .transpose()?;
1631        if caps.is_match() {
1632            Some(Ok(caps.clone()))
1633        } else {
1634            None
1635        }
1636    }
1637}
1638
1639/// A cache represents mutable state that a [`BoundedBacktracker`] requires
1640/// during a search.
1641///
1642/// For a given [`BoundedBacktracker`], its corresponding cache may be created
1643/// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`].
1644/// They are equivalent in every way, except the former does not require
1645/// explicitly importing `Cache`.
1646///
1647/// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which
1648/// it was created. It may only be used with that `BoundedBacktracker`. A cache
1649/// and its allocations may be re-purposed via [`Cache::reset`], in which case,
1650/// it can only be used with the new `BoundedBacktracker` (and not the old
1651/// one).
1652#[derive(Clone, Debug)]
1653pub struct Cache {
1654    /// Stack used on the heap for doing backtracking instead of the
1655    /// traditional recursive approach. We don't want recursion because then
1656    /// we're likely to hit a stack overflow for bigger regexes.
1657    stack: Vec<Frame>,
1658    /// The set of (StateID, HaystackOffset) pairs that have been visited
1659    /// by the backtracker within a single search. If such a pair has been
1660    /// visited, then we avoid doing the work for that pair again. This is
1661    /// what "bounds" the backtracking and prevents it from having worst case
1662    /// exponential time.
1663    visited: Visited,
1664}
1665
1666impl Cache {
1667    /// Create a new [`BoundedBacktracker`] cache.
1668    ///
1669    /// A potentially more convenient routine to create a cache is
1670    /// [`BoundedBacktracker::create_cache`], as it does not require also
1671    /// importing the `Cache` type.
1672    ///
1673    /// If you want to reuse the returned `Cache` with some other
1674    /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the
1675    /// desired `BoundedBacktracker`.
1676    pub fn new(re: &BoundedBacktracker) -> Cache {
1677        Cache { stack: vec![], visited: Visited::new(re) }
1678    }
1679
1680    /// Reset this cache such that it can be used for searching with different
1681    /// [`BoundedBacktracker`].
1682    ///
1683    /// A cache reset permits reusing memory already allocated in this cache
1684    /// with a different `BoundedBacktracker`.
1685    ///
1686    /// # Example
1687    ///
1688    /// This shows how to re-purpose a cache for use with a different
1689    /// `BoundedBacktracker`.
1690    ///
1691    /// ```
1692    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1693    /// use regex_automata::{
1694    ///     nfa::thompson::backtrack::BoundedBacktracker,
1695    ///     Match,
1696    /// };
1697    ///
1698    /// let re1 = BoundedBacktracker::new(r"\w")?;
1699    /// let re2 = BoundedBacktracker::new(r"\W")?;
1700    ///
1701    /// let mut cache = re1.create_cache();
1702    /// assert_eq!(
1703    ///     Some(Ok(Match::must(0, 0..2))),
1704    ///     re1.try_find_iter(&mut cache, "Δ").next(),
1705    /// );
1706    ///
1707    /// // Using 'cache' with re2 is not allowed. It may result in panics or
1708    /// // incorrect results. In order to re-purpose the cache, we must reset
1709    /// // it with the BoundedBacktracker we'd like to use it with.
1710    /// //
1711    /// // Similarly, after this reset, using the cache with 're1' is also not
1712    /// // allowed.
1713    /// cache.reset(&re2);
1714    /// assert_eq!(
1715    ///     Some(Ok(Match::must(0, 0..3))),
1716    ///     re2.try_find_iter(&mut cache, "☃").next(),
1717    /// );
1718    ///
1719    /// # Ok::<(), Box<dyn std::error::Error>>(())
1720    /// ```
1721    pub fn reset(&mut self, re: &BoundedBacktracker) {
1722        self.visited.reset(re);
1723    }
1724
1725    /// Returns the heap memory usage, in bytes, of this cache.
1726    ///
1727    /// This does **not** include the stack size used up by this cache. To
1728    /// compute that, use `std::mem::size_of::<Cache>()`.
1729    pub fn memory_usage(&self) -> usize {
1730        self.stack.len() * core::mem::size_of::<Frame>()
1731            + self.visited.memory_usage()
1732    }
1733
1734    /// Clears this cache. This should be called at the start of every search
1735    /// to ensure we start with a clean slate.
1736    ///
1737    /// This also sets the length of the capturing groups used in the current
1738    /// search. This permits an optimization where by 'SlotTable::for_state'
1739    /// only returns the number of slots equivalent to the number of slots
1740    /// given in the 'Captures' value. This may be less than the total number
1741    /// of possible slots, e.g., when one only wants to track overall match
1742    /// offsets. This in turn permits less copying of capturing group spans
1743    /// in the BoundedBacktracker.
1744    fn setup_search(
1745        &mut self,
1746        re: &BoundedBacktracker,
1747        input: &Input<'_>,
1748    ) -> Result<(), MatchError> {
1749        self.stack.clear();
1750        self.visited.setup_search(re, input)?;
1751        Ok(())
1752    }
1753}
1754
1755/// Represents a stack frame on the heap while doing backtracking.
1756///
1757/// Instead of using explicit recursion for backtracking, we use a stack on
1758/// the heap to keep track of things that we want to explore if the current
1759/// backtracking branch turns out to not lead to a match.
1760#[derive(Clone, Debug)]
1761enum Frame {
1762    /// Look for a match starting at `sid` and the given position in the
1763    /// haystack.
1764    Step { sid: StateID, at: usize },
1765    /// Reset the given `slot` to the given `offset` (which might be `None`).
1766    /// This effectively gives a "scope" to capturing groups, such that an
1767    /// offset for a particular group only gets returned if the match goes
1768    /// through that capturing group. If backtracking ends up going down a
1769    /// different branch that results in a different offset (or perhaps none at
1770    /// all), then this "restore capture" frame will cause the offset to get
1771    /// reset.
1772    RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
1773}
1774
1775/// A bitset that keeps track of whether a particular (StateID, offset) has
1776/// been considered during backtracking. If it has already been visited, then
1777/// backtracking skips it. This is what gives backtracking its "bound."
1778#[derive(Clone, Debug)]
1779struct Visited {
1780    /// The actual underlying bitset. Each element in the bitset corresponds
1781    /// to a particular (StateID, offset) pair. States correspond to the rows
1782    /// and the offsets correspond to the columns.
1783    ///
1784    /// If our underlying NFA has N states and the haystack we're searching
1785    /// has M bytes, then we have N*(M+1) entries in our bitset table. The
1786    /// M+1 occurs because our matches are delayed by one byte (to support
1787    /// look-around), and so we need to handle the end position itself rather
1788    /// than stopping just before the end. (If there is no end position, then
1789    /// it's treated as "end-of-input," which is matched by things like '$'.)
1790    ///
1791    /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize))
1792    /// blocks.
1793    ///
1794    /// We use 'usize' to represent our blocks because it makes some of the
1795    /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for
1796    /// our block, we'd either need to cast u32s to usizes or usizes to u32s.
1797    bitset: Vec<usize>,
1798    /// The stride represents one plus length of the haystack we're searching
1799    /// (as described above). The stride must be initialized for each search.
1800    stride: usize,
1801}
1802
1803impl Visited {
1804    /// The size of each block, in bits.
1805    const BLOCK_SIZE: usize = 8 * core::mem::size_of::<usize>();
1806
1807    /// Create a new visited set for the given backtracker.
1808    ///
1809    /// The set is ready to use, but must be setup at the beginning of each
1810    /// search by calling `setup_search`.
1811    fn new(re: &BoundedBacktracker) -> Visited {
1812        let mut visited = Visited { bitset: vec![], stride: 0 };
1813        visited.reset(re);
1814        visited
1815    }
1816
1817    /// Insert the given (StateID, offset) pair into this set. If it already
1818    /// exists, then this is a no-op and it returns false. Otherwise this
1819    /// returns true.
1820    fn insert(&mut self, sid: StateID, at: usize) -> bool {
1821        let table_index = sid.as_usize() * self.stride + at;
1822        let block_index = table_index / Visited::BLOCK_SIZE;
1823        let bit = table_index % Visited::BLOCK_SIZE;
1824        let block_with_bit = 1 << bit;
1825        if self.bitset[block_index] & block_with_bit != 0 {
1826            return false;
1827        }
1828        self.bitset[block_index] |= block_with_bit;
1829        true
1830    }
1831
1832    /// Reset this visited set to work with the given bounded backtracker.
1833    fn reset(&mut self, _: &BoundedBacktracker) {
1834        self.bitset.truncate(0);
1835    }
1836
1837    /// Setup this visited set to work for a search using the given NFA
1838    /// and input configuration. The NFA must be the same NFA used by the
1839    /// BoundedBacktracker given to Visited::reset. Failing to call this might
1840    /// result in panics or silently incorrect search behavior.
1841    fn setup_search(
1842        &mut self,
1843        re: &BoundedBacktracker,
1844        input: &Input<'_>,
1845    ) -> Result<(), MatchError> {
1846        // Our haystack length is only the length of the span of the entire
1847        // haystack that we'll be searching.
1848        let haylen = input.get_span().len();
1849        let err = || MatchError::haystack_too_long(haylen);
1850        // Our stride is one more than the length of the input because our main
1851        // search loop includes the position at input.end(). (And it does this
1852        // because matches are delayed by one byte to account for look-around.)
1853        self.stride = haylen + 1;
1854        let needed_capacity =
1855            match re.get_nfa().states().len().checked_mul(self.stride) {
1856                None => return Err(err()),
1857                Some(capacity) => capacity,
1858            };
1859        let max_capacity = 8 * re.get_config().get_visited_capacity();
1860        if needed_capacity > max_capacity {
1861            return Err(err());
1862        }
1863        let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE);
1864        self.bitset.truncate(needed_blocks);
1865        for block in self.bitset.iter_mut() {
1866            *block = 0;
1867        }
1868        if needed_blocks > self.bitset.len() {
1869            self.bitset.resize(needed_blocks, 0);
1870        }
1871        Ok(())
1872    }
1873
1874    /// Return the heap memory usage, in bytes, of this visited set.
1875    fn memory_usage(&self) -> usize {
1876        self.bitset.len() * core::mem::size_of::<usize>()
1877    }
1878}
1879
1880/// Integer division, but rounds up instead of down.
1881fn div_ceil(lhs: usize, rhs: usize) -> usize {
1882    if lhs % rhs == 0 {
1883        lhs / rhs
1884    } else {
1885        (lhs / rhs) + 1
1886    }
1887}
1888
1889#[cfg(test)]
1890mod tests {
1891    use super::*;
1892
1893    // This is a regression test for the maximum haystack length computation.
1894    // Previously, it assumed that the total capacity of the backtracker's
1895    // bitset would always be greater than the number of NFA states. But there
1896    // is of course no guarantee that this is true. This regression test
1897    // ensures that not only does `max_haystack_len` not panic, but that it
1898    // should return `0`.
1899    #[cfg(feature = "syntax")]
1900    #[test]
1901    fn max_haystack_len_overflow() {
1902        let re = BoundedBacktracker::builder()
1903            .configure(BoundedBacktracker::config().visited_capacity(10))
1904            .build(r"[0-9A-Za-z]{100}")
1905            .unwrap();
1906        assert_eq!(0, re.max_haystack_len());
1907    }
1908}
regex_automata/nfa/thompson/backtrack.rs

regex_automata/nfa/thompson/
backtrack.rs