regex_automata/nfa/thompson/
pikevm.rs

1/*!
2An NFA backed Pike VM for executing regex searches with capturing groups.
3
4This module provides a [`PikeVM`] that works by simulating an NFA and
5resolving all spans of capturing groups that participate in a match.
6*/
7
8#[cfg(feature = "internal-instrument-pikevm")]
9use core::cell::RefCell;
10
11use alloc::{vec, vec::Vec};
12
13use crate::{
14    nfa::thompson::{self, BuildError, State, NFA},
15    util::{
16        captures::Captures,
17        empty, iter,
18        prefilter::Prefilter,
19        primitives::{NonMaxUsize, PatternID, SmallIndex, StateID},
20        search::{
21            Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span,
22        },
23        sparse_set::SparseSet,
24    },
25};
26
27/// A simple macro for conditionally executing instrumentation logic when
28/// the 'trace' log level is enabled. This is a compile-time no-op when the
29/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
30/// this makes it easier to avoid doing extra work when instrumentation isn't
31/// enabled.
32///
33/// This macro accepts a closure of type `|&mut Counters|`. The closure can
34/// then increment counters (or whatever) in accordance with what one wants
35/// to track.
36macro_rules! instrument {
37    ($fun:expr) => {
38        #[cfg(feature = "internal-instrument-pikevm")]
39        {
40            let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
41            COUNTERS.with(|c: &RefCell<Counters>| fun(&mut *c.borrow_mut()));
42        }
43    };
44}
45
46#[cfg(feature = "internal-instrument-pikevm")]
47std::thread_local! {
48    /// Effectively global state used to keep track of instrumentation
49    /// counters. The "proper" way to do this is to thread it through the
50    /// PikeVM, but it makes the code quite icky. Since this is just a
51    /// debugging feature, we're content to relegate it to thread local
52    /// state. When instrumentation is enabled, the counters are reset at the
53    /// beginning of every search and printed (with the 'trace' log level) at
54    /// the end of every search.
55    static COUNTERS: RefCell<Counters> = RefCell::new(Counters::empty());
56}
57
58/// The configuration used for building a [`PikeVM`].
59///
60/// A PikeVM configuration is a simple data object that is typically used with
61/// [`Builder::configure`]. It can be cheaply cloned.
62///
63/// A default configuration can be created either with `Config::new`, or
64/// perhaps more conveniently, with [`PikeVM::config`].
65#[derive(Clone, Debug, Default)]
66pub struct Config {
67    match_kind: Option<MatchKind>,
68    pre: Option<Option<Prefilter>>,
69}
70
71impl Config {
72    /// Return a new default PikeVM configuration.
73    pub fn new() -> Config {
74        Config::default()
75    }
76
77    /// Set the desired match semantics.
78    ///
79    /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the
80    /// match semantics of Perl-like regex engines. That is, when multiple
81    /// patterns would match at the same leftmost position, the pattern that
82    /// appears first in the concrete syntax is chosen.
83    ///
84    /// Currently, the only other kind of match semantics supported is
85    /// [`MatchKind::All`]. This corresponds to "classical DFA" construction
86    /// where all possible matches are visited in the NFA by the `PikeVM`.
87    ///
88    /// Typically, `All` is used when one wants to execute an overlapping
89    /// search and `LeftmostFirst` otherwise. In particular, it rarely makes
90    /// sense to use `All` with the various "leftmost" find routines, since the
91    /// leftmost routines depend on the `LeftmostFirst` automata construction
92    /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM`
93    /// simulating dead states as a way to terminate the search and report a
94    /// match. `LeftmostFirst` also supports non-greedy matches using this
95    /// strategy where as `All` does not.
96    pub fn match_kind(mut self, kind: MatchKind) -> Config {
97        self.match_kind = Some(kind);
98        self
99    }
100
101    /// Set a prefilter to be used whenever a start state is entered.
102    ///
103    /// A [`Prefilter`] in this context is meant to accelerate searches by
104    /// looking for literal prefixes that every match for the corresponding
105    /// pattern (or patterns) must start with. Once a prefilter produces a
106    /// match, the underlying search routine continues on to try and confirm
107    /// the match.
108    ///
109    /// Be warned that setting a prefilter does not guarantee that the search
110    /// will be faster. While it's usually a good bet, if the prefilter
111    /// produces a lot of false positive candidates (i.e., positions matched
112    /// by the prefilter but not by the regex), then the overall result can
113    /// be slower than if you had just executed the regex engine without any
114    /// prefilters.
115    ///
116    /// By default no prefilter is set.
117    ///
118    /// # Example
119    ///
120    /// ```
121    /// use regex_automata::{
122    ///     nfa::thompson::pikevm::PikeVM,
123    ///     util::prefilter::Prefilter,
124    ///     Input, Match, MatchKind,
125    /// };
126    ///
127    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]);
128    /// let re = PikeVM::builder()
129    ///     .configure(PikeVM::config().prefilter(pre))
130    ///     .build(r"(foo|bar)[a-z]+")?;
131    /// let mut cache = re.create_cache();
132    /// let input = Input::new("foo1 barfox bar");
133    /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input));
134    ///
135    /// # Ok::<(), Box<dyn std::error::Error>>(())
136    /// ```
137    ///
138    /// Be warned though that an incorrect prefilter can lead to incorrect
139    /// results!
140    ///
141    /// ```
142    /// use regex_automata::{
143    ///     nfa::thompson::pikevm::PikeVM,
144    ///     util::prefilter::Prefilter,
145    ///     Input, HalfMatch, MatchKind,
146    /// };
147    ///
148    /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]);
149    /// let re = PikeVM::builder()
150    ///     .configure(PikeVM::config().prefilter(pre))
151    ///     .build(r"(foo|bar)[a-z]+")?;
152    /// let mut cache = re.create_cache();
153    /// let input = Input::new("foo1 barfox bar");
154    /// // No match reported even though there clearly is one!
155    /// assert_eq!(None, re.find(&mut cache, input));
156    ///
157    /// # Ok::<(), Box<dyn std::error::Error>>(())
158    /// ```
159    pub fn prefilter(mut self, pre: Option<Prefilter>) -> Config {
160        self.pre = Some(pre);
161        self
162    }
163
164    /// Returns the match semantics set in this configuration.
165    pub fn get_match_kind(&self) -> MatchKind {
166        self.match_kind.unwrap_or(MatchKind::LeftmostFirst)
167    }
168
169    /// Returns the prefilter set in this configuration, if one at all.
170    pub fn get_prefilter(&self) -> Option<&Prefilter> {
171        self.pre.as_ref().unwrap_or(&None).as_ref()
172    }
173
174    /// Overwrite the default configuration such that the options in `o` are
175    /// always used. If an option in `o` is not set, then the corresponding
176    /// option in `self` is used. If it's not set in `self` either, then it
177    /// remains not set.
178    pub(crate) fn overwrite(&self, o: Config) -> Config {
179        Config {
180            match_kind: o.match_kind.or(self.match_kind),
181            pre: o.pre.or_else(|| self.pre.clone()),
182        }
183    }
184}
185
186/// A builder for a `PikeVM`.
187///
188/// This builder permits configuring options for the syntax of a pattern,
189/// the NFA construction and the `PikeVM` construction. This builder is
190/// different from a general purpose regex builder in that it permits fine
191/// grain configuration of the construction process. The trade off for this is
192/// complexity, and the possibility of setting a configuration that might not
193/// make sense. For example, there are two different UTF-8 modes:
194///
195/// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8)
196/// controls whether the pattern itself can contain sub-expressions that match
197/// invalid UTF-8.
198/// * [`thompson::Config::utf8`] controls whether empty matches that split a
199/// Unicode codepoint are reported or not.
200///
201/// Generally speaking, callers will want to either enable all of these or
202/// disable all of these.
203///
204/// # Example
205///
206/// This example shows how to disable UTF-8 mode in the syntax and the regex
207/// itself. This is generally what you want for matching on arbitrary bytes.
208///
209/// ```
210/// use regex_automata::{
211///     nfa::thompson::{self, pikevm::PikeVM},
212///     util::syntax,
213///     Match,
214/// };
215///
216/// let re = PikeVM::builder()
217///     .syntax(syntax::Config::new().utf8(false))
218///     .thompson(thompson::Config::new().utf8(false))
219///     .build(r"foo(?-u:[^b])ar.*")?;
220/// let mut cache = re.create_cache();
221///
222/// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
223/// let expected = Some(Match::must(0, 1..9));
224/// let got = re.find_iter(&mut cache, haystack).next();
225/// assert_eq!(expected, got);
226/// // Notice that `(?-u:[^b])` matches invalid UTF-8,
227/// // but the subsequent `.*` does not! Disabling UTF-8
228/// // on the syntax permits this.
229/// //
230/// // N.B. This example does not show the impact of
231/// // disabling UTF-8 mode on a PikeVM Config, since that
232/// // only impacts regexes that can produce matches of
233/// // length 0.
234/// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]);
235///
236/// # Ok::<(), Box<dyn std::error::Error>>(())
237/// ```
238#[derive(Clone, Debug)]
239pub struct Builder {
240    config: Config,
241    #[cfg(feature = "syntax")]
242    thompson: thompson::Compiler,
243}
244
245impl Builder {
246    /// Create a new PikeVM builder with its default configuration.
247    pub fn new() -> Builder {
248        Builder {
249            config: Config::default(),
250            #[cfg(feature = "syntax")]
251            thompson: thompson::Compiler::new(),
252        }
253    }
254
255    /// Build a `PikeVM` from the given pattern.
256    ///
257    /// If there was a problem parsing or compiling the pattern, then an error
258    /// is returned.
259    #[cfg(feature = "syntax")]
260    pub fn build(&self, pattern: &str) -> Result<PikeVM, BuildError> {
261        self.build_many(&[pattern])
262    }
263
264    /// Build a `PikeVM` from the given patterns.
265    #[cfg(feature = "syntax")]
266    pub fn build_many<P: AsRef<str>>(
267        &self,
268        patterns: &[P],
269    ) -> Result<PikeVM, BuildError> {
270        let nfa = self.thompson.build_many(patterns)?;
271        self.build_from_nfa(nfa)
272    }
273
274    /// Build a `PikeVM` directly from its NFA.
275    ///
276    /// Note that when using this method, any configuration that applies to the
277    /// construction of the NFA itself will of course be ignored, since the NFA
278    /// given here is already built.
279    pub fn build_from_nfa(&self, nfa: NFA) -> Result<PikeVM, BuildError> {
280        nfa.look_set_any().available().map_err(BuildError::word)?;
281        Ok(PikeVM { config: self.config.clone(), nfa })
282    }
283
284    /// Apply the given `PikeVM` configuration options to this builder.
285    pub fn configure(&mut self, config: Config) -> &mut Builder {
286        self.config = self.config.overwrite(config);
287        self
288    }
289
290    /// Set the syntax configuration for this builder using
291    /// [`syntax::Config`](crate::util::syntax::Config).
292    ///
293    /// This permits setting things like case insensitivity, Unicode and multi
294    /// line mode.
295    ///
296    /// These settings only apply when constructing a PikeVM directly from a
297    /// pattern.
298    #[cfg(feature = "syntax")]
299    pub fn syntax(
300        &mut self,
301        config: crate::util::syntax::Config,
302    ) -> &mut Builder {
303        self.thompson.syntax(config);
304        self
305    }
306
307    /// Set the Thompson NFA configuration for this builder using
308    /// [`nfa::thompson::Config`](crate::nfa::thompson::Config).
309    ///
310    /// This permits setting things like if additional time should be spent
311    /// shrinking the size of the NFA.
312    ///
313    /// These settings only apply when constructing a PikeVM directly from a
314    /// pattern.
315    #[cfg(feature = "syntax")]
316    pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder {
317        self.thompson.configure(config);
318        self
319    }
320}
321
322/// A virtual machine for executing regex searches with capturing groups.
323///
324/// # Infallible APIs
325///
326/// Unlike most other regex engines in this crate, a `PikeVM` never returns an
327/// error at search time. It supports all [`Anchored`] configurations, never
328/// quits and works on haystacks of arbitrary length.
329///
330/// There are two caveats to mention though:
331///
332/// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`],
333/// then the PikeVM will report "no match." This is consistent with all other
334/// regex engines in this crate.
335/// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`]
336/// that has insufficient capacity to store all valid pattern IDs, then if a
337/// match occurs for a `PatternID` that cannot be inserted, it is silently
338/// dropped as if it did not match.
339///
340/// # Advice
341///
342/// The `PikeVM` is generally the most "powerful" regex engine in this crate.
343/// "Powerful" in this context means that it can handle any regular expression
344/// that is parseable by `regex-syntax` and any size haystack. Regretably,
345/// the `PikeVM` is also simultaneously often the _slowest_ regex engine in
346/// practice. This results in an annoying situation where one generally tries
347/// to pick any other regex engine (or perhaps none at all) before being
348/// forced to fall back to a `PikeVM`.
349///
350/// For example, a common strategy for dealing with capturing groups is to
351/// actually look for the overall match of the regex using a faster regex
352/// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall
353/// match is found, one can then run the `PikeVM` on just the match span to
354/// find the spans of the capturing groups. In this way, the faster regex
355/// engine does the majority of the work, while the `PikeVM` only lends its
356/// power in a more limited role.
357///
358/// Unfortunately, this isn't always possible because the faster regex engines
359/// don't support all of the regex features in `regex-syntax`. This notably
360/// includes (and is currently limited to) Unicode word boundaries. So if
361/// your pattern has Unicode word boundaries, you typically can't use a
362/// DFA-based regex engine at all (unless you [enable heuristic support for
363/// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass
364/// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for
365/// anchored searches only, but in a cruel sort of joke, many Unicode features
366/// tend to result in making the regex _not_ one-pass.)
367///
368/// # Example
369///
370/// This example shows that the `PikeVM` implements Unicode word boundaries
371/// correctly by default.
372///
373/// ```
374/// # if cfg!(miri) { return Ok(()); } // miri takes too long
375/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
376///
377/// let re = PikeVM::new(r"\b\w+\b")?;
378/// let mut cache = re.create_cache();
379///
380/// let mut it = re.find_iter(&mut cache, "Шерлок Холмс");
381/// assert_eq!(Some(Match::must(0, 0..12)), it.next());
382/// assert_eq!(Some(Match::must(0, 13..23)), it.next());
383/// assert_eq!(None, it.next());
384/// # Ok::<(), Box<dyn std::error::Error>>(())
385/// ```
386#[derive(Clone, Debug)]
387pub struct PikeVM {
388    config: Config,
389    nfa: NFA,
390}
391
392impl PikeVM {
393    /// Parse the given regular expression using the default configuration and
394    /// return the corresponding `PikeVM`.
395    ///
396    /// If you want a non-default configuration, then use the [`Builder`] to
397    /// set your own configuration.
398    ///
399    /// # Example
400    ///
401    /// ```
402    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
403    ///
404    /// let re = PikeVM::new("foo[0-9]+bar")?;
405    /// let mut cache = re.create_cache();
406    /// assert_eq!(
407    ///     Some(Match::must(0, 3..14)),
408    ///     re.find_iter(&mut cache, "zzzfoo12345barzzz").next(),
409    /// );
410    /// # Ok::<(), Box<dyn std::error::Error>>(())
411    /// ```
412    #[cfg(feature = "syntax")]
413    pub fn new(pattern: &str) -> Result<PikeVM, BuildError> {
414        PikeVM::builder().build(pattern)
415    }
416
417    /// Like `new`, but parses multiple patterns into a single "multi regex."
418    /// This similarly uses the default regex configuration.
419    ///
420    /// # Example
421    ///
422    /// ```
423    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
424    ///
425    /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?;
426    /// let mut cache = re.create_cache();
427    ///
428    /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux");
429    /// assert_eq!(Some(Match::must(0, 0..3)), it.next());
430    /// assert_eq!(Some(Match::must(1, 4..5)), it.next());
431    /// assert_eq!(Some(Match::must(0, 6..9)), it.next());
432    /// assert_eq!(Some(Match::must(1, 10..14)), it.next());
433    /// assert_eq!(Some(Match::must(1, 15..16)), it.next());
434    /// assert_eq!(Some(Match::must(0, 17..21)), it.next());
435    /// assert_eq!(None, it.next());
436    /// # Ok::<(), Box<dyn std::error::Error>>(())
437    /// ```
438    #[cfg(feature = "syntax")]
439    pub fn new_many<P: AsRef<str>>(
440        patterns: &[P],
441    ) -> Result<PikeVM, BuildError> {
442        PikeVM::builder().build_many(patterns)
443    }
444
445    /// Like `new`, but builds a PikeVM directly from an NFA. This is useful
446    /// if you already have an NFA, or even if you hand-assembled the NFA.
447    ///
448    /// # Example
449    ///
450    /// This shows how to hand assemble a regular expression via its HIR,
451    /// compile an NFA from it and build a PikeVM from the NFA.
452    ///
453    /// ```
454    /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match};
455    /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange};
456    ///
457    /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![
458    ///     ClassBytesRange::new(b'0', b'9'),
459    ///     ClassBytesRange::new(b'A', b'Z'),
460    ///     ClassBytesRange::new(b'_', b'_'),
461    ///     ClassBytesRange::new(b'a', b'z'),
462    /// ])));
463    ///
464    /// let config = NFA::config().nfa_size_limit(Some(1_000));
465    /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?;
466    ///
467    /// let re = PikeVM::new_from_nfa(nfa)?;
468    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
469    /// let expected = Some(Match::must(0, 3..4));
470    /// re.captures(&mut cache, "!@#A#@!", &mut caps);
471    /// assert_eq!(expected, caps.get_match());
472    ///
473    /// # Ok::<(), Box<dyn std::error::Error>>(())
474    /// ```
475    pub fn new_from_nfa(nfa: NFA) -> Result<PikeVM, BuildError> {
476        PikeVM::builder().build_from_nfa(nfa)
477    }
478
479    /// Create a new `PikeVM` that matches every input.
480    ///
481    /// # Example
482    ///
483    /// ```
484    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
485    ///
486    /// let re = PikeVM::always_match()?;
487    /// let mut cache = re.create_cache();
488    ///
489    /// let expected = Match::must(0, 0..0);
490    /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next());
491    /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next());
492    /// # Ok::<(), Box<dyn std::error::Error>>(())
493    /// ```
494    pub fn always_match() -> Result<PikeVM, BuildError> {
495        let nfa = thompson::NFA::always_match();
496        PikeVM::new_from_nfa(nfa)
497    }
498
499    /// Create a new `PikeVM` that never matches any input.
500    ///
501    /// # Example
502    ///
503    /// ```
504    /// use regex_automata::nfa::thompson::pikevm::PikeVM;
505    ///
506    /// let re = PikeVM::never_match()?;
507    /// let mut cache = re.create_cache();
508    ///
509    /// assert_eq!(None, re.find_iter(&mut cache, "").next());
510    /// assert_eq!(None, re.find_iter(&mut cache, "foo").next());
511    /// # Ok::<(), Box<dyn std::error::Error>>(())
512    /// ```
513    pub fn never_match() -> Result<PikeVM, BuildError> {
514        let nfa = thompson::NFA::never_match();
515        PikeVM::new_from_nfa(nfa)
516    }
517
518    /// Return a default configuration for a `PikeVM`.
519    ///
520    /// This is a convenience routine to avoid needing to import the `Config`
521    /// type when customizing the construction of a `PikeVM`.
522    ///
523    /// # Example
524    ///
525    /// This example shows how to disable UTF-8 mode. When UTF-8 mode is
526    /// disabled, zero-width matches that split a codepoint are allowed.
527    /// Otherwise they are never reported.
528    ///
529    /// In the code below, notice that `""` is permitted to match positions
530    /// that split the encoding of a codepoint.
531    ///
532    /// ```
533    /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match};
534    ///
535    /// let re = PikeVM::builder()
536    ///     .thompson(thompson::Config::new().utf8(false))
537    ///     .build(r"")?;
538    /// let mut cache = re.create_cache();
539    ///
540    /// let haystack = "a☃z";
541    /// let mut it = re.find_iter(&mut cache, haystack);
542    /// assert_eq!(Some(Match::must(0, 0..0)), it.next());
543    /// assert_eq!(Some(Match::must(0, 1..1)), it.next());
544    /// assert_eq!(Some(Match::must(0, 2..2)), it.next());
545    /// assert_eq!(Some(Match::must(0, 3..3)), it.next());
546    /// assert_eq!(Some(Match::must(0, 4..4)), it.next());
547    /// assert_eq!(Some(Match::must(0, 5..5)), it.next());
548    /// assert_eq!(None, it.next());
549    ///
550    /// # Ok::<(), Box<dyn std::error::Error>>(())
551    /// ```
552    pub fn config() -> Config {
553        Config::new()
554    }
555
556    /// Return a builder for configuring the construction of a `PikeVM`.
557    ///
558    /// This is a convenience routine to avoid needing to import the
559    /// [`Builder`] type in common cases.
560    ///
561    /// # Example
562    ///
563    /// This example shows how to use the builder to disable UTF-8 mode
564    /// everywhere.
565    ///
566    /// ```
567    /// use regex_automata::{
568    ///     nfa::thompson::{self, pikevm::PikeVM},
569    ///     util::syntax,
570    ///     Match,
571    /// };
572    ///
573    /// let re = PikeVM::builder()
574    ///     .syntax(syntax::Config::new().utf8(false))
575    ///     .thompson(thompson::Config::new().utf8(false))
576    ///     .build(r"foo(?-u:[^b])ar.*")?;
577    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
578    ///
579    /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n";
580    /// let expected = Some(Match::must(0, 1..9));
581    /// re.captures(&mut cache, haystack, &mut caps);
582    /// assert_eq!(expected, caps.get_match());
583    ///
584    /// # Ok::<(), Box<dyn std::error::Error>>(())
585    /// ```
586    pub fn builder() -> Builder {
587        Builder::new()
588    }
589
590    /// Create a new empty set of capturing groups that is guaranteed to be
591    /// valid for the search APIs on this `PikeVM`.
592    ///
593    /// A `Captures` value created for a specific `PikeVM` cannot be used with
594    /// any other `PikeVM`.
595    ///
596    /// This is a convenience function for [`Captures::all`]. See the
597    /// [`Captures`] documentation for an explanation of its alternative
598    /// constructors that permit the `PikeVM` to do less work during a search,
599    /// and thus might make it faster.
600    pub fn create_captures(&self) -> Captures {
601        Captures::all(self.get_nfa().group_info().clone())
602    }
603
604    /// Create a new cache for this `PikeVM`.
605    ///
606    /// The cache returned should only be used for searches for this
607    /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then
608    /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently,
609    /// [`PikeVM::reset_cache`]).
610    pub fn create_cache(&self) -> Cache {
611        Cache::new(self)
612    }
613
614    /// Reset the given cache such that it can be used for searching with the
615    /// this `PikeVM` (and only this `PikeVM`).
616    ///
617    /// A cache reset permits reusing memory already allocated in this cache
618    /// with a different `PikeVM`.
619    ///
620    /// # Example
621    ///
622    /// This shows how to re-purpose a cache for use with a different `PikeVM`.
623    ///
624    /// ```
625    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
626    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
627    ///
628    /// let re1 = PikeVM::new(r"\w")?;
629    /// let re2 = PikeVM::new(r"\W")?;
630    ///
631    /// let mut cache = re1.create_cache();
632    /// assert_eq!(
633    ///     Some(Match::must(0, 0..2)),
634    ///     re1.find_iter(&mut cache, "Δ").next(),
635    /// );
636    ///
637    /// // Using 'cache' with re2 is not allowed. It may result in panics or
638    /// // incorrect results. In order to re-purpose the cache, we must reset
639    /// // it with the PikeVM we'd like to use it with.
640    /// //
641    /// // Similarly, after this reset, using the cache with 're1' is also not
642    /// // allowed.
643    /// re2.reset_cache(&mut cache);
644    /// assert_eq!(
645    ///     Some(Match::must(0, 0..3)),
646    ///     re2.find_iter(&mut cache, "☃").next(),
647    /// );
648    ///
649    /// # Ok::<(), Box<dyn std::error::Error>>(())
650    /// ```
651    pub fn reset_cache(&self, cache: &mut Cache) {
652        cache.reset(self);
653    }
654
655    /// Returns the total number of patterns compiled into this `PikeVM`.
656    ///
657    /// In the case of a `PikeVM` that contains no patterns, this returns `0`.
658    ///
659    /// # Example
660    ///
661    /// This example shows the pattern length for a `PikeVM` that never
662    /// matches:
663    ///
664    /// ```
665    /// use regex_automata::nfa::thompson::pikevm::PikeVM;
666    ///
667    /// let re = PikeVM::never_match()?;
668    /// assert_eq!(re.pattern_len(), 0);
669    /// # Ok::<(), Box<dyn std::error::Error>>(())
670    /// ```
671    ///
672    /// And another example for a `PikeVM` that matches at every position:
673    ///
674    /// ```
675    /// use regex_automata::nfa::thompson::pikevm::PikeVM;
676    ///
677    /// let re = PikeVM::always_match()?;
678    /// assert_eq!(re.pattern_len(), 1);
679    /// # Ok::<(), Box<dyn std::error::Error>>(())
680    /// ```
681    ///
682    /// And finally, a `PikeVM` that was constructed from multiple patterns:
683    ///
684    /// ```
685    /// use regex_automata::nfa::thompson::pikevm::PikeVM;
686    ///
687    /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?;
688    /// assert_eq!(re.pattern_len(), 3);
689    /// # Ok::<(), Box<dyn std::error::Error>>(())
690    /// ```
691    pub fn pattern_len(&self) -> usize {
692        self.nfa.pattern_len()
693    }
694
695    /// Return the config for this `PikeVM`.
696    #[inline]
697    pub fn get_config(&self) -> &Config {
698        &self.config
699    }
700
701    /// Returns a reference to the underlying NFA.
702    #[inline]
703    pub fn get_nfa(&self) -> &NFA {
704        &self.nfa
705    }
706}
707
708impl PikeVM {
709    /// Returns true if and only if this `PikeVM` matches the given haystack.
710    ///
711    /// This routine may short circuit if it knows that scanning future
712    /// input will never lead to a different result. In particular, if the
713    /// underlying NFA enters a match state, then this routine will return
714    /// `true` immediately without inspecting any future input. (Consider how
715    /// this might make a difference given the regex `a+` on the haystack
716    /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
717    /// but routines like `find` need to continue searching because `+` is
718    /// greedy by default.)
719    ///
720    /// # Example
721    ///
722    /// This shows basic usage:
723    ///
724    /// ```
725    /// use regex_automata::nfa::thompson::pikevm::PikeVM;
726    ///
727    /// let re = PikeVM::new("foo[0-9]+bar")?;
728    /// let mut cache = re.create_cache();
729    ///
730    /// assert!(re.is_match(&mut cache, "foo12345bar"));
731    /// assert!(!re.is_match(&mut cache, "foobar"));
732    /// # Ok::<(), Box<dyn std::error::Error>>(())
733    /// ```
734    ///
735    /// # Example: consistency with search APIs
736    ///
737    /// `is_match` is guaranteed to return `true` whenever `find` returns a
738    /// match. This includes searches that are executed entirely within a
739    /// codepoint:
740    ///
741    /// ```
742    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
743    ///
744    /// let re = PikeVM::new("a*")?;
745    /// let mut cache = re.create_cache();
746    ///
747    /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
748    /// # Ok::<(), Box<dyn std::error::Error>>(())
749    /// ```
750    ///
751    /// Notice that when UTF-8 mode is disabled, then the above reports a
752    /// match because the restriction against zero-width matches that split a
753    /// codepoint has been lifted:
754    ///
755    /// ```
756    /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
757    ///
758    /// let re = PikeVM::builder()
759    ///     .thompson(NFA::config().utf8(false))
760    ///     .build("a*")?;
761    /// let mut cache = re.create_cache();
762    ///
763    /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
764    /// # Ok::<(), Box<dyn std::error::Error>>(())
765    /// ```
766    #[inline]
767    pub fn is_match<'h, I: Into<Input<'h>>>(
768        &self,
769        cache: &mut Cache,
770        input: I,
771    ) -> bool {
772        let input = input.into().earliest(true);
773        self.search_slots(cache, &input, &mut []).is_some()
774    }
775
776    /// Executes a leftmost forward search and returns a `Match` if one exists.
777    ///
778    /// This routine only includes the overall match span. To get access to the
779    /// individual spans of each capturing group, use [`PikeVM::captures`].
780    ///
781    /// # Example
782    ///
783    /// Leftmost first match semantics corresponds to the match with the
784    /// smallest starting offset, but where the end offset is determined by
785    /// preferring earlier branches in the original regular expression. For
786    /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam`
787    /// will match `Samwise` in `Samwise`.
788    ///
789    /// Generally speaking, the "leftmost first" match is how most backtracking
790    /// regular expressions tend to work. This is in contrast to POSIX-style
791    /// regular expressions that yield "leftmost longest" matches. Namely,
792    /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using
793    /// leftmost longest semantics. (This crate does not currently support
794    /// leftmost longest semantics.)
795    ///
796    /// ```
797    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
798    ///
799    /// let re = PikeVM::new("foo[0-9]+")?;
800    /// let mut cache = re.create_cache();
801    /// let expected = Match::must(0, 0..8);
802    /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345"));
803    ///
804    /// // Even though a match is found after reading the first byte (`a`),
805    /// // the leftmost first match semantics demand that we find the earliest
806    /// // match that prefers earlier parts of the pattern over later parts.
807    /// let re = PikeVM::new("abc|a")?;
808    /// let mut cache = re.create_cache();
809    /// let expected = Match::must(0, 0..3);
810    /// assert_eq!(Some(expected), re.find(&mut cache, "abc"));
811    ///
812    /// # Ok::<(), Box<dyn std::error::Error>>(())
813    /// ```
814    #[inline]
815    pub fn find<'h, I: Into<Input<'h>>>(
816        &self,
817        cache: &mut Cache,
818        input: I,
819    ) -> Option<Match> {
820        let input = input.into();
821        if self.get_nfa().pattern_len() == 1 {
822            let mut slots = [None, None];
823            let pid = self.search_slots(cache, &input, &mut slots)?;
824            let start = slots[0]?.get();
825            let end = slots[1]?.get();
826            return Some(Match::new(pid, Span { start, end }));
827        }
828        let ginfo = self.get_nfa().group_info();
829        let slots_len = ginfo.implicit_slot_len();
830        let mut slots = vec![None; slots_len];
831        let pid = self.search_slots(cache, &input, &mut slots)?;
832        let start = slots[pid.as_usize() * 2]?.get();
833        let end = slots[pid.as_usize() * 2 + 1]?.get();
834        Some(Match::new(pid, Span { start, end }))
835    }
836
837    /// Executes a leftmost forward search and writes the spans of capturing
838    /// groups that participated in a match into the provided [`Captures`]
839    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
840    /// to return `false`.
841    ///
842    /// # Example
843    ///
844    /// ```
845    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
846    ///
847    /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?;
848    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
849    ///
850    /// re.captures(&mut cache, "2010-03-14", &mut caps);
851    /// assert!(caps.is_match());
852    /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1));
853    /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2));
854    /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3));
855    ///
856    /// # Ok::<(), Box<dyn std::error::Error>>(())
857    /// ```
858    #[inline]
859    pub fn captures<'h, I: Into<Input<'h>>>(
860        &self,
861        cache: &mut Cache,
862        input: I,
863        caps: &mut Captures,
864    ) {
865        self.search(cache, &input.into(), caps)
866    }
867
868    /// Returns an iterator over all non-overlapping leftmost matches in the
869    /// given bytes. If no match exists, then the iterator yields no elements.
870    ///
871    /// # Example
872    ///
873    /// ```
874    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
875    ///
876    /// let re = PikeVM::new("foo[0-9]+")?;
877    /// let mut cache = re.create_cache();
878    ///
879    /// let text = "foo1 foo12 foo123";
880    /// let matches: Vec<Match> = re.find_iter(&mut cache, text).collect();
881    /// assert_eq!(matches, vec![
882    ///     Match::must(0, 0..4),
883    ///     Match::must(0, 5..10),
884    ///     Match::must(0, 11..17),
885    /// ]);
886    /// # Ok::<(), Box<dyn std::error::Error>>(())
887    /// ```
888    #[inline]
889    pub fn find_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
890        &'r self,
891        cache: &'c mut Cache,
892        input: I,
893    ) -> FindMatches<'r, 'c, 'h> {
894        let caps = Captures::matches(self.get_nfa().group_info().clone());
895        let it = iter::Searcher::new(input.into());
896        FindMatches { re: self, cache, caps, it }
897    }
898
899    /// Returns an iterator over all non-overlapping `Captures` values. If no
900    /// match exists, then the iterator yields no elements.
901    ///
902    /// This yields the same matches as [`PikeVM::find_iter`], but it includes
903    /// the spans of all capturing groups that participate in each match.
904    ///
905    /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for
906    /// how to correctly iterate over all matches in a haystack while avoiding
907    /// the creation of a new `Captures` value for every match. (Which you are
908    /// forced to do with an `Iterator`.)
909    ///
910    /// # Example
911    ///
912    /// ```
913    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span};
914    ///
915    /// let re = PikeVM::new("foo(?P<numbers>[0-9]+)")?;
916    /// let mut cache = re.create_cache();
917    ///
918    /// let text = "foo1 foo12 foo123";
919    /// let matches: Vec<Span> = re
920    ///     .captures_iter(&mut cache, text)
921    ///     // The unwrap is OK since 'numbers' matches if the pattern matches.
922    ///     .map(|caps| caps.get_group_by_name("numbers").unwrap())
923    ///     .collect();
924    /// assert_eq!(matches, vec![
925    ///     Span::from(3..4),
926    ///     Span::from(8..10),
927    ///     Span::from(14..17),
928    /// ]);
929    /// # Ok::<(), Box<dyn std::error::Error>>(())
930    /// ```
931    #[inline]
932    pub fn captures_iter<'r, 'c, 'h, I: Into<Input<'h>>>(
933        &'r self,
934        cache: &'c mut Cache,
935        input: I,
936    ) -> CapturesMatches<'r, 'c, 'h> {
937        let caps = self.create_captures();
938        let it = iter::Searcher::new(input.into());
939        CapturesMatches { re: self, cache, caps, it }
940    }
941}
942
943impl PikeVM {
944    /// Executes a leftmost forward search and writes the spans of capturing
945    /// groups that participated in a match into the provided [`Captures`]
946    /// value. If no match was found, then [`Captures::is_match`] is guaranteed
947    /// to return `false`.
948    ///
949    /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
950    /// instead of an `Into<Input>`.
951    ///
952    /// # Example: specific pattern search
953    ///
954    /// This example shows how to build a multi-PikeVM that permits searching
955    /// for specific patterns.
956    ///
957    /// ```
958    /// use regex_automata::{
959    ///     nfa::thompson::pikevm::PikeVM,
960    ///     Anchored, Match, PatternID, Input,
961    /// };
962    ///
963    /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
964    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
965    /// let haystack = "foo123";
966    ///
967    /// // Since we are using the default leftmost-first match and both
968    /// // patterns match at the same starting position, only the first pattern
969    /// // will be returned in this case when doing a search for any of the
970    /// // patterns.
971    /// let expected = Some(Match::must(0, 0..6));
972    /// re.search(&mut cache, &Input::new(haystack), &mut caps);
973    /// assert_eq!(expected, caps.get_match());
974    ///
975    /// // But if we want to check whether some other pattern matches, then we
976    /// // can provide its pattern ID.
977    /// let expected = Some(Match::must(1, 0..6));
978    /// let input = Input::new(haystack)
979    ///     .anchored(Anchored::Pattern(PatternID::must(1)));
980    /// re.search(&mut cache, &input, &mut caps);
981    /// assert_eq!(expected, caps.get_match());
982    ///
983    /// # Ok::<(), Box<dyn std::error::Error>>(())
984    /// ```
985    ///
986    /// # Example: specifying the bounds of a search
987    ///
988    /// This example shows how providing the bounds of a search can produce
989    /// different results than simply sub-slicing the haystack.
990    ///
991    /// ```
992    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
993    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
994    ///
995    /// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
996    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
997    /// let haystack = "foo123bar";
998    ///
999    /// // Since we sub-slice the haystack, the search doesn't know about
1000    /// // the larger context and assumes that `123` is surrounded by word
1001    /// // boundaries. And of course, the match position is reported relative
1002    /// // to the sub-slice as well, which means we get `0..3` instead of
1003    /// // `3..6`.
1004    /// let expected = Some(Match::must(0, 0..3));
1005    /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
1006    /// assert_eq!(expected, caps.get_match());
1007    ///
1008    /// // But if we provide the bounds of the search within the context of the
1009    /// // entire haystack, then the search can take the surrounding context
1010    /// // into account. (And if we did find a match, it would be reported
1011    /// // as a valid offset into `haystack` instead of its sub-slice.)
1012    /// let expected = None;
1013    /// let input = Input::new(haystack).range(3..6);
1014    /// re.search(&mut cache, &input, &mut caps);
1015    /// assert_eq!(expected, caps.get_match());
1016    ///
1017    /// # Ok::<(), Box<dyn std::error::Error>>(())
1018    /// ```
1019    #[inline]
1020    pub fn search(
1021        &self,
1022        cache: &mut Cache,
1023        input: &Input<'_>,
1024        caps: &mut Captures,
1025    ) {
1026        caps.set_pattern(None);
1027        let pid = self.search_slots(cache, input, caps.slots_mut());
1028        caps.set_pattern(pid);
1029    }
1030
1031    /// Executes a leftmost forward search and writes the spans of capturing
1032    /// groups that participated in a match into the provided `slots`, and
1033    /// returns the matching pattern ID. The contents of the slots for patterns
1034    /// other than the matching pattern are unspecified. If no match was found,
1035    /// then `None` is returned and the contents of `slots` is unspecified.
1036    ///
1037    /// This is like [`PikeVM::search`], but it accepts a raw slots slice
1038    /// instead of a `Captures` value. This is useful in contexts where you
1039    /// don't want or need to allocate a `Captures`.
1040    ///
1041    /// It is legal to pass _any_ number of slots to this routine. If the regex
1042    /// engine would otherwise write a slot offset that doesn't fit in the
1043    /// provided slice, then it is simply skipped. In general though, there are
1044    /// usually three slice lengths you might want to use:
1045    ///
1046    /// * An empty slice, if you only care about which pattern matched.
1047    /// * A slice with
1048    /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len)
1049    /// slots, if you only care about the overall match spans for each matching
1050    /// pattern.
1051    /// * A slice with
1052    /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which
1053    /// permits recording match offsets for every capturing group in every
1054    /// pattern.
1055    ///
1056    /// # Example
1057    ///
1058    /// This example shows how to find the overall match offsets in a
1059    /// multi-pattern search without allocating a `Captures` value. Indeed, we
1060    /// can put our slots right on the stack.
1061    ///
1062    /// ```
1063    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1064    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input};
1065    ///
1066    /// let re = PikeVM::new_many(&[
1067    ///     r"\pL+",
1068    ///     r"\d+",
1069    /// ])?;
1070    /// let mut cache = re.create_cache();
1071    /// let input = Input::new("!@#123");
1072    ///
1073    /// // We only care about the overall match offsets here, so we just
1074    /// // allocate two slots for each pattern. Each slot records the start
1075    /// // and end of the match.
1076    /// let mut slots = [None; 4];
1077    /// let pid = re.search_slots(&mut cache, &input, &mut slots);
1078    /// assert_eq!(Some(PatternID::must(1)), pid);
1079    ///
1080    /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'.
1081    /// // See 'GroupInfo' for more details on the mapping between groups and
1082    /// // slot indices.
1083    /// let slot_start = pid.unwrap().as_usize() * 2;
1084    /// let slot_end = slot_start + 1;
1085    /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get()));
1086    /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get()));
1087    ///
1088    /// # Ok::<(), Box<dyn std::error::Error>>(())
1089    /// ```
1090    #[inline]
1091    pub fn search_slots(
1092        &self,
1093        cache: &mut Cache,
1094        input: &Input<'_>,
1095        slots: &mut [Option<NonMaxUsize>],
1096    ) -> Option<PatternID> {
1097        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1098        if !utf8empty {
1099            let hm = self.search_slots_imp(cache, input, slots)?;
1100            return Some(hm.pattern());
1101        }
1102        // There is an unfortunate special case where if the regex can
1103        // match the empty string and UTF-8 mode is enabled, the search
1104        // implementation requires that the slots have at least as much space
1105        // to report the bounds of any match. This is so zero-width matches
1106        // that split a codepoint can be filtered out.
1107        //
1108        // Note that if utf8empty is true, we specialize the case for when
1109        // the number of patterns is 1. In that case, we can just use a stack
1110        // allocation. Otherwise we resort to a heap allocation, which we
1111        // convince ourselves we're fine with due to the pathological nature of
1112        // this case.
1113        let min = self.get_nfa().group_info().implicit_slot_len();
1114        if slots.len() >= min {
1115            let hm = self.search_slots_imp(cache, input, slots)?;
1116            return Some(hm.pattern());
1117        }
1118        if self.get_nfa().pattern_len() == 1 {
1119            let mut enough = [None, None];
1120            let got = self.search_slots_imp(cache, input, &mut enough);
1121            // This is OK because we know `enough` is strictly bigger than
1122            // `slots`, otherwise this special case isn't reached.
1123            slots.copy_from_slice(&enough[..slots.len()]);
1124            return got.map(|hm| hm.pattern());
1125        }
1126        let mut enough = vec![None; min];
1127        let got = self.search_slots_imp(cache, input, &mut enough);
1128        // This is OK because we know `enough` is strictly bigger than `slots`,
1129        // otherwise this special case isn't reached.
1130        slots.copy_from_slice(&enough[..slots.len()]);
1131        got.map(|hm| hm.pattern())
1132    }
1133
1134    /// This is the actual implementation of `search_slots_imp` that
1135    /// doesn't account for the special case when 1) the NFA has UTF-8 mode
1136    /// enabled, 2) the NFA can match the empty string and 3) the caller has
1137    /// provided an insufficient number of slots to record match offsets.
1138    #[inline(never)]
1139    fn search_slots_imp(
1140        &self,
1141        cache: &mut Cache,
1142        input: &Input<'_>,
1143        slots: &mut [Option<NonMaxUsize>],
1144    ) -> Option<HalfMatch> {
1145        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1146        let hm = match self.search_imp(cache, input, slots) {
1147            None => return None,
1148            Some(hm) if !utf8empty => return Some(hm),
1149            Some(hm) => hm,
1150        };
1151        empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
1152            Ok(self
1153                .search_imp(cache, input, slots)
1154                .map(|hm| (hm, hm.offset())))
1155        })
1156        // OK because the PikeVM never errors.
1157        .unwrap()
1158    }
1159
1160    /// Writes the set of patterns that match anywhere in the given search
1161    /// configuration to `patset`. If multiple patterns match at the same
1162    /// position and this `PikeVM` was configured with [`MatchKind::All`]
1163    /// semantics, then all matching patterns are written to the given set.
1164    ///
1165    /// Unless all of the patterns in this `PikeVM` are anchored, then
1166    /// generally speaking, this will visit every byte in the haystack.
1167    ///
1168    /// This search routine *does not* clear the pattern set. This gives some
1169    /// flexibility to the caller (e.g., running multiple searches with the
1170    /// same pattern set), but does make the API bug-prone if you're reusing
1171    /// the same pattern set for multiple searches but intended them to be
1172    /// independent.
1173    ///
1174    /// If a pattern ID matched but the given `PatternSet` does not have
1175    /// sufficient capacity to store it, then it is not inserted and silently
1176    /// dropped.
1177    ///
1178    /// # Example
1179    ///
1180    /// This example shows how to find all matching patterns in a haystack,
1181    /// even when some patterns match at the same position as other patterns.
1182    ///
1183    /// ```
1184    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1185    /// use regex_automata::{
1186    ///     nfa::thompson::pikevm::PikeVM,
1187    ///     Input, MatchKind, PatternSet,
1188    /// };
1189    ///
1190    /// let patterns = &[
1191    ///     r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar",
1192    /// ];
1193    /// let re = PikeVM::builder()
1194    ///     .configure(PikeVM::config().match_kind(MatchKind::All))
1195    ///     .build_many(patterns)?;
1196    /// let mut cache = re.create_cache();
1197    ///
1198    /// let input = Input::new("foobar");
1199    /// let mut patset = PatternSet::new(re.pattern_len());
1200    /// re.which_overlapping_matches(&mut cache, &input, &mut patset);
1201    /// let expected = vec![0, 2, 3, 4, 6];
1202    /// let got: Vec<usize> = patset.iter().map(|p| p.as_usize()).collect();
1203    /// assert_eq!(expected, got);
1204    ///
1205    /// # Ok::<(), Box<dyn std::error::Error>>(())
1206    /// ```
1207    #[inline]
1208    pub fn which_overlapping_matches(
1209        &self,
1210        cache: &mut Cache,
1211        input: &Input<'_>,
1212        patset: &mut PatternSet,
1213    ) {
1214        self.which_overlapping_imp(cache, input, patset)
1215    }
1216}
1217
1218impl PikeVM {
1219    /// The implementation of standard leftmost search.
1220    ///
1221    /// Capturing group spans are written to `slots`, but only if requested.
1222    /// `slots` can be any length. Any slot in the NFA that is activated but
1223    /// which is out of bounds for the given `slots` is ignored.
1224    fn search_imp(
1225        &self,
1226        cache: &mut Cache,
1227        input: &Input<'_>,
1228        slots: &mut [Option<NonMaxUsize>],
1229    ) -> Option<HalfMatch> {
1230        cache.setup_search(slots.len());
1231        if input.is_done() {
1232            return None;
1233        }
1234        // Why do we even care about this? Well, in our 'Captures'
1235        // representation, we use usize::MAX as a sentinel to indicate "no
1236        // match." This isn't problematic so long as our haystack doesn't have
1237        // a maximal length. Byte slices are guaranteed by Rust to have a
1238        // length that fits into isize, and so this assert should always pass.
1239        // But we put it here to make our assumption explicit.
1240        assert!(
1241            input.haystack().len() < core::usize::MAX,
1242            "byte slice lengths must be less than usize MAX",
1243        );
1244        instrument!(|c| c.reset(&self.nfa));
1245
1246        // Whether we want to visit all match states instead of emulating the
1247        // 'leftmost' semantics of typical backtracking regex engines.
1248        let allmatches =
1249            self.config.get_match_kind().continue_past_first_match();
1250        let (anchored, start_id) = match self.start_config(input) {
1251            None => return None,
1252            Some(config) => config,
1253        };
1254
1255        let pre =
1256            if anchored { None } else { self.get_config().get_prefilter() };
1257        let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1258        let mut hm = None;
1259        // Yes, our search doesn't end at input.end(), but includes it. This
1260        // is necessary because matches are delayed by one byte, just like
1261        // how the DFA engines work. The delay is used to handle look-behind
1262        // assertions. In the case of the PikeVM, the delay is implemented
1263        // by not considering a match to exist until it is visited in
1264        // 'steps'. Technically, we know a match exists in the previous
1265        // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
1266        // determinization. We don't mark a DFA state as a match state if it
1267        // contains an NFA match state, but rather, whether the DFA state was
1268        // generated by a transition from a DFA state that contains an NFA
1269        // match state.)
1270        let mut at = input.start();
1271        while at <= input.end() {
1272            // If we have no states left to visit, then there are some cases
1273            // where we know we can quit early or even skip ahead.
1274            if curr.set.is_empty() {
1275                // We have a match and we haven't been instructed to continue
1276                // on even after finding a match, so we can quit.
1277                if hm.is_some() && !allmatches {
1278                    break;
1279                }
1280                // If we're running an anchored search and we've advanced
1281                // beyond the start position with no other states to try, then
1282                // we will never observe a match and thus can stop.
1283                if anchored && at > input.start() {
1284                    break;
1285                }
1286                // If there no states left to explore at this position and we
1287                // know we can't terminate early, then we are effectively at
1288                // the starting state of the NFA. If we fell through here,
1289                // we'd end up adding our '(?s-u:.)*?' prefix and it would be
1290                // the only thing in 'curr'. So we might as well just skip
1291                // ahead until we find something that we know might advance us
1292                // forward.
1293                if let Some(ref pre) = pre {
1294                    let span = Span::from(at..input.end());
1295                    match pre.find(input.haystack(), span) {
1296                        None => break,
1297                        Some(ref span) => at = span.start,
1298                    }
1299                }
1300            }
1301            // Instead of using the NFA's unanchored start state, we actually
1302            // always use its anchored starting state. As a result, when doing
1303            // an unanchored search, we need to simulate our own '(?s-u:.)*?'
1304            // prefix, to permit a match to appear anywhere.
1305            //
1306            // Now, we don't *have* to do things this way. We could use the
1307            // NFA's unanchored starting state and do one 'epsilon_closure'
1308            // call from that starting state before the main loop here. And
1309            // that is just as correct. However, it turns out to be slower
1310            // than our approach here because it slightly increases the cost
1311            // of processing each byte by requiring us to visit more NFA
1312            // states to deal with the additional NFA states in the unanchored
1313            // prefix. By simulating it explicitly here, we lower those costs
1314            // substantially. The cost is itself small, but it adds up for
1315            // large haystacks.
1316            //
1317            // In order to simulate the '(?s-u:.)*?' prefix---which is not
1318            // greedy---we are careful not to perform an epsilon closure on
1319            // the start state if we already have a match. Namely, if we
1320            // did otherwise, we would never reach a terminating condition
1321            // because there would always be additional states to process.
1322            // In effect, the exclusion of running 'epsilon_closure' when
1323            // we have a match corresponds to the "dead" states we have in
1324            // our DFA regex engines. Namely, in a DFA, match states merely
1325            // instruct the search execution to record the current offset as
1326            // the most recently seen match. It is the dead state that actually
1327            // indicates when to stop the search (other than EOF or quit
1328            // states).
1329            //
1330            // However, when 'allmatches' is true, the caller has asked us to
1331            // leave in every possible match state. This tends not to make a
1332            // whole lot of sense in unanchored searches, because it means the
1333            // search really cannot terminate until EOF. And often, in that
1334            // case, you wind up skipping over a bunch of matches and are left
1335            // with the "last" match. Arguably, it just doesn't make a lot of
1336            // sense to run a 'leftmost' search (which is what this routine is)
1337            // with 'allmatches' set to true. But the DFAs support it and this
1338            // matches their behavior. (Generally, 'allmatches' is useful for
1339            // overlapping searches or leftmost anchored searches to find the
1340            // longest possible match by ignoring match priority.)
1341            //
1342            // Additionally, when we're running an anchored search, this
1343            // epsilon closure should only be computed at the beginning of the
1344            // search. If we re-computed it at every position, we would be
1345            // simulating an unanchored search when we were tasked to perform
1346            // an anchored search.
1347            if (!hm.is_some() || allmatches)
1348                && (!anchored || at == input.start())
1349            {
1350                // Since we are adding to the 'curr' active states and since
1351                // this is for the start ID, we use a slots slice that is
1352                // guaranteed to have the right length but where every element
1353                // is absent. This is exactly what we want, because this
1354                // epsilon closure is responsible for simulating an unanchored
1355                // '(?s:.)*?' prefix. It is specifically outside of any
1356                // capturing groups, and thus, using slots that are always
1357                // absent is correct.
1358                //
1359                // Note though that we can't just use '&mut []' here, since
1360                // this epsilon closure may traverse through 'Captures' epsilon
1361                // transitions, and thus must be able to write offsets to the
1362                // slots given which are later copied to slot values in 'curr'.
1363                let slots = next.slot_table.all_absent();
1364                self.epsilon_closure(stack, slots, curr, input, at, start_id);
1365            }
1366            if let Some(pid) = self.nexts(stack, curr, next, input, at, slots)
1367            {
1368                hm = Some(HalfMatch::new(pid, at));
1369            }
1370            // Unless the caller asked us to return early, we need to mush on
1371            // to see if we can extend our match. (But note that 'nexts' will
1372            // quit right after seeing a match when match_kind==LeftmostFirst,
1373            // as is consistent with leftmost-first match priority.)
1374            if input.get_earliest() && hm.is_some() {
1375                break;
1376            }
1377            core::mem::swap(curr, next);
1378            next.set.clear();
1379            at += 1;
1380        }
1381        instrument!(|c| c.eprint(&self.nfa));
1382        hm
1383    }
1384
1385    /// The implementation for the 'which_overlapping_matches' API. Basically,
1386    /// we do a single scan through the entire haystack (unless our regex
1387    /// or search is anchored) and record every pattern that matched. In
1388    /// particular, when MatchKind::All is used, this supports overlapping
1389    /// matches. So if we have the regexes 'sam' and 'samwise', they will
1390    /// *both* be reported in the pattern set when searching the haystack
1391    /// 'samwise'.
1392    fn which_overlapping_imp(
1393        &self,
1394        cache: &mut Cache,
1395        input: &Input<'_>,
1396        patset: &mut PatternSet,
1397    ) {
1398        // NOTE: This is effectively a copy of 'search_imp' above, but with no
1399        // captures support and instead writes patterns that matched directly
1400        // to 'patset'. See that routine for better commentary about what's
1401        // going on in this routine. We probably could unify the routines using
1402        // generics or more helper routines, but I'm not sure it's worth it.
1403        //
1404        // NOTE: We somewhat go out of our way here to support things like
1405        // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither
1406        // of those seem particularly relevant to this routine, but they are
1407        // both supported by the DFA analogs of this routine by construction
1408        // and composition, so it seems like good sense to have the PikeVM
1409        // match that behavior.
1410
1411        cache.setup_search(0);
1412        if input.is_done() {
1413            return;
1414        }
1415        assert!(
1416            input.haystack().len() < core::usize::MAX,
1417            "byte slice lengths must be less than usize MAX",
1418        );
1419        instrument!(|c| c.reset(&self.nfa));
1420
1421        let allmatches =
1422            self.config.get_match_kind().continue_past_first_match();
1423        let (anchored, start_id) = match self.start_config(input) {
1424            None => return,
1425            Some(config) => config,
1426        };
1427
1428        let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
1429        for at in input.start()..=input.end() {
1430            let any_matches = !patset.is_empty();
1431            if curr.set.is_empty() {
1432                if any_matches && !allmatches {
1433                    break;
1434                }
1435                if anchored && at > input.start() {
1436                    break;
1437                }
1438            }
1439            if !any_matches || allmatches {
1440                let slots = &mut [];
1441                self.epsilon_closure(stack, slots, curr, input, at, start_id);
1442            }
1443            self.nexts_overlapping(stack, curr, next, input, at, patset);
1444            // If we found a match and filled our set, then there is no more
1445            // additional info that we can provide. Thus, we can quit. We also
1446            // quit if the caller asked us to stop at the earliest point that
1447            // we know a match exists.
1448            if patset.is_full() || input.get_earliest() {
1449                break;
1450            }
1451            core::mem::swap(curr, next);
1452            next.set.clear();
1453        }
1454        instrument!(|c| c.eprint(&self.nfa));
1455    }
1456
1457    /// Process the active states in 'curr' to find the states (written to
1458    /// 'next') we should process for the next byte in the haystack.
1459    ///
1460    /// 'stack' is used to perform a depth first traversal of the NFA when
1461    /// computing an epsilon closure.
1462    ///
1463    /// When a match is found, the slots for that match state (in 'curr') are
1464    /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
1465    /// stops (unless the PikeVM was configured with MatchKind::All semantics).
1466    #[cfg_attr(feature = "perf-inline", inline(always))]
1467    fn nexts(
1468        &self,
1469        stack: &mut Vec<FollowEpsilon>,
1470        curr: &mut ActiveStates,
1471        next: &mut ActiveStates,
1472        input: &Input<'_>,
1473        at: usize,
1474        slots: &mut [Option<NonMaxUsize>],
1475    ) -> Option<PatternID> {
1476        instrument!(|c| c.record_state_set(&curr.set));
1477        let mut pid = None;
1478        let ActiveStates { ref set, ref mut slot_table } = *curr;
1479        for sid in set.iter() {
1480            pid = match self.next(stack, slot_table, next, input, at, sid) {
1481                None => continue,
1482                Some(pid) => Some(pid),
1483            };
1484            slots.copy_from_slice(slot_table.for_state(sid));
1485            if !self.config.get_match_kind().continue_past_first_match() {
1486                break;
1487            }
1488        }
1489        pid
1490    }
1491
1492    /// Like 'nexts', but for the overlapping case. This doesn't write any
1493    /// slots, and instead just writes which pattern matched in 'patset'.
1494    #[cfg_attr(feature = "perf-inline", inline(always))]
1495    fn nexts_overlapping(
1496        &self,
1497        stack: &mut Vec<FollowEpsilon>,
1498        curr: &mut ActiveStates,
1499        next: &mut ActiveStates,
1500        input: &Input<'_>,
1501        at: usize,
1502        patset: &mut PatternSet,
1503    ) {
1504        instrument!(|c| c.record_state_set(&curr.set));
1505        let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8();
1506        let ActiveStates { ref set, ref mut slot_table } = *curr;
1507        for sid in set.iter() {
1508            let pid = match self.next(stack, slot_table, next, input, at, sid)
1509            {
1510                None => continue,
1511                Some(pid) => pid,
1512            };
1513            // This handles the case of finding a zero-width match that splits
1514            // a codepoint. Namely, if we're in UTF-8 mode AND we know we can
1515            // match the empty string, then the only valid way of getting to
1516            // this point with an offset that splits a codepoint is when we
1517            // have an empty match. Such matches, in UTF-8 mode, must not be
1518            // reported. So we just skip them here and pretend as if we did
1519            // not see a match.
1520            if utf8empty && !input.is_char_boundary(at) {
1521                continue;
1522            }
1523            let _ = patset.try_insert(pid);
1524            if !self.config.get_match_kind().continue_past_first_match() {
1525                break;
1526            }
1527        }
1528    }
1529
1530    /// Starting from 'sid', if the position 'at' in the 'input' haystack has a
1531    /// transition defined out of 'sid', then add the state transitioned to and
1532    /// its epsilon closure to the 'next' set of states to explore.
1533    ///
1534    /// 'stack' is used by the epsilon closure computation to perform a depth
1535    /// first traversal of the NFA.
1536    ///
1537    /// 'curr_slot_table' should be the table of slots for the current set of
1538    /// states being explored. If there is a transition out of 'sid', then
1539    /// sid's row in the slot table is used to perform the epsilon closure.
1540    #[cfg_attr(feature = "perf-inline", inline(always))]
1541    fn next(
1542        &self,
1543        stack: &mut Vec<FollowEpsilon>,
1544        curr_slot_table: &mut SlotTable,
1545        next: &mut ActiveStates,
1546        input: &Input<'_>,
1547        at: usize,
1548        sid: StateID,
1549    ) -> Option<PatternID> {
1550        instrument!(|c| c.record_step(sid));
1551        match *self.nfa.state(sid) {
1552            State::Fail
1553            | State::Look { .. }
1554            | State::Union { .. }
1555            | State::BinaryUnion { .. }
1556            | State::Capture { .. } => None,
1557            State::ByteRange { ref trans } => {
1558                if trans.matches(input.haystack(), at) {
1559                    let slots = curr_slot_table.for_state(sid);
1560                    // OK because 'at <= haystack.len() < usize::MAX', so
1561                    // adding 1 will never wrap.
1562                    let at = at.wrapping_add(1);
1563                    self.epsilon_closure(
1564                        stack, slots, next, input, at, trans.next,
1565                    );
1566                }
1567                None
1568            }
1569            State::Sparse(ref sparse) => {
1570                if let Some(next_sid) = sparse.matches(input.haystack(), at) {
1571                    let slots = curr_slot_table.for_state(sid);
1572                    // OK because 'at <= haystack.len() < usize::MAX', so
1573                    // adding 1 will never wrap.
1574                    let at = at.wrapping_add(1);
1575                    self.epsilon_closure(
1576                        stack, slots, next, input, at, next_sid,
1577                    );
1578                }
1579                None
1580            }
1581            State::Dense(ref dense) => {
1582                if let Some(next_sid) = dense.matches(input.haystack(), at) {
1583                    let slots = curr_slot_table.for_state(sid);
1584                    // OK because 'at <= haystack.len() < usize::MAX', so
1585                    // adding 1 will never wrap.
1586                    let at = at.wrapping_add(1);
1587                    self.epsilon_closure(
1588                        stack, slots, next, input, at, next_sid,
1589                    );
1590                }
1591                None
1592            }
1593            State::Match { pattern_id } => Some(pattern_id),
1594        }
1595    }
1596
1597    /// Compute the epsilon closure of 'sid', writing the closure into 'next'
1598    /// while copying slot values from 'curr_slots' into corresponding states
1599    /// in 'next'. 'curr_slots' should be the slot values corresponding to
1600    /// 'sid'.
1601    ///
1602    /// The given 'stack' is used to perform a depth first traversal of the
1603    /// NFA by recursively following all epsilon transitions out of 'sid'.
1604    /// Conditional epsilon transitions are followed if and only if they are
1605    /// satisfied for the position 'at' in the 'input' haystack.
1606    ///
1607    /// While this routine may write to 'curr_slots', once it returns, any
1608    /// writes are undone and the original values (even if absent) are
1609    /// restored.
1610    #[cfg_attr(feature = "perf-inline", inline(always))]
1611    fn epsilon_closure(
1612        &self,
1613        stack: &mut Vec<FollowEpsilon>,
1614        curr_slots: &mut [Option<NonMaxUsize>],
1615        next: &mut ActiveStates,
1616        input: &Input<'_>,
1617        at: usize,
1618        sid: StateID,
1619    ) {
1620        instrument!(|c| {
1621            c.record_closure(sid);
1622            c.record_stack_push(sid);
1623        });
1624        stack.push(FollowEpsilon::Explore(sid));
1625        while let Some(frame) = stack.pop() {
1626            match frame {
1627                FollowEpsilon::RestoreCapture { slot, offset: pos } => {
1628                    curr_slots[slot] = pos;
1629                }
1630                FollowEpsilon::Explore(sid) => {
1631                    self.epsilon_closure_explore(
1632                        stack, curr_slots, next, input, at, sid,
1633                    );
1634                }
1635            }
1636        }
1637    }
1638
1639    /// Explore all of the epsilon transitions out of 'sid'. This is mostly
1640    /// split out from 'epsilon_closure' in order to clearly delineate
1641    /// the actual work of computing an epsilon closure from the stack
1642    /// book-keeping.
1643    ///
1644    /// This will push any additional explorations needed on to 'stack'.
1645    ///
1646    /// 'curr_slots' should refer to the slots for the currently active NFA
1647    /// state. That is, the current state we are stepping through. These
1648    /// slots are mutated in place as new 'Captures' states are traversed
1649    /// during epsilon closure, but the slots are restored to their original
1650    /// values once the full epsilon closure is completed. The ultimate use of
1651    /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
1652    /// the capturing group spans are forwarded from the currently active state
1653    /// to the next.
1654    ///
1655    /// 'next' refers to the next set of active states. Computing an epsilon
1656    /// closure may increase the next set of active states.
1657    ///
1658    /// 'input' refers to the caller's input configuration and 'at' refers to
1659    /// the current position in the haystack. These are used to check whether
1660    /// conditional epsilon transitions (like look-around) are satisfied at
1661    /// the current position. If they aren't, then the epsilon closure won't
1662    /// include them.
1663    #[cfg_attr(feature = "perf-inline", inline(always))]
1664    fn epsilon_closure_explore(
1665        &self,
1666        stack: &mut Vec<FollowEpsilon>,
1667        curr_slots: &mut [Option<NonMaxUsize>],
1668        next: &mut ActiveStates,
1669        input: &Input<'_>,
1670        at: usize,
1671        mut sid: StateID,
1672    ) {
1673        // We can avoid pushing some state IDs on to our stack in precisely
1674        // the cases where a 'push(x)' would be immediately followed by a 'x
1675        // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
1676        // to be the next state ID we want to explore once we're done with
1677        // our initial exploration. In practice, this avoids a lot of stack
1678        // thrashing.
1679        loop {
1680            instrument!(|c| c.record_set_insert(sid));
1681            // Record this state as part of our next set of active states. If
1682            // we've already explored it, then no need to do it again.
1683            if !next.set.insert(sid) {
1684                return;
1685            }
1686            match *self.nfa.state(sid) {
1687                State::Fail
1688                | State::Match { .. }
1689                | State::ByteRange { .. }
1690                | State::Sparse { .. }
1691                | State::Dense { .. } => {
1692                    next.slot_table.for_state(sid).copy_from_slice(curr_slots);
1693                    return;
1694                }
1695                State::Look { look, next } => {
1696                    // OK because we don't permit building a searcher with a
1697                    // Unicode word boundary if the requisite Unicode data is
1698                    // unavailable.
1699                    if !self.nfa.look_matcher().matches_inline(
1700                        look,
1701                        input.haystack(),
1702                        at,
1703                    ) {
1704                        return;
1705                    }
1706                    sid = next;
1707                }
1708                State::Union { ref alternates } => {
1709                    sid = match alternates.get(0) {
1710                        None => return,
1711                        Some(&sid) => sid,
1712                    };
1713                    instrument!(|c| {
1714                        for &alt in &alternates[1..] {
1715                            c.record_stack_push(alt);
1716                        }
1717                    });
1718                    stack.extend(
1719                        alternates[1..]
1720                            .iter()
1721                            .copied()
1722                            .rev()
1723                            .map(FollowEpsilon::Explore),
1724                    );
1725                }
1726                State::BinaryUnion { alt1, alt2 } => {
1727                    sid = alt1;
1728                    instrument!(|c| c.record_stack_push(sid));
1729                    stack.push(FollowEpsilon::Explore(alt2));
1730                }
1731                State::Capture { next, slot, .. } => {
1732                    // There's no need to do anything with slots that
1733                    // ultimately won't be copied into the caller-provided
1734                    // 'Captures' value. So we just skip dealing with them at
1735                    // all.
1736                    if slot.as_usize() < curr_slots.len() {
1737                        instrument!(|c| c.record_stack_push(sid));
1738                        stack.push(FollowEpsilon::RestoreCapture {
1739                            slot,
1740                            offset: curr_slots[slot],
1741                        });
1742                        // OK because length of a slice must fit into an isize.
1743                        curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap());
1744                    }
1745                    sid = next;
1746                }
1747            }
1748        }
1749    }
1750
1751    /// Return the starting configuration of a PikeVM search.
1752    ///
1753    /// The "start config" is basically whether the search should be anchored
1754    /// or not and the NFA state ID at which to begin the search. The state ID
1755    /// returned always corresponds to an anchored starting state even when the
1756    /// search is unanchored. This is because the PikeVM search loop deals with
1757    /// unanchored searches with an explicit epsilon closure out of the start
1758    /// state.
1759    ///
1760    /// This routine accounts for both the caller's `Input` configuration
1761    /// and the pattern itself. For example, even if the caller asks for an
1762    /// unanchored search, if the pattern itself is anchored, then this will
1763    /// always return 'true' because implementing an unanchored search in that
1764    /// case would be incorrect.
1765    ///
1766    /// Similarly, if the caller requests an anchored search for a particular
1767    /// pattern, then the starting state ID returned will reflect that.
1768    ///
1769    /// If a pattern ID is given in the input configuration that is not in
1770    /// this regex, then `None` is returned.
1771    fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> {
1772        match input.get_anchored() {
1773            // Only way we're unanchored is if both the caller asked for an
1774            // unanchored search *and* the pattern is itself not anchored.
1775            Anchored::No => Some((
1776                self.nfa.is_always_start_anchored(),
1777                self.nfa.start_anchored(),
1778            )),
1779            Anchored::Yes => Some((true, self.nfa.start_anchored())),
1780            Anchored::Pattern(pid) => {
1781                Some((true, self.nfa.start_pattern(pid)?))
1782            }
1783        }
1784    }
1785}
1786
1787/// An iterator over all non-overlapping matches for a particular search.
1788///
1789/// The iterator yields a [`Match`] value until no more matches could be found.
1790///
1791/// The lifetime parameters are as follows:
1792///
1793/// * `'r` represents the lifetime of the PikeVM.
1794/// * `'c` represents the lifetime of the PikeVM's cache.
1795/// * `'h` represents the lifetime of the haystack being searched.
1796///
1797/// This iterator can be created with the [`PikeVM::find_iter`] method.
1798#[derive(Debug)]
1799pub struct FindMatches<'r, 'c, 'h> {
1800    re: &'r PikeVM,
1801    cache: &'c mut Cache,
1802    caps: Captures,
1803    it: iter::Searcher<'h>,
1804}
1805
1806impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> {
1807    type Item = Match;
1808
1809    #[inline]
1810    fn next(&mut self) -> Option<Match> {
1811        // Splitting 'self' apart seems necessary to appease borrowck.
1812        let FindMatches { re, ref mut cache, ref mut caps, ref mut it } =
1813            *self;
1814        // 'advance' converts errors into panics, which is OK here because
1815        // the PikeVM can never return an error.
1816        it.advance(|input| {
1817            re.search(cache, input, caps);
1818            Ok(caps.get_match())
1819        })
1820    }
1821}
1822
1823/// An iterator over all non-overlapping leftmost matches, with their capturing
1824/// groups, for a particular search.
1825///
1826/// The iterator yields a [`Captures`] value until no more matches could be
1827/// found.
1828///
1829/// The lifetime parameters are as follows:
1830///
1831/// * `'r` represents the lifetime of the PikeVM.
1832/// * `'c` represents the lifetime of the PikeVM's cache.
1833/// * `'h` represents the lifetime of the haystack being searched.
1834///
1835/// This iterator can be created with the [`PikeVM::captures_iter`] method.
1836#[derive(Debug)]
1837pub struct CapturesMatches<'r, 'c, 'h> {
1838    re: &'r PikeVM,
1839    cache: &'c mut Cache,
1840    caps: Captures,
1841    it: iter::Searcher<'h>,
1842}
1843
1844impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> {
1845    type Item = Captures;
1846
1847    #[inline]
1848    fn next(&mut self) -> Option<Captures> {
1849        // Splitting 'self' apart seems necessary to appease borrowck.
1850        let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } =
1851            *self;
1852        // 'advance' converts errors into panics, which is OK here because
1853        // the PikeVM can never return an error.
1854        it.advance(|input| {
1855            re.search(cache, input, caps);
1856            Ok(caps.get_match())
1857        });
1858        if caps.is_match() {
1859            Some(caps.clone())
1860        } else {
1861            None
1862        }
1863    }
1864}
1865
1866/// A cache represents mutable state that a [`PikeVM`] requires during a
1867/// search.
1868///
1869/// For a given [`PikeVM`], its corresponding cache may be created either via
1870/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
1871/// every way, except the former does not require explicitly importing `Cache`.
1872///
1873/// A particular `Cache` is coupled with the [`PikeVM`] from which it
1874/// was created. It may only be used with that `PikeVM`. A cache and its
1875/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
1876/// only be used with the new `PikeVM` (and not the old one).
1877#[derive(Clone, Debug)]
1878pub struct Cache {
1879    /// Stack used while computing epsilon closure. This effectively lets us
1880    /// move what is more naturally expressed through recursion to a stack
1881    /// on the heap.
1882    stack: Vec<FollowEpsilon>,
1883    /// The current active states being explored for the current byte in the
1884    /// haystack.
1885    curr: ActiveStates,
1886    /// The next set of states we're building that will be explored for the
1887    /// next byte in the haystack.
1888    next: ActiveStates,
1889}
1890
1891impl Cache {
1892    /// Create a new [`PikeVM`] cache.
1893    ///
1894    /// A potentially more convenient routine to create a cache is
1895    /// [`PikeVM::create_cache`], as it does not require also importing the
1896    /// `Cache` type.
1897    ///
1898    /// If you want to reuse the returned `Cache` with some other `PikeVM`,
1899    /// then you must call [`Cache::reset`] with the desired `PikeVM`.
1900    pub fn new(re: &PikeVM) -> Cache {
1901        Cache {
1902            stack: vec![],
1903            curr: ActiveStates::new(re),
1904            next: ActiveStates::new(re),
1905        }
1906    }
1907
1908    /// Reset this cache such that it can be used for searching with a
1909    /// different [`PikeVM`].
1910    ///
1911    /// A cache reset permits reusing memory already allocated in this cache
1912    /// with a different `PikeVM`.
1913    ///
1914    /// # Example
1915    ///
1916    /// This shows how to re-purpose a cache for use with a different `PikeVM`.
1917    ///
1918    /// ```
1919    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
1920    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
1921    ///
1922    /// let re1 = PikeVM::new(r"\w")?;
1923    /// let re2 = PikeVM::new(r"\W")?;
1924    ///
1925    /// let mut cache = re1.create_cache();
1926    /// assert_eq!(
1927    ///     Some(Match::must(0, 0..2)),
1928    ///     re1.find_iter(&mut cache, "Δ").next(),
1929    /// );
1930    ///
1931    /// // Using 'cache' with re2 is not allowed. It may result in panics or
1932    /// // incorrect results. In order to re-purpose the cache, we must reset
1933    /// // it with the PikeVM we'd like to use it with.
1934    /// //
1935    /// // Similarly, after this reset, using the cache with 're1' is also not
1936    /// // allowed.
1937    /// cache.reset(&re2);
1938    /// assert_eq!(
1939    ///     Some(Match::must(0, 0..3)),
1940    ///     re2.find_iter(&mut cache, "☃").next(),
1941    /// );
1942    ///
1943    /// # Ok::<(), Box<dyn std::error::Error>>(())
1944    /// ```
1945    pub fn reset(&mut self, re: &PikeVM) {
1946        self.curr.reset(re);
1947        self.next.reset(re);
1948    }
1949
1950    /// Returns the heap memory usage, in bytes, of this cache.
1951    ///
1952    /// This does **not** include the stack size used up by this cache. To
1953    /// compute that, use `std::mem::size_of::<Cache>()`.
1954    pub fn memory_usage(&self) -> usize {
1955        use core::mem::size_of;
1956        (self.stack.len() * size_of::<FollowEpsilon>())
1957            + self.curr.memory_usage()
1958            + self.next.memory_usage()
1959    }
1960
1961    /// Clears this cache. This should be called at the start of every search
1962    /// to ensure we start with a clean slate.
1963    ///
1964    /// This also sets the length of the capturing groups used in the current
1965    /// search. This permits an optimization where by 'SlotTable::for_state'
1966    /// only returns the number of slots equivalent to the number of slots
1967    /// given in the 'Captures' value. This may be less than the total number
1968    /// of possible slots, e.g., when one only wants to track overall match
1969    /// offsets. This in turn permits less copying of capturing group spans
1970    /// in the PikeVM.
1971    fn setup_search(&mut self, captures_slot_len: usize) {
1972        self.stack.clear();
1973        self.curr.setup_search(captures_slot_len);
1974        self.next.setup_search(captures_slot_len);
1975    }
1976}
1977
1978/// A set of active states used to "simulate" the execution of an NFA via the
1979/// PikeVM.
1980///
1981/// There are two sets of these used during NFA simulation. One set corresponds
1982/// to the "current" set of states being traversed for the current position
1983/// in a haystack. The other set corresponds to the "next" set of states being
1984/// built, which will become the new "current" set for the next position in the
1985/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
1986/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
1987///
1988/// In addition to representing a set of NFA states, this also maintains slot
1989/// values for each state. These slot values are what turn the NFA simulation
1990/// into the "Pike VM." Namely, they track capturing group values for each
1991/// state. During the computation of epsilon closure, we copy slot values from
1992/// states in the "current" set to the "next" set. Eventually, once a match
1993/// is found, the slot values for that match state are what we write to the
1994/// caller provided 'Captures' value.
1995#[derive(Clone, Debug)]
1996struct ActiveStates {
1997    /// The set of active NFA states. This set preserves insertion order, which
1998    /// is critical for simulating the match semantics of backtracking regex
1999    /// engines.
2000    set: SparseSet,
2001    /// The slots for every NFA state, where each slot stores a (possibly
2002    /// absent) offset. Every capturing group has two slots. One for a start
2003    /// offset and one for an end offset.
2004    slot_table: SlotTable,
2005}
2006
2007impl ActiveStates {
2008    /// Create a new set of active states for the given PikeVM. The active
2009    /// states returned may only be used with the given PikeVM. (Use 'reset'
2010    /// to re-purpose the allocation for a different PikeVM.)
2011    fn new(re: &PikeVM) -> ActiveStates {
2012        let mut active = ActiveStates {
2013            set: SparseSet::new(0),
2014            slot_table: SlotTable::new(),
2015        };
2016        active.reset(re);
2017        active
2018    }
2019
2020    /// Reset this set of active states such that it can be used with the given
2021    /// PikeVM (and only that PikeVM).
2022    fn reset(&mut self, re: &PikeVM) {
2023        self.set.resize(re.get_nfa().states().len());
2024        self.slot_table.reset(re);
2025    }
2026
2027    /// Return the heap memory usage, in bytes, used by this set of active
2028    /// states.
2029    ///
2030    /// This does not include the stack size of this value.
2031    fn memory_usage(&self) -> usize {
2032        self.set.memory_usage() + self.slot_table.memory_usage()
2033    }
2034
2035    /// Setup this set of active states for a new search. The given slot
2036    /// length should be the number of slots in a caller provided 'Captures'
2037    /// (and may be zero).
2038    fn setup_search(&mut self, captures_slot_len: usize) {
2039        self.set.clear();
2040        self.slot_table.setup_search(captures_slot_len);
2041    }
2042}
2043
2044/// A table of slots, where each row represent a state in an NFA. Thus, the
2045/// table has room for storing slots for every single state in an NFA.
2046///
2047/// This table is represented with a single contiguous allocation. In general,
2048/// the notion of "capturing group" doesn't really exist at this level of
2049/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
2050/// maps to a pair of slots, one for the start offset and one for the end
2051/// offset.) Slots are indexed by the 'Captures' NFA state.
2052///
2053/// N.B. Not every state actually needs a row of slots. Namely, states that
2054/// only have epsilon transitions currently never have anything written to
2055/// their rows in this table. Thus, the table is somewhat wasteful in its heap
2056/// usage. However, it is important to maintain fast random access by state
2057/// ID, which means one giant table tends to work well. RE2 takes a different
2058/// approach here and allocates each row as its own reference counted thing.
2059/// I explored such a strategy at one point here, but couldn't get it to work
2060/// well using entirely safe code. (To the ambitious reader: I encourage you to
2061/// re-litigate that experiment.) I very much wanted to stick to safe code, but
2062/// could be convinced otherwise if there was a solid argument and the safety
2063/// was encapsulated well.
2064#[derive(Clone, Debug)]
2065struct SlotTable {
2066    /// The actual table of offsets.
2067    table: Vec<Option<NonMaxUsize>>,
2068    /// The number of slots per state, i.e., the table's stride or the length
2069    /// of each row.
2070    slots_per_state: usize,
2071    /// The number of slots in the caller-provided 'Captures' value for the
2072    /// current search. Setting this to 'slots_per_state' is always correct,
2073    /// but may be wasteful.
2074    slots_for_captures: usize,
2075}
2076
2077impl SlotTable {
2078    /// Create a new slot table.
2079    ///
2080    /// One should call 'reset' with the corresponding PikeVM before use.
2081    fn new() -> SlotTable {
2082        SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
2083    }
2084
2085    /// Reset this slot table such that it can be used with the given PikeVM
2086    /// (and only that PikeVM).
2087    fn reset(&mut self, re: &PikeVM) {
2088        let nfa = re.get_nfa();
2089        self.slots_per_state = nfa.group_info().slot_len();
2090        // This is always correct, but may be reduced for a particular search
2091        // if a 'Captures' has fewer slots, e.g., none at all or only slots
2092        // for tracking the overall match instead of all slots for every
2093        // group.
2094        self.slots_for_captures = core::cmp::max(
2095            self.slots_per_state,
2096            nfa.pattern_len().checked_mul(2).unwrap(),
2097        );
2098        let len = nfa
2099            .states()
2100            .len()
2101            .checked_mul(self.slots_per_state)
2102            // Add space to account for scratch space used during a search.
2103            .and_then(|x| x.checked_add(self.slots_for_captures))
2104            // It seems like this could actually panic on legitimate inputs on
2105            // 32-bit targets, and very likely to panic on 16-bit. Should we
2106            // somehow convert this to an error? What about something similar
2107            // for the lazy DFA cache? If you're tripping this assert, please
2108            // file a bug.
2109            .expect("slot table length doesn't overflow");
2110        // This happens about as often as a regex is compiled, so it probably
2111        // should be at debug level, but I found it quite distracting and not
2112        // particularly useful.
2113        trace!(
2114            "resizing PikeVM active states table to {} entries \
2115             (slots_per_state={})",
2116            len,
2117            self.slots_per_state,
2118        );
2119        self.table.resize(len, None);
2120    }
2121
2122    /// Return the heap memory usage, in bytes, used by this slot table.
2123    ///
2124    /// This does not include the stack size of this value.
2125    fn memory_usage(&self) -> usize {
2126        self.table.len() * core::mem::size_of::<Option<NonMaxUsize>>()
2127    }
2128
2129    /// Perform any per-search setup for this slot table.
2130    ///
2131    /// In particular, this sets the length of the number of slots used in the
2132    /// 'Captures' given by the caller (if any at all). This number may be
2133    /// smaller than the total number of slots available, e.g., when the caller
2134    /// is only interested in tracking the overall match and not the spans of
2135    /// every matching capturing group. Only tracking the overall match can
2136    /// save a substantial amount of time copying capturing spans during a
2137    /// search.
2138    fn setup_search(&mut self, captures_slot_len: usize) {
2139        self.slots_for_captures = captures_slot_len;
2140    }
2141
2142    /// Return a mutable slice of the slots for the given state.
2143    ///
2144    /// Note that the length of the slice returned may be less than the total
2145    /// number of slots available for this state. In particular, the length
2146    /// always matches the number of slots indicated via 'setup_search'.
2147    fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
2148        let i = sid.as_usize() * self.slots_per_state;
2149        &mut self.table[i..i + self.slots_for_captures]
2150    }
2151
2152    /// Return a slice of slots of appropriate length where every slot offset
2153    /// is guaranteed to be absent. This is useful in cases where you need to
2154    /// compute an epsilon closure outside of the user supplied regex, and thus
2155    /// never want it to have any capturing slots set.
2156    fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
2157        let i = self.table.len() - self.slots_for_captures;
2158        &mut self.table[i..i + self.slots_for_captures]
2159    }
2160}
2161
2162/// Represents a stack frame for use while computing an epsilon closure.
2163///
2164/// (An "epsilon closure" refers to the set of reachable NFA states from a
2165/// single state without consuming any input. That is, the set of all epsilon
2166/// transitions not only from that single state, but from every other state
2167/// reachable by an epsilon transition as well. This is why it's called a
2168/// "closure." Computing an epsilon closure is also done during DFA
2169/// determinization! Compare and contrast the epsilon closure here in this
2170/// PikeVM and the one used for determinization in crate::util::determinize.)
2171///
2172/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
2173/// first traversal over all epsilon transitions from a particular state.
2174/// (A depth first traversal is important because it emulates the same priority
2175/// of matches that is typically found in backtracking regex engines.) This
2176/// depth first traversal is naturally expressed using recursion, but to avoid
2177/// a call stack size proportional to the size of a regex, we put our stack on
2178/// the heap instead.
2179///
2180/// This stack thus consists of call frames. The typical call frame is
2181/// `Explore`, which instructs epsilon closure to explore the epsilon
2182/// transitions from that state. (Subsequent epsilon transitions are then
2183/// pushed on to the stack as more `Explore` frames.) If the state ID being
2184/// explored has no epsilon transitions, then the capturing group slots are
2185/// copied from the original state that sparked the epsilon closure (from the
2186/// 'step' routine) to the state ID being explored. This way, capturing group
2187/// slots are forwarded from the previous state to the next.
2188///
2189/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
2190/// set the position for a particular slot back to some particular offset. This
2191/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
2192/// set the offset of the slot indicated in `Capture` to the current offset,
2193/// and then push the old offset on to the stack as a `RestoreCapture` frame.
2194/// Thus, the new offset is only used until the epsilon closure reverts back to
2195/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
2196/// transition its "scope" to only states that come "after" it during depth
2197/// first traversal.
2198#[derive(Clone, Debug)]
2199enum FollowEpsilon {
2200    /// Explore the epsilon transitions from a state ID.
2201    Explore(StateID),
2202    /// Reset the given `slot` to the given `offset` (which might be `None`).
2203    RestoreCapture { slot: SmallIndex, offset: Option<NonMaxUsize> },
2204}
2205
2206/// A set of counters that "instruments" a PikeVM search. To enable this, you
2207/// must enable the 'internal-instrument-pikevm' feature. Then run your Rust
2208/// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in
2209/// the environment. The metrics collected will be dumped automatically for
2210/// every search executed by the PikeVM.
2211///
2212/// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an
2213/// absolute decrease in wall-clock performance, even if the 'trace' log level
2214/// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't
2215/// enabled.) The main point of instrumentation is to get counts of various
2216/// events that occur during the PikeVM's execution.
2217///
2218/// This is a somewhat hacked together collection of metrics that are useful
2219/// to gather from a PikeVM search. In particular, it lets us scrutinize the
2220/// performance profile of a search beyond what general purpose profiling tools
2221/// give us. Namely, we orient the profiling data around the specific states of
2222/// the NFA.
2223///
2224/// In other words, this lets us see which parts of the NFA graph are most
2225/// frequently activated. This then provides direction for optimization
2226/// opportunities.
2227///
2228/// The really sad part about this is that it absolutely clutters up the PikeVM
2229/// implementation. :'( Another approach would be to just manually add this
2230/// code in whenever I want this kind of profiling data, but it's complicated
2231/// and tedious enough that I went with this approach... for now.
2232///
2233/// When instrumentation is enabled (which also turns on 'logging'), then a
2234/// `Counters` is initialized for every search and `trace`'d just before the
2235/// search returns to the caller.
2236///
2237/// Tip: When debugging performance problems with the PikeVM, it's best to try
2238/// to work with an NFA that is as small as possible. Otherwise the state graph
2239/// is likely to be too big to digest.
2240#[cfg(feature = "internal-instrument-pikevm")]
2241#[derive(Clone, Debug)]
2242struct Counters {
2243    /// The number of times the NFA is in a particular permutation of states.
2244    state_sets: alloc::collections::BTreeMap<Vec<StateID>, u64>,
2245    /// The number of times 'step' is called for a particular state ID (which
2246    /// indexes this array).
2247    steps: Vec<u64>,
2248    /// The number of times an epsilon closure was computed for a state.
2249    closures: Vec<u64>,
2250    /// The number of times a particular state ID is pushed on to a stack while
2251    /// computing an epsilon closure.
2252    stack_pushes: Vec<u64>,
2253    /// The number of times a particular state ID is inserted into a sparse set
2254    /// while computing an epsilon closure.
2255    set_inserts: Vec<u64>,
2256}
2257
2258#[cfg(feature = "internal-instrument-pikevm")]
2259impl Counters {
2260    fn empty() -> Counters {
2261        Counters {
2262            state_sets: alloc::collections::BTreeMap::new(),
2263            steps: vec![],
2264            closures: vec![],
2265            stack_pushes: vec![],
2266            set_inserts: vec![],
2267        }
2268    }
2269
2270    fn reset(&mut self, nfa: &NFA) {
2271        let len = nfa.states().len();
2272
2273        self.state_sets.clear();
2274
2275        self.steps.clear();
2276        self.steps.resize(len, 0);
2277
2278        self.closures.clear();
2279        self.closures.resize(len, 0);
2280
2281        self.stack_pushes.clear();
2282        self.stack_pushes.resize(len, 0);
2283
2284        self.set_inserts.clear();
2285        self.set_inserts.resize(len, 0);
2286    }
2287
2288    fn eprint(&self, nfa: &NFA) {
2289        trace!("===== START PikeVM Instrumentation Output =====");
2290        // We take the top-K most occurring state sets. Otherwise the output
2291        // is likely to be overwhelming. And we probably only care about the
2292        // most frequently occurring ones anyway.
2293        const LIMIT: usize = 20;
2294        let mut set_counts =
2295            self.state_sets.iter().collect::<Vec<(&Vec<StateID>, &u64)>>();
2296        set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count));
2297        trace!("## PikeVM frequency of state sets (top {})", LIMIT);
2298        for (set, count) in set_counts.iter().take(LIMIT) {
2299            trace!("{:?}: {}", set, count);
2300        }
2301        if set_counts.len() > LIMIT {
2302            trace!(
2303                "... {} sets omitted (out of {} total)",
2304                set_counts.len() - LIMIT,
2305                set_counts.len(),
2306            );
2307        }
2308
2309        trace!("");
2310        trace!("## PikeVM total frequency of events");
2311        trace!(
2312            "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}",
2313            self.steps.iter().copied().sum::<u64>(),
2314            self.closures.iter().copied().sum::<u64>(),
2315            self.stack_pushes.iter().copied().sum::<u64>(),
2316            self.set_inserts.iter().copied().sum::<u64>(),
2317        );
2318
2319        trace!("");
2320        trace!("## PikeVM frequency of events broken down by state");
2321        for sid in 0..self.steps.len() {
2322            trace!(
2323                "{:06}: steps: {}, closures: {}, \
2324                 stack-pushes: {}, set-inserts: {}",
2325                sid,
2326                self.steps[sid],
2327                self.closures[sid],
2328                self.stack_pushes[sid],
2329                self.set_inserts[sid],
2330            );
2331        }
2332
2333        trace!("");
2334        trace!("## NFA debug display");
2335        trace!("{:?}", nfa);
2336        trace!("===== END PikeVM Instrumentation Output =====");
2337    }
2338
2339    fn record_state_set(&mut self, set: &SparseSet) {
2340        let set = set.iter().collect::<Vec<StateID>>();
2341        *self.state_sets.entry(set).or_insert(0) += 1;
2342    }
2343
2344    fn record_step(&mut self, sid: StateID) {
2345        self.steps[sid] += 1;
2346    }
2347
2348    fn record_closure(&mut self, sid: StateID) {
2349        self.closures[sid] += 1;
2350    }
2351
2352    fn record_stack_push(&mut self, sid: StateID) {
2353        self.stack_pushes[sid] += 1;
2354    }
2355
2356    fn record_set_insert(&mut self, sid: StateID) {
2357        self.set_inserts[sid] += 1;
2358    }
2359}
regex_automata/nfa/thompson/pikevm.rs

regex_automata/nfa/thompson/
pikevm.rs