regex_automata/dfa/
sparse.rs

1/*!
2Types and routines specific to sparse DFAs.
3
4This module is the home of [`sparse::DFA`](DFA).
5
6Unlike the [`dense`] module, this module does not contain a builder or
7configuration specific for sparse DFAs. Instead, the intended way to build a
8sparse DFA is either by using a default configuration with its constructor
9[`sparse::DFA::new`](DFA::new), or by first configuring the construction of a
10dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`].
11For example, this configures a sparse DFA to do an overlapping search:
12
13```
14use regex_automata::{
15    dfa::{Automaton, OverlappingState, dense},
16    HalfMatch, Input, MatchKind,
17};
18
19let dense_re = dense::Builder::new()
20    .configure(dense::Config::new().match_kind(MatchKind::All))
21    .build(r"Samwise|Sam")?;
22let sparse_re = dense_re.to_sparse()?;
23
24// Setup our haystack and initial start state.
25let input = Input::new("Samwise");
26let mut state = OverlappingState::start();
27
28// First, 'Sam' will match.
29sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
30assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match());
31
32// And now 'Samwise' will match.
33sparse_re.try_search_overlapping_fwd(&input, &mut state)?;
34assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match());
35# Ok::<(), Box<dyn std::error::Error>>(())
36```
37*/
38
39#[cfg(feature = "dfa-build")]
40use core::iter;
41use core::{fmt, mem::size_of};
42
43#[cfg(feature = "dfa-build")]
44use alloc::{vec, vec::Vec};
45
46#[cfg(feature = "dfa-build")]
47use crate::dfa::dense::{self, BuildError};
48use crate::{
49    dfa::{
50        automaton::{fmt_state_indicator, Automaton, StartError},
51        dense::Flags,
52        special::Special,
53        StartKind, DEAD,
54    },
55    util::{
56        alphabet::{ByteClasses, ByteSet},
57        escape::DebugByte,
58        int::{Pointer, Usize, U16, U32},
59        prefilter::Prefilter,
60        primitives::{PatternID, StateID},
61        search::Anchored,
62        start::{self, Start, StartByteMap},
63        wire::{self, DeserializeError, Endian, SerializeError},
64    },
65};
66
67const LABEL: &str = "rust-regex-automata-dfa-sparse";
68const VERSION: u32 = 2;
69
70/// A sparse deterministic finite automaton (DFA) with variable sized states.
71///
72/// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient
73/// representation for its transitions. Consequently, sparse DFAs may use much
74/// less memory than dense DFAs, but this comes at a price. In particular,
75/// reading the more space efficient transitions takes more work, and
76/// consequently, searching using a sparse DFA is typically slower than a dense
77/// DFA.
78///
79/// A sparse DFA can be built using the default configuration via the
80/// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a
81/// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse
82/// DFA using [`dense::DFA::to_sparse`].
83///
84/// In general, a sparse DFA supports all the same search operations as a dense
85/// DFA.
86///
87/// Making the choice between a dense and sparse DFA depends on your specific
88/// work load. If you can sacrifice a bit of search time performance, then a
89/// sparse DFA might be the best choice. In particular, while sparse DFAs are
90/// probably always slower than dense DFAs, you may find that they are easily
91/// fast enough for your purposes!
92///
93/// # Type parameters
94///
95/// A `DFA` has one type parameter, `T`, which is used to represent the parts
96/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
97///
98/// # The `Automaton` trait
99///
100/// This type implements the [`Automaton`] trait, which means it can be used
101/// for searching. For example:
102///
103/// ```
104/// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
105///
106/// let dfa = DFA::new("foo[0-9]+")?;
107/// let expected = Some(HalfMatch::must(0, 8));
108/// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
109/// # Ok::<(), Box<dyn std::error::Error>>(())
110/// ```
111#[derive(Clone)]
112pub struct DFA<T> {
113    // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
114    // representation-wise. In reality, it is perhaps more complicated. Namely,
115    // in a dense DFA, all information needs to be very cheaply accessible
116    // using only state IDs. In a sparse DFA however, each state uses a
117    // variable amount of space because each state encodes more information
118    // than just its transitions. Each state also includes an accelerator if
119    // one exists, along with the matching pattern IDs if the state is a match
120    // state.
121    //
122    // That is, a lot of the complexity is pushed down into how each state
123    // itself is represented.
124    tt: Transitions<T>,
125    st: StartTable<T>,
126    special: Special,
127    pre: Option<Prefilter>,
128    quitset: ByteSet,
129    flags: Flags,
130}
131
132#[cfg(feature = "dfa-build")]
133impl DFA<Vec<u8>> {
134    /// Parse the given regular expression using a default configuration and
135    /// return the corresponding sparse DFA.
136    ///
137    /// If you want a non-default configuration, then use the
138    /// [`dense::Builder`] to set your own configuration, and then call
139    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
140    ///
141    /// # Example
142    ///
143    /// ```
144    /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
145    ///
146    /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
147    ///
148    /// let expected = Some(HalfMatch::must(0, 11));
149    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
150    /// # Ok::<(), Box<dyn std::error::Error>>(())
151    /// ```
152    #[cfg(feature = "syntax")]
153    pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, BuildError> {
154        dense::Builder::new()
155            .build(pattern)
156            .and_then(|dense| dense.to_sparse())
157    }
158
159    /// Parse the given regular expressions using a default configuration and
160    /// return the corresponding multi-DFA.
161    ///
162    /// If you want a non-default configuration, then use the
163    /// [`dense::Builder`] to set your own configuration, and then call
164    /// [`dense::DFA::to_sparse`] to create a sparse DFA.
165    ///
166    /// # Example
167    ///
168    /// ```
169    /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input};
170    ///
171    /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
172    /// let expected = Some(HalfMatch::must(1, 3));
173    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?);
174    /// # Ok::<(), Box<dyn std::error::Error>>(())
175    /// ```
176    #[cfg(feature = "syntax")]
177    pub fn new_many<P: AsRef<str>>(
178        patterns: &[P],
179    ) -> Result<DFA<Vec<u8>>, BuildError> {
180        dense::Builder::new()
181            .build_many(patterns)
182            .and_then(|dense| dense.to_sparse())
183    }
184}
185
186#[cfg(feature = "dfa-build")]
187impl DFA<Vec<u8>> {
188    /// Create a new DFA that matches every input.
189    ///
190    /// # Example
191    ///
192    /// ```
193    /// use regex_automata::{
194    ///     dfa::{Automaton, sparse},
195    ///     HalfMatch, Input,
196    /// };
197    ///
198    /// let dfa = sparse::DFA::always_match()?;
199    ///
200    /// let expected = Some(HalfMatch::must(0, 0));
201    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?);
202    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?);
203    /// # Ok::<(), Box<dyn std::error::Error>>(())
204    /// ```
205    pub fn always_match() -> Result<DFA<Vec<u8>>, BuildError> {
206        dense::DFA::always_match()?.to_sparse()
207    }
208
209    /// Create a new sparse DFA that never matches any input.
210    ///
211    /// # Example
212    ///
213    /// ```
214    /// use regex_automata::{dfa::{Automaton, sparse}, Input};
215    ///
216    /// let dfa = sparse::DFA::never_match()?;
217    /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?);
218    /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?);
219    /// # Ok::<(), Box<dyn std::error::Error>>(())
220    /// ```
221    pub fn never_match() -> Result<DFA<Vec<u8>>, BuildError> {
222        dense::DFA::never_match()?.to_sparse()
223    }
224
225    /// The implementation for constructing a sparse DFA from a dense DFA.
226    pub(crate) fn from_dense<T: AsRef<[u32]>>(
227        dfa: &dense::DFA<T>,
228    ) -> Result<DFA<Vec<u8>>, BuildError> {
229        // In order to build the transition table, we need to be able to write
230        // state identifiers for each of the "next" transitions in each state.
231        // Our state identifiers correspond to the byte offset in the
232        // transition table at which the state is encoded. Therefore, we do not
233        // actually know what the state identifiers are until we've allocated
234        // exactly as much space as we need for each state. Thus, construction
235        // of the transition table happens in two passes.
236        //
237        // In the first pass, we fill out the shell of each state, which
238        // includes the transition length, the input byte ranges and
239        // zero-filled space for the transitions and accelerators, if present.
240        // In this first pass, we also build up a map from the state identifier
241        // index of the dense DFA to the state identifier in this sparse DFA.
242        //
243        // In the second pass, we fill in the transitions based on the map
244        // built in the first pass.
245
246        // The capacity given here reflects a minimum. (Well, the true minimum
247        // is likely even bigger, but hopefully this saves a few reallocs.)
248        let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len());
249        // This maps state indices from the dense DFA to StateIDs in the sparse
250        // DFA. We build out this map on the first pass, and then use it in the
251        // second pass to back-fill our transitions.
252        let mut remap: Vec<StateID> = vec![DEAD; dfa.state_len()];
253        for state in dfa.states() {
254            let pos = sparse.len();
255
256            remap[dfa.to_index(state.id())] = StateID::new(pos)
257                .map_err(|_| BuildError::too_many_states())?;
258            // zero-filled space for the transition length
259            sparse.push(0);
260            sparse.push(0);
261
262            let mut transition_len = 0;
263            for (unit1, unit2, _) in state.sparse_transitions() {
264                match (unit1.as_u8(), unit2.as_u8()) {
265                    (Some(b1), Some(b2)) => {
266                        transition_len += 1;
267                        sparse.push(b1);
268                        sparse.push(b2);
269                    }
270                    (None, None) => {}
271                    (Some(_), None) | (None, Some(_)) => {
272                        // can never occur because sparse_transitions never
273                        // groups EOI with any other transition.
274                        unreachable!()
275                    }
276                }
277            }
278            // Add dummy EOI transition. This is never actually read while
279            // searching, but having space equivalent to the total number
280            // of transitions is convenient. Otherwise, we'd need to track
281            // a different number of transitions for the byte ranges as for
282            // the 'next' states.
283            //
284            // N.B. The loop above is not guaranteed to yield the EOI
285            // transition, since it may point to a DEAD state. By putting
286            // it here, we always write the EOI transition, and thus
287            // guarantee that our transition length is >0. Why do we always
288            // need the EOI transition? Because in order to implement
289            // Automaton::next_eoi_state, this lets us just ask for the last
290            // transition. There are probably other/better ways to do this.
291            transition_len += 1;
292            sparse.push(0);
293            sparse.push(0);
294
295            // Check some assumptions about transition length.
296            assert_ne!(
297                transition_len, 0,
298                "transition length should be non-zero",
299            );
300            assert!(
301                transition_len <= 257,
302                "expected transition length {} to be <= 257",
303                transition_len,
304            );
305
306            // Fill in the transition length.
307            // Since transition length is always <= 257, we use the most
308            // significant bit to indicate whether this is a match state or
309            // not.
310            let ntrans = if dfa.is_match_state(state.id()) {
311                transition_len | (1 << 15)
312            } else {
313                transition_len
314            };
315            wire::NE::write_u16(ntrans, &mut sparse[pos..]);
316
317            // zero-fill the actual transitions.
318            // Unwraps are OK since transition_length <= 257 and our minimum
319            // support usize size is 16-bits.
320            let zeros = usize::try_from(transition_len)
321                .unwrap()
322                .checked_mul(StateID::SIZE)
323                .unwrap();
324            sparse.extend(iter::repeat(0).take(zeros));
325
326            // If this is a match state, write the pattern IDs matched by this
327            // state.
328            if dfa.is_match_state(state.id()) {
329                let plen = dfa.match_pattern_len(state.id());
330                // Write the actual pattern IDs with a u32 length prefix.
331                // First, zero-fill space.
332                let mut pos = sparse.len();
333                // Unwraps are OK since it's guaranteed that plen <=
334                // PatternID::LIMIT, which is in turn guaranteed to fit into a
335                // u32.
336                let zeros = size_of::<u32>()
337                    .checked_mul(plen)
338                    .unwrap()
339                    .checked_add(size_of::<u32>())
340                    .unwrap();
341                sparse.extend(iter::repeat(0).take(zeros));
342
343                // Now write the length prefix.
344                wire::NE::write_u32(
345                    // Will never fail since u32::MAX is invalid pattern ID.
346                    // Thus, the number of pattern IDs is representable by a
347                    // u32.
348                    plen.try_into().expect("pattern ID length fits in u32"),
349                    &mut sparse[pos..],
350                );
351                pos += size_of::<u32>();
352
353                // Now write the pattern IDs.
354                for &pid in dfa.pattern_id_slice(state.id()) {
355                    pos += wire::write_pattern_id::<wire::NE>(
356                        pid,
357                        &mut sparse[pos..],
358                    );
359                }
360            }
361
362            // And now add the accelerator, if one exists. An accelerator is
363            // at most 4 bytes and at least 1 byte. The first byte is the
364            // length, N. N bytes follow the length. The set of bytes that
365            // follow correspond (exhaustively) to the bytes that must be seen
366            // to leave this state.
367            let accel = dfa.accelerator(state.id());
368            sparse.push(accel.len().try_into().unwrap());
369            sparse.extend_from_slice(accel);
370        }
371
372        let mut new = DFA {
373            tt: Transitions {
374                sparse,
375                classes: dfa.byte_classes().clone(),
376                state_len: dfa.state_len(),
377                pattern_len: dfa.pattern_len(),
378            },
379            st: StartTable::from_dense_dfa(dfa, &remap)?,
380            special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
381            pre: dfa.get_prefilter().map(|p| p.clone()),
382            quitset: dfa.quitset().clone(),
383            flags: dfa.flags().clone(),
384        };
385        // And here's our second pass. Iterate over all of the dense states
386        // again, and update the transitions in each of the states in the
387        // sparse DFA.
388        for old_state in dfa.states() {
389            let new_id = remap[dfa.to_index(old_state.id())];
390            let mut new_state = new.tt.state_mut(new_id);
391            let sparse = old_state.sparse_transitions();
392            for (i, (_, _, next)) in sparse.enumerate() {
393                let next = remap[dfa.to_index(next)];
394                new_state.set_next_at(i, next);
395            }
396        }
397        debug!(
398            "created sparse DFA, memory usage: {} (dense memory usage: {})",
399            new.memory_usage(),
400            dfa.memory_usage(),
401        );
402        Ok(new)
403    }
404}
405
406impl<T: AsRef<[u8]>> DFA<T> {
407    /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
408    /// DFA returned always uses `&[u8]` for its transitions.
409    pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
410        DFA {
411            tt: self.tt.as_ref(),
412            st: self.st.as_ref(),
413            special: self.special,
414            pre: self.pre.clone(),
415            quitset: self.quitset,
416            flags: self.flags,
417        }
418    }
419
420    /// Return an owned version of this sparse DFA. Specifically, the DFA
421    /// returned always uses `Vec<u8>` for its transitions.
422    ///
423    /// Effectively, this returns a sparse DFA whose transitions live on the
424    /// heap.
425    #[cfg(feature = "alloc")]
426    pub fn to_owned(&self) -> DFA<alloc::vec::Vec<u8>> {
427        DFA {
428            tt: self.tt.to_owned(),
429            st: self.st.to_owned(),
430            special: self.special,
431            pre: self.pre.clone(),
432            quitset: self.quitset,
433            flags: self.flags,
434        }
435    }
436
437    /// Returns the starting state configuration for this DFA.
438    ///
439    /// The default is [`StartKind::Both`], which means the DFA supports both
440    /// unanchored and anchored searches. However, this can generally lead to
441    /// bigger DFAs. Therefore, a DFA might be compiled with support for just
442    /// unanchored or anchored searches. In that case, running a search with
443    /// an unsupported configuration will panic.
444    pub fn start_kind(&self) -> StartKind {
445        self.st.kind
446    }
447
448    /// Returns true only if this DFA has starting states for each pattern.
449    ///
450    /// When a DFA has starting states for each pattern, then a search with the
451    /// DFA can be configured to only look for anchored matches of a specific
452    /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can
453    /// accept a [`Anchored::Pattern`] if and only if this method returns true.
454    /// Otherwise, an error will be returned.
455    ///
456    /// Note that if the DFA is empty, this always returns false.
457    pub fn starts_for_each_pattern(&self) -> bool {
458        self.st.pattern_len.is_some()
459    }
460
461    /// Returns the equivalence classes that make up the alphabet for this DFA.
462    ///
463    /// Unless [`dense::Config::byte_classes`] was disabled, it is possible
464    /// that multiple distinct bytes are grouped into the same equivalence
465    /// class if it is impossible for them to discriminate between a match and
466    /// a non-match. This has the effect of reducing the overall alphabet size
467    /// and in turn potentially substantially reducing the size of the DFA's
468    /// transition table.
469    ///
470    /// The downside of using equivalence classes like this is that every state
471    /// transition will automatically use this map to convert an arbitrary
472    /// byte to its corresponding equivalence class. In practice this has a
473    /// negligible impact on performance.
474    pub fn byte_classes(&self) -> &ByteClasses {
475        &self.tt.classes
476    }
477
478    /// Returns the memory usage, in bytes, of this DFA.
479    ///
480    /// The memory usage is computed based on the number of bytes used to
481    /// represent this DFA.
482    ///
483    /// This does **not** include the stack size used up by this DFA. To
484    /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
485    pub fn memory_usage(&self) -> usize {
486        self.tt.memory_usage() + self.st.memory_usage()
487    }
488}
489
490/// Routines for converting a sparse DFA to other representations, such as raw
491/// bytes suitable for persistent storage.
492impl<T: AsRef<[u8]>> DFA<T> {
493    /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
494    /// format.
495    ///
496    /// The written bytes are guaranteed to be deserialized correctly and
497    /// without errors in a semver compatible release of this crate by a
498    /// `DFA`'s deserialization APIs (assuming all other criteria for the
499    /// deserialization APIs has been satisfied):
500    ///
501    /// * [`DFA::from_bytes`]
502    /// * [`DFA::from_bytes_unchecked`]
503    ///
504    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
505    /// not add any initial padding to the returned bytes. Padding isn't
506    /// required for sparse DFAs since they have no alignment requirements.
507    ///
508    /// # Example
509    ///
510    /// This example shows how to serialize and deserialize a DFA:
511    ///
512    /// ```
513    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
514    ///
515    /// // Compile our original DFA.
516    /// let original_dfa = DFA::new("foo[0-9]+")?;
517    ///
518    /// // N.B. We use native endianness here to make the example work, but
519    /// // using to_bytes_little_endian would work on a little endian target.
520    /// let buf = original_dfa.to_bytes_native_endian();
521    /// // Even if buf has initial padding, DFA::from_bytes will automatically
522    /// // ignore it.
523    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
524    ///
525    /// let expected = Some(HalfMatch::must(0, 8));
526    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
527    /// # Ok::<(), Box<dyn std::error::Error>>(())
528    /// ```
529    #[cfg(feature = "dfa-build")]
530    pub fn to_bytes_little_endian(&self) -> Vec<u8> {
531        self.to_bytes::<wire::LE>()
532    }
533
534    /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
535    /// format.
536    ///
537    /// The written bytes are guaranteed to be deserialized correctly and
538    /// without errors in a semver compatible release of this crate by a
539    /// `DFA`'s deserialization APIs (assuming all other criteria for the
540    /// deserialization APIs has been satisfied):
541    ///
542    /// * [`DFA::from_bytes`]
543    /// * [`DFA::from_bytes_unchecked`]
544    ///
545    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
546    /// not add any initial padding to the returned bytes. Padding isn't
547    /// required for sparse DFAs since they have no alignment requirements.
548    ///
549    /// # Example
550    ///
551    /// This example shows how to serialize and deserialize a DFA:
552    ///
553    /// ```
554    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
555    ///
556    /// // Compile our original DFA.
557    /// let original_dfa = DFA::new("foo[0-9]+")?;
558    ///
559    /// // N.B. We use native endianness here to make the example work, but
560    /// // using to_bytes_big_endian would work on a big endian target.
561    /// let buf = original_dfa.to_bytes_native_endian();
562    /// // Even if buf has initial padding, DFA::from_bytes will automatically
563    /// // ignore it.
564    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
565    ///
566    /// let expected = Some(HalfMatch::must(0, 8));
567    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
568    /// # Ok::<(), Box<dyn std::error::Error>>(())
569    /// ```
570    #[cfg(feature = "dfa-build")]
571    pub fn to_bytes_big_endian(&self) -> Vec<u8> {
572        self.to_bytes::<wire::BE>()
573    }
574
575    /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
576    /// format.
577    ///
578    /// The written bytes are guaranteed to be deserialized correctly and
579    /// without errors in a semver compatible release of this crate by a
580    /// `DFA`'s deserialization APIs (assuming all other criteria for the
581    /// deserialization APIs has been satisfied):
582    ///
583    /// * [`DFA::from_bytes`]
584    /// * [`DFA::from_bytes_unchecked`]
585    ///
586    /// Note that unlike a [`dense::DFA`]'s serialization methods, this does
587    /// not add any initial padding to the returned bytes. Padding isn't
588    /// required for sparse DFAs since they have no alignment requirements.
589    ///
590    /// Generally speaking, native endian format should only be used when
591    /// you know that the target you're compiling the DFA for matches the
592    /// endianness of the target on which you're compiling DFA. For example,
593    /// if serialization and deserialization happen in the same process or on
594    /// the same machine. Otherwise, when serializing a DFA for use in a
595    /// portable environment, you'll almost certainly want to serialize _both_
596    /// a little endian and a big endian version and then load the correct one
597    /// based on the target's configuration.
598    ///
599    /// # Example
600    ///
601    /// This example shows how to serialize and deserialize a DFA:
602    ///
603    /// ```
604    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
605    ///
606    /// // Compile our original DFA.
607    /// let original_dfa = DFA::new("foo[0-9]+")?;
608    ///
609    /// let buf = original_dfa.to_bytes_native_endian();
610    /// // Even if buf has initial padding, DFA::from_bytes will automatically
611    /// // ignore it.
612    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
613    ///
614    /// let expected = Some(HalfMatch::must(0, 8));
615    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
616    /// # Ok::<(), Box<dyn std::error::Error>>(())
617    /// ```
618    #[cfg(feature = "dfa-build")]
619    pub fn to_bytes_native_endian(&self) -> Vec<u8> {
620        self.to_bytes::<wire::NE>()
621    }
622
623    /// The implementation of the public `to_bytes` serialization methods,
624    /// which is generic over endianness.
625    #[cfg(feature = "dfa-build")]
626    fn to_bytes<E: Endian>(&self) -> Vec<u8> {
627        let mut buf = vec![0; self.write_to_len()];
628        // This should always succeed since the only possible serialization
629        // error is providing a buffer that's too small, but we've ensured that
630        // `buf` is big enough here.
631        self.write_to::<E>(&mut buf).unwrap();
632        buf
633    }
634
635    /// Serialize this DFA as raw bytes to the given slice, in little endian
636    /// format. Upon success, the total number of bytes written to `dst` is
637    /// returned.
638    ///
639    /// The written bytes are guaranteed to be deserialized correctly and
640    /// without errors in a semver compatible release of this crate by a
641    /// `DFA`'s deserialization APIs (assuming all other criteria for the
642    /// deserialization APIs has been satisfied):
643    ///
644    /// * [`DFA::from_bytes`]
645    /// * [`DFA::from_bytes_unchecked`]
646    ///
647    /// # Errors
648    ///
649    /// This returns an error if the given destination slice is not big enough
650    /// to contain the full serialized DFA. If an error occurs, then nothing
651    /// is written to `dst`.
652    ///
653    /// # Example
654    ///
655    /// This example shows how to serialize and deserialize a DFA without
656    /// dynamic memory allocation.
657    ///
658    /// ```
659    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
660    ///
661    /// // Compile our original DFA.
662    /// let original_dfa = DFA::new("foo[0-9]+")?;
663    ///
664    /// // Create a 4KB buffer on the stack to store our serialized DFA.
665    /// let mut buf = [0u8; 4 * (1<<10)];
666    /// // N.B. We use native endianness here to make the example work, but
667    /// // using write_to_little_endian would work on a little endian target.
668    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
669    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
670    ///
671    /// let expected = Some(HalfMatch::must(0, 8));
672    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
673    /// # Ok::<(), Box<dyn std::error::Error>>(())
674    /// ```
675    pub fn write_to_little_endian(
676        &self,
677        dst: &mut [u8],
678    ) -> Result<usize, SerializeError> {
679        self.write_to::<wire::LE>(dst)
680    }
681
682    /// Serialize this DFA as raw bytes to the given slice, in big endian
683    /// format. Upon success, the total number of bytes written to `dst` is
684    /// returned.
685    ///
686    /// The written bytes are guaranteed to be deserialized correctly and
687    /// without errors in a semver compatible release of this crate by a
688    /// `DFA`'s deserialization APIs (assuming all other criteria for the
689    /// deserialization APIs has been satisfied):
690    ///
691    /// * [`DFA::from_bytes`]
692    /// * [`DFA::from_bytes_unchecked`]
693    ///
694    /// # Errors
695    ///
696    /// This returns an error if the given destination slice is not big enough
697    /// to contain the full serialized DFA. If an error occurs, then nothing
698    /// is written to `dst`.
699    ///
700    /// # Example
701    ///
702    /// This example shows how to serialize and deserialize a DFA without
703    /// dynamic memory allocation.
704    ///
705    /// ```
706    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
707    ///
708    /// // Compile our original DFA.
709    /// let original_dfa = DFA::new("foo[0-9]+")?;
710    ///
711    /// // Create a 4KB buffer on the stack to store our serialized DFA.
712    /// let mut buf = [0u8; 4 * (1<<10)];
713    /// // N.B. We use native endianness here to make the example work, but
714    /// // using write_to_big_endian would work on a big endian target.
715    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
716    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
717    ///
718    /// let expected = Some(HalfMatch::must(0, 8));
719    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
720    /// # Ok::<(), Box<dyn std::error::Error>>(())
721    /// ```
722    pub fn write_to_big_endian(
723        &self,
724        dst: &mut [u8],
725    ) -> Result<usize, SerializeError> {
726        self.write_to::<wire::BE>(dst)
727    }
728
729    /// Serialize this DFA as raw bytes to the given slice, in native endian
730    /// format. Upon success, the total number of bytes written to `dst` is
731    /// returned.
732    ///
733    /// The written bytes are guaranteed to be deserialized correctly and
734    /// without errors in a semver compatible release of this crate by a
735    /// `DFA`'s deserialization APIs (assuming all other criteria for the
736    /// deserialization APIs has been satisfied):
737    ///
738    /// * [`DFA::from_bytes`]
739    /// * [`DFA::from_bytes_unchecked`]
740    ///
741    /// Generally speaking, native endian format should only be used when
742    /// you know that the target you're compiling the DFA for matches the
743    /// endianness of the target on which you're compiling DFA. For example,
744    /// if serialization and deserialization happen in the same process or on
745    /// the same machine. Otherwise, when serializing a DFA for use in a
746    /// portable environment, you'll almost certainly want to serialize _both_
747    /// a little endian and a big endian version and then load the correct one
748    /// based on the target's configuration.
749    ///
750    /// # Errors
751    ///
752    /// This returns an error if the given destination slice is not big enough
753    /// to contain the full serialized DFA. If an error occurs, then nothing
754    /// is written to `dst`.
755    ///
756    /// # Example
757    ///
758    /// This example shows how to serialize and deserialize a DFA without
759    /// dynamic memory allocation.
760    ///
761    /// ```
762    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
763    ///
764    /// // Compile our original DFA.
765    /// let original_dfa = DFA::new("foo[0-9]+")?;
766    ///
767    /// // Create a 4KB buffer on the stack to store our serialized DFA.
768    /// let mut buf = [0u8; 4 * (1<<10)];
769    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
770    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
771    ///
772    /// let expected = Some(HalfMatch::must(0, 8));
773    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
774    /// # Ok::<(), Box<dyn std::error::Error>>(())
775    /// ```
776    pub fn write_to_native_endian(
777        &self,
778        dst: &mut [u8],
779    ) -> Result<usize, SerializeError> {
780        self.write_to::<wire::NE>(dst)
781    }
782
783    /// The implementation of the public `write_to` serialization methods,
784    /// which is generic over endianness.
785    fn write_to<E: Endian>(
786        &self,
787        dst: &mut [u8],
788    ) -> Result<usize, SerializeError> {
789        let mut nw = 0;
790        nw += wire::write_label(LABEL, &mut dst[nw..])?;
791        nw += wire::write_endianness_check::<E>(&mut dst[nw..])?;
792        nw += wire::write_version::<E>(VERSION, &mut dst[nw..])?;
793        nw += {
794            // Currently unused, intended for future flexibility
795            E::write_u32(0, &mut dst[nw..]);
796            size_of::<u32>()
797        };
798        nw += self.flags.write_to::<E>(&mut dst[nw..])?;
799        nw += self.tt.write_to::<E>(&mut dst[nw..])?;
800        nw += self.st.write_to::<E>(&mut dst[nw..])?;
801        nw += self.special.write_to::<E>(&mut dst[nw..])?;
802        nw += self.quitset.write_to::<E>(&mut dst[nw..])?;
803        Ok(nw)
804    }
805
806    /// Return the total number of bytes required to serialize this DFA.
807    ///
808    /// This is useful for determining the size of the buffer required to pass
809    /// to one of the serialization routines:
810    ///
811    /// * [`DFA::write_to_little_endian`]
812    /// * [`DFA::write_to_big_endian`]
813    /// * [`DFA::write_to_native_endian`]
814    ///
815    /// Passing a buffer smaller than the size returned by this method will
816    /// result in a serialization error.
817    ///
818    /// # Example
819    ///
820    /// This example shows how to dynamically allocate enough room to serialize
821    /// a sparse DFA.
822    ///
823    /// ```
824    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
825    ///
826    /// // Compile our original DFA.
827    /// let original_dfa = DFA::new("foo[0-9]+")?;
828    ///
829    /// let mut buf = vec![0; original_dfa.write_to_len()];
830    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
831    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
832    ///
833    /// let expected = Some(HalfMatch::must(0, 8));
834    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
835    /// # Ok::<(), Box<dyn std::error::Error>>(())
836    /// ```
837    pub fn write_to_len(&self) -> usize {
838        wire::write_label_len(LABEL)
839        + wire::write_endianness_check_len()
840        + wire::write_version_len()
841        + size_of::<u32>() // unused, intended for future flexibility
842        + self.flags.write_to_len()
843        + self.tt.write_to_len()
844        + self.st.write_to_len()
845        + self.special.write_to_len()
846        + self.quitset.write_to_len()
847    }
848}
849
850impl<'a> DFA<&'a [u8]> {
851    /// Safely deserialize a sparse DFA with a specific state identifier
852    /// representation. Upon success, this returns both the deserialized DFA
853    /// and the number of bytes read from the given slice. Namely, the contents
854    /// of the slice beyond the DFA are not read.
855    ///
856    /// Deserializing a DFA using this routine will never allocate heap memory.
857    /// For safety purposes, the DFA's transitions will be verified such that
858    /// every transition points to a valid state. If this verification is too
859    /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
860    /// will always execute in constant time.
861    ///
862    /// The bytes given must be generated by one of the serialization APIs
863    /// of a `DFA` using a semver compatible release of this crate. Those
864    /// include:
865    ///
866    /// * [`DFA::to_bytes_little_endian`]
867    /// * [`DFA::to_bytes_big_endian`]
868    /// * [`DFA::to_bytes_native_endian`]
869    /// * [`DFA::write_to_little_endian`]
870    /// * [`DFA::write_to_big_endian`]
871    /// * [`DFA::write_to_native_endian`]
872    ///
873    /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
874    /// `write_to` methods do not allocate and write to an existing slice
875    /// (which may be on the stack). Since deserialization always uses the
876    /// native endianness of the target platform, the serialization API you use
877    /// should match the endianness of the target platform. (It's often a good
878    /// idea to generate serialized DFAs for both forms of endianness and then
879    /// load the correct one based on endianness.)
880    ///
881    /// # Errors
882    ///
883    /// Generally speaking, it's easier to state the conditions in which an
884    /// error is _not_ returned. All of the following must be true:
885    ///
886    /// * The bytes given must be produced by one of the serialization APIs
887    ///   on this DFA, as mentioned above.
888    /// * The endianness of the target platform matches the endianness used to
889    ///   serialized the provided DFA.
890    ///
891    /// If any of the above are not true, then an error will be returned.
892    ///
893    /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse
894    /// DFA has no alignment requirements. That is, an alignment of `1` is
895    /// valid.
896    ///
897    /// # Panics
898    ///
899    /// This routine will never panic for any input.
900    ///
901    /// # Example
902    ///
903    /// This example shows how to serialize a DFA to raw bytes, deserialize it
904    /// and then use it for searching.
905    ///
906    /// ```
907    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
908    ///
909    /// let initial = DFA::new("foo[0-9]+")?;
910    /// let bytes = initial.to_bytes_native_endian();
911    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
912    ///
913    /// let expected = Some(HalfMatch::must(0, 8));
914    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
915    /// # Ok::<(), Box<dyn std::error::Error>>(())
916    /// ```
917    ///
918    /// # Example: loading a DFA from static memory
919    ///
920    /// One use case this library supports is the ability to serialize a
921    /// DFA to disk and then use `include_bytes!` to store it in a compiled
922    /// Rust program. Those bytes can then be cheaply deserialized into a
923    /// `DFA` structure at runtime and used for searching without having to
924    /// re-compile the DFA (which can be quite costly).
925    ///
926    /// We can show this in two parts. The first part is serializing the DFA to
927    /// a file:
928    ///
929    /// ```no_run
930    /// use regex_automata::dfa::sparse::DFA;
931    ///
932    /// let dfa = DFA::new("foo[0-9]+")?;
933    ///
934    /// // Write a big endian serialized version of this DFA to a file.
935    /// let bytes = dfa.to_bytes_big_endian();
936    /// std::fs::write("foo.bigendian.dfa", &bytes)?;
937    ///
938    /// // Do it again, but this time for little endian.
939    /// let bytes = dfa.to_bytes_little_endian();
940    /// std::fs::write("foo.littleendian.dfa", &bytes)?;
941    /// # Ok::<(), Box<dyn std::error::Error>>(())
942    /// ```
943    ///
944    /// And now the second part is embedding the DFA into the compiled program
945    /// and deserializing it at runtime on first use. We use conditional
946    /// compilation to choose the correct endianness. We do not need to employ
947    /// any special tricks to ensure a proper alignment, since a sparse DFA has
948    /// no alignment requirements.
949    ///
950    /// ```no_run
951    /// use regex_automata::{
952    ///     dfa::{Automaton, sparse::DFA},
953    ///     util::lazy::Lazy,
954    ///     HalfMatch, Input,
955    /// };
956    ///
957    /// // This crate provides its own "lazy" type, kind of like
958    /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc
959    /// // no-std environments and let's us write this using completely
960    /// // safe code.
961    /// static RE: Lazy<DFA<&'static [u8]>> = Lazy::new(|| {
962    ///     # const _: &str = stringify! {
963    ///     #[cfg(target_endian = "big")]
964    ///     static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
965    ///     #[cfg(target_endian = "little")]
966    ///     static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
967    ///     # };
968    ///     # static BYTES: &[u8] = b"";
969    ///
970    ///     let (dfa, _) = DFA::from_bytes(BYTES)
971    ///         .expect("serialized DFA should be valid");
972    ///     dfa
973    /// });
974    ///
975    /// let expected = Ok(Some(HalfMatch::must(0, 8)));
976    /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345")));
977    /// ```
978    ///
979    /// Alternatively, consider using
980    /// [`lazy_static`](https://crates.io/crates/lazy_static)
981    /// or
982    /// [`once_cell`](https://crates.io/crates/once_cell),
983    /// which will guarantee safety for you.
984    pub fn from_bytes(
985        slice: &'a [u8],
986    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
987        // SAFETY: This is safe because we validate both the sparse transitions
988        // (by trying to decode every state) and start state ID list below. If
989        // either validation fails, then we return an error.
990        let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
991        let seen = dfa.tt.validate(&dfa.special)?;
992        dfa.st.validate(&dfa.special, &seen)?;
993        // N.B. dfa.special doesn't have a way to do unchecked deserialization,
994        // so it has already been validated.
995        Ok((dfa, nread))
996    }
997
998    /// Deserialize a DFA with a specific state identifier representation in
999    /// constant time by omitting the verification of the validity of the
1000    /// sparse transitions.
1001    ///
1002    /// This is just like [`DFA::from_bytes`], except it can potentially return
1003    /// a DFA that exhibits undefined behavior if its transitions contains
1004    /// invalid state identifiers.
1005    ///
1006    /// This routine is useful if you need to deserialize a DFA cheaply and
1007    /// cannot afford the transition validation performed by `from_bytes`.
1008    ///
1009    /// # Safety
1010    ///
1011    /// This routine is not safe because it permits callers to provide
1012    /// arbitrary transitions with possibly incorrect state identifiers. While
1013    /// the various serialization routines will never return an incorrect
1014    /// DFA, there is no guarantee that the bytes provided here are correct.
1015    /// While `from_bytes_unchecked` will still do several forms of basic
1016    /// validation, this routine does not check that the transitions themselves
1017    /// are correct. Given an incorrect transition table, it is possible for
1018    /// the search routines to access out-of-bounds memory because of explicit
1019    /// bounds check elision.
1020    ///
1021    /// # Example
1022    ///
1023    /// ```
1024    /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input};
1025    ///
1026    /// let initial = DFA::new("foo[0-9]+")?;
1027    /// let bytes = initial.to_bytes_native_endian();
1028    /// // SAFETY: This is guaranteed to be safe since the bytes given come
1029    /// // directly from a compatible serialization routine.
1030    /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1031    ///
1032    /// let expected = Some(HalfMatch::must(0, 8));
1033    /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?);
1034    /// # Ok::<(), Box<dyn std::error::Error>>(())
1035    /// ```
1036    pub unsafe fn from_bytes_unchecked(
1037        slice: &'a [u8],
1038    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1039        let mut nr = 0;
1040
1041        nr += wire::read_label(&slice[nr..], LABEL)?;
1042        nr += wire::read_endianness_check(&slice[nr..])?;
1043        nr += wire::read_version(&slice[nr..], VERSION)?;
1044
1045        let _unused = wire::try_read_u32(&slice[nr..], "unused space")?;
1046        nr += size_of::<u32>();
1047
1048        let (flags, nread) = Flags::from_bytes(&slice[nr..])?;
1049        nr += nread;
1050
1051        let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1052        nr += nread;
1053
1054        let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1055        nr += nread;
1056
1057        let (special, nread) = Special::from_bytes(&slice[nr..])?;
1058        nr += nread;
1059        if special.max.as_usize() >= tt.sparse().len() {
1060            return Err(DeserializeError::generic(
1061                "max should not be greater than or equal to sparse bytes",
1062            ));
1063        }
1064
1065        let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?;
1066        nr += nread;
1067
1068        // Prefilters don't support serialization, so they're always absent.
1069        let pre = None;
1070        Ok((DFA { tt, st, special, pre, quitset, flags }, nr))
1071    }
1072}
1073
1074impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1075    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1076        writeln!(f, "sparse::DFA(")?;
1077        for state in self.tt.states() {
1078            fmt_state_indicator(f, self, state.id())?;
1079            writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?;
1080        }
1081        writeln!(f, "")?;
1082        for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() {
1083            if i % self.st.stride == 0 {
1084                match anchored {
1085                    Anchored::No => writeln!(f, "START-GROUP(unanchored)")?,
1086                    Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?,
1087                    Anchored::Pattern(pid) => writeln!(
1088                        f,
1089                        "START_GROUP(pattern: {:?})",
1090                        pid.as_usize()
1091                    )?,
1092                }
1093            }
1094            writeln!(f, "  {:?} => {:06?}", sty, start_id.as_usize())?;
1095        }
1096        writeln!(f, "state length: {:?}", self.tt.state_len)?;
1097        writeln!(f, "pattern length: {:?}", self.pattern_len())?;
1098        writeln!(f, "flags: {:?}", self.flags)?;
1099        writeln!(f, ")")?;
1100        Ok(())
1101    }
1102}
1103
1104// SAFETY: We assert that our implementation of each method is correct.
1105unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1106    #[inline]
1107    fn is_special_state(&self, id: StateID) -> bool {
1108        self.special.is_special_state(id)
1109    }
1110
1111    #[inline]
1112    fn is_dead_state(&self, id: StateID) -> bool {
1113        self.special.is_dead_state(id)
1114    }
1115
1116    #[inline]
1117    fn is_quit_state(&self, id: StateID) -> bool {
1118        self.special.is_quit_state(id)
1119    }
1120
1121    #[inline]
1122    fn is_match_state(&self, id: StateID) -> bool {
1123        self.special.is_match_state(id)
1124    }
1125
1126    #[inline]
1127    fn is_start_state(&self, id: StateID) -> bool {
1128        self.special.is_start_state(id)
1129    }
1130
1131    #[inline]
1132    fn is_accel_state(&self, id: StateID) -> bool {
1133        self.special.is_accel_state(id)
1134    }
1135
1136    // This is marked as inline to help dramatically boost sparse searching,
1137    // which decodes each state it enters to follow the next transition.
1138    #[cfg_attr(feature = "perf-inline", inline(always))]
1139    fn next_state(&self, current: StateID, input: u8) -> StateID {
1140        let input = self.tt.classes.get(input);
1141        self.tt.state(current).next(input)
1142    }
1143
1144    #[inline]
1145    unsafe fn next_state_unchecked(
1146        &self,
1147        current: StateID,
1148        input: u8,
1149    ) -> StateID {
1150        self.next_state(current, input)
1151    }
1152
1153    #[inline]
1154    fn next_eoi_state(&self, current: StateID) -> StateID {
1155        self.tt.state(current).next_eoi()
1156    }
1157
1158    #[inline]
1159    fn pattern_len(&self) -> usize {
1160        self.tt.pattern_len
1161    }
1162
1163    #[inline]
1164    fn match_len(&self, id: StateID) -> usize {
1165        self.tt.state(id).pattern_len()
1166    }
1167
1168    #[inline]
1169    fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1170        // This is an optimization for the very common case of a DFA with a
1171        // single pattern. This conditional avoids a somewhat more costly path
1172        // that finds the pattern ID from the state machine, which requires
1173        // a bit of slicing/pointer-chasing. This optimization tends to only
1174        // matter when matches are frequent.
1175        if self.tt.pattern_len == 1 {
1176            return PatternID::ZERO;
1177        }
1178        self.tt.state(id).pattern_id(match_index)
1179    }
1180
1181    #[inline]
1182    fn has_empty(&self) -> bool {
1183        self.flags.has_empty
1184    }
1185
1186    #[inline]
1187    fn is_utf8(&self) -> bool {
1188        self.flags.is_utf8
1189    }
1190
1191    #[inline]
1192    fn is_always_start_anchored(&self) -> bool {
1193        self.flags.is_always_start_anchored
1194    }
1195
1196    #[inline]
1197    fn start_state(
1198        &self,
1199        config: &start::Config,
1200    ) -> Result<StateID, StartError> {
1201        let anchored = config.get_anchored();
1202        let start = match config.get_look_behind() {
1203            None => Start::Text,
1204            Some(byte) => {
1205                if !self.quitset.is_empty() && self.quitset.contains(byte) {
1206                    return Err(StartError::quit(byte));
1207                }
1208                self.st.start_map.get(byte)
1209            }
1210        };
1211        self.st.start(anchored, start)
1212    }
1213
1214    #[inline]
1215    fn universal_start_state(&self, mode: Anchored) -> Option<StateID> {
1216        match mode {
1217            Anchored::No => self.st.universal_start_unanchored,
1218            Anchored::Yes => self.st.universal_start_anchored,
1219            Anchored::Pattern(_) => None,
1220        }
1221    }
1222
1223    #[inline]
1224    fn accelerator(&self, id: StateID) -> &[u8] {
1225        self.tt.state(id).accelerator()
1226    }
1227
1228    #[inline]
1229    fn get_prefilter(&self) -> Option<&Prefilter> {
1230        self.pre.as_ref()
1231    }
1232}
1233
1234/// The transition table portion of a sparse DFA.
1235///
1236/// The transition table is the core part of the DFA in that it describes how
1237/// to move from one state to another based on the input sequence observed.
1238///
1239/// Unlike a typical dense table based DFA, states in a sparse transition
1240/// table have variable size. That is, states with more transitions use more
1241/// space than states with fewer transitions. This means that finding the next
1242/// transition takes more work than with a dense DFA, but also typically uses
1243/// much less space.
1244#[derive(Clone)]
1245struct Transitions<T> {
1246    /// The raw encoding of each state in this DFA.
1247    ///
1248    /// Each state has the following information:
1249    ///
1250    /// * A set of transitions to subsequent states. Transitions to the dead
1251    ///   state are omitted.
1252    /// * If the state can be accelerated, then any additional accelerator
1253    ///   information.
1254    /// * If the state is a match state, then the state contains all pattern
1255    ///   IDs that match when in that state.
1256    ///
1257    /// To decode a state, use Transitions::state.
1258    ///
1259    /// In practice, T is either Vec<u8> or &[u8].
1260    sparse: T,
1261    /// A set of equivalence classes, where a single equivalence class
1262    /// represents a set of bytes that never discriminate between a match
1263    /// and a non-match in the DFA. Each equivalence class corresponds to a
1264    /// single character in this DFA's alphabet, where the maximum number of
1265    /// characters is 257 (each possible value of a byte plus the special
1266    /// EOI transition). Consequently, the number of equivalence classes
1267    /// corresponds to the number of transitions for each DFA state. Note
1268    /// though that the *space* used by each DFA state in the transition table
1269    /// may be larger. The total space used by each DFA state is known as the
1270    /// stride and is documented above.
1271    ///
1272    /// The only time the number of equivalence classes is fewer than 257 is
1273    /// if the DFA's kind uses byte classes which is the default. Equivalence
1274    /// classes should generally only be disabled when debugging, so that
1275    /// the transitions themselves aren't obscured. Disabling them has no
1276    /// other benefit, since the equivalence class map is always used while
1277    /// searching. In the vast majority of cases, the number of equivalence
1278    /// classes is substantially smaller than 257, particularly when large
1279    /// Unicode classes aren't used.
1280    ///
1281    /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1282    /// in the current implementation, since equivalence classes generally tend
1283    /// to correspond to continuous ranges of bytes that map to the same
1284    /// transition. So in a sparse DFA, equivalence classes don't really lead
1285    /// to a space savings. In the future, it would be good to try and remove
1286    /// them from sparse DFAs entirely, but requires a bit of work since sparse
1287    /// DFAs are built from dense DFAs, which are in turn built on top of
1288    /// equivalence classes.
1289    classes: ByteClasses,
1290    /// The total number of states in this DFA. Note that a DFA always has at
1291    /// least one state---the dead state---even the empty DFA. In particular,
1292    /// the dead state always has ID 0 and is correspondingly always the first
1293    /// state. The dead state is never a match state.
1294    state_len: usize,
1295    /// The total number of unique patterns represented by these match states.
1296    pattern_len: usize,
1297}
1298
1299impl<'a> Transitions<&'a [u8]> {
1300    unsafe fn from_bytes_unchecked(
1301        mut slice: &'a [u8],
1302    ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1303        let slice_start = slice.as_ptr().as_usize();
1304
1305        let (state_len, nr) =
1306            wire::try_read_u32_as_usize(&slice, "state length")?;
1307        slice = &slice[nr..];
1308
1309        let (pattern_len, nr) =
1310            wire::try_read_u32_as_usize(&slice, "pattern length")?;
1311        slice = &slice[nr..];
1312
1313        let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1314        slice = &slice[nr..];
1315
1316        let (len, nr) =
1317            wire::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1318        slice = &slice[nr..];
1319
1320        wire::check_slice_len(slice, len, "sparse states byte length")?;
1321        let sparse = &slice[..len];
1322        slice = &slice[len..];
1323
1324        let trans = Transitions { sparse, classes, state_len, pattern_len };
1325        Ok((trans, slice.as_ptr().as_usize() - slice_start))
1326    }
1327}
1328
1329impl<T: AsRef<[u8]>> Transitions<T> {
1330    /// Writes a serialized form of this transition table to the buffer given.
1331    /// If the buffer is too small, then an error is returned. To determine
1332    /// how big the buffer must be, use `write_to_len`.
1333    fn write_to<E: Endian>(
1334        &self,
1335        mut dst: &mut [u8],
1336    ) -> Result<usize, SerializeError> {
1337        let nwrite = self.write_to_len();
1338        if dst.len() < nwrite {
1339            return Err(SerializeError::buffer_too_small(
1340                "sparse transition table",
1341            ));
1342        }
1343        dst = &mut dst[..nwrite];
1344
1345        // write state length
1346        E::write_u32(u32::try_from(self.state_len).unwrap(), dst);
1347        dst = &mut dst[size_of::<u32>()..];
1348
1349        // write pattern length
1350        E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst);
1351        dst = &mut dst[size_of::<u32>()..];
1352
1353        // write byte class map
1354        let n = self.classes.write_to(dst)?;
1355        dst = &mut dst[n..];
1356
1357        // write number of bytes in sparse transitions
1358        E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1359        dst = &mut dst[size_of::<u32>()..];
1360
1361        // write actual transitions
1362        let mut id = DEAD;
1363        while id.as_usize() < self.sparse().len() {
1364            let state = self.state(id);
1365            let n = state.write_to::<E>(&mut dst)?;
1366            dst = &mut dst[n..];
1367            // The next ID is the offset immediately following `state`.
1368            id = StateID::new(id.as_usize() + state.write_to_len()).unwrap();
1369        }
1370        Ok(nwrite)
1371    }
1372
1373    /// Returns the number of bytes the serialized form of this transition
1374    /// table will use.
1375    fn write_to_len(&self) -> usize {
1376        size_of::<u32>()   // state length
1377        + size_of::<u32>() // pattern length
1378        + self.classes.write_to_len()
1379        + size_of::<u32>() // sparse transitions length
1380        + self.sparse().len()
1381    }
1382
1383    /// Validates that every state ID in this transition table is valid.
1384    ///
1385    /// That is, every state ID can be used to correctly index a state in this
1386    /// table.
1387    fn validate(&self, sp: &Special) -> Result<Seen, DeserializeError> {
1388        let mut verified = Seen::new();
1389        // We need to make sure that we decode the correct number of states.
1390        // Otherwise, an empty set of transitions would validate even if the
1391        // recorded state length is non-empty.
1392        let mut len = 0;
1393        // We can't use the self.states() iterator because it assumes the state
1394        // encodings are valid. It could panic if they aren't.
1395        let mut id = DEAD;
1396        while id.as_usize() < self.sparse().len() {
1397            // Before we even decode the state, we check that the ID itself
1398            // is well formed. That is, if it's a special state then it must
1399            // actually be a quit, dead, accel, match or start state.
1400            if sp.is_special_state(id) {
1401                let is_actually_special = sp.is_dead_state(id)
1402                    || sp.is_quit_state(id)
1403                    || sp.is_match_state(id)
1404                    || sp.is_start_state(id)
1405                    || sp.is_accel_state(id);
1406                if !is_actually_special {
1407                    // This is kind of a cryptic error message...
1408                    return Err(DeserializeError::generic(
1409                        "found sparse state tagged as special but \
1410                         wasn't actually special",
1411                    ));
1412                }
1413            }
1414            let state = self.try_state(sp, id)?;
1415            verified.insert(id);
1416            // The next ID should be the offset immediately following `state`.
1417            id = StateID::new(wire::add(
1418                id.as_usize(),
1419                state.write_to_len(),
1420                "next state ID offset",
1421            )?)
1422            .map_err(|err| {
1423                DeserializeError::state_id_error(err, "next state ID offset")
1424            })?;
1425            len += 1;
1426        }
1427        // Now that we've checked that all top-level states are correct and
1428        // importantly, collected a set of valid state IDs, we have all the
1429        // information we need to check that all transitions are correct too.
1430        //
1431        // Note that we can't use `valid_ids` to iterate because it will
1432        // be empty in no-std no-alloc contexts. (And yes, that means our
1433        // verification isn't quite as good.) We can use `self.states()`
1434        // though at least, since we know that all states can at least be
1435        // decoded and traversed correctly.
1436        for state in self.states() {
1437            // Check that all transitions in this state are correct.
1438            for i in 0..state.ntrans {
1439                let to = state.next_at(i);
1440                // For no-alloc, we just check that the state can decode. It is
1441                // technically possible that the state ID could still point to
1442                // a non-existent state even if it decodes (fuzzing proved this
1443                // to be true), but it shouldn't result in any memory unsafety
1444                // or panics in non-debug mode.
1445                #[cfg(not(feature = "alloc"))]
1446                {
1447                    let _ = self.try_state(sp, to)?;
1448                }
1449                #[cfg(feature = "alloc")]
1450                {
1451                    if !verified.contains(&to) {
1452                        return Err(DeserializeError::generic(
1453                            "found transition that points to a \
1454                             non-existent state",
1455                        ));
1456                    }
1457                }
1458            }
1459        }
1460        if len != self.state_len {
1461            return Err(DeserializeError::generic(
1462                "mismatching sparse state length",
1463            ));
1464        }
1465        Ok(verified)
1466    }
1467
1468    /// Converts these transitions to a borrowed value.
1469    fn as_ref(&self) -> Transitions<&'_ [u8]> {
1470        Transitions {
1471            sparse: self.sparse(),
1472            classes: self.classes.clone(),
1473            state_len: self.state_len,
1474            pattern_len: self.pattern_len,
1475        }
1476    }
1477
1478    /// Converts these transitions to an owned value.
1479    #[cfg(feature = "alloc")]
1480    fn to_owned(&self) -> Transitions<alloc::vec::Vec<u8>> {
1481        Transitions {
1482            sparse: self.sparse().to_vec(),
1483            classes: self.classes.clone(),
1484            state_len: self.state_len,
1485            pattern_len: self.pattern_len,
1486        }
1487    }
1488
1489    /// Return a convenient representation of the given state.
1490    ///
1491    /// This panics if the state is invalid.
1492    ///
1493    /// This is marked as inline to help dramatically boost sparse searching,
1494    /// which decodes each state it enters to follow the next transition. Other
1495    /// functions involved are also inlined, which should hopefully eliminate
1496    /// a lot of the extraneous decoding that is never needed just to follow
1497    /// the next transition.
1498    #[cfg_attr(feature = "perf-inline", inline(always))]
1499    fn state(&self, id: StateID) -> State<'_> {
1500        let mut state = &self.sparse()[id.as_usize()..];
1501        let mut ntrans = wire::read_u16(&state).as_usize();
1502        let is_match = (1 << 15) & ntrans != 0;
1503        ntrans &= !(1 << 15);
1504        state = &state[2..];
1505
1506        let (input_ranges, state) = state.split_at(ntrans * 2);
1507        let (next, state) = state.split_at(ntrans * StateID::SIZE);
1508        let (pattern_ids, state) = if is_match {
1509            let npats = wire::read_u32(&state).as_usize();
1510            state[4..].split_at(npats * 4)
1511        } else {
1512            (&[][..], state)
1513        };
1514
1515        let accel_len = usize::from(state[0]);
1516        let accel = &state[1..accel_len + 1];
1517        State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1518    }
1519
1520    /// Like `state`, but will return an error if the state encoding is
1521    /// invalid. This is useful for verifying states after deserialization,
1522    /// which is required for a safe deserialization API.
1523    ///
1524    /// Note that this only verifies that this state is decodable and that
1525    /// all of its data is consistent. It does not verify that its state ID
1526    /// transitions point to valid states themselves, nor does it verify that
1527    /// every pattern ID is valid.
1528    fn try_state(
1529        &self,
1530        sp: &Special,
1531        id: StateID,
1532    ) -> Result<State<'_>, DeserializeError> {
1533        if id.as_usize() > self.sparse().len() {
1534            return Err(DeserializeError::generic(
1535                "invalid caller provided sparse state ID",
1536            ));
1537        }
1538        let mut state = &self.sparse()[id.as_usize()..];
1539        // Encoding format starts with a u16 that stores the total number of
1540        // transitions in this state.
1541        let (mut ntrans, _) =
1542            wire::try_read_u16_as_usize(state, "state transition length")?;
1543        let is_match = ((1 << 15) & ntrans) != 0;
1544        ntrans &= !(1 << 15);
1545        state = &state[2..];
1546        if ntrans > 257 || ntrans == 0 {
1547            return Err(DeserializeError::generic(
1548                "invalid transition length",
1549            ));
1550        }
1551        if is_match && !sp.is_match_state(id) {
1552            return Err(DeserializeError::generic(
1553                "state marked as match but not in match ID range",
1554            ));
1555        } else if !is_match && sp.is_match_state(id) {
1556            return Err(DeserializeError::generic(
1557                "state in match ID range but not marked as match state",
1558            ));
1559        }
1560
1561        // Each transition has two pieces: an inclusive range of bytes on which
1562        // it is defined, and the state ID that those bytes transition to. The
1563        // pairs come first, followed by a corresponding sequence of state IDs.
1564        let input_ranges_len = ntrans.checked_mul(2).unwrap();
1565        wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1566        let (input_ranges, state) = state.split_at(input_ranges_len);
1567        // Every range should be of the form A-B, where A<=B.
1568        for pair in input_ranges.chunks(2) {
1569            let (start, end) = (pair[0], pair[1]);
1570            if start > end {
1571                return Err(DeserializeError::generic("invalid input range"));
1572            }
1573        }
1574
1575        // And now extract the corresponding sequence of state IDs. We leave
1576        // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1577        // not have any alignment requirements.
1578        let next_len = ntrans
1579            .checked_mul(self.id_len())
1580            .expect("state size * #trans should always fit in a usize");
1581        wire::check_slice_len(state, next_len, "sparse trans state IDs")?;
1582        let (next, state) = state.split_at(next_len);
1583        // We can at least verify that every state ID is in bounds.
1584        for idbytes in next.chunks(self.id_len()) {
1585            let (id, _) =
1586                wire::read_state_id(idbytes, "sparse state ID in try_state")?;
1587            wire::check_slice_len(
1588                self.sparse(),
1589                id.as_usize(),
1590                "invalid sparse state ID",
1591            )?;
1592        }
1593
1594        // If this is a match state, then read the pattern IDs for this state.
1595        // Pattern IDs is a u32-length prefixed sequence of native endian
1596        // encoded 32-bit integers.
1597        let (pattern_ids, state) = if is_match {
1598            let (npats, nr) =
1599                wire::try_read_u32_as_usize(state, "pattern ID length")?;
1600            let state = &state[nr..];
1601            if npats == 0 {
1602                return Err(DeserializeError::generic(
1603                    "state marked as a match, but pattern length is zero",
1604                ));
1605            }
1606
1607            let pattern_ids_len =
1608                wire::mul(npats, 4, "sparse pattern ID byte length")?;
1609            wire::check_slice_len(
1610                state,
1611                pattern_ids_len,
1612                "sparse pattern IDs",
1613            )?;
1614            let (pattern_ids, state) = state.split_at(pattern_ids_len);
1615            for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1616                wire::read_pattern_id(
1617                    patbytes,
1618                    "sparse pattern ID in try_state",
1619                )?;
1620            }
1621            (pattern_ids, state)
1622        } else {
1623            (&[][..], state)
1624        };
1625        if is_match && pattern_ids.is_empty() {
1626            return Err(DeserializeError::generic(
1627                "state marked as a match, but has no pattern IDs",
1628            ));
1629        }
1630        if sp.is_match_state(id) && pattern_ids.is_empty() {
1631            return Err(DeserializeError::generic(
1632                "state marked special as a match, but has no pattern IDs",
1633            ));
1634        }
1635        if sp.is_match_state(id) != is_match {
1636            return Err(DeserializeError::generic(
1637                "whether state is a match or not is inconsistent",
1638            ));
1639        }
1640
1641        // Now read this state's accelerator info. The first byte is the length
1642        // of the accelerator, which is typically 0 (for no acceleration) but
1643        // is no bigger than 3. The length indicates the number of bytes that
1644        // follow, where each byte corresponds to a transition out of this
1645        // state.
1646        if state.is_empty() {
1647            return Err(DeserializeError::generic("no accelerator length"));
1648        }
1649        let (accel_len, state) = (usize::from(state[0]), &state[1..]);
1650
1651        if accel_len > 3 {
1652            return Err(DeserializeError::generic(
1653                "sparse invalid accelerator length",
1654            ));
1655        } else if accel_len == 0 && sp.is_accel_state(id) {
1656            return Err(DeserializeError::generic(
1657                "got no accelerators in state, but in accelerator ID range",
1658            ));
1659        } else if accel_len > 0 && !sp.is_accel_state(id) {
1660            return Err(DeserializeError::generic(
1661                "state in accelerator ID range, but has no accelerators",
1662            ));
1663        }
1664
1665        wire::check_slice_len(
1666            state,
1667            accel_len,
1668            "sparse corrupt accelerator length",
1669        )?;
1670        let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1671
1672        let state = State {
1673            id,
1674            is_match,
1675            ntrans,
1676            input_ranges,
1677            next,
1678            pattern_ids,
1679            accel,
1680        };
1681        if sp.is_quit_state(state.next_at(state.ntrans - 1)) {
1682            return Err(DeserializeError::generic(
1683                "state with EOI transition to quit state is illegal",
1684            ));
1685        }
1686        Ok(state)
1687    }
1688
1689    /// Return an iterator over all of the states in this DFA.
1690    ///
1691    /// The iterator returned yields tuples, where the first element is the
1692    /// state ID and the second element is the state itself.
1693    fn states(&self) -> StateIter<'_, T> {
1694        StateIter { trans: self, id: DEAD.as_usize() }
1695    }
1696
1697    /// Returns the sparse transitions as raw bytes.
1698    fn sparse(&self) -> &[u8] {
1699        self.sparse.as_ref()
1700    }
1701
1702    /// Returns the number of bytes represented by a single state ID.
1703    fn id_len(&self) -> usize {
1704        StateID::SIZE
1705    }
1706
1707    /// Return the memory usage, in bytes, of these transitions.
1708    ///
1709    /// This does not include the size of a `Transitions` value itself.
1710    fn memory_usage(&self) -> usize {
1711        self.sparse().len()
1712    }
1713}
1714
1715#[cfg(feature = "dfa-build")]
1716impl<T: AsMut<[u8]>> Transitions<T> {
1717    /// Return a convenient mutable representation of the given state.
1718    /// This panics if the state is invalid.
1719    fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1720        let mut state = &mut self.sparse_mut()[id.as_usize()..];
1721        let mut ntrans = wire::read_u16(&state).as_usize();
1722        let is_match = (1 << 15) & ntrans != 0;
1723        ntrans &= !(1 << 15);
1724        state = &mut state[2..];
1725
1726        let (input_ranges, state) = state.split_at_mut(ntrans * 2);
1727        let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1728        let (pattern_ids, state) = if is_match {
1729            let npats = wire::read_u32(&state).as_usize();
1730            state[4..].split_at_mut(npats * 4)
1731        } else {
1732            (&mut [][..], state)
1733        };
1734
1735        let accel_len = usize::from(state[0]);
1736        let accel = &mut state[1..accel_len + 1];
1737        StateMut {
1738            id,
1739            is_match,
1740            ntrans,
1741            input_ranges,
1742            next,
1743            pattern_ids,
1744            accel,
1745        }
1746    }
1747
1748    /// Returns the sparse transitions as raw mutable bytes.
1749    fn sparse_mut(&mut self) -> &mut [u8] {
1750        self.sparse.as_mut()
1751    }
1752}
1753
1754/// The set of all possible starting states in a DFA.
1755///
1756/// See the eponymous type in the `dense` module for more details. This type
1757/// is very similar to `dense::StartTable`, except that its underlying
1758/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1759/// sparse DFAs to be aligned, which is explicitly something we do not require
1760/// because we don't really need it.)
1761#[derive(Clone)]
1762struct StartTable<T> {
1763    /// The initial start state IDs as a contiguous table of native endian
1764    /// encoded integers, represented by `S`.
1765    ///
1766    /// In practice, T is either Vec<u8> or &[u8] and has no alignment
1767    /// requirements.
1768    ///
1769    /// The first `2 * stride` (currently always 8) entries always correspond
1770    /// to the starts states for the entire DFA, with the first 4 entries being
1771    /// for unanchored searches and the second 4 entries being for anchored
1772    /// searches. To keep things simple, we always use 8 entries even if the
1773    /// `StartKind` is not both.
1774    ///
1775    /// After that, there are `stride * patterns` state IDs, where `patterns`
1776    /// may be zero in the case of a DFA with no patterns or in the case where
1777    /// the DFA was built without enabling starting states for each pattern.
1778    table: T,
1779    /// The starting state configuration supported. When 'both', both
1780    /// unanchored and anchored searches work. When 'unanchored', anchored
1781    /// searches panic. When 'anchored', unanchored searches panic.
1782    kind: StartKind,
1783    /// The start state configuration for every possible byte.
1784    start_map: StartByteMap,
1785    /// The number of starting state IDs per pattern.
1786    stride: usize,
1787    /// The total number of patterns for which starting states are encoded.
1788    /// This is `None` for DFAs that were built without start states for each
1789    /// pattern. Thus, one cannot use this field to say how many patterns
1790    /// are in the DFA in all cases. It is specific to how many patterns are
1791    /// represented in this start table.
1792    pattern_len: Option<usize>,
1793    /// The universal starting state for unanchored searches. This is only
1794    /// present when the DFA supports unanchored searches and when all starting
1795    /// state IDs for an unanchored search are equivalent.
1796    universal_start_unanchored: Option<StateID>,
1797    /// The universal starting state for anchored searches. This is only
1798    /// present when the DFA supports anchored searches and when all starting
1799    /// state IDs for an anchored search are equivalent.
1800    universal_start_anchored: Option<StateID>,
1801}
1802
1803#[cfg(feature = "dfa-build")]
1804impl StartTable<Vec<u8>> {
1805    fn new<T: AsRef<[u32]>>(
1806        dfa: &dense::DFA<T>,
1807        pattern_len: Option<usize>,
1808    ) -> StartTable<Vec<u8>> {
1809        let stride = Start::len();
1810        // This is OK since the only way we're here is if a dense DFA could be
1811        // constructed successfully, which uses the same space.
1812        let len = stride
1813            .checked_mul(pattern_len.unwrap_or(0))
1814            .unwrap()
1815            .checked_add(stride.checked_mul(2).unwrap())
1816            .unwrap()
1817            .checked_mul(StateID::SIZE)
1818            .unwrap();
1819        StartTable {
1820            table: vec![0; len],
1821            kind: dfa.start_kind(),
1822            start_map: dfa.start_map().clone(),
1823            stride,
1824            pattern_len,
1825            universal_start_unanchored: dfa
1826                .universal_start_state(Anchored::No),
1827            universal_start_anchored: dfa.universal_start_state(Anchored::Yes),
1828        }
1829    }
1830
1831    fn from_dense_dfa<T: AsRef<[u32]>>(
1832        dfa: &dense::DFA<T>,
1833        remap: &[StateID],
1834    ) -> Result<StartTable<Vec<u8>>, BuildError> {
1835        // Unless the DFA has start states compiled for each pattern, then
1836        // as far as the starting state table is concerned, there are zero
1837        // patterns to account for. It will instead only store starting states
1838        // for the entire DFA.
1839        let start_pattern_len = if dfa.starts_for_each_pattern() {
1840            Some(dfa.pattern_len())
1841        } else {
1842            None
1843        };
1844        let mut sl = StartTable::new(dfa, start_pattern_len);
1845        for (old_start_id, anchored, sty) in dfa.starts() {
1846            let new_start_id = remap[dfa.to_index(old_start_id)];
1847            sl.set_start(anchored, sty, new_start_id);
1848        }
1849        Ok(sl)
1850    }
1851}
1852
1853impl<'a> StartTable<&'a [u8]> {
1854    unsafe fn from_bytes_unchecked(
1855        mut slice: &'a [u8],
1856    ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1857        let slice_start = slice.as_ptr().as_usize();
1858
1859        let (kind, nr) = StartKind::from_bytes(slice)?;
1860        slice = &slice[nr..];
1861
1862        let (start_map, nr) = StartByteMap::from_bytes(slice)?;
1863        slice = &slice[nr..];
1864
1865        let (stride, nr) =
1866            wire::try_read_u32_as_usize(slice, "sparse start table stride")?;
1867        slice = &slice[nr..];
1868        if stride != Start::len() {
1869            return Err(DeserializeError::generic(
1870                "invalid sparse starting table stride",
1871            ));
1872        }
1873
1874        let (maybe_pattern_len, nr) =
1875            wire::try_read_u32_as_usize(slice, "sparse start table patterns")?;
1876        slice = &slice[nr..];
1877        let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX {
1878            None
1879        } else {
1880            Some(maybe_pattern_len)
1881        };
1882        if pattern_len.map_or(false, |len| len > PatternID::LIMIT) {
1883            return Err(DeserializeError::generic(
1884                "sparse invalid number of patterns",
1885            ));
1886        }
1887
1888        let (universal_unanchored, nr) =
1889            wire::try_read_u32(slice, "universal unanchored start")?;
1890        slice = &slice[nr..];
1891        let universal_start_unanchored = if universal_unanchored == u32::MAX {
1892            None
1893        } else {
1894            Some(StateID::try_from(universal_unanchored).map_err(|e| {
1895                DeserializeError::state_id_error(
1896                    e,
1897                    "universal unanchored start",
1898                )
1899            })?)
1900        };
1901
1902        let (universal_anchored, nr) =
1903            wire::try_read_u32(slice, "universal anchored start")?;
1904        slice = &slice[nr..];
1905        let universal_start_anchored = if universal_anchored == u32::MAX {
1906            None
1907        } else {
1908            Some(StateID::try_from(universal_anchored).map_err(|e| {
1909                DeserializeError::state_id_error(e, "universal anchored start")
1910            })?)
1911        };
1912
1913        let pattern_table_size = wire::mul(
1914            stride,
1915            pattern_len.unwrap_or(0),
1916            "sparse invalid pattern length",
1917        )?;
1918        // Our start states always start with a single stride of start states
1919        // for the entire automaton which permit it to match any pattern. What
1920        // follows it are an optional set of start states for each pattern.
1921        let start_state_len = wire::add(
1922            wire::mul(2, stride, "start state stride too big")?,
1923            pattern_table_size,
1924            "sparse invalid 'any' pattern starts size",
1925        )?;
1926        let table_bytes_len = wire::mul(
1927            start_state_len,
1928            StateID::SIZE,
1929            "sparse pattern table bytes length",
1930        )?;
1931        wire::check_slice_len(
1932            slice,
1933            table_bytes_len,
1934            "sparse start ID table",
1935        )?;
1936        let table = &slice[..table_bytes_len];
1937        slice = &slice[table_bytes_len..];
1938
1939        let sl = StartTable {
1940            table,
1941            kind,
1942            start_map,
1943            stride,
1944            pattern_len,
1945            universal_start_unanchored,
1946            universal_start_anchored,
1947        };
1948        Ok((sl, slice.as_ptr().as_usize() - slice_start))
1949    }
1950}
1951
1952impl<T: AsRef<[u8]>> StartTable<T> {
1953    fn write_to<E: Endian>(
1954        &self,
1955        mut dst: &mut [u8],
1956    ) -> Result<usize, SerializeError> {
1957        let nwrite = self.write_to_len();
1958        if dst.len() < nwrite {
1959            return Err(SerializeError::buffer_too_small(
1960                "sparse starting table ids",
1961            ));
1962        }
1963        dst = &mut dst[..nwrite];
1964
1965        // write start kind
1966        let nw = self.kind.write_to::<E>(dst)?;
1967        dst = &mut dst[nw..];
1968        // write start byte map
1969        let nw = self.start_map.write_to(dst)?;
1970        dst = &mut dst[nw..];
1971        // write stride
1972        E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1973        dst = &mut dst[size_of::<u32>()..];
1974        // write pattern length
1975        E::write_u32(
1976            u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(),
1977            dst,
1978        );
1979        dst = &mut dst[size_of::<u32>()..];
1980        // write universal start unanchored state id, u32::MAX if absent
1981        E::write_u32(
1982            self.universal_start_unanchored
1983                .map_or(u32::MAX, |sid| sid.as_u32()),
1984            dst,
1985        );
1986        dst = &mut dst[size_of::<u32>()..];
1987        // write universal start anchored state id, u32::MAX if absent
1988        E::write_u32(
1989            self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()),
1990            dst,
1991        );
1992        dst = &mut dst[size_of::<u32>()..];
1993        // write start IDs
1994        for (sid, _, _) in self.iter() {
1995            E::write_u32(sid.as_u32(), dst);
1996            dst = &mut dst[StateID::SIZE..];
1997        }
1998        Ok(nwrite)
1999    }
2000
2001    /// Returns the number of bytes the serialized form of this transition
2002    /// table will use.
2003    fn write_to_len(&self) -> usize {
2004        self.kind.write_to_len()
2005        + self.start_map.write_to_len()
2006        + size_of::<u32>() // stride
2007        + size_of::<u32>() // # patterns
2008        + size_of::<u32>() // universal unanchored start
2009        + size_of::<u32>() // universal anchored start
2010        + self.table().len()
2011    }
2012
2013    /// Validates that every starting state ID in this table is valid.
2014    ///
2015    /// That is, every starting state ID can be used to correctly decode a
2016    /// state in the DFA's sparse transitions.
2017    fn validate(
2018        &self,
2019        sp: &Special,
2020        seen: &Seen,
2021    ) -> Result<(), DeserializeError> {
2022        for (id, _, _) in self.iter() {
2023            if !seen.contains(&id) {
2024                return Err(DeserializeError::generic(
2025                    "found invalid start state ID",
2026                ));
2027            }
2028            if sp.is_match_state(id) {
2029                return Err(DeserializeError::generic(
2030                    "start states cannot be match states",
2031                ));
2032            }
2033        }
2034        Ok(())
2035    }
2036
2037    /// Converts this start list to a borrowed value.
2038    fn as_ref(&self) -> StartTable<&'_ [u8]> {
2039        StartTable {
2040            table: self.table(),
2041            kind: self.kind,
2042            start_map: self.start_map.clone(),
2043            stride: self.stride,
2044            pattern_len: self.pattern_len,
2045            universal_start_unanchored: self.universal_start_unanchored,
2046            universal_start_anchored: self.universal_start_anchored,
2047        }
2048    }
2049
2050    /// Converts this start list to an owned value.
2051    #[cfg(feature = "alloc")]
2052    fn to_owned(&self) -> StartTable<alloc::vec::Vec<u8>> {
2053        StartTable {
2054            table: self.table().to_vec(),
2055            kind: self.kind,
2056            start_map: self.start_map.clone(),
2057            stride: self.stride,
2058            pattern_len: self.pattern_len,
2059            universal_start_unanchored: self.universal_start_unanchored,
2060            universal_start_anchored: self.universal_start_anchored,
2061        }
2062    }
2063
2064    /// Return the start state for the given index and pattern ID. If the
2065    /// pattern ID is None, then the corresponding start state for the entire
2066    /// DFA is returned. If the pattern ID is not None, then the corresponding
2067    /// starting state for the given pattern is returned. If this start table
2068    /// does not have individual starting states for each pattern, then this
2069    /// panics.
2070    fn start(
2071        &self,
2072        anchored: Anchored,
2073        start: Start,
2074    ) -> Result<StateID, StartError> {
2075        let start_index = start.as_usize();
2076        let index = match anchored {
2077            Anchored::No => {
2078                if !self.kind.has_unanchored() {
2079                    return Err(StartError::unsupported_anchored(anchored));
2080                }
2081                start_index
2082            }
2083            Anchored::Yes => {
2084                if !self.kind.has_anchored() {
2085                    return Err(StartError::unsupported_anchored(anchored));
2086                }
2087                self.stride + start_index
2088            }
2089            Anchored::Pattern(pid) => {
2090                let len = match self.pattern_len {
2091                    None => {
2092                        return Err(StartError::unsupported_anchored(anchored))
2093                    }
2094                    Some(len) => len,
2095                };
2096                if pid.as_usize() >= len {
2097                    return Ok(DEAD);
2098                }
2099                (2 * self.stride)
2100                    + (self.stride * pid.as_usize())
2101                    + start_index
2102            }
2103        };
2104        let start = index * StateID::SIZE;
2105        // This OK since we're allowed to assume that the start table contains
2106        // valid StateIDs.
2107        Ok(wire::read_state_id_unchecked(&self.table()[start..]).0)
2108    }
2109
2110    /// Return an iterator over all start IDs in this table.
2111    fn iter(&self) -> StartStateIter<'_, T> {
2112        StartStateIter { st: self, i: 0 }
2113    }
2114
2115    /// Returns the total number of start state IDs in this table.
2116    fn len(&self) -> usize {
2117        self.table().len() / StateID::SIZE
2118    }
2119
2120    /// Returns the table as a raw slice of bytes.
2121    fn table(&self) -> &[u8] {
2122        self.table.as_ref()
2123    }
2124
2125    /// Return the memory usage, in bytes, of this start list.
2126    ///
2127    /// This does not include the size of a `StartTable` value itself.
2128    fn memory_usage(&self) -> usize {
2129        self.table().len()
2130    }
2131}
2132
2133#[cfg(feature = "dfa-build")]
2134impl<T: AsMut<[u8]>> StartTable<T> {
2135    /// Set the start state for the given index and pattern.
2136    ///
2137    /// If the pattern ID or state ID are not valid, then this will panic.
2138    fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) {
2139        let start_index = start.as_usize();
2140        let index = match anchored {
2141            Anchored::No => start_index,
2142            Anchored::Yes => self.stride + start_index,
2143            Anchored::Pattern(pid) => {
2144                let pid = pid.as_usize();
2145                let len = self
2146                    .pattern_len
2147                    .expect("start states for each pattern enabled");
2148                assert!(pid < len, "invalid pattern ID {:?}", pid);
2149                self.stride
2150                    .checked_mul(pid)
2151                    .unwrap()
2152                    .checked_add(self.stride.checked_mul(2).unwrap())
2153                    .unwrap()
2154                    .checked_add(start_index)
2155                    .unwrap()
2156            }
2157        };
2158        let start = index * StateID::SIZE;
2159        let end = start + StateID::SIZE;
2160        wire::write_state_id::<wire::NE>(
2161            id,
2162            &mut self.table.as_mut()[start..end],
2163        );
2164    }
2165}
2166
2167/// An iterator over all state state IDs in a sparse DFA.
2168struct StartStateIter<'a, T> {
2169    st: &'a StartTable<T>,
2170    i: usize,
2171}
2172
2173impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
2174    type Item = (StateID, Anchored, Start);
2175
2176    fn next(&mut self) -> Option<(StateID, Anchored, Start)> {
2177        let i = self.i;
2178        if i >= self.st.len() {
2179            return None;
2180        }
2181        self.i += 1;
2182
2183        // This unwrap is okay since the stride of any DFA must always match
2184        // the number of start state types.
2185        let start_type = Start::from_usize(i % self.st.stride).unwrap();
2186        let anchored = if i < self.st.stride {
2187            Anchored::No
2188        } else if i < (2 * self.st.stride) {
2189            Anchored::Yes
2190        } else {
2191            let pid = (i - (2 * self.st.stride)) / self.st.stride;
2192            Anchored::Pattern(PatternID::new(pid).unwrap())
2193        };
2194        let start = i * StateID::SIZE;
2195        let end = start + StateID::SIZE;
2196        let bytes = self.st.table()[start..end].try_into().unwrap();
2197        // This is OK since we're allowed to assume that any IDs in this start
2198        // table are correct and valid for this DFA.
2199        let id = StateID::from_ne_bytes_unchecked(bytes);
2200        Some((id, anchored, start_type))
2201    }
2202}
2203
2204impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2205    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2206        f.debug_struct("StartStateIter").field("i", &self.i).finish()
2207    }
2208}
2209
2210/// An iterator over all states in a sparse DFA.
2211///
2212/// This iterator yields tuples, where the first element is the state ID and
2213/// the second element is the state itself.
2214struct StateIter<'a, T> {
2215    trans: &'a Transitions<T>,
2216    id: usize,
2217}
2218
2219impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2220    type Item = State<'a>;
2221
2222    fn next(&mut self) -> Option<State<'a>> {
2223        if self.id >= self.trans.sparse().len() {
2224            return None;
2225        }
2226        let state = self.trans.state(StateID::new_unchecked(self.id));
2227        self.id = self.id + state.write_to_len();
2228        Some(state)
2229    }
2230}
2231
2232impl<'a, T> fmt::Debug for StateIter<'a, T> {
2233    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2234        f.debug_struct("StateIter").field("id", &self.id).finish()
2235    }
2236}
2237
2238/// A representation of a sparse DFA state that can be cheaply materialized
2239/// from a state identifier.
2240#[derive(Clone)]
2241struct State<'a> {
2242    /// The identifier of this state.
2243    id: StateID,
2244    /// Whether this is a match state or not.
2245    is_match: bool,
2246    /// The number of transitions in this state.
2247    ntrans: usize,
2248    /// Pairs of input ranges, where there is one pair for each transition.
2249    /// Each pair specifies an inclusive start and end byte range for the
2250    /// corresponding transition.
2251    input_ranges: &'a [u8],
2252    /// Transitions to the next state. This slice contains native endian
2253    /// encoded state identifiers, with `S` as the representation. Thus, there
2254    /// are `ntrans * size_of::<S>()` bytes in this slice.
2255    next: &'a [u8],
2256    /// If this is a match state, then this contains the pattern IDs that match
2257    /// when the DFA is in this state.
2258    ///
2259    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2260    pattern_ids: &'a [u8],
2261    /// An accelerator for this state, if present. If this state has no
2262    /// accelerator, then this is an empty slice. When non-empty, this slice
2263    /// has length at most 3 and corresponds to the exhaustive set of bytes
2264    /// that must be seen in order to transition out of this state.
2265    accel: &'a [u8],
2266}
2267
2268impl<'a> State<'a> {
2269    /// Searches for the next transition given an input byte. If no such
2270    /// transition could be found, then a dead state is returned.
2271    ///
2272    /// This is marked as inline to help dramatically boost sparse searching,
2273    /// which decodes each state it enters to follow the next transition.
2274    #[cfg_attr(feature = "perf-inline", inline(always))]
2275    fn next(&self, input: u8) -> StateID {
2276        // This straight linear search was observed to be much better than
2277        // binary search on ASCII haystacks, likely because a binary search
2278        // visits the ASCII case last but a linear search sees it first. A
2279        // binary search does do a little better on non-ASCII haystacks, but
2280        // not by much. There might be a better trade off lurking here.
2281        for i in 0..(self.ntrans - 1) {
2282            let (start, end) = self.range(i);
2283            if start <= input && input <= end {
2284                return self.next_at(i);
2285            }
2286            // We could bail early with an extra branch: if input < b1, then
2287            // we know we'll never find a matching transition. Interestingly,
2288            // this extra branch seems to not help performance, or will even
2289            // hurt it. It's likely very dependent on the DFA itself and what
2290            // is being searched.
2291        }
2292        DEAD
2293    }
2294
2295    /// Returns the next state ID for the special EOI transition.
2296    fn next_eoi(&self) -> StateID {
2297        self.next_at(self.ntrans - 1)
2298    }
2299
2300    /// Returns the identifier for this state.
2301    fn id(&self) -> StateID {
2302        self.id
2303    }
2304
2305    /// Returns the inclusive input byte range for the ith transition in this
2306    /// state.
2307    fn range(&self, i: usize) -> (u8, u8) {
2308        (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
2309    }
2310
2311    /// Returns the next state for the ith transition in this state.
2312    fn next_at(&self, i: usize) -> StateID {
2313        let start = i * StateID::SIZE;
2314        let end = start + StateID::SIZE;
2315        let bytes = self.next[start..end].try_into().unwrap();
2316        StateID::from_ne_bytes_unchecked(bytes)
2317    }
2318
2319    /// Returns the pattern ID for the given match index. If the match index
2320    /// is invalid, then this panics.
2321    fn pattern_id(&self, match_index: usize) -> PatternID {
2322        let start = match_index * PatternID::SIZE;
2323        wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2324    }
2325
2326    /// Returns the total number of pattern IDs for this state. This is always
2327    /// zero when `is_match` is false.
2328    fn pattern_len(&self) -> usize {
2329        assert_eq!(0, self.pattern_ids.len() % 4);
2330        self.pattern_ids.len() / 4
2331    }
2332
2333    /// Return an accelerator for this state.
2334    fn accelerator(&self) -> &'a [u8] {
2335        self.accel
2336    }
2337
2338    /// Write the raw representation of this state to the given buffer using
2339    /// the given endianness.
2340    fn write_to<E: Endian>(
2341        &self,
2342        mut dst: &mut [u8],
2343    ) -> Result<usize, SerializeError> {
2344        let nwrite = self.write_to_len();
2345        if dst.len() < nwrite {
2346            return Err(SerializeError::buffer_too_small(
2347                "sparse state transitions",
2348            ));
2349        }
2350
2351        let ntrans =
2352            if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans };
2353        E::write_u16(u16::try_from(ntrans).unwrap(), dst);
2354        dst = &mut dst[size_of::<u16>()..];
2355
2356        dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges);
2357        dst = &mut dst[self.input_ranges.len()..];
2358
2359        for i in 0..self.ntrans {
2360            E::write_u32(self.next_at(i).as_u32(), dst);
2361            dst = &mut dst[StateID::SIZE..];
2362        }
2363
2364        if self.is_match {
2365            E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst);
2366            dst = &mut dst[size_of::<u32>()..];
2367            for i in 0..self.pattern_len() {
2368                let pid = self.pattern_id(i);
2369                E::write_u32(pid.as_u32(), dst);
2370                dst = &mut dst[PatternID::SIZE..];
2371            }
2372        }
2373
2374        dst[0] = u8::try_from(self.accel.len()).unwrap();
2375        dst[1..][..self.accel.len()].copy_from_slice(self.accel);
2376
2377        Ok(nwrite)
2378    }
2379
2380    /// Return the total number of bytes that this state consumes in its
2381    /// encoded form.
2382    fn write_to_len(&self) -> usize {
2383        let mut len = 2
2384            + (self.ntrans * 2)
2385            + (self.ntrans * StateID::SIZE)
2386            + (1 + self.accel.len());
2387        if self.is_match {
2388            len += size_of::<u32>() + self.pattern_ids.len();
2389        }
2390        len
2391    }
2392}
2393
2394impl<'a> fmt::Debug for State<'a> {
2395    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2396        let mut printed = false;
2397        for i in 0..(self.ntrans - 1) {
2398            let next = self.next_at(i);
2399            if next == DEAD {
2400                continue;
2401            }
2402
2403            if printed {
2404                write!(f, ", ")?;
2405            }
2406            let (start, end) = self.range(i);
2407            if start == end {
2408                write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?;
2409            } else {
2410                write!(
2411                    f,
2412                    "{:?}-{:?} => {:?}",
2413                    DebugByte(start),
2414                    DebugByte(end),
2415                    next.as_usize(),
2416                )?;
2417            }
2418            printed = true;
2419        }
2420        let eoi = self.next_at(self.ntrans - 1);
2421        if eoi != DEAD {
2422            if printed {
2423                write!(f, ", ")?;
2424            }
2425            write!(f, "EOI => {:?}", eoi.as_usize())?;
2426        }
2427        Ok(())
2428    }
2429}
2430
2431/// A representation of a mutable sparse DFA state that can be cheaply
2432/// materialized from a state identifier.
2433#[cfg(feature = "dfa-build")]
2434struct StateMut<'a> {
2435    /// The identifier of this state.
2436    id: StateID,
2437    /// Whether this is a match state or not.
2438    is_match: bool,
2439    /// The number of transitions in this state.
2440    ntrans: usize,
2441    /// Pairs of input ranges, where there is one pair for each transition.
2442    /// Each pair specifies an inclusive start and end byte range for the
2443    /// corresponding transition.
2444    input_ranges: &'a mut [u8],
2445    /// Transitions to the next state. This slice contains native endian
2446    /// encoded state identifiers, with `S` as the representation. Thus, there
2447    /// are `ntrans * size_of::<S>()` bytes in this slice.
2448    next: &'a mut [u8],
2449    /// If this is a match state, then this contains the pattern IDs that match
2450    /// when the DFA is in this state.
2451    ///
2452    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2453    pattern_ids: &'a [u8],
2454    /// An accelerator for this state, if present. If this state has no
2455    /// accelerator, then this is an empty slice. When non-empty, this slice
2456    /// has length at most 3 and corresponds to the exhaustive set of bytes
2457    /// that must be seen in order to transition out of this state.
2458    accel: &'a mut [u8],
2459}
2460
2461#[cfg(feature = "dfa-build")]
2462impl<'a> StateMut<'a> {
2463    /// Sets the ith transition to the given state.
2464    fn set_next_at(&mut self, i: usize, next: StateID) {
2465        let start = i * StateID::SIZE;
2466        let end = start + StateID::SIZE;
2467        wire::write_state_id::<wire::NE>(next, &mut self.next[start..end]);
2468    }
2469}
2470
2471#[cfg(feature = "dfa-build")]
2472impl<'a> fmt::Debug for StateMut<'a> {
2473    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2474        let state = State {
2475            id: self.id,
2476            is_match: self.is_match,
2477            ntrans: self.ntrans,
2478            input_ranges: self.input_ranges,
2479            next: self.next,
2480            pattern_ids: self.pattern_ids,
2481            accel: self.accel,
2482        };
2483        fmt::Debug::fmt(&state, f)
2484    }
2485}
2486
2487// In order to validate everything, we not only need to make sure we
2488// can decode every state, but that every transition in every state
2489// points to a valid state. There are many duplicative transitions, so
2490// we record state IDs that we've verified so that we don't redo the
2491// decoding work.
2492//
2493// Except, when in no_std mode, we don't have dynamic memory allocation
2494// available to us, so we skip this optimization. It's not clear
2495// whether doing something more clever is worth it just yet. If you're
2496// profiling this code and need it to run faster, please file an issue.
2497//
2498// OK, so we also use this to record the set of valid state IDs. Since
2499// it is possible for a transition to point to an invalid state ID that
2500// still (somehow) deserializes to a valid state. So we need to make
2501// sure our transitions are limited to actually correct state IDs.
2502// The problem is, I'm not sure how to do this verification step in
2503// no-std no-alloc mode. I think we'd *have* to store the set of valid
2504// state IDs in the DFA itself. For now, we don't do this verification
2505// in no-std no-alloc mode. The worst thing that can happen is an
2506// incorrect result. But no panics or memory safety problems should
2507// result. Because we still do validate that the state itself is
2508// "valid" in the sense that everything it points to actually exists.
2509//
2510// ---AG
2511#[derive(Debug)]
2512struct Seen {
2513    #[cfg(feature = "alloc")]
2514    set: alloc::collections::BTreeSet<StateID>,
2515    #[cfg(not(feature = "alloc"))]
2516    set: core::marker::PhantomData<StateID>,
2517}
2518
2519#[cfg(feature = "alloc")]
2520impl Seen {
2521    fn new() -> Seen {
2522        Seen { set: alloc::collections::BTreeSet::new() }
2523    }
2524    fn insert(&mut self, id: StateID) {
2525        self.set.insert(id);
2526    }
2527    fn contains(&self, id: &StateID) -> bool {
2528        self.set.contains(id)
2529    }
2530}
2531
2532#[cfg(not(feature = "alloc"))]
2533impl Seen {
2534    fn new() -> Seen {
2535        Seen { set: core::marker::PhantomData }
2536    }
2537    fn insert(&mut self, _id: StateID) {}
2538    fn contains(&self, _id: &StateID) -> bool {
2539        true
2540    }
2541}
2542
2543/*
2544/// A binary search routine specialized specifically to a sparse DFA state's
2545/// transitions. Specifically, the transitions are defined as a set of pairs
2546/// of input bytes that delineate an inclusive range of bytes. If the input
2547/// byte is in the range, then the corresponding transition is a match.
2548///
2549/// This binary search accepts a slice of these pairs and returns the position
2550/// of the matching pair (the ith transition), or None if no matching pair
2551/// could be found.
2552///
2553/// Note that this routine is not currently used since it was observed to
2554/// either decrease performance when searching ASCII, or did not provide enough
2555/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2556/// for posterity in case we can find a way to use it.
2557///
2558/// In theory, we could use the standard library's search routine if we could
2559/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2560/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2561/// representation of `(u8, u8)` has been nailed down). One could define a
2562/// repr(C) type, but the casting doesn't seem justified.
2563#[cfg_attr(feature = "perf-inline", inline(always))]
2564fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2565    debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
2566    debug_assert!(ranges.len() <= 512, "ranges should be short");
2567
2568    let (mut left, mut right) = (0, ranges.len() / 2);
2569    while left < right {
2570        let mid = (left + right) / 2;
2571        let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
2572        if needle < b1 {
2573            right = mid;
2574        } else if needle > b2 {
2575            left = mid + 1;
2576        } else {
2577            return Some(mid);
2578        }
2579    }
2580    None
2581}
2582*/
2583
2584#[cfg(all(test, feature = "syntax", feature = "dfa-build"))]
2585mod tests {
2586    use crate::{
2587        dfa::{dense::DFA, Automaton},
2588        nfa::thompson,
2589        Input, MatchError,
2590    };
2591
2592    // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2593    #[test]
2594    fn heuristic_unicode_forward() {
2595        let dfa = DFA::builder()
2596            .configure(DFA::config().unicode_word_boundary(true))
2597            .thompson(thompson::Config::new().reverse(true))
2598            .build(r"\b[0-9]+\b")
2599            .unwrap()
2600            .to_sparse()
2601            .unwrap();
2602
2603        let input = Input::new("β123").range(2..);
2604        let expected = MatchError::quit(0xB2, 1);
2605        let got = dfa.try_search_fwd(&input);
2606        assert_eq!(Err(expected), got);
2607
2608        let input = Input::new("123β").range(..3);
2609        let expected = MatchError::quit(0xCE, 3);
2610        let got = dfa.try_search_fwd(&input);
2611        assert_eq!(Err(expected), got);
2612    }
2613
2614    // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs.
2615    #[test]
2616    fn heuristic_unicode_reverse() {
2617        let dfa = DFA::builder()
2618            .configure(DFA::config().unicode_word_boundary(true))
2619            .thompson(thompson::Config::new().reverse(true))
2620            .build(r"\b[0-9]+\b")
2621            .unwrap()
2622            .to_sparse()
2623            .unwrap();
2624
2625        let input = Input::new("β123").range(2..);
2626        let expected = MatchError::quit(0xB2, 1);
2627        let got = dfa.try_search_rev(&input);
2628        assert_eq!(Err(expected), got);
2629
2630        let input = Input::new("123β").range(..3);
2631        let expected = MatchError::quit(0xCE, 3);
2632        let got = dfa.try_search_rev(&input);
2633        assert_eq!(Err(expected), got);
2634    }
2635}
regex_automata/dfa/sparse.rs

regex_automata/dfa/
sparse.rs