1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
use crate::util::id::PatternID;

/// The kind of match semantics to use for a DFA.
///
/// The default match kind is `LeftmostFirst`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum MatchKind {
    /// Report all possible matches.
    All,
    /// Report only the leftmost matches. When multiple leftmost matches exist,
    /// report the match corresponding to the part of the regex that appears
    /// first in the syntax.
    LeftmostFirst,
    /// Hints that destructuring should not be exhaustive.
    ///
    /// This enum may grow additional variants, so this makes sure clients
    /// don't count on exhaustive matching. (Otherwise, adding a new variant
    /// could break existing code.)
    #[doc(hidden)]
    __Nonexhaustive,
    // There is prior art in RE2 that shows that we should be able to add
    // LeftmostLongest too. The tricky part of it is supporting ungreedy
    // repetitions. Instead of treating all NFA states as having equivalent
    // priority (as in 'All') or treating all NFA states as having distinct
    // priority based on order (as in 'LeftmostFirst'), we instead group NFA
    // states into sets, and treat members of each set as having equivalent
    // priority, but having greater priority than all following members
    // of different sets.
    //
    // However, it's not clear whether it's really worth adding this. After
    // all, leftmost-longest can be emulated when using literals by using
    // leftmost-first and sorting the literals by length in descending order.
    // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
    // always match `a` in `ab` when using leftmost-first, but leftmost-longest
    // would match `ab`.
}

impl MatchKind {
    #[cfg(feature = "alloc")]
    pub(crate) fn continue_past_first_match(&self) -> bool {
        *self == MatchKind::All
    }
}

impl Default for MatchKind {
    fn default() -> MatchKind {
        MatchKind::LeftmostFirst
    }
}

/// A representation of a match reported by a regex engine.
///
/// A match records the start and end offsets of the match in the haystack.
///
/// Every match guarantees that `start <= end`.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct Match {
    /// The start offset of the match, inclusive.
    start: usize,
    /// The end offset of the match, exclusive.
    end: usize,
}

impl Match {
    /// Create a new match from a byte offset span.
    ///
    /// # Panics
    ///
    /// This panics if `end < start`.
    #[inline]
    pub fn new(start: usize, end: usize) -> Match {
        assert!(start <= end);
        Match { start, end }
    }

    /// The starting position of the match.
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }

    /// The ending position of the match.
    #[inline]
    pub fn end(&self) -> usize {
        self.end
    }

    /// Returns the match location as a range.
    #[inline]
    pub fn range(&self) -> core::ops::Range<usize> {
        self.start..self.end
    }

    /// Returns true if and only if this match is empty. That is, when
    /// `start() == end()`.
    ///
    /// An empty match can only be returned when the empty string was among
    /// the patterns used to build the Aho-Corasick automaton.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.start == self.end
    }
}

/// A representation of a match reported by a DFA.
///
/// This is called a "half" match because it only includes the end location
/// (or start location for a reverse match) of a match. This corresponds to the
/// information that a single DFA scan can report. Getting the other half of
/// the match requires a second scan with a reversed DFA.
///
/// A half match also includes the pattern that matched. The pattern is
/// identified by an ID, which corresponds to its position (starting from `0`)
/// relative to other patterns used to construct the corresponding DFA. If only
/// a single pattern is provided to the DFA, then all matches are guaranteed to
/// have a pattern ID of `0`.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub struct HalfMatch {
    /// The pattern ID.
    pub(crate) pattern: PatternID,
    /// The offset of the match.
    ///
    /// For forward searches, the offset is exclusive. For reverse searches,
    /// the offset is inclusive.
    pub(crate) offset: usize,
}

impl HalfMatch {
    /// Create a new half match from a pattern ID and a byte offset.
    #[inline]
    pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
        HalfMatch { pattern, offset }
    }

    /// Create a new half match from a pattern ID and a byte offset.
    ///
    /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
    /// [`PatternID`]. This panics if the given `usize` is not representable
    /// as a `PatternID`.
    #[inline]
    pub fn must(pattern: usize, offset: usize) -> HalfMatch {
        HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
    }

    /// Returns the ID of the pattern that matched.
    ///
    /// The ID of a pattern is derived from the position in which it was
    /// originally inserted into the corresponding DFA. The first pattern has
    /// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
    #[inline]
    pub fn pattern(&self) -> PatternID {
        self.pattern
    }

    /// The position of the match.
    ///
    /// If this match was produced by a forward search, then the offset is
    /// exclusive. If this match was produced by a reverse search, then the
    /// offset is inclusive.
    #[inline]
    pub fn offset(&self) -> usize {
        self.offset
    }
}

/// A representation of a multi match reported by a regex engine.
///
/// A multi match has two essential pieces of information: the identifier of
/// the pattern that matched, along with the start and end offsets of the match
/// in the haystack.
///
/// The pattern is identified by an ID, which corresponds to its position
/// (starting from `0`) relative to other patterns used to construct the
/// corresponding regex engine. If only a single pattern is provided, then all
/// multi matches are guaranteed to have a pattern ID of `0`.
///
/// Every multi match guarantees that `start <= end`.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct MultiMatch {
    /// The pattern ID.
    pattern: PatternID,
    /// The start offset of the match, inclusive.
    start: usize,
    /// The end offset of the match, exclusive.
    end: usize,
}

impl MultiMatch {
    /// Create a new match from a pattern ID and a byte offset span.
    ///
    /// # Panics
    ///
    /// This panics if `end < start`.
    #[inline]
    pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
        assert!(start <= end);
        MultiMatch { pattern, start, end }
    }

    /// Create a new match from a pattern ID and a byte offset span.
    ///
    /// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
    /// [`PatternID`]. This panics if the given `usize` is not representable
    /// as a `PatternID`.
    ///
    /// # Panics
    ///
    /// This panics if `end < start` or if `pattern > PatternID::MAX`.
    #[inline]
    pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
        MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
    }

    /// Returns the ID of the pattern that matched.
    ///
    /// The ID of a pattern is derived from the position in which it was
    /// originally inserted into the corresponding regex engine. The first
    /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
    /// so on.
    #[inline]
    pub fn pattern(&self) -> PatternID {
        self.pattern
    }

    /// The starting position of the match.
    #[inline]
    pub fn start(&self) -> usize {
        self.start
    }

    /// The ending position of the match.
    #[inline]
    pub fn end(&self) -> usize {
        self.end
    }

    /// Returns the match location as a range.
    #[inline]
    pub fn range(&self) -> core::ops::Range<usize> {
        self.start..self.end
    }

    /// Returns true if and only if this match is empty. That is, when
    /// `start() == end()`.
    ///
    /// An empty match can only be returned when the empty string was among
    /// the patterns used to build the Aho-Corasick automaton.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.start == self.end
    }
}

/// An error type indicating that a search stopped prematurely without finding
/// a match.
///
/// This error type implies that one cannot assume that no matches occur, since
/// the search stopped before completing.
///
/// Normally, when one searches for something, the response is either an
/// affirmative "it was found at this location" or a negative "not found at
/// all." However, in some cases, a regex engine can be configured to stop its
/// search before concluding whether a match exists or not. When this happens,
/// it may be important for the caller to know why the regex engine gave up and
/// where in the input it gave up at. This error type exposes the 'why' and the
/// 'where.'
///
/// For example, the DFAs provided by this library generally cannot correctly
/// implement Unicode word boundaries. Instead, they provide an option to
/// eagerly support them on ASCII text (since Unicode word boundaries are
/// equivalent to ASCII word boundaries when searching ASCII text), but will
/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
/// required to either report the failure to the caller (unergonomic) or
/// otherwise fall back to some other regex engine (ergonomic, but potentially
/// costly).
///
/// More generally, some regex engines offer the ability for callers to specify
/// certain bytes that will trigger the regex engine to automatically quit if
/// they are seen.
///
/// Still yet, there may be other reasons for a failed match. For example,
/// the hybrid DFA provided by this crate can be configured to give up if it
/// believes that it is not efficient. This in turn permits callers to choose a
/// different regex engine.
///
/// # Advice
///
/// While this form of error reporting adds complexity, it is generally
/// possible for callers to configure regex engines to never give up a search,
/// and thus never return an error. Indeed, the default configuration for every
/// regex engine in this crate is such that they will never stop searching
/// early. Therefore, the only way to get a match error is if the regex engine
/// is explicitly configured to do so. Options that enable this behavior
/// document the new error conditions they imply.
///
/// Regex engines for which no errors are possible for any configuration will
/// return the normal `Option<Match>` and not use this error type at all.
///
/// For example, regex engines in the `dfa` sub-module will only report
/// `MatchError::Quit` if instructed by either
/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
/// or by
/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub enum MatchError {
    // Note that the first version of this type was called `SearchError` and it
    // included a third `None` variant to indicate that the search completed
    // and no match was found. However, this was problematic for iterator
    // APIs where the `None` sentinel for stopping iteration corresponds
    // precisely to the "match not found" case. The fact that the `None`
    // variant was buried inside this type was in turn quite awkward. So
    // instead, I removed the `None` variant, renamed the type and used
    // `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
    // conceptually simpler `Result<Match, MatchError>`. However, we "regain"
    // ergonomics by only putting the more complex API in the `try_` variants
    // ("fallible") of search methods. The infallible APIs will instead just
    // return `Option<Match>` and panic on error.
    /// The search saw a "quit" byte at which it was instructed to stop
    /// searching.
    Quit {
        /// The "quit" byte that was observed that caused the search to stop.
        byte: u8,
        /// The offset at which the quit byte was observed.
        offset: usize,
    },
    /// The search, based on heuristics, determined that it would be better
    /// to stop, typically to provide the caller an opportunity to use an
    /// alternative regex engine.
    ///
    /// Currently, the only way for this to occur is via the lazy DFA and
    /// only when it is configured to do so (it will not return this error by
    /// default).
    GaveUp {
        /// The offset at which the search stopped. This corresponds to the
        /// position immediately following the last byte scanned.
        offset: usize,
    },
}

#[cfg(feature = "std")]
impl std::error::Error for MatchError {}

impl core::fmt::Display for MatchError {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        match *self {
            MatchError::Quit { byte, offset } => write!(
                f,
                "quit search after observing byte \\x{:02X} at offset {}",
                byte, offset,
            ),
            MatchError::GaveUp { offset } => {
                write!(f, "gave up searching at offset {}", offset)
            }
        }
    }
}