1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
use crate::util::id::PatternID;
/// The kind of match semantics to use for a DFA.
///
/// The default match kind is `LeftmostFirst`.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum MatchKind {
/// Report all possible matches.
All,
/// Report only the leftmost matches. When multiple leftmost matches exist,
/// report the match corresponding to the part of the regex that appears
/// first in the syntax.
LeftmostFirst,
/// Hints that destructuring should not be exhaustive.
///
/// This enum may grow additional variants, so this makes sure clients
/// don't count on exhaustive matching. (Otherwise, adding a new variant
/// could break existing code.)
#[doc(hidden)]
__Nonexhaustive,
// There is prior art in RE2 that shows that we should be able to add
// LeftmostLongest too. The tricky part of it is supporting ungreedy
// repetitions. Instead of treating all NFA states as having equivalent
// priority (as in 'All') or treating all NFA states as having distinct
// priority based on order (as in 'LeftmostFirst'), we instead group NFA
// states into sets, and treat members of each set as having equivalent
// priority, but having greater priority than all following members
// of different sets.
//
// However, it's not clear whether it's really worth adding this. After
// all, leftmost-longest can be emulated when using literals by using
// leftmost-first and sorting the literals by length in descending order.
// However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will
// always match `a` in `ab` when using leftmost-first, but leftmost-longest
// would match `ab`.
}
impl MatchKind {
#[cfg(feature = "alloc")]
pub(crate) fn continue_past_first_match(&self) -> bool {
*self == MatchKind::All
}
}
impl Default for MatchKind {
fn default() -> MatchKind {
MatchKind::LeftmostFirst
}
}
/// A representation of a match reported by a regex engine.
///
/// A match records the start and end offsets of the match in the haystack.
///
/// Every match guarantees that `start <= end`.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct Match {
/// The start offset of the match, inclusive.
start: usize,
/// The end offset of the match, exclusive.
end: usize,
}
impl Match {
/// Create a new match from a byte offset span.
///
/// # Panics
///
/// This panics if `end < start`.
#[inline]
pub fn new(start: usize, end: usize) -> Match {
assert!(start <= end);
Match { start, end }
}
/// The starting position of the match.
#[inline]
pub fn start(&self) -> usize {
self.start
}
/// The ending position of the match.
#[inline]
pub fn end(&self) -> usize {
self.end
}
/// Returns the match location as a range.
#[inline]
pub fn range(&self) -> core::ops::Range<usize> {
self.start..self.end
}
/// Returns true if and only if this match is empty. That is, when
/// `start() == end()`.
///
/// An empty match can only be returned when the empty string was among
/// the patterns used to build the Aho-Corasick automaton.
#[inline]
pub fn is_empty(&self) -> bool {
self.start == self.end
}
}
/// A representation of a match reported by a DFA.
///
/// This is called a "half" match because it only includes the end location
/// (or start location for a reverse match) of a match. This corresponds to the
/// information that a single DFA scan can report. Getting the other half of
/// the match requires a second scan with a reversed DFA.
///
/// A half match also includes the pattern that matched. The pattern is
/// identified by an ID, which corresponds to its position (starting from `0`)
/// relative to other patterns used to construct the corresponding DFA. If only
/// a single pattern is provided to the DFA, then all matches are guaranteed to
/// have a pattern ID of `0`.
#[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)]
pub struct HalfMatch {
/// The pattern ID.
pub(crate) pattern: PatternID,
/// The offset of the match.
///
/// For forward searches, the offset is exclusive. For reverse searches,
/// the offset is inclusive.
pub(crate) offset: usize,
}
impl HalfMatch {
/// Create a new half match from a pattern ID and a byte offset.
#[inline]
pub fn new(pattern: PatternID, offset: usize) -> HalfMatch {
HalfMatch { pattern, offset }
}
/// Create a new half match from a pattern ID and a byte offset.
///
/// This is like [`HalfMatch::new`], but accepts a `usize` instead of a
/// [`PatternID`]. This panics if the given `usize` is not representable
/// as a `PatternID`.
#[inline]
pub fn must(pattern: usize, offset: usize) -> HalfMatch {
HalfMatch::new(PatternID::new(pattern).unwrap(), offset)
}
/// Returns the ID of the pattern that matched.
///
/// The ID of a pattern is derived from the position in which it was
/// originally inserted into the corresponding DFA. The first pattern has
/// identifier `0`, and each subsequent pattern is `1`, `2` and so on.
#[inline]
pub fn pattern(&self) -> PatternID {
self.pattern
}
/// The position of the match.
///
/// If this match was produced by a forward search, then the offset is
/// exclusive. If this match was produced by a reverse search, then the
/// offset is inclusive.
#[inline]
pub fn offset(&self) -> usize {
self.offset
}
}
/// A representation of a multi match reported by a regex engine.
///
/// A multi match has two essential pieces of information: the identifier of
/// the pattern that matched, along with the start and end offsets of the match
/// in the haystack.
///
/// The pattern is identified by an ID, which corresponds to its position
/// (starting from `0`) relative to other patterns used to construct the
/// corresponding regex engine. If only a single pattern is provided, then all
/// multi matches are guaranteed to have a pattern ID of `0`.
///
/// Every multi match guarantees that `start <= end`.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub struct MultiMatch {
/// The pattern ID.
pattern: PatternID,
/// The start offset of the match, inclusive.
start: usize,
/// The end offset of the match, exclusive.
end: usize,
}
impl MultiMatch {
/// Create a new match from a pattern ID and a byte offset span.
///
/// # Panics
///
/// This panics if `end < start`.
#[inline]
pub fn new(pattern: PatternID, start: usize, end: usize) -> MultiMatch {
assert!(start <= end);
MultiMatch { pattern, start, end }
}
/// Create a new match from a pattern ID and a byte offset span.
///
/// This is like [`MultiMatch::new`], but accepts a `usize` instead of a
/// [`PatternID`]. This panics if the given `usize` is not representable
/// as a `PatternID`.
///
/// # Panics
///
/// This panics if `end < start` or if `pattern > PatternID::MAX`.
#[inline]
pub fn must(pattern: usize, start: usize, end: usize) -> MultiMatch {
MultiMatch::new(PatternID::new(pattern).unwrap(), start, end)
}
/// Returns the ID of the pattern that matched.
///
/// The ID of a pattern is derived from the position in which it was
/// originally inserted into the corresponding regex engine. The first
/// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and
/// so on.
#[inline]
pub fn pattern(&self) -> PatternID {
self.pattern
}
/// The starting position of the match.
#[inline]
pub fn start(&self) -> usize {
self.start
}
/// The ending position of the match.
#[inline]
pub fn end(&self) -> usize {
self.end
}
/// Returns the match location as a range.
#[inline]
pub fn range(&self) -> core::ops::Range<usize> {
self.start..self.end
}
/// Returns true if and only if this match is empty. That is, when
/// `start() == end()`.
///
/// An empty match can only be returned when the empty string was among
/// the patterns used to build the Aho-Corasick automaton.
#[inline]
pub fn is_empty(&self) -> bool {
self.start == self.end
}
}
/// An error type indicating that a search stopped prematurely without finding
/// a match.
///
/// This error type implies that one cannot assume that no matches occur, since
/// the search stopped before completing.
///
/// Normally, when one searches for something, the response is either an
/// affirmative "it was found at this location" or a negative "not found at
/// all." However, in some cases, a regex engine can be configured to stop its
/// search before concluding whether a match exists or not. When this happens,
/// it may be important for the caller to know why the regex engine gave up and
/// where in the input it gave up at. This error type exposes the 'why' and the
/// 'where.'
///
/// For example, the DFAs provided by this library generally cannot correctly
/// implement Unicode word boundaries. Instead, they provide an option to
/// eagerly support them on ASCII text (since Unicode word boundaries are
/// equivalent to ASCII word boundaries when searching ASCII text), but will
/// "give up" if a non-ASCII byte is seen. In such cases, one is usually
/// required to either report the failure to the caller (unergonomic) or
/// otherwise fall back to some other regex engine (ergonomic, but potentially
/// costly).
///
/// More generally, some regex engines offer the ability for callers to specify
/// certain bytes that will trigger the regex engine to automatically quit if
/// they are seen.
///
/// Still yet, there may be other reasons for a failed match. For example,
/// the hybrid DFA provided by this crate can be configured to give up if it
/// believes that it is not efficient. This in turn permits callers to choose a
/// different regex engine.
///
/// # Advice
///
/// While this form of error reporting adds complexity, it is generally
/// possible for callers to configure regex engines to never give up a search,
/// and thus never return an error. Indeed, the default configuration for every
/// regex engine in this crate is such that they will never stop searching
/// early. Therefore, the only way to get a match error is if the regex engine
/// is explicitly configured to do so. Options that enable this behavior
/// document the new error conditions they imply.
///
/// Regex engines for which no errors are possible for any configuration will
/// return the normal `Option<Match>` and not use this error type at all.
///
/// For example, regex engines in the `dfa` sub-module will only report
/// `MatchError::Quit` if instructed by either
/// [enabling Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary)
/// or by
/// [explicitly specifying one or more quit bytes](crate::dfa::dense::Config::quit).
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub enum MatchError {
// Note that the first version of this type was called `SearchError` and it
// included a third `None` variant to indicate that the search completed
// and no match was found. However, this was problematic for iterator
// APIs where the `None` sentinel for stopping iteration corresponds
// precisely to the "match not found" case. The fact that the `None`
// variant was buried inside this type was in turn quite awkward. So
// instead, I removed the `None` variant, renamed the type and used
// `Result<Option<Match>, MatchError>` in non-iterator APIs instead of the
// conceptually simpler `Result<Match, MatchError>`. However, we "regain"
// ergonomics by only putting the more complex API in the `try_` variants
// ("fallible") of search methods. The infallible APIs will instead just
// return `Option<Match>` and panic on error.
/// The search saw a "quit" byte at which it was instructed to stop
/// searching.
Quit {
/// The "quit" byte that was observed that caused the search to stop.
byte: u8,
/// The offset at which the quit byte was observed.
offset: usize,
},
/// The search, based on heuristics, determined that it would be better
/// to stop, typically to provide the caller an opportunity to use an
/// alternative regex engine.
///
/// Currently, the only way for this to occur is via the lazy DFA and
/// only when it is configured to do so (it will not return this error by
/// default).
GaveUp {
/// The offset at which the search stopped. This corresponds to the
/// position immediately following the last byte scanned.
offset: usize,
},
}
#[cfg(feature = "std")]
impl std::error::Error for MatchError {}
impl core::fmt::Display for MatchError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
match *self {
MatchError::Quit { byte, offset } => write!(
f,
"quit search after observing byte \\x{:02X} at offset {}",
byte, offset,
),
MatchError::GaveUp { offset } => {
write!(f, "gave up searching at offset {}", offset)
}
}
}
}