1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415
/// A state identifier especially tailored for lazy DFAs.
///
/// A lazy state ID logically represents a pointer to a DFA state. In practice,
/// by limiting the number of DFA states it can address, it reserves some
/// bits of its representation to encode some additional information. That
/// additional information is called a "tag." That tag is used to record
/// whether the state it points to is an unknown, dead, quit, start or match
/// state.
///
/// When implementing a low level search routine with a lazy DFA, it is
/// necessary to query the type of the current state to know what to do:
///
/// * **Unknown** - The state has not yet been computed. The
/// parameters used to get this state ID must be re-passed to
/// [`DFA::next_state`](crate::hybrid::dfa::DFA), which will never return an
/// unknown state ID.
/// * **Dead** - A dead state only has transitions to itself. It indicates that
/// the search cannot do anything else and should stop with whatever result it
/// has.
/// * **Quit** - A quit state indicates that the automaton could not answer
/// whether a match exists or not. Correct search implementations must return a
/// [`MatchError::Quit`](crate::MatchError::Quit).
/// * **Start** - A start state indicates that the automaton will begin
/// searching at a starting state. Branching on this isn't required for
/// correctness, but a common optimization is to use this to more quickly look
/// for a prefix.
/// * **Match** - A match state indicates that a match has been found.
/// Depending on the semantics of your search implementation, it may either
/// continue until the end of the haystack or a dead state, or it might quit
/// and return the match immediately.
///
/// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate
/// can be used to determine if a tag exists at all. This is useful to avoid
/// branching on all of the above types for every byte searched.
///
/// # Example
///
/// This example shows how `LazyStateID` can be used to implement a correct
/// search routine with minimal branching. In particular, this search routine
/// implements "leftmost" matching, which means that it doesn't immediately
/// stop once a match is found. Instead, it continues until it reaches a dead
/// state.
///
/// Notice also how a correct search implementation deals with
/// [`CacheError`](crate::hybrid::CacheError)s returned by some of
/// the lazy DFA routines. When a `CacheError` occurs, it returns
/// [`MatchError::GaveUp`](crate::MatchError::GaveUp).
///
/// ```
/// use regex_automata::{
/// hybrid::dfa::{Cache, DFA},
/// HalfMatch, MatchError, PatternID,
/// };
///
/// fn find_leftmost_first(
/// dfa: &DFA,
/// cache: &mut Cache,
/// haystack: &[u8],
/// ) -> Result<Option<HalfMatch>, MatchError> {
/// // The start state is determined by inspecting the position and the
/// // initial bytes of the haystack. Note that start states can never
/// // be match states (since DFAs in this crate delay matches by 1
/// // byte), so we don't need to check if the start state is a match.
/// let mut sid = dfa.start_state_forward(
/// cache, None, haystack, 0, haystack.len(),
/// ).map_err(|_| MatchError::GaveUp { offset: 0 })?;
/// let mut last_match = None;
/// // Walk all the bytes in the haystack. We can quit early if we see
/// // a dead or a quit state. The former means the automaton will
/// // never transition to any other state. The latter means that the
/// // automaton entered a condition in which its search failed.
/// for (i, &b) in haystack.iter().enumerate() {
/// sid = dfa
/// .next_state(cache, sid, b)
/// .map_err(|_| MatchError::GaveUp { offset: i })?;
/// if sid.is_tagged() {
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
/// dfa.match_pattern(cache, sid, 0),
/// i,
/// ));
/// } else if sid.is_dead() {
/// return Ok(last_match);
/// } else if sid.is_quit() {
/// // It is possible to enter into a quit state after
/// // observing a match has occurred. In that case, we
/// // should return the match instead of an error.
/// if last_match.is_some() {
/// return Ok(last_match);
/// }
/// return Err(MatchError::Quit { byte: b, offset: i });
/// }
/// // Implementors may also want to check for start states and
/// // handle them differently for performance reasons. But it is
/// // not necessary for correctness.
/// }
/// }
/// // Matches are always delayed by 1 byte, so we must explicitly walk
/// // the special "EOI" transition at the end of the search.
/// sid = dfa
/// .next_eoi_state(cache, sid)
/// .map_err(|_| MatchError::GaveUp { offset: haystack.len() })?;
/// if sid.is_match() {
/// last_match = Some(HalfMatch::new(
/// dfa.match_pattern(cache, sid, 0),
/// haystack.len(),
/// ));
/// }
/// Ok(last_match)
/// }
///
/// // We use a greedy '+' operator to show how the search doesn't just stop
/// // once a match is detected. It continues extending the match. Using
/// // '[a-z]+?' would also work as expected and stop the search early.
/// // Greediness is built into the automaton.
/// let dfa = DFA::new(r"[a-z]+")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 10);
///
/// // Here's another example that tests our handling of the special
/// // EOI transition. This will fail to find a match if we don't call
/// // 'next_eoi_state' at the end of the search since the match isn't found
/// // until the final byte in the haystack.
/// let dfa = DFA::new(r"[0-9]{4}")?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 15);
///
/// // And note that our search implementation above automatically works
/// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects
/// // the appropriate pattern ID for us.
/// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?;
/// let mut cache = dfa.create_cache();
/// let haystack = "123 foobar 4567".as_bytes();
/// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 3);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 0);
/// assert_eq!(mat.offset(), 7);
/// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap();
/// assert_eq!(mat.pattern().as_usize(), 1);
/// assert_eq!(mat.offset(), 5);
///
/// # Ok::<(), Box<dyn std::error::Error>>(())
/// ```
#[derive(
Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord,
)]
pub struct LazyStateID(u32);
impl LazyStateID {
#[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))]
const MAX_BIT: usize = 31;
#[cfg(target_pointer_width = "16")]
const MAX_BIT: usize = 15;
const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT);
const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1);
const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2);
const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3);
const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4);
const MAX: usize = LazyStateID::MASK_MATCH - 1;
/// Create a new lazy state ID.
///
/// If the given identifier exceeds [`LazyStateID::MAX`], then this returns
/// an error.
#[inline]
pub(crate) fn new(id: usize) -> Result<LazyStateID, LazyStateIDError> {
if id > LazyStateID::MAX {
return Err(LazyStateIDError { attempted: id as u64 });
}
Ok(LazyStateID::new_unchecked(id))
}
/// Create a new lazy state ID without checking whether the given value
/// exceeds [`LazyStateID::MAX`].
///
/// While this is unchecked, providing an incorrect value must never
/// sacrifice memory safety.
#[inline]
const fn new_unchecked(id: usize) -> LazyStateID {
LazyStateID(id as u32)
}
/// Return this lazy state ID as its raw value if and only if it is not
/// tagged (and thus not an unknown, dead, quit, start or match state ID).
#[inline]
pub(crate) fn as_usize(&self) -> Option<usize> {
if self.is_tagged() {
None
} else {
Some(self.as_usize_unchecked())
}
}
/// Return this lazy state ID as an untagged `usize`.
///
/// If this lazy state ID is tagged, then the usize returned is the state
/// ID without the tag. If the ID was not tagged, then the usize returned
/// is equivalent to the state ID.
#[inline]
pub(crate) fn as_usize_untagged(&self) -> usize {
self.as_usize_unchecked() & LazyStateID::MAX
}
/// Return this lazy state ID as its raw internal `usize` value, which may
/// be tagged (and thus greater than LazyStateID::MAX).
#[inline]
pub(crate) const fn as_usize_unchecked(&self) -> usize {
self.0 as usize
}
#[inline]
pub(crate) const fn to_unknown(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN,
)
}
#[inline]
pub(crate) const fn to_dead(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_DEAD,
)
}
#[inline]
pub(crate) const fn to_quit(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_QUIT,
)
}
/// Return this lazy state ID as a state ID that is tagged as a start
/// state.
#[inline]
pub(crate) const fn to_start(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_START,
)
}
/// Return this lazy state ID as a lazy state ID that is tagged as a match
/// state.
#[inline]
pub(crate) const fn to_match(&self) -> LazyStateID {
LazyStateID::new_unchecked(
self.as_usize_unchecked() | LazyStateID::MASK_MATCH,
)
}
/// Return true if and only if this lazy state ID is tagged.
///
/// When a lazy state ID is tagged, then one can conclude that it is one
/// of a match, start, dead, quit or unknown state.
#[inline]
pub const fn is_tagged(&self) -> bool {
self.as_usize_unchecked() > LazyStateID::MAX
}
/// Return true if and only if this represents a lazy state ID that is
/// "unknown." That is, the state has not yet been created. When a caller
/// sees this state ID, it generally means that a state has to be computed
/// in order to proceed.
#[inline]
pub const fn is_unknown(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0
}
/// Return true if and only if this represents a dead state. A dead state
/// is a state that can never transition to any other state except the
/// dead state. When a dead state is seen, it generally indicates that a
/// search should stop.
#[inline]
pub const fn is_dead(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0
}
/// Return true if and only if this represents a quit state. A quit state
/// is a state that is representationally equivalent to a dead state,
/// except it indicates the automaton has reached a point at which it can
/// no longer determine whether a match exists or not. In general, this
/// indicates an error during search and the caller must either pass this
/// error up or use a different search technique.
#[inline]
pub const fn is_quit(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0
}
/// Return true if and only if this lazy state ID has been tagged as a
/// start state.
#[inline]
pub const fn is_start(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_START > 0
}
/// Return true if and only if this lazy state ID has been tagged as a
/// match state.
#[inline]
pub const fn is_match(&self) -> bool {
self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0
}
}
/// This error occurs when a lazy state ID could not be constructed.
///
/// This occurs when given an integer exceeding the maximum lazy state ID
/// value.
///
/// When the `std` feature is enabled, this implements the `Error` trait.
#[derive(Clone, Debug, Eq, PartialEq)]
pub(crate) struct LazyStateIDError {
attempted: u64,
}
impl LazyStateIDError {
/// Returns the value that failed to constructed a lazy state ID.
pub(crate) fn attempted(&self) -> u64 {
self.attempted
}
}
#[cfg(feature = "std")]
impl std::error::Error for LazyStateIDError {}
impl core::fmt::Display for LazyStateIDError {
fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
write!(
f,
"failed to create LazyStateID from {:?}, which exceeds {:?}",
self.attempted(),
LazyStateID::MAX,
)
}
}
/// Represents the current state of an overlapping search.
///
/// This is used for overlapping searches since they need to know something
/// about the previous search. For example, when multiple patterns match at the
/// same position, this state tracks the last reported pattern so that the next
/// search knows whether to report another matching pattern or continue with
/// the search at the next position. Additionally, it also tracks which state
/// the last search call terminated in.
///
/// This type provides no introspection capabilities. The only thing a caller
/// can do is construct it and pass it around to permit search routines to use
/// it to track state.
///
/// Callers should always provide a fresh state constructed via
/// [`OverlappingState::start`] when starting a new search. Reusing state from
/// a previous search may result in incorrect results.
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct OverlappingState {
/// The state ID of the state at which the search was in when the call
/// terminated. When this is a match state, `last_match` must be set to a
/// non-None value.
///
/// A `None` value indicates the start state of the corresponding
/// automaton. We cannot use the actual ID, since any one automaton may
/// have many start states, and which one is in use depends on several
/// search-time factors.
id: Option<LazyStateID>,
/// Information associated with a match when `id` corresponds to a match
/// state.
last_match: Option<StateMatch>,
}
/// Internal state about the last match that occurred. This records both the
/// offset of the match and the match index.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub(crate) struct StateMatch {
/// The index into the matching patterns for the current match state.
pub(crate) match_index: usize,
/// The offset in the haystack at which the match occurred. This is used
/// when reporting multiple matches at the same offset. That is, when
/// an overlapping search runs, the first thing it checks is whether it's
/// already in a match state, and if so, whether there are more patterns
/// to report as matches in that state. If so, it increments `match_index`
/// and returns the pattern and this offset. Once `match_index` exceeds the
/// number of matching patterns in the current state, the search continues.
pub(crate) offset: usize,
}
impl OverlappingState {
/// Create a new overlapping state that begins at the start state of any
/// automaton.
pub fn start() -> OverlappingState {
OverlappingState { id: None, last_match: None }
}
pub(crate) fn id(&self) -> Option<LazyStateID> {
self.id
}
pub(crate) fn set_id(&mut self, id: LazyStateID) {
self.id = Some(id);
}
pub(crate) fn last_match(&mut self) -> Option<&mut StateMatch> {
self.last_match.as_mut()
}
pub(crate) fn set_last_match(&mut self, last_match: StateMatch) {
self.last_match = Some(last_match);
}
}