tantivy_fst/automaton/
mod.rs

1use self::StartsWithStateInternal::*;
2
3/// Automaton describes types that behave as a finite automaton.
4///
5/// All implementors of this trait are represented by *byte based* automata.
6/// Stated differently, all transitions in the automata correspond to a single
7/// byte in the input.
8///
9/// This implementation choice is important for a couple reasons:
10///
11/// 1. The set of possible transitions in each node is small, which may make
12///    efficient memory usage easier.
13/// 2. The finite state transducers in this crate are all byte based, so any
14///    automata used on them must also be byte based.
15///
16/// In practice, this does present somewhat of a problem, for example, if
17/// you're storing UTF-8 encoded strings in a finite state transducer. Consider
18/// using a `Levenshtein` automaton, which accepts a query string and an edit
19/// distance. The edit distance should apply to some notion of *character*,
20/// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for
21/// some definition of "character"). Therefore, the automaton must have UTF-8
22/// decoding built into it. This can be tricky to implement, so you may find
23/// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful.
24pub trait Automaton {
25    /// The type of the state used in the automaton.
26    type State;
27
28    /// Returns a single start state for this automaton.
29    ///
30    /// This method should always return the same value for each
31    /// implementation.
32    fn start(&self) -> Self::State;
33
34    /// Returns true if and only if `state` is a match state.
35    fn is_match(&self, state: &Self::State) -> bool;
36
37    /// Returns true if and only if `state` can lead to a match in zero or more
38    /// steps.
39    ///
40    /// If this returns `false`, then no sequence of inputs from this state
41    /// should ever produce a match. If this does not follow, then those match
42    /// states may never be reached. In other words, behavior may be incorrect.
43    ///
44    /// If this returns `true` even when no match is possible, then behavior
45    /// will be correct, but callers may be forced to do additional work.
46    fn can_match(&self, _state: &Self::State) -> bool {
47        true
48    }
49
50    /// Returns true if and only if `state` matches and must match no matter
51    /// what steps are taken.
52    ///
53    /// If this returns `true`, then every sequence of inputs from this state
54    /// produces a match. If this does not follow, then those match states may
55    /// never be reached. In other words, behavior may be incorrect.
56    ///
57    /// If this returns `false` even when every sequence of inputs will lead to
58    /// a match, then behavior will be correct, but callers may be forced to do
59    /// additional work.
60    fn will_always_match(&self, _state: &Self::State) -> bool {
61        false
62    }
63
64    /// Return the next state given `state` and an input.
65    fn accept(&self, state: &Self::State, byte: u8) -> Self::State;
66
67    /// Returns an automaton that matches the strings that start with something
68    /// this automaton matches.
69    fn starts_with(self) -> StartsWith<Self>
70    where
71        Self: Sized,
72    {
73        StartsWith(self)
74    }
75
76    /// Returns an automaton that matches the strings matched by either this or
77    /// the other automaton.
78    fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs>
79    where
80        Self: Sized,
81    {
82        Union(self, rhs)
83    }
84
85    /// Returns an automaton that matches the strings matched by both this and
86    /// the other automaton.
87    fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs>
88    where
89        Self: Sized,
90    {
91        Intersection(self, rhs)
92    }
93
94    /// Returns an automaton that matches the strings not matched by this
95    /// automaton.
96    fn complement(self) -> Complement<Self>
97    where
98        Self: Sized,
99    {
100        Complement(self)
101    }
102}
103
104impl<'a, T: Automaton> Automaton for &'a T {
105    type State = T::State;
106
107    fn start(&self) -> Self::State {
108        (*self).start()
109    }
110
111    fn is_match(&self, state: &Self::State) -> bool {
112        (*self).is_match(state)
113    }
114
115    fn can_match(&self, state: &Self::State) -> bool {
116        (*self).can_match(state)
117    }
118
119    fn will_always_match(&self, state: &Self::State) -> bool {
120        (*self).will_always_match(state)
121    }
122
123    fn accept(&self, state: &Self::State, byte: u8) -> Self::State {
124        (*self).accept(state, byte)
125    }
126}
127
128/// An automaton that matches if the input contains a specific subsequence.
129#[derive(Clone, Debug)]
130pub struct Subsequence<'a> {
131    subseq: &'a [u8],
132}
133
134impl<'a> Subsequence<'a> {
135    /// Constructs automaton that matches input containing the
136    /// specified subsequence.
137    #[inline]
138    pub fn new(subsequence: &'a str) -> Subsequence<'a> {
139        Subsequence {
140            subseq: subsequence.as_bytes(),
141        }
142    }
143}
144
145impl<'a> Automaton for Subsequence<'a> {
146    type State = usize;
147
148    #[inline]
149    fn start(&self) -> usize {
150        0
151    }
152
153    #[inline]
154    fn is_match(&self, &state: &usize) -> bool {
155        state == self.subseq.len()
156    }
157
158    #[inline]
159    fn can_match(&self, _: &usize) -> bool {
160        true
161    }
162
163    #[inline]
164    fn will_always_match(&self, &state: &usize) -> bool {
165        state == self.subseq.len()
166    }
167
168    #[inline]
169    fn accept(&self, &state: &usize, byte: u8) -> usize {
170        if state == self.subseq.len() {
171            return state;
172        }
173        state + (byte == self.subseq[state]) as usize
174    }
175}
176
177/// An automaton that always matches.
178///
179/// This is useful in a generic context as a way to express that no automaton
180/// should be used.
181#[derive(Clone, Debug)]
182pub struct AlwaysMatch;
183
184impl Automaton for AlwaysMatch {
185    type State = ();
186
187    #[inline]
188    fn start(&self) {}
189    #[inline]
190    fn is_match(&self, _: &()) -> bool {
191        true
192    }
193    #[inline]
194    fn can_match(&self, _: &()) -> bool {
195        true
196    }
197    #[inline]
198    fn will_always_match(&self, _: &()) -> bool {
199        true
200    }
201    #[inline]
202    fn accept(&self, _: &(), _: u8) {}
203}
204
205/// An automaton that matches a string that begins with something that the
206/// wrapped automaton matches.
207#[derive(Clone, Debug)]
208pub struct StartsWith<A>(A);
209
210/// The `Automaton` state for `StartsWith<A>`.
211pub struct StartsWithState<A: Automaton>(StartsWithStateInternal<A>);
212
213enum StartsWithStateInternal<A: Automaton> {
214    Done,
215    Running(A::State),
216}
217
218impl<A: Automaton> Automaton for StartsWith<A> {
219    type State = StartsWithState<A>;
220
221    fn start(&self) -> StartsWithState<A> {
222        StartsWithState({
223            let inner = self.0.start();
224            if self.0.is_match(&inner) {
225                Done
226            } else {
227                Running(inner)
228            }
229        })
230    }
231
232    fn is_match(&self, state: &StartsWithState<A>) -> bool {
233        match state.0 {
234            Done => true,
235            Running(_) => false,
236        }
237    }
238
239    fn can_match(&self, state: &StartsWithState<A>) -> bool {
240        match state.0 {
241            Done => true,
242            Running(ref inner) => self.0.can_match(inner),
243        }
244    }
245
246    fn will_always_match(&self, state: &StartsWithState<A>) -> bool {
247        match state.0 {
248            Done => true,
249            Running(_) => false,
250        }
251    }
252
253    fn accept(&self, state: &StartsWithState<A>, byte: u8) -> StartsWithState<A> {
254        StartsWithState(match state.0 {
255            Done => Done,
256            Running(ref inner) => {
257                let next_inner = self.0.accept(inner, byte);
258                if self.0.is_match(&next_inner) {
259                    Done
260                } else {
261                    Running(next_inner)
262                }
263            }
264        })
265    }
266}
267
268/// An automaton that matches when one of its component automata match.
269#[derive(Clone, Debug)]
270pub struct Union<A, B>(A, B);
271
272/// The `Automaton` state for `Union<A, B>`.
273pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State);
274
275impl<A: Automaton, B: Automaton> Automaton for Union<A, B> {
276    type State = UnionState<A, B>;
277
278    fn start(&self) -> UnionState<A, B> {
279        UnionState(self.0.start(), self.1.start())
280    }
281
282    fn is_match(&self, state: &UnionState<A, B>) -> bool {
283        self.0.is_match(&state.0) || self.1.is_match(&state.1)
284    }
285
286    fn can_match(&self, state: &UnionState<A, B>) -> bool {
287        self.0.can_match(&state.0) || self.1.can_match(&state.1)
288    }
289
290    fn will_always_match(&self, state: &UnionState<A, B>) -> bool {
291        self.0.will_always_match(&state.0) || self.1.will_always_match(&state.1)
292    }
293
294    fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> {
295        UnionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
296    }
297}
298
299/// An automaton that matches when both of its component automata match.
300#[derive(Clone, Debug)]
301pub struct Intersection<A, B>(A, B);
302
303/// The `Automaton` state for `Intersection<A, B>`.
304pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State);
305
306impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> {
307    type State = IntersectionState<A, B>;
308
309    fn start(&self) -> IntersectionState<A, B> {
310        IntersectionState(self.0.start(), self.1.start())
311    }
312
313    fn is_match(&self, state: &IntersectionState<A, B>) -> bool {
314        self.0.is_match(&state.0) && self.1.is_match(&state.1)
315    }
316
317    fn can_match(&self, state: &IntersectionState<A, B>) -> bool {
318        self.0.can_match(&state.0) && self.1.can_match(&state.1)
319    }
320
321    fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool {
322        self.0.will_always_match(&state.0) && self.1.will_always_match(&state.1)
323    }
324
325    fn accept(&self, state: &IntersectionState<A, B>, byte: u8) -> IntersectionState<A, B> {
326        IntersectionState(self.0.accept(&state.0, byte), self.1.accept(&state.1, byte))
327    }
328}
329
330/// An automaton that matches exactly when the automaton it wraps does not.
331#[derive(Clone, Debug)]
332pub struct Complement<A>(A);
333
334/// The `Automaton` state for `Complement<A>`.
335pub struct ComplementState<A: Automaton>(A::State);
336
337impl<A: Automaton> Automaton for Complement<A> {
338    type State = ComplementState<A>;
339
340    fn start(&self) -> ComplementState<A> {
341        ComplementState(self.0.start())
342    }
343
344    fn is_match(&self, state: &ComplementState<A>) -> bool {
345        !self.0.is_match(&state.0)
346    }
347
348    fn can_match(&self, state: &ComplementState<A>) -> bool {
349        !self.0.will_always_match(&state.0)
350    }
351
352    fn will_always_match(&self, state: &ComplementState<A>) -> bool {
353        !self.0.can_match(&state.0)
354    }
355
356    fn accept(&self, state: &ComplementState<A>, byte: u8) -> ComplementState<A> {
357        ComplementState(self.0.accept(&state.0, byte))
358    }
359}