tantivy_fst/raw/mod.rs
1/*!
2Operations on raw finite state transducers.
3
4This sub-module exposes the guts of a finite state transducer. Many parts of
5it, such as construction and traversal, are mirrored in the `set` and `map`
6sub-modules. Other parts of it, such as direct access to nodes and transitions
7in the transducer, do not have any analog.
8
9# Overview of types
10
11`Fst` is a read only interface to pre-constructed finite state transducers.
12`Node` is a read only interface to a single node in a transducer. `Builder` is
13used to create new finite state transducers. (Once a transducer is created, it
14can never be modified.) `Stream` is a stream of all inputs and outputs in a
15transducer. `StreamBuilder` builds range queries. `OpBuilder` collects streams
16and executes set operations like `union` or `intersection` on them with the
17option of specifying a merge strategy for output values.
18
19Most of the rest of the types are streams from set operations.
20*/
21use std::cmp;
22use std::fmt;
23use std::ops::Deref;
24
25use byteorder::{LittleEndian, ReadBytesExt};
26
27use crate::automaton::{AlwaysMatch, Automaton};
28use crate::error::Result;
29use crate::stream::{IntoStreamer, Streamer};
30
31pub use self::build::Builder;
32pub use self::error::Error;
33use self::node::node_new;
34pub use self::node::{Node, Transitions};
35pub use self::ops::{
36 Chain, Difference, IndexedValue, Intersection, OpBuilder, SymmetricDifference, Union,
37};
38
39mod build;
40mod common_inputs;
41mod counting_writer;
42mod error;
43mod node;
44mod ops;
45mod pack;
46mod registry;
47mod registry_minimal;
48#[cfg(test)]
49mod tests;
50
51/// The API version of this crate.
52///
53/// This version number is written to every finite state transducer created by
54/// this crate. When a finite state transducer is read, its version number is
55/// checked against this value.
56///
57/// Currently, any version mismatch results in an error. Fixing this requires
58/// regenerating the finite state transducer or switching to a version of this
59/// crate that is compatible with the serialized transducer. This particular
60/// behavior may be relaxed in future versions.
61pub const VERSION: u64 = 2;
62
63/// A sentinel value used to indicate an empty final state.
64const EMPTY_ADDRESS: CompiledAddr = 0;
65
66/// A sentinel value used to indicate an invalid state.
67///
68/// This is never the address of a node in a serialized transducer.
69const NONE_ADDRESS: CompiledAddr = 1;
70
71/// Default capacity for the key buffer of a stream.
72const KEY_BUFFER_CAPACITY: usize = 128;
73
74/// FstType is a convention used to indicate the type of the underlying
75/// transducer.
76///
77/// This crate reserves the range 0-255 (inclusive) but currently leaves the
78/// meaning of 0-255 unspecified.
79pub type FstType = u64;
80
81/// CompiledAddr is the type used to address nodes in a finite state
82/// transducer.
83///
84/// It is most useful as a pointer to nodes. It can be used in the `Fst::node`
85/// method to resolve the pointer.
86pub type CompiledAddr = usize;
87
88/// An acyclic deterministic finite state transducer.
89///
90/// # How does it work?
91///
92/// The short answer: it's just like a prefix trie, which compresses keys
93/// based only on their prefixes, except that a automaton/transducer also
94/// compresses suffixes.
95///
96/// The longer answer is that keys in an automaton are stored only in the
97/// transitions from one state to another. A key can be acquired by tracing
98/// a path from the root of the automaton to any match state. The inputs along
99/// each transition are concatenated. Once a match state is reached, the
100/// concatenation of inputs up until that point corresponds to a single key.
101///
102/// But why is it called a transducer instead of an automaton? A finite state
103/// transducer is just like a finite state automaton, except that it has output
104/// transitions in addition to input transitions. Namely, the value associated
105/// with any particular key is determined by summing the outputs along every
106/// input transition that leads to the key's corresponding match state.
107///
108/// This is best demonstrated with a couple images. First, let's ignore the
109/// "transducer" aspect and focus on a plain automaton.
110///
111/// Consider that your keys are abbreviations of some of the months in the
112/// Gregorian calendar:
113///
114/// ```ignore
115/// jan
116/// feb
117/// mar
118/// apr
119/// may
120/// jun
121/// jul
122/// ```
123///
124/// The corresponding automaton that stores all of these as keys looks like
125/// this:
126///
127/// 
128///
129/// Notice here how the prefix and suffix of `jan` and `jun` are shared.
130/// Similarly, the prefixes of `jun` and `jul` are shared and the prefixes
131/// of `mar` and `may` are shared.
132///
133/// All of the keys from this automaton can be enumerated in lexicographic
134/// order by following every transition from each node in lexicographic
135/// order. Since it is acyclic, the procedure will terminate.
136///
137/// A key can be found by tracing it through the transitions in the automaton.
138/// For example, the key `aug` is known not to be in the automaton by only
139/// visiting the root state (because there is no `a` transition). For another
140/// example, the key `jax` is known not to be in the set only after moving
141/// through the transitions for `j` and `a`. Namely, after those transitions
142/// are followed, there are no transitions for `x`.
143///
144/// Notice here that looking up a key is proportional the length of the key
145/// itself. Namely, lookup time is not affected by the number of keys in the
146/// automaton!
147///
148/// Additionally, notice that the automaton exploits the fact that many keys
149/// share common prefixes and suffixes. For example, `jun` and `jul` are
150/// represented with no more states than would be required to represent either
151/// one on its own. Instead, the only change is a single extra transition. This
152/// is a form of compression and is key to how the automatons produced by this
153/// crate are so small.
154///
155/// Let's move on to finite state transducers. Consider the same set of keys
156/// as above, but let's assign their numeric month values:
157///
158/// ```ignore
159/// jan,1
160/// feb,2
161/// mar,3
162/// apr,4
163/// may,5
164/// jun,6
165/// jul,7
166/// ```
167///
168/// The corresponding transducer looks very similar to the automaton above,
169/// except outputs have been added to some of the transitions:
170///
171/// 
172///
173/// All of the operations with a transducer are the same as described above
174/// for automatons. Additionally, the same compression techniques are used:
175/// common prefixes and suffixes in keys are exploited.
176///
177/// The key difference is that some transitions have been given an output.
178/// As one follows input transitions, one must sum the outputs as they
179/// are seen. (A transition with no output represents the additive identity,
180/// or `0` in this case.) For example, when looking up `feb`, the transition
181/// `f` has output `2`, the transition `e` has output `0`, and the transition
182/// `b` also has output `0`. The sum of these is `2`, which is exactly the
183/// value we associated with `feb`.
184///
185/// For another more interesting example, consider `jul`. The `j` transition
186/// has output `1`, the `u` transition has output `5` and the `l` transition
187/// has output `1`. Summing these together gets us `7`, which is again the
188/// correct value associated with `jul`. Notice that if we instead looked up
189/// the `jun` key, then the `n` transition would be followed instead of the
190/// `l` transition, which has no output. Therefore, the `jun` key equals
191/// `1+5+0=6`.
192///
193/// The trick to transducers is that there exists a unique path through the
194/// transducer for every key, and its outputs are stored appropriately along
195/// this path such that the correct value is returned when they are all summed
196/// together. This process also enables the data that makes up each value to be
197/// shared across many values in the transducer in exactly the same way that
198/// keys are shared. This is yet another form of compression!
199///
200/// # Bonus: a billion strings
201///
202/// The amount of compression one can get from automata can be absolutely
203/// ridiuclous. Consider the particular case of storing all billion strings
204/// in the range `0000000001-1000000000`, e.g.,
205///
206/// ```ignore
207/// 0000000001
208/// 0000000002
209/// ...
210/// 0000000100
211/// 0000000101
212/// ...
213/// 0999999999
214/// 1000000000
215/// ```
216///
217/// The corresponding automaton looks like this:
218///
219/// ![finite state automaton - one billion strings]
220/// (http://burntsushi.net/stuff/one-billion.png)
221///
222/// Indeed, the on disk size of this automaton is a mere **251 bytes**.
223///
224/// Of course, this is a bit of a pathological best case, but it does serve
225/// to show how good compression can be in the optimal case.
226///
227/// Also, check out the
228/// [corresponding transducer](http://burntsushi.net/stuff/one-billion-map.svg)
229/// that maps each string to its integer value. It's a bit bigger, but still
230/// only takes up **896 bytes** of space on disk. This demonstrates that
231/// output values are also compressible.
232///
233/// # Does this crate produce minimal transducers?
234///
235/// For any non-trivial sized set of keys, it is unlikely that this crate will
236/// produce a minimal transducer. As far as this author knows, guaranteeing a
237/// minimal transducer requires working memory proportional to the number of
238/// states. This can be quite costly and is anathema to the main design goal of
239/// this crate: provide the ability to work with gigantic sets of strings with
240/// constant memory overhead.
241///
242/// Instead, construction of a finite state transducer uses a cache of
243/// states. More frequently used states are cached and reused, which provides
244/// reasonably good compression ratios. (No comprehensive benchmarks exist to
245/// back up this claim.)
246///
247/// It is possible that this crate may expose a way to guarantee minimal
248/// construction of transducers at the expense of exorbitant memory
249/// requirements.
250///
251/// # Bibliography
252///
253/// I initially got the idea to use finite state tranducers to represent
254/// ordered sets/maps from
255/// [Michael
256/// McCandless'](http://blog.mikemccandless.com/2010/12/using-finite-state-transducers-in.html)
257/// work on incorporating transducers in Lucene.
258///
259/// However, my work would also not have been possible without the hard work
260/// of many academics, especially
261/// [Jan Daciuk](http://galaxy.eti.pg.gda.pl/katedry/kiw/pracownicy/Jan.Daciuk/personal/).
262///
263/// * [Incremental construction of minimal acyclic finite-state automata](http://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601)
264/// (Section 3 provides a decent overview of the algorithm used to construct
265/// transducers in this crate, assuming all outputs are `0`.)
266/// * [Direct Construction of Minimal Acyclic Subsequential Transducers](http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.24.3698&rep=rep1&type=pdf)
267/// (The whole thing. The proof is dense but illuminating. The algorithm at
268/// the end is the money shot, namely, it incorporates output values.)
269/// * [Experiments with Automata Compression](http://www.researchgate.net/profile/Jii_Dvorsky/publication/221568039_Word_Random_Access_Compression/links/0c96052c095630d5b3000000.pdf#page=116), [Smaller Representation of Finite State Automata](http://www.cs.put.poznan.pl/dweiss/site/publications/download/fsacomp.pdf)
270/// (various compression techniques for representing states/transitions)
271/// * [Jan Daciuk's dissertation](http://www.pg.gda.pl/~jandac/thesis.ps.gz)
272/// (excellent for in depth overview)
273/// * [Comparison of Construction Algorithms for Minimal, Acyclic, Deterministic, Finite-State Automata from Sets of Strings](http://www.cs.mun.ca/~harold/Courses/Old/CS4750/Diary/q3p2qx4lv71m5vew.pdf)
274/// (excellent for surface level overview)
275pub struct Fst<Data = Vec<u8>> {
276 meta: FstMeta,
277 data: Data,
278}
279
280struct FstMeta {
281 version: u64,
282 root_addr: CompiledAddr,
283 ty: FstType,
284 len: usize,
285}
286
287impl FstMeta {
288 #[inline(always)]
289 fn root<'f>(&self, data: &'f [u8]) -> Node<'f> {
290 self.node(self.root_addr, data)
291 }
292
293 #[inline(always)]
294 fn node<'f>(&self, addr: CompiledAddr, data: &'f [u8]) -> Node<'f> {
295 node_new(self.version, addr, data)
296 }
297
298 fn empty_final_output(&self, data: &[u8]) -> Option<Output> {
299 let root = self.root(data);
300 if root.is_final() {
301 Some(root.final_output())
302 } else {
303 None
304 }
305 }
306}
307
308impl<Data: Deref<Target = [u8]>> Fst<Data> {
309 /// Open a `Fst` from a given data.
310 pub fn new(data: Data) -> Result<Fst<Data>> {
311 if data.len() < 32 {
312 return Err(Error::Format.into());
313 }
314 // The read_u64 unwraps below are OK because they can never fail.
315 // They can only fail when there is an IO error or if there is an
316 // unexpected EOF. However, we are reading from a byte slice (no
317 // IO errors possible) and we've confirmed the byte slice is at least
318 // N bytes (no unexpected EOF).
319 let version = (&*data).read_u64::<LittleEndian>().unwrap();
320 if version == 0 || version > VERSION {
321 return Err(Error::Version {
322 expected: VERSION,
323 got: version,
324 }
325 .into());
326 }
327 let ty = (&data[8..]).read_u64::<LittleEndian>().unwrap();
328 let root_addr = {
329 let mut last = &data[data.len() - 8..];
330 u64_to_usize(last.read_u64::<LittleEndian>().unwrap())
331 };
332 let len = {
333 let mut last2 = &data[data.len() - 16..];
334 u64_to_usize(last2.read_u64::<LittleEndian>().unwrap())
335 };
336 // The root node is always the last node written, so its address should
337 // be near the end. After the root node is written, we still have to
338 // write the root *address* and the number of keys in the FST.
339 // That's 16 bytes. The extra byte comes from the fact that the root
340 // address points to the last byte in the root node, rather than the
341 // byte immediately following the root node.
342 //
343 // If this check passes, it is still possible that the FST is invalid
344 // but probably unlikely. If this check reports a false positive, then
345 // the program will probably panic. In the worst case, the FST will
346 // operate but be subtly wrong. (This would require the bytes to be in
347 // a format expected by an FST, which is incredibly unlikely.)
348 //
349 // The special check for EMPTY_ADDRESS is needed since an empty FST
350 // has a root node that is empty and final, which means it has the
351 // special address `0`. In that case, the FST is the smallest it can
352 // be: the version, type, root address and number of nodes. That's
353 // 32 bytes (8 byte u64 each).
354 //
355 // This is essentially our own little checksum.
356 if (root_addr == EMPTY_ADDRESS && data.len() != 32) && root_addr + 17 != data.len() {
357 return Err(Error::Format.into());
358 }
359 Ok(Fst {
360 data,
361 meta: FstMeta {
362 version,
363 root_addr,
364 ty,
365 len,
366 },
367 })
368 }
369
370 /// Retrieves the value associated with a key.
371 ///
372 /// If the key does not exist, then `None` is returned.
373 #[inline(never)]
374 pub fn get<B: AsRef<[u8]>>(&self, key: B) -> Option<Output> {
375 let mut node = self.root();
376 let mut out = Output::zero();
377 for &b in key.as_ref() {
378 node = match node.find_input(b) {
379 None => return None,
380 Some(i) => {
381 let t = node.transition(i);
382 out = out.cat(t.out);
383 self.node(t.addr)
384 }
385 }
386 }
387 if !node.is_final() {
388 None
389 } else {
390 Some(out.cat(node.final_output()))
391 }
392 }
393
394 /// Returns true if and only if the given key is in this FST.
395 pub fn contains_key<B: AsRef<[u8]>>(&self, key: B) -> bool {
396 let mut node = self.root();
397 for &b in key.as_ref() {
398 node = match node.find_input(b) {
399 None => return false,
400 Some(i) => self.node(node.transition_addr(i)),
401 }
402 }
403 node.is_final()
404 }
405
406 /// Return a lexicographically ordered stream of all key-value pairs in
407 /// this fst.
408 #[inline]
409 pub fn stream(&self) -> Stream {
410 self.stream_builder(AlwaysMatch).into_stream()
411 }
412
413 fn stream_builder<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
414 StreamBuilder::new(&self.meta, &self.data, aut)
415 }
416
417 /// Return a builder for range queries.
418 ///
419 /// A range query returns a subset of key-value pairs in this fst in a
420 /// range given in lexicographic order.
421 #[inline]
422 pub fn range(&self) -> StreamBuilder {
423 self.stream_builder(AlwaysMatch)
424 }
425
426 /// Executes an automaton on the keys of this map.
427 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
428 self.stream_builder(aut)
429 }
430
431 /// Returns the number of keys in this fst.
432 #[inline]
433 pub fn len(&self) -> usize {
434 self.meta.len
435 }
436
437 /// Returns true if and only if this fst has no keys.
438 #[inline]
439 pub fn is_empty(&self) -> bool {
440 self.len() == 0
441 }
442
443 /// Returns the number of bytes used by this fst.
444 #[inline]
445 pub fn size(&self) -> usize {
446 self.data.len()
447 }
448
449 /// Creates a new fst operation with this fst added to it.
450 ///
451 /// The `OpBuilder` type can be used to add additional fst streams
452 /// and perform set operations like union, intersection, difference and
453 /// symmetric difference on the keys of the fst. These set operations also
454 /// allow one to specify how conflicting values are merged in the stream.
455 #[inline]
456 pub fn op(&self) -> OpBuilder {
457 OpBuilder::default().add(self)
458 }
459
460 /// Returns true if and only if the `self` fst is disjoint with the fst
461 /// `stream`.
462 ///
463 /// `stream` must be a lexicographically ordered sequence of byte strings
464 /// with associated values.
465 pub fn is_disjoint<'f, I, S>(&self, stream: I) -> bool
466 where
467 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
468 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
469 {
470 self.op().add(stream).intersection().next().is_none()
471 }
472
473 /// Returns true if and only if the `self` fst is a subset of the fst
474 /// `stream`.
475 ///
476 /// `stream` must be a lexicographically ordered sequence of byte strings
477 /// with associated values.
478 pub fn is_subset<'f, I, S>(&self, stream: I) -> bool
479 where
480 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
481 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
482 {
483 let mut op = self.op().add(stream).intersection();
484 let mut count = 0;
485 while let Some(_) = op.next() {
486 count += 1;
487 }
488 count == self.len()
489 }
490
491 /// Returns true if and only if the `self` fst is a superset of the fst
492 /// `stream`.
493 ///
494 /// `stream` must be a lexicographically ordered sequence of byte strings
495 /// with associated values.
496 pub fn is_superset<'f, I, S>(&self, stream: I) -> bool
497 where
498 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
499 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
500 {
501 let mut op = self.op().add(stream).union();
502 let mut count = 0;
503 while let Some(_) = op.next() {
504 count += 1;
505 }
506 count == self.len()
507 }
508
509 /// Returns the underlying type of this fst.
510 ///
511 /// FstType is a convention used to indicate the type of the underlying
512 /// transducer.
513 ///
514 /// This crate reserves the range 0-255 (inclusive) but currently leaves
515 /// the meaning of 0-255 unspecified.
516 #[inline]
517 pub fn fst_type(&self) -> FstType {
518 self.meta.ty
519 }
520
521 /// Returns the root node of this fst.
522 #[inline(always)]
523 pub fn root(&self) -> Node {
524 self.meta.root(self.data.deref())
525 }
526
527 /// Returns the node at the given address.
528 ///
529 /// Node addresses can be obtained by reading transitions on `Node` values.
530 #[inline]
531 pub fn node(&self, addr: CompiledAddr) -> Node {
532 self.meta.node(addr, self.data.deref())
533 }
534
535 /// Returns a copy of the binary contents of this FST.
536 #[inline]
537 pub fn to_vec(&self) -> Vec<u8> {
538 self.data.to_vec()
539 }
540}
541
542impl<'a, 'f, Data> IntoStreamer<'a> for &'f Fst<Data>
543where
544 Data: Deref<Target = [u8]>,
545{
546 type Item = (&'a [u8], Output);
547 type Into = Stream<'f>;
548
549 #[inline]
550 fn into_stream(self) -> Self::Into {
551 self.stream()
552 }
553}
554
555/// A builder for constructing range queries on streams.
556///
557/// Once all bounds are set, one should call `into_stream` to get a
558/// `Stream`.
559///
560/// Bounds are not additive. That is, if `ge` is called twice on the same
561/// builder, then the second setting wins.
562///
563/// The `A` type parameter corresponds to an optional automaton to filter
564/// the stream. By default, no filtering is done.
565///
566/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
567pub struct StreamBuilder<'f, A = AlwaysMatch> {
568 meta: &'f FstMeta,
569 data: &'f [u8],
570 aut: A,
571 min: Bound,
572 max: Bound,
573 backward: bool,
574}
575
576impl<'f, A: Automaton> StreamBuilder<'f, A> {
577 fn new(meta: &'f FstMeta, data: &'f [u8], aut: A) -> Self {
578 StreamBuilder {
579 meta,
580 data,
581 aut,
582 min: Bound::Unbounded,
583 max: Bound::Unbounded,
584 backward: false,
585 }
586 }
587
588 /// Specify a greater-than-or-equal-to bound.
589 pub fn ge<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
590 self.min = Bound::Included(bound.as_ref().to_owned());
591 self
592 }
593
594 /// Specify a greater-than bound.
595 pub fn gt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
596 self.min = Bound::Excluded(bound.as_ref().to_owned());
597 self
598 }
599
600 /// Specify a less-than-or-equal-to bound.
601 pub fn le<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
602 self.max = Bound::Included(bound.as_ref().to_owned());
603 self
604 }
605
606 /// Specify a less-than bound.
607 pub fn lt<T: AsRef<[u8]>>(mut self, bound: T) -> Self {
608 self.max = Bound::Excluded(bound.as_ref().to_owned());
609 self
610 }
611
612 /// Sets the `StreamBuilder` to stream the `(key, value)` backward.
613 pub fn backward(mut self) -> Self {
614 self.backward = true;
615 self
616 }
617
618 /// Return this builder and gives the automaton states
619 /// along with the results.
620 pub fn with_state(self) -> StreamWithStateBuilder<'f, A> {
621 StreamWithStateBuilder(self)
622 }
623}
624
625impl<'a, 'f, A: Automaton> IntoStreamer<'a> for StreamBuilder<'f, A> {
626 type Item = (&'a [u8], Output);
627 type Into = Stream<'f, A>;
628
629 fn into_stream(self) -> Stream<'f, A> {
630 Stream::new(
631 self.meta,
632 self.data,
633 self.aut,
634 self.min,
635 self.max,
636 self.backward,
637 )
638 }
639}
640
641/// A builder for constructing range queries of streams
642/// that returns results along with automaton states.
643///
644/// Once all bounds are set, one should call `into_stream` to get a
645/// `StreamWithState`.
646///
647/// Bounds are not additive. That is, if `ge` is called twice on the same
648/// builder, then the second setting wins.
649///
650/// The `A` type parameter corresponds to an optional automaton to filter
651/// the stream. By default, no filtering is done.
652///
653/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
654pub struct StreamWithStateBuilder<'f, A = AlwaysMatch>(StreamBuilder<'f, A>);
655
656impl<'a, 'f, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'f, A>
657where
658 A::State: Clone,
659{
660 type Item = (&'a [u8], Output, A::State);
661 type Into = StreamWithState<'f, A>;
662
663 fn into_stream(self) -> StreamWithState<'f, A> {
664 StreamWithState::new(
665 self.0.meta,
666 self.0.data,
667 self.0.aut,
668 self.0.min,
669 self.0.max,
670 self.0.backward,
671 )
672 }
673}
674
675#[derive(Clone, Debug)]
676enum Bound {
677 Included(Vec<u8>),
678 Excluded(Vec<u8>),
679 Unbounded,
680}
681
682impl Bound {
683 fn exceeded_by(&self, inp: &[u8]) -> bool {
684 match *self {
685 Bound::Included(ref v) => inp > v,
686 Bound::Excluded(ref v) => inp >= v,
687 Bound::Unbounded => false,
688 }
689 }
690
691 fn subceeded_by(&self, inp: &[u8]) -> bool {
692 match *self {
693 Bound::Included(ref v) => inp < v,
694 Bound::Excluded(ref v) => inp <= v,
695 Bound::Unbounded => false,
696 }
697 }
698
699 fn is_empty(&self) -> bool {
700 match *self {
701 Bound::Included(ref v) => v.is_empty(),
702 Bound::Excluded(ref v) => v.is_empty(),
703 Bound::Unbounded => true,
704 }
705 }
706
707 fn is_inclusive(&self) -> bool {
708 match *self {
709 Bound::Excluded(_) => false,
710 _ => true,
711 }
712 }
713}
714
715/// Stream of `key, value` not exposing the state of the automaton.
716pub struct Stream<'f, A = AlwaysMatch>(StreamWithState<'f, A>)
717where
718 A: Automaton;
719
720impl<'f, A: Automaton> Stream<'f, A> {
721 fn new(
722 meta: &'f FstMeta,
723 data: &'f [u8],
724 aut: A,
725 min: Bound,
726 max: Bound,
727 backward: bool,
728 ) -> Self {
729 Self(StreamWithState::new(meta, data, aut, min, max, backward))
730 }
731
732 /// Convert this stream into a vector of byte strings and outputs.
733 ///
734 /// Note that this creates a new allocation for every key in the stream.
735 pub fn into_byte_vec(mut self) -> Vec<(Vec<u8>, u64)> {
736 let mut vs = vec![];
737 while let Some((k, v)) = self.next() {
738 vs.push((k.to_vec(), v.value()));
739 }
740 vs
741 }
742
743 /// Convert this stream into a vector of Unicode strings and outputs.
744 ///
745 /// If any key is not valid UTF-8, then iteration on the stream is stopped
746 /// and a UTF-8 decoding error is returned.
747 ///
748 /// Note that this creates a new allocation for every key in the stream.
749 pub fn into_str_vec(mut self) -> Result<Vec<(String, u64)>> {
750 let mut vs = vec![];
751 while let Some((k, v)) = self.next() {
752 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
753 vs.push((k, v.value()));
754 }
755 Ok(vs)
756 }
757
758 /// Convert this stream into a vector of byte strings.
759 ///
760 /// Note that this creates a new allocation for every key in the stream.
761 pub fn into_byte_keys(mut self) -> Vec<Vec<u8>> {
762 let mut vs = vec![];
763 while let Some((k, _)) = self.next() {
764 vs.push(k.to_vec());
765 }
766 vs
767 }
768
769 /// Convert this stream into a vector of Unicode strings.
770 ///
771 /// If any key is not valid UTF-8, then iteration on the stream is stopped
772 /// and a UTF-8 decoding error is returned.
773 ///
774 /// Note that this creates a new allocation for every key in the stream.
775 pub fn into_str_keys(mut self) -> Result<Vec<String>> {
776 let mut vs = vec![];
777 while let Some((k, _)) = self.next() {
778 let k = String::from_utf8(k.to_vec()).map_err(Error::from)?;
779 vs.push(k);
780 }
781 Ok(vs)
782 }
783
784 /// Convert this stream into a vector of outputs.
785 pub fn into_values(mut self) -> Vec<u64> {
786 let mut vs = vec![];
787 while let Some((_, v)) = self.next() {
788 vs.push(v.value());
789 }
790 vs
791 }
792}
793
794impl<'f, 'a, A: Automaton> Streamer<'a> for Stream<'f, A> {
795 type Item = (&'a [u8], Output);
796
797 fn next(&'a mut self) -> Option<Self::Item> {
798 self.0.next(|_| ()).map(|(key, out, _)| (key, out))
799 }
800}
801
802/// A lexicographically ordered stream from an fst
803/// of key-value pairs along with the state of the automaton.
804///
805/// The `A` type parameter corresponds to an optional automaton to filter
806/// the stream. By default, no filtering is done.
807///
808/// The `'f` lifetime parameter refers to the lifetime of the underlying fst.
809#[derive(Clone)]
810pub struct StreamWithState<'f, A = AlwaysMatch>
811where
812 A: Automaton,
813{
814 fst: &'f FstMeta,
815 data: &'f [u8],
816 aut: A,
817 inp: Buffer,
818 empty_output: Option<Output>,
819 stack: Vec<StreamState<'f, A::State>>,
820 end_at: Bound,
821 min: Bound,
822 max: Bound,
823 reversed: bool,
824}
825
826#[derive(Clone, Debug)]
827struct StreamState<'f, S> {
828 node: Node<'f>,
829 trans: usize,
830 out: Output,
831 aut_state: S,
832 done: bool, // ('done' = true) means that there are no unexplored transitions in the current state.
833 // 'trans' value should be ignored when done is true.
834}
835
836impl<'f, A: Automaton> StreamWithState<'f, A> {
837 fn new(
838 fst: &'f FstMeta,
839 data: &'f [u8],
840 aut: A,
841 min: Bound,
842 max: Bound,
843 backward: bool,
844 ) -> Self {
845 let min_2 = min.clone();
846 let max_2 = max.clone();
847 let end_at: Bound = if !backward { max.clone() } else { min.clone() };
848 let mut stream = StreamWithState {
849 fst,
850 data,
851 aut,
852 inp: Buffer::new(),
853 empty_output: None,
854 stack: vec![],
855 end_at,
856 min: min_2,
857 max: max_2,
858 reversed: backward,
859 };
860 stream.seek(&min, &max);
861 stream
862 }
863
864 /// Seeks the underlying stream such that the next key to be read is the
865 /// smallest key in the underlying fst that satisfies the given minimum
866 /// bound.
867 ///
868 /// This theoretically should be straight-forward, but we need to make
869 /// sure our stack is correct, which includes accounting for automaton
870 /// states.
871 fn seek(&mut self, min: &Bound, max: &Bound) {
872 let start_bound = if self.reversed { &max } else { &min };
873 if min.is_empty() && min.is_inclusive() {
874 self.empty_output = self.resolve_empty_output(min, max);
875 }
876 if start_bound.is_empty() {
877 self.stack.clear();
878 let node = self.fst.root(self.data);
879 let transition = self.starting_transition(&node);
880 self.stack = vec![StreamState {
881 node,
882 trans: transition.unwrap_or_default(),
883 out: Output::zero(),
884 aut_state: self.aut.start(),
885 done: transition.is_none(),
886 }];
887 return;
888 }
889 let (key, inclusive) = match start_bound {
890 Bound::Excluded(ref start_bound) => (start_bound, false),
891 Bound::Included(ref start_bound) => (start_bound, true),
892 Bound::Unbounded => unreachable!(),
893 };
894 // At this point, we need to find the starting location of `min` in
895 // the FST. However, as we search, we need to maintain a stack of
896 // reader states so that the reader can pick up where we left off.
897 // N.B. We do not necessarily need to stop in a final state, unlike
898 // the one-off `find` method. For the example, the given bound might
899 // not actually exist in the FST.
900 let mut node = self.fst.root(self.data);
901 let mut out = Output::zero();
902 let mut aut_state = self.aut.start();
903 for &b in key {
904 match node.find_input(b) {
905 Some(i) => {
906 let t = node.transition(i);
907 let prev_state = aut_state;
908 aut_state = self.aut.accept(&prev_state, b);
909 self.inp.push(b);
910 let transition = self.next_transition(&node, i);
911 self.stack.push(StreamState {
912 node,
913 trans: transition.unwrap_or_default(),
914 out,
915 aut_state: prev_state,
916 done: transition.is_none(),
917 });
918 out = out.cat(t.out);
919 node = self.fst.node(t.addr, self.data);
920 }
921 None => {
922 // This is a little tricky. We're in this case if the
923 // given bound is not a prefix of any key in the FST.
924 // Since this is a minimum bound, we need to find the
925 // first transition in this node that proceeds the current
926 // input byte.
927 let trans = self.transition_within_bound(&node, b);
928 self.stack.push(StreamState {
929 node,
930 trans: trans.unwrap_or_default(),
931 out,
932 aut_state,
933 done: trans.is_none(),
934 });
935 return;
936 }
937 }
938 }
939 if self.stack.is_empty() {
940 return;
941 }
942 let last = self.stack.len() - 1;
943 let state = &self.stack[last];
944 let transition = if !state.done {
945 self.previous_transition(&state.node, state.trans)
946 } else {
947 self.last_transition(&state.node)
948 };
949 if inclusive {
950 self.stack[last].trans = transition.unwrap_or_default();
951 self.stack[last].done = transition.is_none();
952 self.inp.pop();
953 } else {
954 let next_node = self.fst.node(
955 state.node.transition(transition.unwrap_or_default()).addr,
956 self.data,
957 );
958 let starting_transition = self.starting_transition(&next_node);
959 self.stack.push(StreamState {
960 node: next_node,
961 trans: starting_transition.unwrap_or_default(),
962 out,
963 aut_state,
964 done: starting_transition.is_none(),
965 });
966 }
967 }
968
969 #[inline]
970 fn next<F, T>(&mut self, transform: F) -> Option<(&[u8], Output, T)>
971 where
972 F: Fn(&A::State) -> T,
973 {
974 if !self.reversed {
975 // Inorder empty output (will be first).
976 if let Some(out) = self.empty_output.take() {
977 return Some((&[], out, transform(&self.aut.start())));
978 }
979 }
980 while let Some(state) = self.stack.pop() {
981 if state.done || !self.aut.can_match(&state.aut_state) {
982 if state.node.addr() != self.fst.root_addr {
983 // Reversed return next logic.
984 // If the stack is empty the value should not be returned.
985 if self.reversed && !self.stack.is_empty() && state.node.is_final() {
986 let out_of_bounds =
987 self.min.subceeded_by(&self.inp) || self.max.exceeded_by(&self.inp);
988 if !out_of_bounds && self.aut.is_match(&state.aut_state) {
989 return Some((&self.inp.pop(), state.out, transform(&state.aut_state)));
990 }
991 }
992 self.inp.pop();
993 }
994 continue;
995 }
996 let trans = state.node.transition(state.trans);
997 let out = state.out.cat(trans.out);
998 let next_state = self.aut.accept(&state.aut_state, trans.inp);
999 let is_match = self.aut.is_match(&next_state);
1000 let next_node = self.fst.node(trans.addr, self.data);
1001 self.inp.push(trans.inp);
1002 let current_transition = self.next_transition(&state.node, state.trans);
1003 self.stack.push(StreamState {
1004 trans: current_transition.unwrap_or_default(),
1005 done: current_transition.is_none(),
1006 ..state
1007 });
1008 let ns = transform(&next_state);
1009 let next_transition = self.starting_transition(&next_node);
1010 self.stack.push(StreamState {
1011 node: next_node,
1012 trans: next_transition.unwrap_or_default(),
1013 out,
1014 aut_state: next_state,
1015 done: next_transition.is_none(),
1016 });
1017 // Inorder return next logic.
1018 if !self.reversed {
1019 if self.end_at.exceeded_by(&self.inp) {
1020 // We are done, forever.
1021 self.stack.clear();
1022 return None;
1023 } else if !self.reversed && next_node.is_final() && is_match {
1024 return Some((&self.inp, out.cat(next_node.final_output()), ns));
1025 }
1026 }
1027 }
1028 // If we are streaming backward, we still need to return the empty output, if empty is
1029 // part of our fst, matches the range and the automaton
1030 self.empty_output
1031 .take()
1032 .map(|out| (&[][..], out, transform(&self.aut.start())))
1033 }
1034
1035 // The first transition that is in a bound for a given node.
1036 #[inline]
1037 fn transition_within_bound(&self, node: &Node<'f>, bound: u8) -> Option<usize> {
1038 let mut trans;
1039 if let Some(t) = self.starting_transition(&node) {
1040 trans = t;
1041 } else {
1042 return None;
1043 }
1044 loop {
1045 let transition = node.transition(trans);
1046 if (!self.reversed && transition.inp > bound)
1047 || (self.reversed && transition.inp < bound)
1048 {
1049 return Some(trans);
1050 } else if let Some(t) = self.next_transition(&node, trans) {
1051 trans = t;
1052 } else {
1053 return None;
1054 }
1055 }
1056 }
1057
1058 /// Resolves value of the empty output. Will be none if the empty output should not be returned.
1059 #[inline]
1060 fn resolve_empty_output(&mut self, min: &Bound, max: &Bound) -> Option<Output> {
1061 if min.subceeded_by(&[]) || max.exceeded_by(&[]) {
1062 return None;
1063 }
1064 let start = self.aut.start();
1065 if !self.aut.is_match(&start) {
1066 return None;
1067 }
1068 self.fst.empty_final_output(self.data)
1069 }
1070
1071 #[inline]
1072 fn starting_transition(&self, node: &Node<'f>) -> Option<usize> {
1073 if node.is_empty() {
1074 None
1075 } else if !self.reversed {
1076 Some(0)
1077 } else {
1078 Some(node.len() - 1)
1079 }
1080 }
1081
1082 #[inline]
1083 fn last_transition(&self, node: &Node<'f>) -> Option<usize> {
1084 if node.is_empty() {
1085 None
1086 } else if self.reversed {
1087 Some(0)
1088 } else {
1089 Some(node.len() - 1)
1090 }
1091 }
1092
1093 /// Returns the next transition.
1094 ///
1095 /// The concept of `next` transition is dependent on whether the stream is in reverse mode or
1096 /// not. If all the transitions of this node have been emitted, this method returns None.
1097 #[inline]
1098 fn next_transition(&self, node: &Node<'f>, current_transition: usize) -> Option<usize> {
1099 if self.reversed {
1100 Self::backward_transition(node, current_transition)
1101 } else {
1102 Self::forward_transition(node, current_transition)
1103 }
1104 }
1105
1106 /// See `StreamWithState::next_transition`.
1107 #[inline]
1108 fn previous_transition(&self, node: &Node<'f>, current_transition: usize) -> Option<usize> {
1109 if self.reversed {
1110 Self::forward_transition(node, current_transition)
1111 } else {
1112 Self::backward_transition(node, current_transition)
1113 }
1114 }
1115
1116 /// Returns the next logical transition.
1117 ///
1118 /// This is independent from whether the stream is in backward mode or not.
1119 #[inline]
1120 fn forward_transition(node: &Node<'f>, current_transition: usize) -> Option<usize> {
1121 if current_transition + 1 < node.len() {
1122 Some(current_transition + 1)
1123 } else {
1124 None
1125 }
1126 }
1127
1128 /// See [Stream::forward_transition].
1129 #[inline]
1130 fn backward_transition(node: &Node<'f>, current_transition: usize) -> Option<usize> {
1131 if current_transition > 0 && !node.is_empty() {
1132 Some(current_transition - 1)
1133 } else {
1134 None
1135 }
1136 }
1137}
1138
1139impl<'f, 'a, A: 'a + Automaton> Streamer<'a> for StreamWithState<'f, A>
1140where
1141 A::State: Clone,
1142{
1143 type Item = (&'a [u8], Output, A::State);
1144
1145 fn next(&'a mut self) -> Option<Self::Item> {
1146 self.next(Clone::clone)
1147 }
1148}
1149
1150/// An output is a value that is associated with a key in a finite state
1151/// transducer.
1152///
1153/// Note that outputs must satisfy an algebra. Namely, it must have an additive
1154/// identity and the following binary operations defined: `prefix`,
1155/// `concatenation` and `subtraction`. `prefix` and `concatenation` are
1156/// commutative while `subtraction` is not. `subtraction` is only defined on
1157/// pairs of operands where the first operand is greater than or equal to the
1158/// second operand.
1159///
1160/// Currently, output values must be `u64`. However, in theory, an output value
1161/// can be anything that satisfies the above algebra. Future versions of this
1162/// crate may make outputs generic on this algebra.
1163#[derive(Copy, Clone, Debug, Hash, Eq, Ord, PartialEq, PartialOrd)]
1164pub struct Output(u64);
1165
1166#[derive(Clone)]
1167struct Buffer {
1168 buf: Box<[u8]>,
1169 len: usize,
1170}
1171
1172impl Buffer {
1173 fn new() -> Self {
1174 Buffer {
1175 buf: vec![0u8; KEY_BUFFER_CAPACITY].into_boxed_slice(),
1176 len: 0,
1177 }
1178 }
1179
1180 fn capacity(&self) -> usize {
1181 self.buf.len()
1182 }
1183
1184 fn double_cap(&mut self) {
1185 let old_cap = self.capacity();
1186 let new_cap = old_cap * 2;
1187 let mut new_buf = vec![0u8; new_cap].into_boxed_slice();
1188 new_buf[..old_cap].copy_from_slice(&self.buf[..old_cap]);
1189 self.buf = new_buf;
1190 }
1191
1192 fn push(&mut self, b: u8) {
1193 if self.capacity() <= self.len {
1194 self.double_cap();
1195 }
1196 self.buf[self.len] = b;
1197 self.len += 1;
1198 }
1199
1200 // Pops one byte and returns the entire chain before the byte was popped.
1201 fn pop(&mut self) -> &[u8] {
1202 let len = self.len;
1203 self.len = len - 1;
1204 &self.buf[..len]
1205 }
1206}
1207
1208impl Deref for Buffer {
1209 type Target = [u8];
1210
1211 fn deref(&self) -> &[u8] {
1212 &self.buf[..self.len]
1213 }
1214}
1215
1216impl Output {
1217 /// Create a new output from a `u64`.
1218 #[inline]
1219 pub fn new(v: u64) -> Output {
1220 Output(v)
1221 }
1222
1223 /// Create a zero output.
1224 #[inline]
1225 pub fn zero() -> Output {
1226 Output(0)
1227 }
1228
1229 /// Retrieve the value inside this output.
1230 #[inline]
1231 pub fn value(self) -> u64 {
1232 self.0
1233 }
1234
1235 /// Returns true if this is a zero output.
1236 #[inline]
1237 pub fn is_zero(self) -> bool {
1238 self.0 == 0
1239 }
1240
1241 /// Returns the prefix of this output and `o`.
1242 #[inline]
1243 pub fn prefix(self, o: Output) -> Output {
1244 Output(cmp::min(self.0, o.0))
1245 }
1246
1247 /// Returns the concatenation of this output and `o`.
1248 #[inline]
1249 pub fn cat(self, o: Output) -> Output {
1250 Output(self.0 + o.0)
1251 }
1252
1253 /// Returns the subtraction of `o` from this output.
1254 ///
1255 /// This function panics if `self > o`.
1256 #[inline]
1257 pub fn sub(self, o: Output) -> Output {
1258 Output(
1259 self.0
1260 .checked_sub(o.0)
1261 .expect("BUG: underflow subtraction not allowed"),
1262 )
1263 }
1264}
1265
1266/// A transition from one note to another.
1267#[derive(Copy, Clone, Hash, Eq, PartialEq)]
1268pub struct Transition {
1269 /// The byte input associated with this transition.
1270 pub inp: u8,
1271 /// The output associated with this transition.
1272 pub out: Output,
1273 /// The address of the node that this transition points to.
1274 pub addr: CompiledAddr,
1275}
1276
1277impl Default for Transition {
1278 #[inline]
1279 fn default() -> Self {
1280 Transition {
1281 inp: 0,
1282 out: Output::zero(),
1283 addr: NONE_ADDRESS,
1284 }
1285 }
1286}
1287
1288impl fmt::Debug for Transition {
1289 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1290 if self.out.is_zero() {
1291 write!(f, "{} -> {}", self.inp as char, self.addr)
1292 } else {
1293 write!(
1294 f,
1295 "({}, {}) -> {}",
1296 self.inp as char,
1297 self.out.value(),
1298 self.addr
1299 )
1300 }
1301 }
1302}
1303
1304#[inline]
1305#[cfg(target_pointer_width = "64")]
1306fn u64_to_usize(n: u64) -> usize {
1307 n as usize
1308}
1309
1310#[inline]
1311#[cfg(not(target_pointer_width = "64"))]
1312fn u64_to_usize(n: u64) -> usize {
1313 if n > ::std::usize::MAX as u64 {
1314 panic!(
1315 "\
1316Cannot convert node address {} to a pointer sized variable. If this FST
1317is very large and was generated on a system with a larger pointer size
1318than this system, then it is not possible to read this FST on this
1319system.",
1320 n
1321 );
1322 }
1323 n as usize
1324}