tantivy_fst/
map.rs

1use std::fmt;
2use std::io;
3use std::iter::FromIterator;
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7pub use crate::raw::IndexedValue;
8use crate::raw::Output;
9use crate::stream::{IntoStreamer, Streamer};
10use crate::Result;
11use std::ops::Deref;
12
13/// Map is a lexicographically ordered map from byte strings to integers.
14///
15/// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map`
16/// can be constructed in memory from a lexicographically ordered iterator
17/// of key-value pairs (`Map::from_iter`).
18///
19/// A key feature of `Map` is that it can be serialized to disk compactly. Its
20/// underlying representation is built such that the `Map` can be memory mapped
21/// (`Map::from_path`) and searched without necessarily loading the entire
22/// map into memory.
23///
24/// It supports most common operations associated with maps, such as key
25/// lookup and search. It also supports set operations on its keys along with
26/// the ability to specify how conflicting values are merged together. Maps
27/// also support range queries and automata based searches (e.g. a regular
28/// expression).
29///
30/// Maps are represented by a finite state transducer where inputs are the keys
31/// and outputs are the values. As such, maps have the following invariants:
32///
33/// 1. Once constructed, a `Map` can never be modified.
34/// 2. Maps must be constructed with lexicographically ordered byte sequences.
35///    There is no restricting on the ordering of values.
36///
37/// # Differences with sets
38///
39/// Maps and sets are represented by the same underlying data structure: the
40/// finite state transducer. The principal difference between them is that
41/// sets always have their output values set to `0`. This has an impact on the
42/// representation size and is reflected in the type system for convenience.
43/// A secondary but subtle difference is that duplicate values can be added
44/// to a set, but it is an error to do so with maps. That is, a set can have
45/// the same key added sequentially, but a map can't.
46///
47/// # The future
48///
49/// It is regrettable that the output value is fixed to `u64`. Indeed, it is
50/// not necessary, but it was a major simplification in the implementation.
51/// In the future, the value type may become generic to an extent (outputs must
52/// satisfy a basic algebra).
53///
54/// Keys will always be byte strings; however, we may grow more conveniences
55/// around dealing with them (such as a serialization/deserialization step,
56/// although it isn't clear where exactly this should live).
57pub struct Map<Data>(raw::Fst<Data>);
58
59impl Map<Vec<u8>> {
60    /// Creates a map from its representation as a raw byte sequence.
61    ///
62    /// Note that this operation is very cheap (no allocations and no copies).
63    ///
64    /// The map must have been written with a compatible finite state
65    /// transducer builder (`MapBuilder` qualifies). If the format is invalid
66    /// or if there is a mismatch between the API version of this library
67    /// and the map, then an error is returned.
68    pub fn from_bytes(bytes: Vec<u8>) -> Result<Map<Vec<u8>>> {
69        raw::Fst::new(bytes).map(Map)
70    }
71
72    /// Create a `Map` from an iterator of lexicographically ordered byte
73    /// strings and associated values.
74    ///
75    /// If the iterator does not yield unique keys in lexicographic order, then
76    /// an error is returned.
77    ///
78    /// Note that this is a convenience function to build a map in memory.
79    /// To build a map that streams to an arbitrary `io::Write`, use
80    /// `MapBuilder`.
81    pub fn from_iter<K, I>(iter: I) -> Result<Self>
82    where
83        K: AsRef<[u8]>,
84        I: IntoIterator<Item = (K, u64)>,
85    {
86        let mut builder = MapBuilder::memory();
87        builder.extend_iter(iter)?;
88        Map::from_bytes(builder.into_inner()?)
89    }
90}
91
92impl<Data: Deref<Target = [u8]>> Map<Data> {
93    /// Tests the membership of a single key.
94    ///
95    /// # Example
96    ///
97    /// ```rust
98    /// use tantivy_fst::Map;
99    ///
100    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
101    ///
102    /// assert_eq!(map.contains_key("b"), true);
103    /// assert_eq!(map.contains_key("z"), false);
104    /// ```
105    pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
106        self.0.contains_key(key)
107    }
108
109    /// Retrieves the value associated with a key.
110    ///
111    /// If the key does not exist, then `None` is returned.
112    ///
113    /// # Example
114    ///
115    /// ```rust
116    /// use tantivy_fst::Map;
117    ///
118    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
119    ///
120    /// assert_eq!(map.get("b"), Some(2));
121    /// assert_eq!(map.get("z"), None);
122    /// ```
123    pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> {
124        self.0.get(key).map(|output| output.value())
125    }
126
127    /// Return a lexicographically ordered stream of all key-value pairs in
128    /// this map.
129    ///
130    /// While this is a stream, it does require heap space proportional to the
131    /// longest key in the map.
132    ///
133    /// If the map is memory mapped, then no further heap space is needed.
134    /// Note though that your operating system may fill your page cache
135    /// (which will cause the resident memory usage of the process to go up
136    /// correspondingly).
137    ///
138    /// # Example
139    ///
140    /// Since streams are not iterators, the traditional `for` loop cannot be
141    /// used. `while let` is useful instead:
142    ///
143    /// ```rust
144    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
145    ///
146    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
147    /// let mut stream = map.stream();
148    ///
149    /// let mut kvs = vec![];
150    /// while let Some((k, v)) = stream.next() {
151    ///     kvs.push((k.to_vec(), v));
152    /// }
153    /// assert_eq!(kvs, vec![
154    ///     (b"a".to_vec(), 1),
155    ///     (b"b".to_vec(), 2),
156    ///     (b"c".to_vec(), 3),
157    /// ]);
158    /// ```
159    #[inline]
160    pub fn stream(&self) -> Stream {
161        Stream(self.0.stream())
162    }
163
164    /// Return a lexicographically ordered stream of all keys in this map.
165    ///
166    /// Memory requirements are the same as described on `Map::stream`.
167    ///
168    /// # Example
169    ///
170    /// ```rust
171    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
172    ///
173    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
174    /// let mut stream = map.keys();
175    ///
176    /// let mut keys = vec![];
177    /// while let Some(k) = stream.next() {
178    ///     keys.push(k.to_vec());
179    /// }
180    /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
181    /// ```
182    #[inline]
183    pub fn keys(&self) -> Keys {
184        Keys(self.0.stream())
185    }
186
187    /// Return a stream of all values in this map ordered lexicographically
188    /// by each value's corresponding key.
189    ///
190    /// Memory requirements are the same as described on `Map::stream`.
191    ///
192    /// # Example
193    ///
194    /// ```rust
195    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
196    ///
197    /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
198    /// let mut stream = map.values();
199    ///
200    /// let mut values = vec![];
201    /// while let Some(v) = stream.next() {
202    ///     values.push(v);
203    /// }
204    /// assert_eq!(values, vec![1, 2, 3]);
205    /// ```
206    #[inline]
207    pub fn values(&self) -> Values {
208        Values(self.0.stream())
209    }
210
211    /// Return a builder for range queries.
212    ///
213    /// A range query returns a subset of key-value pairs in this map in a
214    /// range given in lexicographic order.
215    ///
216    /// Memory requirements are the same as described on `Map::stream`.
217    /// Notably, only the keys in the range are read; keys outside the range
218    /// are not.
219    ///
220    /// # Example
221    ///
222    /// Returns only the key-value pairs in the range given.
223    ///
224    /// ```rust
225    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
226    ///
227    /// let map = Map::from_iter(vec![
228    ///     ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5),
229    /// ]).unwrap();
230    /// let mut stream = map.range().ge("b").lt("e").into_stream();
231    ///
232    /// let mut kvs = vec![];
233    /// while let Some((k, v)) = stream.next() {
234    ///     kvs.push((k.to_vec(), v));
235    /// }
236    /// assert_eq!(kvs, vec![
237    ///     (b"b".to_vec(), 2),
238    ///     (b"c".to_vec(), 3),
239    ///     (b"d".to_vec(), 4),
240    /// ]);
241    /// ```
242    #[inline]
243    pub fn range(&self) -> StreamBuilder {
244        StreamBuilder(self.0.range())
245    }
246
247    /// Executes an automaton on the keys of this map.
248    ///
249    /// Note that this returns a `StreamBuilder`, which can be used to
250    /// add a range query to the search (see the `range` method).
251    ///
252    /// Memory requirements are the same as described on `Map::stream`.
253    ///
254    /// # Example
255    ///
256    /// An implementation of regular expressions for `Automaton` is available
257    /// in the `fst-regex` crate, which can be used to search maps.
258    ///
259    /// ```rust
260    ///
261    /// use std::error::Error;
262    ///
263    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
264    /// #[cfg(feature = "regex")]
265    /// use tantivy_fst::Regex;
266    ///
267    /// #[cfg(feature = "regex")]
268    /// {
269    ///     let map = Map::from_iter(vec![
270    ///         ("foo", 1), ("foo1", 2), ("foo2", 3), ("foo3", 4), ("foobar", 5),
271    ///     ]).unwrap();
272    ///
273    ///     let re = Regex::new("f[a-z]+3?").unwrap();
274    ///     let mut stream = map.search(&re).into_stream();
275    ///
276    ///     let mut kvs = vec![];
277    ///     while let Some((k, v)) = stream.next() {
278    ///         kvs.push((k.to_vec(), v));
279    ///     }
280    ///     assert_eq!(kvs, vec![
281    ///         (b"foo".to_vec(), 1),
282    ///         (b"foo3".to_vec(), 4),
283    ///         (b"foobar".to_vec(), 5),
284    ///     ]);
285    ///
286    /// }
287    /// ```
288    pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
289        StreamBuilder(self.0.search(aut))
290    }
291
292    /// Returns the number of elements in this map.
293    #[inline]
294    pub fn len(&self) -> usize {
295        self.0.len()
296    }
297
298    /// Returns true if and only if this map is empty.
299    #[inline]
300    pub fn is_empty(&self) -> bool {
301        self.0.is_empty()
302    }
303
304    /// Creates a new map operation with this map added to it.
305    ///
306    /// The `OpBuilder` type can be used to add additional map streams
307    /// and perform set operations like union, intersection, difference and
308    /// symmetric difference on the keys of the map. These set operations also
309    /// allow one to specify how conflicting values are merged in the stream.
310    ///
311    /// # Example
312    ///
313    /// This example demonstrates a union on multiple map streams. Notice that
314    /// the stream returned from the union is not a sequence of key-value
315    /// pairs, but rather a sequence of keys associated with one or more
316    /// values. Namely, a key is associated with each value associated with
317    /// that same key in the all of the streams.
318    ///
319    /// ```rust
320    /// use tantivy_fst::{Streamer, Map};
321    /// use tantivy_fst::{map::IndexedValue};
322    ///
323    /// let map1 = Map::from_iter(vec![
324    ///     ("a", 1), ("b", 2), ("c", 3),
325    /// ]).unwrap();
326    /// let map2 = Map::from_iter(vec![
327    ///     ("a", 10), ("y", 11), ("z", 12),
328    /// ]).unwrap();
329    ///
330    /// let mut union = map1.op().add(&map2).union();
331    ///
332    /// let mut kvs = vec![];
333    /// while let Some((k, vs)) = union.next() {
334    ///     kvs.push((k.to_vec(), vs.to_vec()));
335    /// }
336    /// assert_eq!(kvs, vec![
337    ///     (b"a".to_vec(), vec![
338    ///         IndexedValue { index: 0, value: 1 },
339    ///         IndexedValue { index: 1, value: 10 },
340    ///     ]),
341    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
342    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
343    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]),
344    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
345    /// ]);
346    /// ```
347    #[inline]
348    pub fn op(&self) -> OpBuilder {
349        OpBuilder::new().add(self)
350    }
351
352    /// Returns a reference to the underlying raw finite state transducer.
353    #[inline]
354    pub fn as_fst(&self) -> &raw::Fst<Data> {
355        &self.0
356    }
357}
358
359impl<Data: Deref<Target = [u8]>> fmt::Debug for Map<Data> {
360    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
361        write!(f, "Map([")?;
362        let mut stream = self.stream();
363        let mut first = true;
364        while let Some((k, v)) = stream.next() {
365            if !first {
366                write!(f, ", ")?;
367            }
368            first = false;
369            write!(f, "({}, {})", String::from_utf8_lossy(k), v)?;
370        }
371        write!(f, "])")
372    }
373}
374
375// Construct a map from an Fst object.
376impl<Data> From<raw::Fst<Data>> for Map<Data> {
377    #[inline]
378    fn from(fst: raw::Fst<Data>) -> Self {
379        Map(fst)
380    }
381}
382
383/// Returns the underlying finite state transducer.
384impl<Data> AsRef<raw::Fst<Data>> for Map<Data> {
385    #[inline]
386    fn as_ref(&self) -> &raw::Fst<Data> {
387        &self.0
388    }
389}
390
391impl<'m, 'a, Data: Deref<Target = [u8]>> IntoStreamer<'a> for &'m Map<Data> {
392    type Item = (&'a [u8], u64);
393    type Into = Stream<'m>;
394
395    #[inline]
396    fn into_stream(self) -> Self::Into {
397        Stream(self.0.stream())
398    }
399}
400
401/// A builder for creating a map.
402///
403/// This is not your average everyday builder. It has two important qualities
404/// that make it a bit unique from what you might expect:
405///
406/// 1. All keys must be added in lexicographic order. Adding a key out of order
407///    will result in an error. Additionally, adding a duplicate key will also
408///    result in an error. That is, once a key is associated with a value,
409///    that association can never be modified or deleted.
410/// 2. The representation of a map is streamed to *any* `io::Write` as it is
411///    built. For an in memory representation, this can be a `Vec<u8>`.
412///
413/// Point (2) is especially important because it means that a map can be
414/// constructed *without storing the entire map in memory*. Namely, since it
415/// works with any `io::Write`, it can be streamed directly to a file.
416///
417/// With that said, the builder does use memory, but **memory usage is bounded
418/// to a constant size**. The amount of memory used trades off with the
419/// compression ratio. Currently, the implementation hard codes this trade off
420/// which can result in about 5-20MB of heap usage during construction. (N.B.
421/// Guaranteeing a maximal compression ratio requires memory proportional to
422/// the size of the map, which defeats some of the benefit of streaming
423/// it to disk. In practice, a small bounded amount of memory achieves
424/// close-to-minimal compression ratios.)
425///
426/// The algorithmic complexity of map construction is `O(n)` where `n` is the
427/// number of elements added to the map.
428///
429/// # Example: build in memory
430///
431/// This shows how to use the builder to construct a map in memory. Note that
432/// `Map::from_iter` provides a convenience function that achieves this same
433/// goal without needing to explicitly use `MapBuilder`.
434///
435/// ```rust
436/// use tantivy_fst::{IntoStreamer, Streamer, Map, MapBuilder};
437///
438/// let mut build = MapBuilder::memory();
439/// build.insert("bruce", 1).unwrap();
440/// build.insert("clarence", 2).unwrap();
441/// build.insert("stevie", 3).unwrap();
442///
443/// // You could also call `finish()` here, but since we're building the map in
444/// // memory, there would be no way to get the `Vec<u8>` back.
445/// let bytes = build.into_inner().unwrap();
446///
447/// // At this point, the map has been constructed, but here's how to read it.
448/// let map = Map::from_bytes(bytes).unwrap();
449/// let mut stream = map.into_stream();
450/// let mut kvs = vec![];
451/// while let Some((k, v)) = stream.next() {
452///     kvs.push((k.to_vec(), v));
453/// }
454/// assert_eq!(kvs, vec![
455///     (b"bruce".to_vec(), 1),
456///     (b"clarence".to_vec(), 2),
457///     (b"stevie".to_vec(), 3),
458/// ]);
459/// ```
460pub struct MapBuilder<W>(raw::Builder<W>);
461
462impl MapBuilder<Vec<u8>> {
463    /// Create a builder that builds a map in memory.
464    #[inline]
465    pub fn memory() -> Self {
466        MapBuilder(raw::Builder::memory())
467    }
468}
469
470impl<W: io::Write> MapBuilder<W> {
471    /// Create a builder that builds a map by writing it to `wtr` in a
472    /// streaming fashion.
473    pub fn new(wtr: W) -> Result<MapBuilder<W>> {
474        raw::Builder::new_type(wtr, 0).map(MapBuilder)
475    }
476
477    /// Insert a new key-value pair into the map.
478    ///
479    /// Keys must be convertible to byte strings. Values must be a `u64`, which
480    /// is a restriction of the current implementation of finite state
481    /// transducers. (Values may one day be expanded to other types.)
482    ///
483    /// If a key is inserted that is less than or equal to any previous key
484    /// added, then an error is returned. Similarly, if there was a problem
485    /// writing to the underlying writer, an error is returned.
486    pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> {
487        self.0.insert(key, val)
488    }
489
490    /// Calls insert on each item in the iterator.
491    ///
492    /// If an error occurred while adding an element, processing is stopped
493    /// and the error is returned.
494    ///
495    /// If a key is inserted that is less than or equal to any previous key
496    /// added, then an error is returned. Similarly, if there was a problem
497    /// writing to the underlying writer, an error is returned.
498    pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()>
499    where
500        K: AsRef<[u8]>,
501        I: IntoIterator<Item = (K, u64)>,
502    {
503        self.0
504            .extend_iter(iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))))
505    }
506
507    /// Calls insert on each item in the stream.
508    ///
509    /// Note that unlike `extend_iter`, this is not generic on the items in
510    /// the stream.
511    ///
512    /// If a key is inserted that is less than or equal to any previous key
513    /// added, then an error is returned. Similarly, if there was a problem
514    /// writing to the underlying writer, an error is returned.
515    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
516    where
517        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
518        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
519    {
520        self.0.extend_stream(StreamOutput(stream.into_stream()))
521    }
522
523    /// Finishes the construction of the map and flushes the underlying
524    /// writer. After completion, the data written to `W` may be read using
525    /// one of `Map`'s constructor methods.
526    pub fn finish(self) -> Result<()> {
527        self.0.finish()
528    }
529
530    /// Just like `finish`, except it returns the underlying writer after
531    /// flushing it.
532    pub fn into_inner(self) -> Result<W> {
533        self.0.into_inner()
534    }
535
536    /// Gets a reference to the underlying writer.
537    pub fn get_ref(&self) -> &W {
538        self.0.get_ref()
539    }
540
541    /// Returns the number of bytes written to the underlying writer
542    pub fn bytes_written(&self) -> u64 {
543        self.0.bytes_written()
544    }
545}
546
547/// A lexicographically ordered stream of key-value pairs from a map.
548///
549/// The `A` type parameter corresponds to an optional automaton to filter
550/// the stream. By default, no filtering is done.
551///
552/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
553pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>)
554where
555    A: Automaton;
556
557impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> {
558    type Item = (&'a [u8], u64);
559
560    fn next(&'a mut self) -> Option<Self::Item> {
561        self.0.next().map(|(key, out)| (key, out.value()))
562    }
563}
564
565impl<'m, A: Automaton> Stream<'m, A> {
566    /// Convert this stream into a vector of byte strings and outputs.
567    ///
568    /// Note that this creates a new allocation for every key in the stream.
569    pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> {
570        self.0.into_byte_vec()
571    }
572
573    /// Convert this stream into a vector of Unicode strings and outputs.
574    ///
575    /// If any key is not valid UTF-8, then iteration on the stream is stopped
576    /// and a UTF-8 decoding error is returned.
577    ///
578    /// Note that this creates a new allocation for every key in the stream.
579    pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> {
580        self.0.into_str_vec()
581    }
582
583    /// Convert this stream into a vector of byte strings.
584    ///
585    /// Note that this creates a new allocation for every key in the stream.
586    pub fn into_byte_keys(self) -> Vec<Vec<u8>> {
587        self.0.into_byte_keys()
588    }
589
590    /// Convert this stream into a vector of Unicode strings.
591    ///
592    /// If any key is not valid UTF-8, then iteration on the stream is stopped
593    /// and a UTF-8 decoding error is returned.
594    ///
595    /// Note that this creates a new allocation for every key in the stream.
596    pub fn into_str_keys(self) -> Result<Vec<String>> {
597        self.0.into_str_keys()
598    }
599
600    /// Convert this stream into a vector of outputs.
601    pub fn into_values(self) -> Vec<u64> {
602        self.0.into_values()
603    }
604}
605
606/// A lexicographically ordered stream of keys from a map.
607///
608/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
609pub struct Keys<'m>(raw::Stream<'m>);
610
611impl<'a, 'm> Streamer<'a> for Keys<'m> {
612    type Item = &'a [u8];
613
614    #[inline]
615    fn next(&'a mut self) -> Option<Self::Item> {
616        self.0.next().map(|(key, _)| key)
617    }
618}
619
620/// A stream of values from a map, lexicographically ordered by each value's
621/// corresponding key.
622///
623/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
624pub struct Values<'m>(raw::Stream<'m>);
625
626impl<'a, 'm> Streamer<'a> for Values<'m> {
627    type Item = u64;
628
629    #[inline]
630    fn next(&'a mut self) -> Option<Self::Item> {
631        self.0.next().map(|(_, out)| out.value())
632    }
633}
634
635/// A builder for constructing range queries on streams.
636///
637/// Once all bounds are set, one should call `into_stream` to get a
638/// `Stream`.
639///
640/// Bounds are not additive. That is, if `ge` is called twice on the same
641/// builder, then the second setting wins.
642///
643/// The `A` type parameter corresponds to an optional automaton to filter
644/// the stream. By default, no filtering is done.
645///
646/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
647pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>);
648
649impl<'m, A: Automaton> StreamBuilder<'m, A> {
650    /// Specify a greater-than-or-equal-to bound.
651    pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self {
652        StreamBuilder(self.0.ge(bound))
653    }
654
655    /// Specify a greater-than bound.
656    pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> Self {
657        StreamBuilder(self.0.gt(bound))
658    }
659
660    /// Specify a less-than-or-equal-to bound.
661    pub fn le<T: AsRef<[u8]>>(self, bound: T) -> Self {
662        StreamBuilder(self.0.le(bound))
663    }
664
665    /// Specify a less-than bound.
666    pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> Self {
667        StreamBuilder(self.0.lt(bound))
668    }
669
670    /// Make it iterate backwards.
671    pub fn backward(self) -> Self {
672        StreamBuilder(self.0.backward())
673    }
674
675    /// Return this builder and gives the automaton states
676    /// along with the results.
677    pub fn with_state(self) -> StreamWithStateBuilder<'m, A> {
678        StreamWithStateBuilder(self.0.with_state())
679    }
680}
681
682impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> {
683    type Item = (&'a [u8], u64);
684    type Into = Stream<'m, A>;
685
686    fn into_stream(self) -> Self::Into {
687        Stream(self.0.into_stream())
688    }
689}
690
691/// A builder for constructing range queries of streams
692/// that returns results along with automaton states.
693///
694/// Once all bounds are set, one should call `into_stream` to get a
695/// `StreamWithState`.
696///
697/// Bounds are not additive. That is, if `ge` is called twice on the same
698/// builder, then the second setting wins.
699///
700/// The `A` type parameter corresponds to an optional automaton to filter
701/// the stream. By default, no filtering is done.
702///
703/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
704pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>(raw::StreamWithStateBuilder<'m, A>);
705
706impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'m, A>
707where
708    A::State: Clone,
709{
710    type Item = (&'a [u8], u64, A::State);
711    type Into = StreamWithState<'m, A>;
712
713    fn into_stream(self) -> Self::Into {
714        StreamWithState(self.0.into_stream())
715    }
716}
717
718/// A builder for collecting map streams on which to perform set operations
719/// on the keys of maps.
720///
721/// Set operations include intersection, union, difference and symmetric
722/// difference. The result of each set operation is itself a stream that emits
723/// pairs of keys and a sequence of each occurrence of that key in the
724/// participating streams. This information allows one to perform set
725/// operations on maps and customize how conflicting output values are handled.
726///
727/// All set operations work efficiently on an arbitrary number of
728/// streams with memory proportional to the number of streams.
729///
730/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
731/// where `n1, n2, n3, ...` correspond to the number of elements in each
732/// stream.
733///
734/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
735pub struct OpBuilder<'m>(raw::OpBuilder<'m>);
736
737impl<'m> OpBuilder<'m> {
738    /// Create a new set operation builder.
739    #[inline]
740    pub fn new() -> Self {
741        OpBuilder(raw::OpBuilder::default())
742    }
743
744    /// Add a stream to this set operation.
745    ///
746    /// This is useful for a chaining style pattern, e.g.,
747    /// `builder.add(stream1).add(stream2).union()`.
748    ///
749    /// The stream must emit a lexicographically ordered sequence of key-value
750    /// pairs.
751    pub fn add<I, S>(mut self, streamable: I) -> Self
752    where
753        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
754        S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
755    {
756        self.push(streamable);
757        self
758    }
759
760    /// Add a stream to this set operation.
761    ///
762    /// The stream must emit a lexicographically ordered sequence of key-value
763    /// pairs.
764    pub fn push<I, S>(&mut self, streamable: I)
765    where
766        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
767        S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
768    {
769        self.0.push(StreamOutput(streamable.into_stream()));
770    }
771
772    /// Performs a union operation on all streams that have been added.
773    ///
774    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
775    /// first element of the tuple is the byte string key. The second element
776    /// of the tuple is a list of all occurrences of that key in participating
777    /// streams. The `IndexedValue` contains an index and the value associated
778    /// with that key in that stream. The index uniquely identifies each
779    /// stream, which is an integer that is auto-incremented when a stream
780    /// is added to this operation (starting at `0`).
781    ///
782    /// # Example
783    ///
784    /// ```rust
785    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
786    /// use tantivy_fst::map::IndexedValue;
787    ///
788    /// let map1 = Map::from_iter(vec![
789    ///     ("a", 1), ("b", 2), ("c", 3),
790    /// ]).unwrap();
791    /// let map2 = Map::from_iter(vec![
792    ///     ("a", 11), ("y", 12), ("z", 13),
793    /// ]).unwrap();
794    ///
795    /// let mut union = map1.op().add(&map2).union();
796    ///
797    /// let mut kvs = vec![];
798    /// while let Some((k, vs)) = union.next() {
799    ///     kvs.push((k.to_vec(), vs.to_vec()));
800    /// }
801    /// assert_eq!(kvs, vec![
802    ///     (b"a".to_vec(), vec![
803    ///         IndexedValue { index: 0, value: 1 },
804    ///         IndexedValue { index: 1, value: 11 },
805    ///     ]),
806    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
807    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
808    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
809    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
810    /// ]);
811    /// ```
812    #[inline]
813    pub fn union(self) -> Union<'m> {
814        Union(self.0.union())
815    }
816
817    /// Chains all streams that have been added in the order they have been added.
818    ///
819    /// Note that this returns a stream of `(&[u8], &[Output])`. The
820    /// first element of the tuple is the byte string key. The second element
821    /// of the tuple is the value associated with that key in the underlying
822    /// streams. This is useful when the streams area already sorted and not
823    /// overlapping.
824    ///
825    /// # Example
826    ///
827    /// ```rust
828    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
829    /// use tantivy_fst::raw::Output;
830    /// use tantivy_fst::map::IndexedValue;
831    ///
832    /// let map1 = Map::from_iter(vec![
833    ///     ("a", 1), ("b", 2),
834    /// ]).unwrap();
835    /// let map2 = Map::from_iter(vec![
836    ///     ("a", 11), ("y", 12),
837    /// ]).unwrap();
838    ///
839    /// let mut chain = map1.op().add(&map2).chain();
840    ///
841    /// let mut kvs = vec![];
842    /// while let Some((k, v)) = chain.next() {
843    ///     kvs.push((k.to_vec(), v));
844    /// }
845    /// assert_eq!(kvs, vec![
846    ///     (b"a".to_vec(), Output::new(1)),
847    ///     (b"b".to_vec(), Output::new(2)),
848    ///     (b"a".to_vec(), Output::new(11)),
849    ///     (b"y".to_vec(), Output::new(12)),
850    /// ]);
851    /// ```
852    #[inline]
853    pub fn chain(self) -> Chain<'m> {
854        Chain(self.0.chain())
855    }
856    /// Performs an intersection operation on all streams that have been added.
857    ///
858    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
859    /// first element of the tuple is the byte string key. The second element
860    /// of the tuple is a list of all occurrences of that key in participating
861    /// streams. The `IndexedValue` contains an index and the value associated
862    /// with that key in that stream. The index uniquely identifies each
863    /// stream, which is an integer that is auto-incremented when a stream
864    /// is added to this operation (starting at `0`).
865    ///
866    /// # Example
867    ///
868    /// ```rust
869    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
870    /// use tantivy_fst::map::IndexedValue;
871    ///
872    /// let map1 = Map::from_iter(vec![
873    ///     ("a", 1), ("b", 2), ("c", 3),
874    /// ]).unwrap();
875    /// let map2 = Map::from_iter(vec![
876    ///     ("a", 11), ("y", 12), ("z", 13),
877    /// ]).unwrap();
878    ///
879    /// let mut intersection = map1.op().add(&map2).intersection();
880    ///
881    /// let mut kvs = vec![];
882    /// while let Some((k, vs)) = intersection.next() {
883    ///     kvs.push((k.to_vec(), vs.to_vec()));
884    /// }
885    /// assert_eq!(kvs, vec![
886    ///     (b"a".to_vec(), vec![
887    ///         IndexedValue { index: 0, value: 1 },
888    ///         IndexedValue { index: 1, value: 11 },
889    ///     ]),
890    /// ]);
891    /// ```
892    #[inline]
893    pub fn intersection(self) -> Intersection<'m> {
894        Intersection(self.0.intersection())
895    }
896
897    /// Performs a difference operation with respect to the first stream added.
898    /// That is, this returns a stream of all elements in the first stream
899    /// that don't exist in any other stream that has been added.
900    ///
901    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
902    /// first element of the tuple is the byte string key. The second element
903    /// of the tuple is a list of all occurrences of that key in participating
904    /// streams. The `IndexedValue` contains an index and the value associated
905    /// with that key in that stream. The index uniquely identifies each
906    /// stream, which is an integer that is auto-incremented when a stream
907    /// is added to this operation (starting at `0`).
908    ///
909    /// # Example
910    ///
911    /// ```rust
912    /// use tantivy_fst::{Streamer, Map};
913    /// use tantivy_fst::map::IndexedValue;
914    ///
915    /// let map1 = Map::from_iter(vec![
916    ///     ("a", 1), ("b", 2), ("c", 3),
917    /// ]).unwrap();
918    /// let map2 = Map::from_iter(vec![
919    ///     ("a", 11), ("y", 12), ("z", 13),
920    /// ]).unwrap();
921    ///
922    /// let mut difference = map1.op().add(&map2).difference();
923    ///
924    /// let mut kvs = vec![];
925    /// while let Some((k, vs)) = difference.next() {
926    ///     kvs.push((k.to_vec(), vs.to_vec()));
927    /// }
928    /// assert_eq!(kvs, vec![
929    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
930    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
931    /// ]);
932    /// ```
933    #[inline]
934    pub fn difference(self) -> Difference<'m> {
935        Difference(self.0.difference())
936    }
937
938    /// Performs a symmetric difference operation on all of the streams that
939    /// have been added.
940    ///
941    /// When there are only two streams, then the keys returned correspond to
942    /// keys that are in either stream but *not* in both streams.
943    ///
944    /// More generally, for any number of streams, keys that occur in an odd
945    /// number of streams are returned.
946    ///
947    /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
948    /// first element of the tuple is the byte string key. The second element
949    /// of the tuple is a list of all occurrences of that key in participating
950    /// streams. The `IndexedValue` contains an index and the value associated
951    /// with that key in that stream. The index uniquely identifies each
952    /// stream, which is an integer that is auto-incremented when a stream
953    /// is added to this operation (starting at `0`).
954    ///
955    /// # Example
956    ///
957    /// ```rust
958    /// use tantivy_fst::{IntoStreamer, Streamer, Map};
959    /// use tantivy_fst::map::IndexedValue;
960    ///
961    /// let map1 = Map::from_iter(vec![
962    ///     ("a", 1), ("b", 2), ("c", 3),
963    /// ]).unwrap();
964    /// let map2 = Map::from_iter(vec![
965    ///     ("a", 11), ("y", 12), ("z", 13),
966    /// ]).unwrap();
967    ///
968    /// let mut sym_difference = map1.op().add(&map2).symmetric_difference();
969    ///
970    /// let mut kvs = vec![];
971    /// while let Some((k, vs)) = sym_difference.next() {
972    ///     kvs.push((k.to_vec(), vs.to_vec()));
973    /// }
974    /// assert_eq!(kvs, vec![
975    ///     (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
976    ///     (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
977    ///     (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
978    ///     (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
979    /// ]);
980    /// ```
981    #[inline]
982    pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
983        SymmetricDifference(self.0.symmetric_difference())
984    }
985}
986
987impl<'f, I, S> Extend<I> for OpBuilder<'f>
988where
989    I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
990    S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
991{
992    fn extend<T>(&mut self, it: T)
993    where
994        T: IntoIterator<Item = I>,
995    {
996        for stream in it {
997            self.push(stream);
998        }
999    }
1000}
1001
1002impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
1003where
1004    I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
1005    S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
1006{
1007    fn from_iter<T>(it: T) -> Self
1008    where
1009        T: IntoIterator<Item = I>,
1010    {
1011        let mut op = OpBuilder::new();
1012        op.extend(it);
1013        op
1014    }
1015}
1016
1017/// A stream of set union over multiple map streams in lexicographic order.
1018///
1019/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1020pub struct Union<'m>(raw::Union<'m>);
1021
1022impl<'a, 'm> Streamer<'a> for Union<'m> {
1023    type Item = (&'a [u8], &'a [IndexedValue]);
1024
1025    #[inline]
1026    fn next(&'a mut self) -> Option<Self::Item> {
1027        self.0.next()
1028    }
1029}
1030
1031/// A stream of set union over multiple map streams in lexicographic order.
1032///
1033/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1034pub struct Chain<'m>(raw::Chain<'m>);
1035
1036impl<'a, 'm> Streamer<'a> for Chain<'m> {
1037    type Item = (&'a [u8], Output);
1038
1039    #[inline]
1040    fn next(&'a mut self) -> Option<Self::Item> {
1041        self.0.next()
1042    }
1043}
1044
1045/// A stream of set intersection over multiple map streams in lexicographic
1046/// order.
1047///
1048/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1049pub struct Intersection<'m>(raw::Intersection<'m>);
1050
1051impl<'a, 'm> Streamer<'a> for Intersection<'m> {
1052    type Item = (&'a [u8], &'a [IndexedValue]);
1053
1054    #[inline]
1055    fn next(&'a mut self) -> Option<Self::Item> {
1056        self.0.next()
1057    }
1058}
1059
1060/// A stream of set difference over multiple map streams in lexicographic
1061/// order.
1062///
1063/// The difference operation is taken with respect to the first stream and the
1064/// rest of the streams. i.e., All elements in the first stream that do not
1065/// appear in any other streams.
1066///
1067/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1068pub struct Difference<'m>(raw::Difference<'m>);
1069
1070impl<'a, 'm> Streamer<'a> for Difference<'m> {
1071    type Item = (&'a [u8], &'a [IndexedValue]);
1072
1073    #[inline]
1074    fn next(&'a mut self) -> Option<Self::Item> {
1075        self.0.next()
1076    }
1077}
1078
1079/// A stream of set symmetric difference over multiple map streams in
1080/// lexicographic order.
1081///
1082/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1083pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>);
1084
1085impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> {
1086    type Item = (&'a [u8], &'a [IndexedValue]);
1087
1088    #[inline]
1089    fn next(&'a mut self) -> Option<Self::Item> {
1090        self.0.next()
1091    }
1092}
1093
1094/// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams
1095/// used by raw fsts (`(&[u8], Output)`).
1096///
1097/// If this were iterators, we could use `iter::Map`, but doing this on streams
1098/// requires HKT, so we need to write out the monomorphization ourselves.
1099struct StreamOutput<S>(S);
1100
1101impl<'a, S> Streamer<'a> for StreamOutput<S>
1102where
1103    S: Streamer<'a, Item = (&'a [u8], u64)>,
1104{
1105    type Item = (&'a [u8], raw::Output);
1106
1107    fn next(&'a mut self) -> Option<Self::Item> {
1108        self.0.next().map(|(k, v)| (k, raw::Output::new(v)))
1109    }
1110}
1111
1112/// A lexicographically ordered stream of key-value from a map
1113/// along with the states of the automaton.
1114///
1115/// The `Stream` type is based on the `StreamWithState`.
1116pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
1117where
1118    A: Automaton;
1119
1120impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
1121where
1122    A::State: Clone,
1123{
1124    type Item = (&'a [u8], u64, A::State);
1125
1126    fn next(&'a mut self) -> Option<Self::Item> {
1127        self.0
1128            .next()
1129            .map(|(key, out, state)| (key, out.value(), state))
1130    }
1131}