tantivy_fst/map.rs
1use std::fmt;
2use std::io;
3use std::iter::FromIterator;
4
5use crate::automaton::{AlwaysMatch, Automaton};
6use crate::raw;
7pub use crate::raw::IndexedValue;
8use crate::raw::Output;
9use crate::stream::{IntoStreamer, Streamer};
10use crate::Result;
11use std::ops::Deref;
12
13/// Map is a lexicographically ordered map from byte strings to integers.
14///
15/// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map`
16/// can be constructed in memory from a lexicographically ordered iterator
17/// of key-value pairs (`Map::from_iter`).
18///
19/// A key feature of `Map` is that it can be serialized to disk compactly. Its
20/// underlying representation is built such that the `Map` can be memory mapped
21/// (`Map::from_path`) and searched without necessarily loading the entire
22/// map into memory.
23///
24/// It supports most common operations associated with maps, such as key
25/// lookup and search. It also supports set operations on its keys along with
26/// the ability to specify how conflicting values are merged together. Maps
27/// also support range queries and automata based searches (e.g. a regular
28/// expression).
29///
30/// Maps are represented by a finite state transducer where inputs are the keys
31/// and outputs are the values. As such, maps have the following invariants:
32///
33/// 1. Once constructed, a `Map` can never be modified.
34/// 2. Maps must be constructed with lexicographically ordered byte sequences.
35/// There is no restricting on the ordering of values.
36///
37/// # Differences with sets
38///
39/// Maps and sets are represented by the same underlying data structure: the
40/// finite state transducer. The principal difference between them is that
41/// sets always have their output values set to `0`. This has an impact on the
42/// representation size and is reflected in the type system for convenience.
43/// A secondary but subtle difference is that duplicate values can be added
44/// to a set, but it is an error to do so with maps. That is, a set can have
45/// the same key added sequentially, but a map can't.
46///
47/// # The future
48///
49/// It is regrettable that the output value is fixed to `u64`. Indeed, it is
50/// not necessary, but it was a major simplification in the implementation.
51/// In the future, the value type may become generic to an extent (outputs must
52/// satisfy a basic algebra).
53///
54/// Keys will always be byte strings; however, we may grow more conveniences
55/// around dealing with them (such as a serialization/deserialization step,
56/// although it isn't clear where exactly this should live).
57pub struct Map<Data>(raw::Fst<Data>);
58
59impl Map<Vec<u8>> {
60 /// Creates a map from its representation as a raw byte sequence.
61 ///
62 /// Note that this operation is very cheap (no allocations and no copies).
63 ///
64 /// The map must have been written with a compatible finite state
65 /// transducer builder (`MapBuilder` qualifies). If the format is invalid
66 /// or if there is a mismatch between the API version of this library
67 /// and the map, then an error is returned.
68 pub fn from_bytes(bytes: Vec<u8>) -> Result<Map<Vec<u8>>> {
69 raw::Fst::new(bytes).map(Map)
70 }
71
72 /// Create a `Map` from an iterator of lexicographically ordered byte
73 /// strings and associated values.
74 ///
75 /// If the iterator does not yield unique keys in lexicographic order, then
76 /// an error is returned.
77 ///
78 /// Note that this is a convenience function to build a map in memory.
79 /// To build a map that streams to an arbitrary `io::Write`, use
80 /// `MapBuilder`.
81 pub fn from_iter<K, I>(iter: I) -> Result<Self>
82 where
83 K: AsRef<[u8]>,
84 I: IntoIterator<Item = (K, u64)>,
85 {
86 let mut builder = MapBuilder::memory();
87 builder.extend_iter(iter)?;
88 Map::from_bytes(builder.into_inner()?)
89 }
90}
91
92impl<Data: Deref<Target = [u8]>> Map<Data> {
93 /// Tests the membership of a single key.
94 ///
95 /// # Example
96 ///
97 /// ```rust
98 /// use tantivy_fst::Map;
99 ///
100 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
101 ///
102 /// assert_eq!(map.contains_key("b"), true);
103 /// assert_eq!(map.contains_key("z"), false);
104 /// ```
105 pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool {
106 self.0.contains_key(key)
107 }
108
109 /// Retrieves the value associated with a key.
110 ///
111 /// If the key does not exist, then `None` is returned.
112 ///
113 /// # Example
114 ///
115 /// ```rust
116 /// use tantivy_fst::Map;
117 ///
118 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
119 ///
120 /// assert_eq!(map.get("b"), Some(2));
121 /// assert_eq!(map.get("z"), None);
122 /// ```
123 pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> {
124 self.0.get(key).map(|output| output.value())
125 }
126
127 /// Return a lexicographically ordered stream of all key-value pairs in
128 /// this map.
129 ///
130 /// While this is a stream, it does require heap space proportional to the
131 /// longest key in the map.
132 ///
133 /// If the map is memory mapped, then no further heap space is needed.
134 /// Note though that your operating system may fill your page cache
135 /// (which will cause the resident memory usage of the process to go up
136 /// correspondingly).
137 ///
138 /// # Example
139 ///
140 /// Since streams are not iterators, the traditional `for` loop cannot be
141 /// used. `while let` is useful instead:
142 ///
143 /// ```rust
144 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
145 ///
146 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
147 /// let mut stream = map.stream();
148 ///
149 /// let mut kvs = vec![];
150 /// while let Some((k, v)) = stream.next() {
151 /// kvs.push((k.to_vec(), v));
152 /// }
153 /// assert_eq!(kvs, vec![
154 /// (b"a".to_vec(), 1),
155 /// (b"b".to_vec(), 2),
156 /// (b"c".to_vec(), 3),
157 /// ]);
158 /// ```
159 #[inline]
160 pub fn stream(&self) -> Stream {
161 Stream(self.0.stream())
162 }
163
164 /// Return a lexicographically ordered stream of all keys in this map.
165 ///
166 /// Memory requirements are the same as described on `Map::stream`.
167 ///
168 /// # Example
169 ///
170 /// ```rust
171 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
172 ///
173 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
174 /// let mut stream = map.keys();
175 ///
176 /// let mut keys = vec![];
177 /// while let Some(k) = stream.next() {
178 /// keys.push(k.to_vec());
179 /// }
180 /// assert_eq!(keys, vec![b"a", b"b", b"c"]);
181 /// ```
182 #[inline]
183 pub fn keys(&self) -> Keys {
184 Keys(self.0.stream())
185 }
186
187 /// Return a stream of all values in this map ordered lexicographically
188 /// by each value's corresponding key.
189 ///
190 /// Memory requirements are the same as described on `Map::stream`.
191 ///
192 /// # Example
193 ///
194 /// ```rust
195 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
196 ///
197 /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap();
198 /// let mut stream = map.values();
199 ///
200 /// let mut values = vec![];
201 /// while let Some(v) = stream.next() {
202 /// values.push(v);
203 /// }
204 /// assert_eq!(values, vec![1, 2, 3]);
205 /// ```
206 #[inline]
207 pub fn values(&self) -> Values {
208 Values(self.0.stream())
209 }
210
211 /// Return a builder for range queries.
212 ///
213 /// A range query returns a subset of key-value pairs in this map in a
214 /// range given in lexicographic order.
215 ///
216 /// Memory requirements are the same as described on `Map::stream`.
217 /// Notably, only the keys in the range are read; keys outside the range
218 /// are not.
219 ///
220 /// # Example
221 ///
222 /// Returns only the key-value pairs in the range given.
223 ///
224 /// ```rust
225 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
226 ///
227 /// let map = Map::from_iter(vec![
228 /// ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5),
229 /// ]).unwrap();
230 /// let mut stream = map.range().ge("b").lt("e").into_stream();
231 ///
232 /// let mut kvs = vec![];
233 /// while let Some((k, v)) = stream.next() {
234 /// kvs.push((k.to_vec(), v));
235 /// }
236 /// assert_eq!(kvs, vec![
237 /// (b"b".to_vec(), 2),
238 /// (b"c".to_vec(), 3),
239 /// (b"d".to_vec(), 4),
240 /// ]);
241 /// ```
242 #[inline]
243 pub fn range(&self) -> StreamBuilder {
244 StreamBuilder(self.0.range())
245 }
246
247 /// Executes an automaton on the keys of this map.
248 ///
249 /// Note that this returns a `StreamBuilder`, which can be used to
250 /// add a range query to the search (see the `range` method).
251 ///
252 /// Memory requirements are the same as described on `Map::stream`.
253 ///
254 /// # Example
255 ///
256 /// An implementation of regular expressions for `Automaton` is available
257 /// in the `fst-regex` crate, which can be used to search maps.
258 ///
259 /// ```rust
260 ///
261 /// use std::error::Error;
262 ///
263 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
264 /// #[cfg(feature = "regex")]
265 /// use tantivy_fst::Regex;
266 ///
267 /// #[cfg(feature = "regex")]
268 /// {
269 /// let map = Map::from_iter(vec![
270 /// ("foo", 1), ("foo1", 2), ("foo2", 3), ("foo3", 4), ("foobar", 5),
271 /// ]).unwrap();
272 ///
273 /// let re = Regex::new("f[a-z]+3?").unwrap();
274 /// let mut stream = map.search(&re).into_stream();
275 ///
276 /// let mut kvs = vec![];
277 /// while let Some((k, v)) = stream.next() {
278 /// kvs.push((k.to_vec(), v));
279 /// }
280 /// assert_eq!(kvs, vec![
281 /// (b"foo".to_vec(), 1),
282 /// (b"foo3".to_vec(), 4),
283 /// (b"foobar".to_vec(), 5),
284 /// ]);
285 ///
286 /// }
287 /// ```
288 pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<A> {
289 StreamBuilder(self.0.search(aut))
290 }
291
292 /// Returns the number of elements in this map.
293 #[inline]
294 pub fn len(&self) -> usize {
295 self.0.len()
296 }
297
298 /// Returns true if and only if this map is empty.
299 #[inline]
300 pub fn is_empty(&self) -> bool {
301 self.0.is_empty()
302 }
303
304 /// Creates a new map operation with this map added to it.
305 ///
306 /// The `OpBuilder` type can be used to add additional map streams
307 /// and perform set operations like union, intersection, difference and
308 /// symmetric difference on the keys of the map. These set operations also
309 /// allow one to specify how conflicting values are merged in the stream.
310 ///
311 /// # Example
312 ///
313 /// This example demonstrates a union on multiple map streams. Notice that
314 /// the stream returned from the union is not a sequence of key-value
315 /// pairs, but rather a sequence of keys associated with one or more
316 /// values. Namely, a key is associated with each value associated with
317 /// that same key in the all of the streams.
318 ///
319 /// ```rust
320 /// use tantivy_fst::{Streamer, Map};
321 /// use tantivy_fst::{map::IndexedValue};
322 ///
323 /// let map1 = Map::from_iter(vec![
324 /// ("a", 1), ("b", 2), ("c", 3),
325 /// ]).unwrap();
326 /// let map2 = Map::from_iter(vec![
327 /// ("a", 10), ("y", 11), ("z", 12),
328 /// ]).unwrap();
329 ///
330 /// let mut union = map1.op().add(&map2).union();
331 ///
332 /// let mut kvs = vec![];
333 /// while let Some((k, vs)) = union.next() {
334 /// kvs.push((k.to_vec(), vs.to_vec()));
335 /// }
336 /// assert_eq!(kvs, vec![
337 /// (b"a".to_vec(), vec![
338 /// IndexedValue { index: 0, value: 1 },
339 /// IndexedValue { index: 1, value: 10 },
340 /// ]),
341 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
342 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
343 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]),
344 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
345 /// ]);
346 /// ```
347 #[inline]
348 pub fn op(&self) -> OpBuilder {
349 OpBuilder::new().add(self)
350 }
351
352 /// Returns a reference to the underlying raw finite state transducer.
353 #[inline]
354 pub fn as_fst(&self) -> &raw::Fst<Data> {
355 &self.0
356 }
357}
358
359impl<Data: Deref<Target = [u8]>> fmt::Debug for Map<Data> {
360 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
361 write!(f, "Map([")?;
362 let mut stream = self.stream();
363 let mut first = true;
364 while let Some((k, v)) = stream.next() {
365 if !first {
366 write!(f, ", ")?;
367 }
368 first = false;
369 write!(f, "({}, {})", String::from_utf8_lossy(k), v)?;
370 }
371 write!(f, "])")
372 }
373}
374
375// Construct a map from an Fst object.
376impl<Data> From<raw::Fst<Data>> for Map<Data> {
377 #[inline]
378 fn from(fst: raw::Fst<Data>) -> Self {
379 Map(fst)
380 }
381}
382
383/// Returns the underlying finite state transducer.
384impl<Data> AsRef<raw::Fst<Data>> for Map<Data> {
385 #[inline]
386 fn as_ref(&self) -> &raw::Fst<Data> {
387 &self.0
388 }
389}
390
391impl<'m, 'a, Data: Deref<Target = [u8]>> IntoStreamer<'a> for &'m Map<Data> {
392 type Item = (&'a [u8], u64);
393 type Into = Stream<'m>;
394
395 #[inline]
396 fn into_stream(self) -> Self::Into {
397 Stream(self.0.stream())
398 }
399}
400
401/// A builder for creating a map.
402///
403/// This is not your average everyday builder. It has two important qualities
404/// that make it a bit unique from what you might expect:
405///
406/// 1. All keys must be added in lexicographic order. Adding a key out of order
407/// will result in an error. Additionally, adding a duplicate key will also
408/// result in an error. That is, once a key is associated with a value,
409/// that association can never be modified or deleted.
410/// 2. The representation of a map is streamed to *any* `io::Write` as it is
411/// built. For an in memory representation, this can be a `Vec<u8>`.
412///
413/// Point (2) is especially important because it means that a map can be
414/// constructed *without storing the entire map in memory*. Namely, since it
415/// works with any `io::Write`, it can be streamed directly to a file.
416///
417/// With that said, the builder does use memory, but **memory usage is bounded
418/// to a constant size**. The amount of memory used trades off with the
419/// compression ratio. Currently, the implementation hard codes this trade off
420/// which can result in about 5-20MB of heap usage during construction. (N.B.
421/// Guaranteeing a maximal compression ratio requires memory proportional to
422/// the size of the map, which defeats some of the benefit of streaming
423/// it to disk. In practice, a small bounded amount of memory achieves
424/// close-to-minimal compression ratios.)
425///
426/// The algorithmic complexity of map construction is `O(n)` where `n` is the
427/// number of elements added to the map.
428///
429/// # Example: build in memory
430///
431/// This shows how to use the builder to construct a map in memory. Note that
432/// `Map::from_iter` provides a convenience function that achieves this same
433/// goal without needing to explicitly use `MapBuilder`.
434///
435/// ```rust
436/// use tantivy_fst::{IntoStreamer, Streamer, Map, MapBuilder};
437///
438/// let mut build = MapBuilder::memory();
439/// build.insert("bruce", 1).unwrap();
440/// build.insert("clarence", 2).unwrap();
441/// build.insert("stevie", 3).unwrap();
442///
443/// // You could also call `finish()` here, but since we're building the map in
444/// // memory, there would be no way to get the `Vec<u8>` back.
445/// let bytes = build.into_inner().unwrap();
446///
447/// // At this point, the map has been constructed, but here's how to read it.
448/// let map = Map::from_bytes(bytes).unwrap();
449/// let mut stream = map.into_stream();
450/// let mut kvs = vec![];
451/// while let Some((k, v)) = stream.next() {
452/// kvs.push((k.to_vec(), v));
453/// }
454/// assert_eq!(kvs, vec![
455/// (b"bruce".to_vec(), 1),
456/// (b"clarence".to_vec(), 2),
457/// (b"stevie".to_vec(), 3),
458/// ]);
459/// ```
460pub struct MapBuilder<W>(raw::Builder<W>);
461
462impl MapBuilder<Vec<u8>> {
463 /// Create a builder that builds a map in memory.
464 #[inline]
465 pub fn memory() -> Self {
466 MapBuilder(raw::Builder::memory())
467 }
468}
469
470impl<W: io::Write> MapBuilder<W> {
471 /// Create a builder that builds a map by writing it to `wtr` in a
472 /// streaming fashion.
473 pub fn new(wtr: W) -> Result<MapBuilder<W>> {
474 raw::Builder::new_type(wtr, 0).map(MapBuilder)
475 }
476
477 /// Insert a new key-value pair into the map.
478 ///
479 /// Keys must be convertible to byte strings. Values must be a `u64`, which
480 /// is a restriction of the current implementation of finite state
481 /// transducers. (Values may one day be expanded to other types.)
482 ///
483 /// If a key is inserted that is less than or equal to any previous key
484 /// added, then an error is returned. Similarly, if there was a problem
485 /// writing to the underlying writer, an error is returned.
486 pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> {
487 self.0.insert(key, val)
488 }
489
490 /// Calls insert on each item in the iterator.
491 ///
492 /// If an error occurred while adding an element, processing is stopped
493 /// and the error is returned.
494 ///
495 /// If a key is inserted that is less than or equal to any previous key
496 /// added, then an error is returned. Similarly, if there was a problem
497 /// writing to the underlying writer, an error is returned.
498 pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()>
499 where
500 K: AsRef<[u8]>,
501 I: IntoIterator<Item = (K, u64)>,
502 {
503 self.0
504 .extend_iter(iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))))
505 }
506
507 /// Calls insert on each item in the stream.
508 ///
509 /// Note that unlike `extend_iter`, this is not generic on the items in
510 /// the stream.
511 ///
512 /// If a key is inserted that is less than or equal to any previous key
513 /// added, then an error is returned. Similarly, if there was a problem
514 /// writing to the underlying writer, an error is returned.
515 pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
516 where
517 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
518 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
519 {
520 self.0.extend_stream(StreamOutput(stream.into_stream()))
521 }
522
523 /// Finishes the construction of the map and flushes the underlying
524 /// writer. After completion, the data written to `W` may be read using
525 /// one of `Map`'s constructor methods.
526 pub fn finish(self) -> Result<()> {
527 self.0.finish()
528 }
529
530 /// Just like `finish`, except it returns the underlying writer after
531 /// flushing it.
532 pub fn into_inner(self) -> Result<W> {
533 self.0.into_inner()
534 }
535
536 /// Gets a reference to the underlying writer.
537 pub fn get_ref(&self) -> &W {
538 self.0.get_ref()
539 }
540
541 /// Returns the number of bytes written to the underlying writer
542 pub fn bytes_written(&self) -> u64 {
543 self.0.bytes_written()
544 }
545}
546
547/// A lexicographically ordered stream of key-value pairs from a map.
548///
549/// The `A` type parameter corresponds to an optional automaton to filter
550/// the stream. By default, no filtering is done.
551///
552/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
553pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>)
554where
555 A: Automaton;
556
557impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> {
558 type Item = (&'a [u8], u64);
559
560 fn next(&'a mut self) -> Option<Self::Item> {
561 self.0.next().map(|(key, out)| (key, out.value()))
562 }
563}
564
565impl<'m, A: Automaton> Stream<'m, A> {
566 /// Convert this stream into a vector of byte strings and outputs.
567 ///
568 /// Note that this creates a new allocation for every key in the stream.
569 pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> {
570 self.0.into_byte_vec()
571 }
572
573 /// Convert this stream into a vector of Unicode strings and outputs.
574 ///
575 /// If any key is not valid UTF-8, then iteration on the stream is stopped
576 /// and a UTF-8 decoding error is returned.
577 ///
578 /// Note that this creates a new allocation for every key in the stream.
579 pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> {
580 self.0.into_str_vec()
581 }
582
583 /// Convert this stream into a vector of byte strings.
584 ///
585 /// Note that this creates a new allocation for every key in the stream.
586 pub fn into_byte_keys(self) -> Vec<Vec<u8>> {
587 self.0.into_byte_keys()
588 }
589
590 /// Convert this stream into a vector of Unicode strings.
591 ///
592 /// If any key is not valid UTF-8, then iteration on the stream is stopped
593 /// and a UTF-8 decoding error is returned.
594 ///
595 /// Note that this creates a new allocation for every key in the stream.
596 pub fn into_str_keys(self) -> Result<Vec<String>> {
597 self.0.into_str_keys()
598 }
599
600 /// Convert this stream into a vector of outputs.
601 pub fn into_values(self) -> Vec<u64> {
602 self.0.into_values()
603 }
604}
605
606/// A lexicographically ordered stream of keys from a map.
607///
608/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
609pub struct Keys<'m>(raw::Stream<'m>);
610
611impl<'a, 'm> Streamer<'a> for Keys<'m> {
612 type Item = &'a [u8];
613
614 #[inline]
615 fn next(&'a mut self) -> Option<Self::Item> {
616 self.0.next().map(|(key, _)| key)
617 }
618}
619
620/// A stream of values from a map, lexicographically ordered by each value's
621/// corresponding key.
622///
623/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
624pub struct Values<'m>(raw::Stream<'m>);
625
626impl<'a, 'm> Streamer<'a> for Values<'m> {
627 type Item = u64;
628
629 #[inline]
630 fn next(&'a mut self) -> Option<Self::Item> {
631 self.0.next().map(|(_, out)| out.value())
632 }
633}
634
635/// A builder for constructing range queries on streams.
636///
637/// Once all bounds are set, one should call `into_stream` to get a
638/// `Stream`.
639///
640/// Bounds are not additive. That is, if `ge` is called twice on the same
641/// builder, then the second setting wins.
642///
643/// The `A` type parameter corresponds to an optional automaton to filter
644/// the stream. By default, no filtering is done.
645///
646/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
647pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>);
648
649impl<'m, A: Automaton> StreamBuilder<'m, A> {
650 /// Specify a greater-than-or-equal-to bound.
651 pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> Self {
652 StreamBuilder(self.0.ge(bound))
653 }
654
655 /// Specify a greater-than bound.
656 pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> Self {
657 StreamBuilder(self.0.gt(bound))
658 }
659
660 /// Specify a less-than-or-equal-to bound.
661 pub fn le<T: AsRef<[u8]>>(self, bound: T) -> Self {
662 StreamBuilder(self.0.le(bound))
663 }
664
665 /// Specify a less-than bound.
666 pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> Self {
667 StreamBuilder(self.0.lt(bound))
668 }
669
670 /// Make it iterate backwards.
671 pub fn backward(self) -> Self {
672 StreamBuilder(self.0.backward())
673 }
674
675 /// Return this builder and gives the automaton states
676 /// along with the results.
677 pub fn with_state(self) -> StreamWithStateBuilder<'m, A> {
678 StreamWithStateBuilder(self.0.with_state())
679 }
680}
681
682impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> {
683 type Item = (&'a [u8], u64);
684 type Into = Stream<'m, A>;
685
686 fn into_stream(self) -> Self::Into {
687 Stream(self.0.into_stream())
688 }
689}
690
691/// A builder for constructing range queries of streams
692/// that returns results along with automaton states.
693///
694/// Once all bounds are set, one should call `into_stream` to get a
695/// `StreamWithState`.
696///
697/// Bounds are not additive. That is, if `ge` is called twice on the same
698/// builder, then the second setting wins.
699///
700/// The `A` type parameter corresponds to an optional automaton to filter
701/// the stream. By default, no filtering is done.
702///
703/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
704pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>(raw::StreamWithStateBuilder<'m, A>);
705
706impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> for StreamWithStateBuilder<'m, A>
707where
708 A::State: Clone,
709{
710 type Item = (&'a [u8], u64, A::State);
711 type Into = StreamWithState<'m, A>;
712
713 fn into_stream(self) -> Self::Into {
714 StreamWithState(self.0.into_stream())
715 }
716}
717
718/// A builder for collecting map streams on which to perform set operations
719/// on the keys of maps.
720///
721/// Set operations include intersection, union, difference and symmetric
722/// difference. The result of each set operation is itself a stream that emits
723/// pairs of keys and a sequence of each occurrence of that key in the
724/// participating streams. This information allows one to perform set
725/// operations on maps and customize how conflicting output values are handled.
726///
727/// All set operations work efficiently on an arbitrary number of
728/// streams with memory proportional to the number of streams.
729///
730/// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)`
731/// where `n1, n2, n3, ...` correspond to the number of elements in each
732/// stream.
733///
734/// The `'m` lifetime parameter refers to the lifetime of the underlying set.
735pub struct OpBuilder<'m>(raw::OpBuilder<'m>);
736
737impl<'m> OpBuilder<'m> {
738 /// Create a new set operation builder.
739 #[inline]
740 pub fn new() -> Self {
741 OpBuilder(raw::OpBuilder::default())
742 }
743
744 /// Add a stream to this set operation.
745 ///
746 /// This is useful for a chaining style pattern, e.g.,
747 /// `builder.add(stream1).add(stream2).union()`.
748 ///
749 /// The stream must emit a lexicographically ordered sequence of key-value
750 /// pairs.
751 pub fn add<I, S>(mut self, streamable: I) -> Self
752 where
753 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
754 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
755 {
756 self.push(streamable);
757 self
758 }
759
760 /// Add a stream to this set operation.
761 ///
762 /// The stream must emit a lexicographically ordered sequence of key-value
763 /// pairs.
764 pub fn push<I, S>(&mut self, streamable: I)
765 where
766 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
767 S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
768 {
769 self.0.push(StreamOutput(streamable.into_stream()));
770 }
771
772 /// Performs a union operation on all streams that have been added.
773 ///
774 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
775 /// first element of the tuple is the byte string key. The second element
776 /// of the tuple is a list of all occurrences of that key in participating
777 /// streams. The `IndexedValue` contains an index and the value associated
778 /// with that key in that stream. The index uniquely identifies each
779 /// stream, which is an integer that is auto-incremented when a stream
780 /// is added to this operation (starting at `0`).
781 ///
782 /// # Example
783 ///
784 /// ```rust
785 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
786 /// use tantivy_fst::map::IndexedValue;
787 ///
788 /// let map1 = Map::from_iter(vec![
789 /// ("a", 1), ("b", 2), ("c", 3),
790 /// ]).unwrap();
791 /// let map2 = Map::from_iter(vec![
792 /// ("a", 11), ("y", 12), ("z", 13),
793 /// ]).unwrap();
794 ///
795 /// let mut union = map1.op().add(&map2).union();
796 ///
797 /// let mut kvs = vec![];
798 /// while let Some((k, vs)) = union.next() {
799 /// kvs.push((k.to_vec(), vs.to_vec()));
800 /// }
801 /// assert_eq!(kvs, vec![
802 /// (b"a".to_vec(), vec![
803 /// IndexedValue { index: 0, value: 1 },
804 /// IndexedValue { index: 1, value: 11 },
805 /// ]),
806 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
807 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
808 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
809 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
810 /// ]);
811 /// ```
812 #[inline]
813 pub fn union(self) -> Union<'m> {
814 Union(self.0.union())
815 }
816
817 /// Chains all streams that have been added in the order they have been added.
818 ///
819 /// Note that this returns a stream of `(&[u8], &[Output])`. The
820 /// first element of the tuple is the byte string key. The second element
821 /// of the tuple is the value associated with that key in the underlying
822 /// streams. This is useful when the streams area already sorted and not
823 /// overlapping.
824 ///
825 /// # Example
826 ///
827 /// ```rust
828 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
829 /// use tantivy_fst::raw::Output;
830 /// use tantivy_fst::map::IndexedValue;
831 ///
832 /// let map1 = Map::from_iter(vec![
833 /// ("a", 1), ("b", 2),
834 /// ]).unwrap();
835 /// let map2 = Map::from_iter(vec![
836 /// ("a", 11), ("y", 12),
837 /// ]).unwrap();
838 ///
839 /// let mut chain = map1.op().add(&map2).chain();
840 ///
841 /// let mut kvs = vec![];
842 /// while let Some((k, v)) = chain.next() {
843 /// kvs.push((k.to_vec(), v));
844 /// }
845 /// assert_eq!(kvs, vec![
846 /// (b"a".to_vec(), Output::new(1)),
847 /// (b"b".to_vec(), Output::new(2)),
848 /// (b"a".to_vec(), Output::new(11)),
849 /// (b"y".to_vec(), Output::new(12)),
850 /// ]);
851 /// ```
852 #[inline]
853 pub fn chain(self) -> Chain<'m> {
854 Chain(self.0.chain())
855 }
856 /// Performs an intersection operation on all streams that have been added.
857 ///
858 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
859 /// first element of the tuple is the byte string key. The second element
860 /// of the tuple is a list of all occurrences of that key in participating
861 /// streams. The `IndexedValue` contains an index and the value associated
862 /// with that key in that stream. The index uniquely identifies each
863 /// stream, which is an integer that is auto-incremented when a stream
864 /// is added to this operation (starting at `0`).
865 ///
866 /// # Example
867 ///
868 /// ```rust
869 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
870 /// use tantivy_fst::map::IndexedValue;
871 ///
872 /// let map1 = Map::from_iter(vec![
873 /// ("a", 1), ("b", 2), ("c", 3),
874 /// ]).unwrap();
875 /// let map2 = Map::from_iter(vec![
876 /// ("a", 11), ("y", 12), ("z", 13),
877 /// ]).unwrap();
878 ///
879 /// let mut intersection = map1.op().add(&map2).intersection();
880 ///
881 /// let mut kvs = vec![];
882 /// while let Some((k, vs)) = intersection.next() {
883 /// kvs.push((k.to_vec(), vs.to_vec()));
884 /// }
885 /// assert_eq!(kvs, vec![
886 /// (b"a".to_vec(), vec![
887 /// IndexedValue { index: 0, value: 1 },
888 /// IndexedValue { index: 1, value: 11 },
889 /// ]),
890 /// ]);
891 /// ```
892 #[inline]
893 pub fn intersection(self) -> Intersection<'m> {
894 Intersection(self.0.intersection())
895 }
896
897 /// Performs a difference operation with respect to the first stream added.
898 /// That is, this returns a stream of all elements in the first stream
899 /// that don't exist in any other stream that has been added.
900 ///
901 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
902 /// first element of the tuple is the byte string key. The second element
903 /// of the tuple is a list of all occurrences of that key in participating
904 /// streams. The `IndexedValue` contains an index and the value associated
905 /// with that key in that stream. The index uniquely identifies each
906 /// stream, which is an integer that is auto-incremented when a stream
907 /// is added to this operation (starting at `0`).
908 ///
909 /// # Example
910 ///
911 /// ```rust
912 /// use tantivy_fst::{Streamer, Map};
913 /// use tantivy_fst::map::IndexedValue;
914 ///
915 /// let map1 = Map::from_iter(vec![
916 /// ("a", 1), ("b", 2), ("c", 3),
917 /// ]).unwrap();
918 /// let map2 = Map::from_iter(vec![
919 /// ("a", 11), ("y", 12), ("z", 13),
920 /// ]).unwrap();
921 ///
922 /// let mut difference = map1.op().add(&map2).difference();
923 ///
924 /// let mut kvs = vec![];
925 /// while let Some((k, vs)) = difference.next() {
926 /// kvs.push((k.to_vec(), vs.to_vec()));
927 /// }
928 /// assert_eq!(kvs, vec![
929 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
930 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
931 /// ]);
932 /// ```
933 #[inline]
934 pub fn difference(self) -> Difference<'m> {
935 Difference(self.0.difference())
936 }
937
938 /// Performs a symmetric difference operation on all of the streams that
939 /// have been added.
940 ///
941 /// When there are only two streams, then the keys returned correspond to
942 /// keys that are in either stream but *not* in both streams.
943 ///
944 /// More generally, for any number of streams, keys that occur in an odd
945 /// number of streams are returned.
946 ///
947 /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The
948 /// first element of the tuple is the byte string key. The second element
949 /// of the tuple is a list of all occurrences of that key in participating
950 /// streams. The `IndexedValue` contains an index and the value associated
951 /// with that key in that stream. The index uniquely identifies each
952 /// stream, which is an integer that is auto-incremented when a stream
953 /// is added to this operation (starting at `0`).
954 ///
955 /// # Example
956 ///
957 /// ```rust
958 /// use tantivy_fst::{IntoStreamer, Streamer, Map};
959 /// use tantivy_fst::map::IndexedValue;
960 ///
961 /// let map1 = Map::from_iter(vec![
962 /// ("a", 1), ("b", 2), ("c", 3),
963 /// ]).unwrap();
964 /// let map2 = Map::from_iter(vec![
965 /// ("a", 11), ("y", 12), ("z", 13),
966 /// ]).unwrap();
967 ///
968 /// let mut sym_difference = map1.op().add(&map2).symmetric_difference();
969 ///
970 /// let mut kvs = vec![];
971 /// while let Some((k, vs)) = sym_difference.next() {
972 /// kvs.push((k.to_vec(), vs.to_vec()));
973 /// }
974 /// assert_eq!(kvs, vec![
975 /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]),
976 /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]),
977 /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]),
978 /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]),
979 /// ]);
980 /// ```
981 #[inline]
982 pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
983 SymmetricDifference(self.0.symmetric_difference())
984 }
985}
986
987impl<'f, I, S> Extend<I> for OpBuilder<'f>
988where
989 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
990 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
991{
992 fn extend<T>(&mut self, it: T)
993 where
994 T: IntoIterator<Item = I>,
995 {
996 for stream in it {
997 self.push(stream);
998 }
999 }
1000}
1001
1002impl<'f, I, S> FromIterator<I> for OpBuilder<'f>
1003where
1004 I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>,
1005 S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>,
1006{
1007 fn from_iter<T>(it: T) -> Self
1008 where
1009 T: IntoIterator<Item = I>,
1010 {
1011 let mut op = OpBuilder::new();
1012 op.extend(it);
1013 op
1014 }
1015}
1016
1017/// A stream of set union over multiple map streams in lexicographic order.
1018///
1019/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1020pub struct Union<'m>(raw::Union<'m>);
1021
1022impl<'a, 'm> Streamer<'a> for Union<'m> {
1023 type Item = (&'a [u8], &'a [IndexedValue]);
1024
1025 #[inline]
1026 fn next(&'a mut self) -> Option<Self::Item> {
1027 self.0.next()
1028 }
1029}
1030
1031/// A stream of set union over multiple map streams in lexicographic order.
1032///
1033/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1034pub struct Chain<'m>(raw::Chain<'m>);
1035
1036impl<'a, 'm> Streamer<'a> for Chain<'m> {
1037 type Item = (&'a [u8], Output);
1038
1039 #[inline]
1040 fn next(&'a mut self) -> Option<Self::Item> {
1041 self.0.next()
1042 }
1043}
1044
1045/// A stream of set intersection over multiple map streams in lexicographic
1046/// order.
1047///
1048/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1049pub struct Intersection<'m>(raw::Intersection<'m>);
1050
1051impl<'a, 'm> Streamer<'a> for Intersection<'m> {
1052 type Item = (&'a [u8], &'a [IndexedValue]);
1053
1054 #[inline]
1055 fn next(&'a mut self) -> Option<Self::Item> {
1056 self.0.next()
1057 }
1058}
1059
1060/// A stream of set difference over multiple map streams in lexicographic
1061/// order.
1062///
1063/// The difference operation is taken with respect to the first stream and the
1064/// rest of the streams. i.e., All elements in the first stream that do not
1065/// appear in any other streams.
1066///
1067/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1068pub struct Difference<'m>(raw::Difference<'m>);
1069
1070impl<'a, 'm> Streamer<'a> for Difference<'m> {
1071 type Item = (&'a [u8], &'a [IndexedValue]);
1072
1073 #[inline]
1074 fn next(&'a mut self) -> Option<Self::Item> {
1075 self.0.next()
1076 }
1077}
1078
1079/// A stream of set symmetric difference over multiple map streams in
1080/// lexicographic order.
1081///
1082/// The `'m` lifetime parameter refers to the lifetime of the underlying map.
1083pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>);
1084
1085impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> {
1086 type Item = (&'a [u8], &'a [IndexedValue]);
1087
1088 #[inline]
1089 fn next(&'a mut self) -> Option<Self::Item> {
1090 self.0.next()
1091 }
1092}
1093
1094/// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams
1095/// used by raw fsts (`(&[u8], Output)`).
1096///
1097/// If this were iterators, we could use `iter::Map`, but doing this on streams
1098/// requires HKT, so we need to write out the monomorphization ourselves.
1099struct StreamOutput<S>(S);
1100
1101impl<'a, S> Streamer<'a> for StreamOutput<S>
1102where
1103 S: Streamer<'a, Item = (&'a [u8], u64)>,
1104{
1105 type Item = (&'a [u8], raw::Output);
1106
1107 fn next(&'a mut self) -> Option<Self::Item> {
1108 self.0.next().map(|(k, v)| (k, raw::Output::new(v)))
1109 }
1110}
1111
1112/// A lexicographically ordered stream of key-value from a map
1113/// along with the states of the automaton.
1114///
1115/// The `Stream` type is based on the `StreamWithState`.
1116pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>)
1117where
1118 A: Automaton;
1119
1120impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A>
1121where
1122 A::State: Clone,
1123{
1124 type Item = (&'a [u8], u64, A::State);
1125
1126 fn next(&'a mut self) -> Option<Self::Item> {
1127 self.0
1128 .next()
1129 .map(|(key, out, state)| (key, out.value(), state))
1130 }
1131}