tantivy_fst/raw/
build.rs

1use std::io::{self, Write};
2
3use byteorder::{LittleEndian, WriteBytesExt};
4
5use crate::error::Result;
6use crate::raw::counting_writer::CountingWriter;
7use crate::raw::error::Error;
8use crate::raw::registry::{Registry, RegistryEntry};
9use crate::raw::{CompiledAddr, FstType, Output, Transition, EMPTY_ADDRESS, NONE_ADDRESS, VERSION};
10// use raw::registry_minimal::{Registry, RegistryEntry};
11use crate::stream::{IntoStreamer, Streamer};
12
13/// A builder for creating a finite state transducer.
14///
15/// This is not your average everyday builder. It has two important qualities
16/// that make it a bit unique from what you might expect:
17///
18/// 1. All keys must be added in lexicographic order. Adding a key out of order
19///    will result in an error. Additionally, adding a duplicate key with an
20///    output value will also result in an error. That is, once a key is
21///    associated with a value, that association can never be modified or
22///    deleted.
23/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
24///    built. For an in memory representation, this can be a `Vec<u8>`.
25///
26/// Point (2) is especially important because it means that an fst can be
27/// constructed *without storing the entire fst in memory*. Namely, since it
28/// works with any `io::Write`, it can be streamed directly to a file.
29///
30/// With that said, the builder does use memory, but **memory usage is bounded
31/// to a constant size**. The amount of memory used trades off with the
32/// compression ratio. Currently, the implementation hard codes this trade off
33/// which can result in about 5-20MB of heap usage during construction. (N.B.
34/// Guaranteeing a maximal compression ratio requires memory proportional to
35/// the size of the fst, which defeats some of the benefit of streaming
36/// it to disk. In practice, a small bounded amount of memory achieves
37/// close-to-minimal compression ratios.)
38///
39/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
40/// number of elements added to the fst.
41pub struct Builder<W> {
42    /// The FST raw data is written directly to `wtr`.
43    ///
44    /// No internal buffering is done.
45    wtr: CountingWriter<W>,
46    /// The stack of unfinished nodes.
47    ///
48    /// An unfinished node is a node that could potentially have a new
49    /// transition added to it when a new word is added to the dictionary.
50    unfinished: UnfinishedNodes,
51    /// A map of finished nodes.
52    ///
53    /// A finished node is one that has been compiled and written to `wtr`.
54    /// After this point, the node is considered immutable and will never
55    /// Achange.
56    registry: Registry,
57    /// The last word added.
58    ///
59    /// This is used to enforce the invariant that words are added in sorted
60    /// order.
61    last: Option<Vec<u8>>,
62    /// The address of the last compiled node.
63    ///
64    /// This is used to optimize states with one transition that point
65    /// to the previously compiled node. (The previously compiled node in
66    /// this case actually corresponds to the next state for the transition,
67    /// since states are compiled in reverse.)
68    last_addr: CompiledAddr,
69    /// The number of keys added.
70    len: usize,
71}
72
73#[derive(Debug)]
74struct UnfinishedNodes {
75    stack: Vec<BuilderNodeUnfinished>,
76}
77
78#[derive(Debug)]
79struct BuilderNodeUnfinished {
80    node: BuilderNode,
81    last: Option<LastTransition>,
82}
83
84#[derive(Debug, Hash, Eq, PartialEq)]
85pub struct BuilderNode {
86    pub is_final: bool,
87    pub final_output: Output,
88    pub trans: Vec<Transition>,
89}
90
91#[derive(Debug)]
92struct LastTransition {
93    inp: u8,
94    out: Output,
95}
96
97impl Builder<Vec<u8>> {
98    /// Create a builder that builds an fst in memory.
99    #[inline]
100    pub fn memory() -> Self {
101        Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
102    }
103}
104
105impl<W: io::Write> Builder<W> {
106    /// Create a builder that builds an fst by writing it to `wtr` in a
107    /// streaming fashion.
108    pub fn new(wtr: W) -> Result<Builder<W>> {
109        Builder::new_type(wtr, 0)
110    }
111
112    /// The same as `new`, except it sets the type of the fst to the type
113    /// given.
114    pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
115        let mut wtr = CountingWriter::new(wtr);
116        // Don't allow any nodes to have address 0-7. We use these to encode
117        // the API version. We also use addresses `0` and `1` as special
118        // sentinel values, so they should never correspond to a real node.
119        wtr.write_u64::<LittleEndian>(VERSION)?;
120        // Similarly for 8-15 for the fst type.
121        wtr.write_u64::<LittleEndian>(ty)?;
122        Ok(Builder {
123            wtr,
124            unfinished: UnfinishedNodes::new(),
125            registry: Registry::new(10_000, 2),
126            last: None,
127            last_addr: NONE_ADDRESS,
128            len: 0,
129        })
130    }
131
132    /// Adds a byte string to this FST with a zero output value.
133    pub fn add<B>(&mut self, bs: B) -> Result<()>
134    where
135        B: AsRef<[u8]>,
136    {
137        self.check_last_key(bs.as_ref(), false)?;
138        self.insert_output(bs, None)
139    }
140
141    /// Insert a new key-value pair into the fst.
142    ///
143    /// Keys must be convertible to byte strings. Values must be a `u64`, which
144    /// is a restriction of the current implementation of finite state
145    /// transducers. (Values may one day be expanded to other types.)
146    ///
147    /// If a key is inserted that is less than or equal to any previous key
148    /// added, then an error is returned. Similarly, if there was a problem
149    /// writing to the underlying writer, an error is returned.
150    pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
151    where
152        B: AsRef<[u8]>,
153    {
154        self.check_last_key(bs.as_ref(), true)?;
155        self.insert_output(bs, Some(Output::new(val)))
156    }
157
158    /// Calls insert on each item in the iterator.
159    ///
160    /// If an error occurred while adding an element, processing is stopped
161    /// and the error is returned.
162    ///
163    /// If a key is inserted that is less than or equal to any previous key
164    /// added, then an error is returned. Similarly, if there was a problem
165    /// writing to the underlying writer, an error is returned.
166    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
167    where
168        T: AsRef<[u8]>,
169        I: IntoIterator<Item = (T, Output)>,
170    {
171        for (key, out) in iter {
172            self.insert(key, out.value())?;
173        }
174        Ok(())
175    }
176
177    /// Calls insert on each item in the stream.
178    ///
179    /// Note that unlike `extend_iter`, this is not generic on the items in
180    /// the stream.
181    ///
182    /// If a key is inserted that is less than or equal to any previous key
183    /// added, then an error is returned. Similarly, if there was a problem
184    /// writing to the underlying writer, an error is returned.
185    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
186    where
187        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
188        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
189    {
190        let mut stream = stream.into_stream();
191        while let Some((key, out)) = stream.next() {
192            self.insert(key, out.value())?;
193        }
194        Ok(())
195    }
196
197    /// Finishes the construction of the fst and flushes the underlying
198    /// writer. After completion, the data written to `W` may be read using
199    /// one of `Fst`'s constructor methods.
200    pub fn finish(self) -> Result<()> {
201        self.into_inner()?;
202        Ok(())
203    }
204
205    /// Just like `finish`, except it returns the underlying writer after
206    /// flushing it.
207    pub fn into_inner(mut self) -> Result<W> {
208        self.compile_from(0)?;
209        let root_node = self.unfinished.pop_root();
210        let root_addr = self.compile(&root_node)?;
211        self.wtr.write_u64::<LittleEndian>(self.len as u64)?;
212        self.wtr.write_u64::<LittleEndian>(root_addr as u64)?;
213        self.wtr.flush()?;
214        Ok(self.wtr.into_inner())
215    }
216
217    fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
218    where
219        B: AsRef<[u8]>,
220    {
221        let bs = bs.as_ref();
222        if bs.is_empty() {
223            self.len = 1; // must be first key, so length is always 1
224            self.unfinished
225                .set_root_output(out.unwrap_or_else(Output::zero));
226            return Ok(());
227        }
228        let (prefix_len, out) = if let Some(out) = out {
229            self.unfinished.find_common_prefix_and_set_output(bs, out)
230        } else {
231            (self.unfinished.find_common_prefix(bs), Output::zero())
232        };
233        if prefix_len == bs.len() {
234            // If the prefix found consumes the entire set of bytes, then
235            // the prefix *equals* the bytes given. This means it is a
236            // duplicate value with no output. So we can give up here.
237            //
238            // If the below assert fails, then that means we let a duplicate
239            // value through even when inserting outputs.
240            assert!(out.is_zero());
241            return Ok(());
242        }
243        self.len += 1;
244        self.compile_from(prefix_len)?;
245        self.unfinished.add_suffix(&bs[prefix_len..], out);
246        Ok(())
247    }
248
249    fn compile_from(&mut self, istate: usize) -> Result<()> {
250        let mut addr = NONE_ADDRESS;
251        while istate + 1 < self.unfinished.len() {
252            let node = if addr == NONE_ADDRESS {
253                self.unfinished.pop_empty()
254            } else {
255                self.unfinished.pop_freeze(addr)
256            };
257            addr = self.compile(&node)?;
258            assert_ne!(addr, NONE_ADDRESS);
259        }
260        self.unfinished.top_last_freeze(addr);
261        Ok(())
262    }
263
264    #[inline]
265    fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
266        if node.is_final && node.trans.is_empty() && node.final_output.is_zero() {
267            return Ok(EMPTY_ADDRESS);
268        }
269        let entry = self.registry.entry(&node);
270        if let RegistryEntry::Found(ref addr) = entry {
271            return Ok(*addr);
272        }
273        let start_addr = self.wtr.count() as CompiledAddr;
274        node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
275        self.last_addr = self.wtr.count() as CompiledAddr - 1;
276        if let RegistryEntry::NotFound(cell) = entry {
277            cell.insert(self.last_addr);
278        }
279        Ok(self.last_addr)
280    }
281
282    fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
283        if let Some(ref mut last) = self.last {
284            if check_dupe && bs == &**last {
285                return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
286            }
287            if bs < &**last {
288                return Err(Error::OutOfOrder {
289                    previous: last.to_vec(),
290                    got: bs.to_vec(),
291                }
292                .into());
293            }
294            last.clear();
295            for &b in bs {
296                last.push(b);
297            }
298        } else {
299            self.last = Some(bs.to_vec());
300        }
301        Ok(())
302    }
303
304    /// Gets a reference to the underlying writer.
305    pub fn get_ref(&self) -> &W {
306        self.wtr.get_ref()
307    }
308
309    /// Returns the number of bytes written to the underlying writer
310    pub fn bytes_written(&self) -> u64 {
311        self.wtr.count()
312    }
313}
314
315impl UnfinishedNodes {
316    fn new() -> UnfinishedNodes {
317        let mut unfinished = UnfinishedNodes {
318            stack: Vec::with_capacity(64),
319        };
320        unfinished.push_empty(false);
321        unfinished
322    }
323
324    fn len(&self) -> usize {
325        self.stack.len()
326    }
327
328    fn push_empty(&mut self, is_final: bool) {
329        self.stack.push(BuilderNodeUnfinished {
330            node: BuilderNode {
331                is_final,
332                ..BuilderNode::default()
333            },
334            last: None,
335        });
336    }
337
338    fn pop_root(&mut self) -> BuilderNode {
339        assert_eq!(self.stack.len(), 1);
340        assert!(self.stack[0].last.is_none());
341        self.stack.pop().unwrap().node
342    }
343
344    fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
345        let mut unfinished = self.stack.pop().unwrap();
346        unfinished.last_compiled(addr);
347        unfinished.node
348    }
349
350    fn pop_empty(&mut self) -> BuilderNode {
351        let unfinished = self.stack.pop().unwrap();
352        assert!(unfinished.last.is_none());
353        unfinished.node
354    }
355
356    fn set_root_output(&mut self, out: Output) {
357        self.stack[0].node.is_final = true;
358        self.stack[0].node.final_output = out;
359    }
360
361    fn top_last_freeze(&mut self, addr: CompiledAddr) {
362        let last = self.stack.len().checked_sub(1).unwrap();
363        self.stack[last].last_compiled(addr);
364    }
365
366    fn add_suffix(&mut self, bs: &[u8], out: Output) {
367        if bs.is_empty() {
368            return;
369        }
370        let last = self.stack.len().checked_sub(1).unwrap();
371        assert!(self.stack[last].last.is_none());
372        self.stack[last].last = Some(LastTransition { inp: bs[0], out });
373        for &b in &bs[1..] {
374            self.stack.push(BuilderNodeUnfinished {
375                node: BuilderNode::default(),
376                last: Some(LastTransition {
377                    inp: b,
378                    out: Output::zero(),
379                }),
380            });
381        }
382        self.push_empty(true);
383    }
384
385    fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
386        bs.iter()
387            .zip(&self.stack)
388            .take_while(|&(&b, ref node)| node.last.as_ref().map(|t| t.inp == b).unwrap_or(false))
389            .count()
390    }
391
392    fn find_common_prefix_and_set_output(&mut self, bs: &[u8], mut out: Output) -> (usize, Output) {
393        let mut i = 0;
394        while i < bs.len() {
395            let add_prefix = match self.stack[i].last.as_mut() {
396                Some(ref mut t) if t.inp == bs[i] => {
397                    i += 1;
398                    let common_pre = t.out.prefix(out);
399                    let add_prefix = t.out.sub(common_pre);
400                    out = out.sub(common_pre);
401                    t.out = common_pre;
402                    add_prefix
403                }
404                _ => break,
405            };
406            if !add_prefix.is_zero() {
407                self.stack[i].add_output_prefix(add_prefix);
408            }
409        }
410        (i, out)
411    }
412}
413
414impl BuilderNodeUnfinished {
415    fn last_compiled(&mut self, addr: CompiledAddr) {
416        if let Some(trans) = self.last.take() {
417            self.node.trans.push(Transition {
418                inp: trans.inp,
419                out: trans.out,
420                addr,
421            });
422        }
423    }
424
425    fn add_output_prefix(&mut self, prefix: Output) {
426        if self.node.is_final {
427            self.node.final_output = prefix.cat(self.node.final_output);
428        }
429        for t in &mut self.node.trans {
430            t.out = prefix.cat(t.out);
431        }
432        if let Some(ref mut t) = self.last {
433            t.out = prefix.cat(t.out);
434        }
435    }
436}
437
438impl Clone for BuilderNode {
439    fn clone(&self) -> Self {
440        BuilderNode {
441            is_final: self.is_final,
442            final_output: self.final_output,
443            trans: self.trans.clone(),
444        }
445    }
446
447    fn clone_from(&mut self, source: &Self) {
448        self.is_final = source.is_final;
449        self.final_output = source.final_output;
450        self.trans.clear();
451        self.trans.extend(source.trans.iter());
452    }
453}
454
455impl Default for BuilderNode {
456    fn default() -> Self {
457        BuilderNode {
458            is_final: false,
459            final_output: Output::zero(),
460            trans: vec![],
461        }
462    }
463}