elasticlunr/
inverted_index.rs

1//! Implements an elasticlunr.js inverted index. Most users do not need to use this module directly.
2
3use std::collections::BTreeMap;
4
5#[derive(Debug, Copy, Clone, Serialize, Deserialize, PartialEq)]
6struct TermFrequency {
7    #[serde(rename = "tf")]
8    pub term_freq: f64,
9}
10
11#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Default)]
12struct IndexItem {
13    pub docs: BTreeMap<String, TermFrequency>,
14    #[serde(rename = "df")]
15    pub doc_freq: i64,
16    #[serde(flatten, serialize_with = "IndexItem::serialize")]
17    pub children: BTreeMap<char, IndexItem>,
18}
19
20impl IndexItem {
21    fn new() -> Self {
22        Default::default()
23    }
24
25    fn serialize<S>(map: &BTreeMap<char, IndexItem>, ser: S) -> Result<S::Ok, S::Error>
26    where
27        S: ::serde::Serializer,
28    {
29        use serde::ser::SerializeMap;
30
31        let mut ser_map = ser.serialize_map(Some(map.len()))?;
32        let mut buf = [0u8; 4];
33        for (key, value) in map {
34            let key = key.encode_utf8(&mut buf);
35            ser_map.serialize_entry(key, value)?;
36        }
37        ser_map.end()
38    }
39
40    fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) {
41        let mut iter = token.chars();
42        if let Some(character) = iter.next() {
43            let mut item = self
44                .children
45                .entry(character)
46                .or_insert_with(IndexItem::new);
47
48            for character in iter {
49                let tmp = item;
50                item = tmp.children.entry(character).or_insert_with(IndexItem::new);
51            }
52
53            if !item.docs.contains_key(doc_ref) {
54                item.doc_freq += 1;
55            }
56            item.docs
57                .insert(doc_ref.into(), TermFrequency { term_freq });
58        }
59    }
60
61    fn get_node(&self, token: &str) -> Option<&IndexItem> {
62        let mut root = self;
63        for ch in token.chars() {
64            if let Some(item) = root.children.get(&ch) {
65                root = item;
66            } else {
67                return None;
68            }
69        }
70
71        Some(root)
72    }
73
74    fn remove_token(&mut self, doc_ref: &str, token: &str) {
75        let mut iter = token.char_indices();
76        if let Some((_, ch)) = iter.next() {
77            if let Some(item) = self.children.get_mut(&ch) {
78                if let Some((idx, _)) = iter.next() {
79                    item.remove_token(doc_ref, &token[idx..]);
80                } else if item.docs.contains_key(doc_ref) {
81                    item.docs.remove(doc_ref);
82                    item.doc_freq -= 1;
83                }
84            }
85        }
86    }
87}
88
89/// Implements an elasticlunr.js inverted index. Most users do not need to use this type directly.
90#[derive(Serialize, Deserialize, Debug, PartialEq, Default)]
91pub struct InvertedIndex {
92    root: IndexItem,
93}
94
95impl InvertedIndex {
96    pub fn new() -> Self {
97        Default::default()
98    }
99
100    pub fn add_token(&mut self, doc_ref: &str, token: &str, term_freq: f64) {
101        self.root.add_token(doc_ref, token, term_freq)
102    }
103
104    pub fn has_token(&self, token: &str) -> bool {
105        self.root.get_node(token).map_or(false, |_| true)
106    }
107
108    pub fn remove_token(&mut self, doc_ref: &str, token: &str) {
109        self.root.remove_token(doc_ref, token)
110    }
111
112    pub fn get_docs(&self, token: &str) -> Option<BTreeMap<String, f64>> {
113        self.root.get_node(token).map(|node| {
114            node.docs
115                .iter()
116                .map(|(k, &v)| (k.clone(), v.term_freq))
117                .collect()
118        })
119    }
120
121    pub fn get_term_frequency(&self, doc_ref: &str, token: &str) -> f64 {
122        self.root
123            .get_node(token)
124            .and_then(|node| node.docs.get(doc_ref))
125            .map_or(0., |docs| docs.term_freq)
126    }
127
128    pub fn get_doc_frequency(&self, token: &str) -> i64 {
129        self.root.get_node(token).map_or(0, |node| node.doc_freq)
130    }
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136
137    #[test]
138    fn adding_token() {
139        let mut inverted_index = InvertedIndex::new();
140        let token = "foo";
141
142        inverted_index.add_token("123", token, 1.);
143        assert_eq!(inverted_index.get_doc_frequency("foo"), 1);
144        assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.);
145    }
146
147    #[test]
148    fn has_token() {
149        let mut inverted_index = InvertedIndex::new();
150        let token = "foo";
151
152        inverted_index.add_token("123", token, 1.);
153        assert!(inverted_index.has_token(token));
154        assert!(inverted_index.has_token("fo"));
155        assert!(inverted_index.has_token("f"));
156
157        assert!(!inverted_index.has_token("bar"));
158        assert!(!inverted_index.has_token("foo "));
159        assert!(!inverted_index.has_token("foo  "))
160    }
161
162    #[test]
163    fn adding_another_document_to_the_token() {
164        let mut inverted_index = InvertedIndex::new();
165        let token = "foo";
166
167        inverted_index.add_token("123", token, 1.);
168        inverted_index.add_token("456", token, 1.);
169
170        assert_eq!(inverted_index.get_term_frequency("123", "foo"), 1.);
171        assert_eq!(inverted_index.get_term_frequency("456", "foo"), 1.);
172        assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
173    }
174
175    #[test]
176    fn df_of_nonexistant_token() {
177        let mut inverted_index = InvertedIndex::new();
178        let token = "foo";
179
180        inverted_index.add_token("123", token, 1.);
181        inverted_index.add_token("456", token, 1.);
182
183        assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
184        assert_eq!(inverted_index.get_doc_frequency("fox"), 0);
185    }
186
187    #[test]
188    fn adding_existing_doc() {
189        let mut inverted_index = InvertedIndex::new();
190        let token = "foo";
191
192        inverted_index.add_token("123", token, 1.);
193        inverted_index.add_token("456", token, 1.);
194        inverted_index.add_token("456", token, 100.);
195
196        assert_eq!(inverted_index.get_term_frequency("456", "foo"), 100.);
197        assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
198    }
199
200    #[test]
201    fn checking_token_exists_in() {
202        let mut inverted_index = InvertedIndex::new();
203        let token = "foo";
204
205        inverted_index.add_token("123", token, 1.);
206
207        assert!(inverted_index.has_token(token));
208    }
209
210    #[test]
211    fn checking_if_a_token_does_not_exist() {
212        let mut inverted_index = InvertedIndex::new();
213        let token = "foo";
214
215        inverted_index.add_token("123", token, 1.);
216        assert!(!inverted_index.has_token("fooo"));
217        assert!(!inverted_index.has_token("bar"));
218        assert!(!inverted_index.has_token("fof"));
219    }
220
221    #[test]
222    fn retrieving_items() {
223        let mut inverted_index = InvertedIndex::new();
224        let token = "foo";
225
226        inverted_index.add_token("123", token, 1.);
227        assert_eq!(
228            inverted_index.get_docs(token).unwrap(),
229            btreemap! {
230                "123".into() => 1.
231            }
232        );
233
234        assert_eq!(inverted_index.get_docs(""), Some(BTreeMap::new()));
235
236        inverted_index.add_token("234", "boo", 100.);
237        inverted_index.add_token("345", "too", 101.);
238
239        assert_eq!(
240            inverted_index.get_docs(token).unwrap(),
241            btreemap! {
242                "123".into() => 1.
243            }
244        );
245
246        inverted_index.add_token("234", token, 100.);
247        inverted_index.add_token("345", token, 101.);
248
249        assert_eq!(
250            inverted_index.get_docs(token).unwrap(),
251            btreemap! {
252                "123".into() => 1.,
253                "234".into() => 100.,
254                "345".into() => 101.,
255            }
256        );
257    }
258
259    #[test]
260    fn retrieving_nonexistant_items() {
261        let inverted_index = InvertedIndex::new();
262
263        assert_eq!(inverted_index.get_docs("foo"), None);
264        assert_eq!(inverted_index.get_docs("fox"), None);
265    }
266
267    #[test]
268    fn df_of_items() {
269        let mut inverted_index = InvertedIndex::new();
270
271        inverted_index.add_token("123", "foo", 1.);
272        inverted_index.add_token("456", "foo", 1.);
273        inverted_index.add_token("789", "bar", 1.);
274
275        assert_eq!(inverted_index.get_doc_frequency("foo"), 2);
276        assert_eq!(inverted_index.get_doc_frequency("bar"), 1);
277        assert_eq!(inverted_index.get_doc_frequency("baz"), 0);
278        assert_eq!(inverted_index.get_doc_frequency("ba"), 0);
279        assert_eq!(inverted_index.get_doc_frequency("b"), 0);
280        assert_eq!(inverted_index.get_doc_frequency("fo"), 0);
281        assert_eq!(inverted_index.get_doc_frequency("f"), 0);
282    }
283
284    #[test]
285    fn removing_document_from_token() {
286        let mut inverted_index = InvertedIndex::new();
287        assert_eq!(inverted_index.get_docs("foo"), None);
288
289        inverted_index.add_token("123", "foo", 1.);
290        assert_eq!(
291            inverted_index.get_docs("foo").unwrap(),
292            btreemap! {
293                "123".into() => 1.,
294            }
295        );
296
297        inverted_index.remove_token("123", "foo");
298        assert_eq!(inverted_index.get_docs("foo"), Some(BTreeMap::new()));
299        assert_eq!(inverted_index.get_doc_frequency("foo"), 0);
300        assert_eq!(inverted_index.has_token("foo"), true);
301    }
302
303    #[test]
304    fn removing_nonexistant_document() {
305        let mut inverted_index = InvertedIndex::new();
306
307        inverted_index.add_token("123", "foo", 1.);
308        inverted_index.add_token("567", "bar", 1.);
309        inverted_index.remove_token("foo", "456");
310
311        assert_eq!(
312            inverted_index.get_docs("foo").unwrap(),
313            btreemap! {
314                "123".into() => 1.
315            }
316        );
317        assert_eq!(inverted_index.get_doc_frequency("foo"), 1);
318    }
319
320    #[test]
321    fn removing_documet_nonexistant_key() {
322        let mut inverted_index = InvertedIndex::new();
323
324        inverted_index.remove_token("123", "foo");
325        assert!(!inverted_index.has_token("foo"));
326        assert_eq!(inverted_index.get_doc_frequency("foo"), 0);
327    }
328
329    #[test]
330    fn get_term_frequency() {
331        let mut inverted_index = InvertedIndex::new();
332        let token = "foo";
333
334        inverted_index.add_token("123", token, 2.);
335        inverted_index.add_token("456", token, 3.);
336
337        assert_eq!(inverted_index.get_term_frequency("123", token), 2.);
338        assert_eq!(inverted_index.get_term_frequency("456", token), 3.);
339        assert_eq!(inverted_index.get_term_frequency("789", token), 0.);
340    }
341
342    #[test]
343    fn get_term_frequency_nonexistant_token() {
344        let mut inverted_index = InvertedIndex::new();
345        let token = "foo";
346
347        inverted_index.add_token("123", token, 2.);
348        inverted_index.add_token("456", token, 3.);
349
350        assert_eq!(inverted_index.get_term_frequency("123", "ken"), 0.);
351        assert_eq!(inverted_index.get_term_frequency("456", "ken"), 0.);
352    }
353
354    #[test]
355    fn get_term_frequency_nonexistant_docref() {
356        let mut inverted_index = InvertedIndex::new();
357        let token = "foo";
358
359        inverted_index.add_token("123", token, 2.);
360        inverted_index.add_token("456", token, 3.);
361
362        assert_eq!(inverted_index.get_term_frequency(token, "12"), 0.);
363        assert_eq!(inverted_index.get_term_frequency(token, "23"), 0.);
364        assert_eq!(inverted_index.get_term_frequency(token, "45"), 0.);
365    }
366
367    #[test]
368    fn get_term_frequency_nonexistant_token_and_docref() {
369        let mut inverted_index = InvertedIndex::new();
370        let token = "foo";
371
372        inverted_index.add_token("123", token, 2.);
373        inverted_index.add_token("456", token, 3.);
374
375        assert_eq!(inverted_index.get_term_frequency("token", "1"), 0.);
376        assert_eq!(inverted_index.get_term_frequency("abc", "2"), 0.);
377        assert_eq!(inverted_index.get_term_frequency("fo", "123"), 0.);
378    }
379}