elasticlunr/
lib.rs

1//!# elasticlunr-rs
2//!
3//! [![Build Status](https://travis-ci.org/mattico/elasticlunr-rs.svg?branch=master)](https://travis-ci.org/mattico/elasticlunr-rs)
4//! [![Documentation](https://docs.rs/elasticlunr-rs/badge.svg)](https://docs.rs/elasticlunr-rs)
5//! [![Crates.io](https://img.shields.io/crates/v/elasticlunr-rs.svg)](https://crates.io/crates/elasticlunr-rs)
6//!
7//! A partial port of [elasticlunr](https://github.com/weixsong/elasticlunr.js) to Rust. Intended to
8//! be used for generating compatible search indices.
9//!
10//! Access to all index-generating functionality is provided. Most users will only need to use the
11//! [`Index`](struct.Index.html) or [`IndexBuilder`](struct.IndexBuilder.html) types.
12//!
13//! The [`Language`] trait can be used to implement a custom language.
14//!
15//! ## Example
16//!
17//! ```
18//! use std::fs::File;
19//! use std::io::Write;
20//! use elasticlunr::Index;
21//!
22//! let mut index = Index::new(&["title", "body"]);
23//! index.add_doc("1", &["This is a title", "This is body text!"]);
24//! // Add more docs...
25//! let mut file = File::create("out.json").unwrap();
26//! file.write_all(index.to_json_pretty().as_bytes());
27//! ```
28
29#[macro_use]
30extern crate serde_derive;
31
32#[cfg(test)]
33#[macro_use]
34extern crate maplit;
35
36/// The version of elasticlunr.js this library was designed for.
37pub const ELASTICLUNR_VERSION: &str = "0.9.5";
38
39pub mod config;
40pub mod document_store;
41pub mod inverted_index;
42pub mod lang;
43pub mod pipeline;
44
45use std::collections::BTreeMap;
46
47use document_store::DocumentStore;
48use inverted_index::InvertedIndex;
49use lang::English;
50pub use lang::Language;
51pub use pipeline::Pipeline;
52
53type Tokenizer = Option<Box<dyn Fn(&str) -> Vec<String>>>;
54
55/// A builder for an `Index` with custom parameters.
56///
57/// # Example
58/// ```
59/// # use elasticlunr::{Index, IndexBuilder};
60/// let mut index = IndexBuilder::new()
61///     .save_docs(false)
62///     .add_fields(&["title", "subtitle", "body"])
63///     .set_ref("doc_id")
64///     .build();
65/// index.add_doc("doc_a", &["Chapter 1", "Welcome to Copenhagen", "..."]);
66/// ```
67pub struct IndexBuilder {
68    save: bool,
69    fields: Vec<String>,
70    field_tokenizers: Vec<Tokenizer>,
71    ref_field: String,
72    pipeline: Option<Pipeline>,
73    language: Box<dyn Language>,
74}
75
76impl Default for IndexBuilder {
77    fn default() -> Self {
78        IndexBuilder {
79            save: true,
80            fields: Vec::new(),
81            field_tokenizers: Vec::new(),
82            ref_field: "id".into(),
83            pipeline: None,
84            language: Box::new(English::new()),
85        }
86    }
87}
88
89impl IndexBuilder {
90    pub fn new() -> Self {
91        Default::default()
92    }
93
94    pub fn with_language(language: Box<dyn Language>) -> Self {
95        Self {
96            language,
97            ..Default::default()
98        }
99    }
100
101    /// Set whether or not documents should be saved in the `Index`'s document store.
102    pub fn save_docs(mut self, save: bool) -> Self {
103        self.save = save;
104        self
105    }
106
107    /// Add a document field to the `Index`.
108    ///
109    /// # Panics
110    ///
111    /// Panics if a field with the name already exists.
112    pub fn add_field(mut self, field: &str) -> Self {
113        let field = field.into();
114        if self.fields.contains(&field) {
115            panic!("Duplicate fields in index: {}", field);
116        }
117        self.fields.push(field);
118        self.field_tokenizers.push(None);
119        self
120    }
121
122    /// Add a document field to the `Index`, with a custom tokenizer for that field.
123    ///
124    /// # Panics
125    ///
126    /// Panics if a field with the name already exists.
127    pub fn add_field_with_tokenizer(
128        mut self,
129        field: &str,
130        tokenizer: Box<dyn Fn(&str) -> Vec<String>>,
131    ) -> Self {
132        let field = field.into();
133        if self.fields.contains(&field) {
134            panic!("Duplicate fields in index: {}", field);
135        }
136        self.fields.push(field);
137        self.field_tokenizers.push(Some(tokenizer));
138        self
139    }
140
141    /// Add the document fields to the `Index`.
142    ///
143    /// # Panics
144    ///
145    /// Panics if two fields have the same name.
146    pub fn add_fields<I>(mut self, fields: I) -> Self
147    where
148        I: IntoIterator,
149        I::Item: AsRef<str>,
150    {
151        for field in fields {
152            self = self.add_field(field.as_ref())
153        }
154        self
155    }
156
157    /// Set the key used to store the document reference field.
158    pub fn set_ref(mut self, ref_field: &str) -> Self {
159        self.ref_field = ref_field.into();
160        self
161    }
162
163    /// Build an `Index` from this builder.
164    pub fn build(self) -> Index {
165        let IndexBuilder {
166            save,
167            fields,
168            field_tokenizers,
169            ref_field,
170            pipeline,
171            language,
172        } = self;
173
174        let index = fields
175            .iter()
176            .map(|f| (f.clone(), InvertedIndex::new()))
177            .collect();
178
179        let pipeline = pipeline.unwrap_or_else(|| language.make_pipeline());
180
181        Index {
182            index,
183            fields,
184            field_tokenizers,
185            ref_field,
186            document_store: DocumentStore::new(save),
187            pipeline,
188            version: crate::ELASTICLUNR_VERSION,
189            lang: language,
190        }
191    }
192}
193
194/// An elasticlunr search index.
195#[derive(Serialize, Deserialize)]
196#[serde(rename_all = "camelCase")]
197pub struct Index {
198    fields: Vec<String>,
199    #[serde(skip)]
200    field_tokenizers: Vec<Tokenizer>,
201    pipeline: Pipeline,
202    #[serde(rename = "ref")]
203    ref_field: String,
204    version: &'static str,
205    index: BTreeMap<String, InvertedIndex>,
206    document_store: DocumentStore,
207    #[serde(with = "ser_lang")]
208    lang: Box<dyn Language>,
209}
210
211mod ser_lang {
212    use crate::Language;
213    use serde::de;
214    use serde::{Deserializer, Serializer};
215    use std::fmt;
216
217    pub fn serialize<S>(lang: &Box<dyn Language>, serializer: S) -> Result<S::Ok, S::Error>
218    where
219        S: Serializer,
220    {
221        serializer.serialize_str(&lang.name())
222    }
223
224    pub fn deserialize<'de, D>(deserializer: D) -> Result<Box<dyn Language>, D::Error>
225    where
226        D: Deserializer<'de>,
227    {
228        deserializer.deserialize_str(LanguageVisitor)
229    }
230
231    struct LanguageVisitor;
232
233    impl<'de> de::Visitor<'de> for LanguageVisitor {
234        type Value = Box<dyn Language>;
235
236        fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
237            formatter.write_str("a capitalized language name")
238        }
239
240        fn visit_borrowed_str<E>(self, v: &'de str) -> Result<Self::Value, E>
241        where
242            E: de::Error,
243        {
244            match crate::lang::from_name(v) {
245                Some(l) => Ok(l),
246                None => Err(E::custom(format!("Unknown language name: {}", v))),
247            }
248        }
249    }
250}
251
252impl Index {
253    /// Create a new index with the provided fields.
254    ///
255    /// # Example
256    ///
257    /// ```
258    /// # use elasticlunr::{Index};
259    /// let mut index = Index::new(&["title", "body"]);
260    /// index.add_doc("1", &["this is a title", "this is body text"]);
261    /// ```
262    ///
263    /// # Panics
264    ///
265    /// Panics if a field with the name already exists.
266    pub fn new<I>(fields: I) -> Self
267    where
268        I: IntoIterator,
269        I::Item: AsRef<str>,
270    {
271        IndexBuilder::new().add_fields(fields).build()
272    }
273
274    /// Create a new index with the provided fields for the given
275    /// [`Language`](lang/enum.Language.html).
276    ///
277    /// # Example
278    ///
279    /// ```
280    /// use elasticlunr::{Index, lang::English};
281    /// let mut index = Index::with_language(Box::new(English::new()), &["title", "body"]);
282    /// index.add_doc("1", &["this is a title", "this is body text"]);
283    /// ```
284    ///
285    /// # Panics
286    ///
287    /// Panics if a field with the name already exists.
288    pub fn with_language<I>(lang: Box<dyn Language>, fields: I) -> Self
289    where
290        I: IntoIterator,
291        I::Item: AsRef<str>,
292    {
293        IndexBuilder::with_language(lang).add_fields(fields).build()
294    }
295
296    /// Add the data from a document to the index.
297    ///
298    /// *NOTE: The elements of `data` should be provided in the same order as
299    /// the fields used to create the index.*
300    ///
301    /// # Example
302    /// ```
303    /// # use elasticlunr::Index;
304    /// let mut index = Index::new(&["title", "body"]);
305    /// index.add_doc("1", &["this is a title", "this is body text"]);
306    /// ```
307    pub fn add_doc<I>(&mut self, doc_ref: &str, data: I)
308    where
309        I: IntoIterator,
310        I::Item: AsRef<str>,
311    {
312        let mut doc = BTreeMap::new();
313        doc.insert(self.ref_field.clone(), doc_ref.into());
314        let mut token_freq = BTreeMap::new();
315
316        for (i, value) in data.into_iter().enumerate() {
317            let field = &self.fields[i];
318            let tokenizer = self.field_tokenizers[i].as_ref();
319            doc.insert(field.clone(), value.as_ref().to_string());
320
321            if field == &self.ref_field {
322                continue;
323            }
324
325            let raw_tokens = if let Some(tokenizer) = tokenizer {
326                tokenizer(value.as_ref())
327            } else {
328                self.lang.tokenize(value.as_ref())
329            };
330
331            let tokens = self.pipeline.run(raw_tokens);
332
333            self.document_store
334                .add_field_length(doc_ref, field, tokens.len());
335
336            for token in tokens {
337                *token_freq.entry(token).or_insert(0u64) += 1;
338            }
339
340            for (token, count) in &token_freq {
341                let freq = (*count as f64).sqrt();
342
343                self.index
344                    .get_mut(field)
345                    .unwrap_or_else(|| panic!("InvertedIndex does not exist for field {}", field))
346                    .add_token(doc_ref, token, freq);
347            }
348        }
349
350        self.document_store.add_doc(doc_ref, doc);
351    }
352
353    pub fn get_fields(&self) -> &[String] {
354        &self.fields
355    }
356
357    /// Returns the index, serialized to pretty-printed JSON.
358    pub fn to_json_pretty(&self) -> String {
359        serde_json::to_string_pretty(&self).unwrap()
360    }
361
362    /// Returns the index, serialized to JSON.
363    pub fn to_json(&self) -> String {
364        serde_json::to_string(&self).unwrap()
365    }
366}
367
368#[cfg(test)]
369mod tests {
370    use super::*;
371
372    #[test]
373    fn add_field_to_builder() {
374        let idx = IndexBuilder::new()
375            .add_fields(&["foo", "bar", "baz"])
376            .build();
377
378        let idx_fields = idx.get_fields();
379        for f in &["foo", "bar", "baz"] {
380            assert_eq!(idx_fields.iter().filter(|x| x == f).count(), 1);
381        }
382    }
383
384    #[test]
385    fn adding_document_to_index() {
386        let mut idx = Index::new(&["body"]);
387        idx.add_doc("1", &["this is a test"]);
388
389        assert_eq!(idx.document_store.len(), 1);
390        assert_eq!(
391            idx.document_store.get_doc("1").unwrap(),
392            btreemap! {
393                "id".into() => "1".into(),
394                "body".into() => "this is a test".into(),
395            }
396        );
397    }
398
399    #[test]
400    fn adding_document_with_empty_field() {
401        let mut idx = Index::new(&["title", "body"]);
402
403        idx.add_doc("1", &["", "test"]);
404        assert_eq!(idx.index["body"].get_doc_frequency("test"), 1);
405        assert_eq!(idx.index["body"].get_docs("test").unwrap()["1"], 1.);
406    }
407
408    #[test]
409    #[should_panic]
410    fn creating_index_with_identical_fields_panics() {
411        let _idx = Index::new(&["title", "body", "title"]);
412    }
413}