elasticlunr/lang/
en.rs

1use super::{common::StopWordFilter, Language};
2use crate::pipeline::{FnWrapper, Pipeline, PipelineFn};
3use regex::Regex;
4
5const WORDS: &[&str] = &[
6    "", "a", "able", "about", "across", "after", "all", "almost", "also", "am", "among", "an",
7    "and", "any", "are", "as", "at", "be", "because", "been", "but", "by", "can", "cannot",
8    "could", "dear", "did", "do", "does", "either", "else", "ever", "every", "for", "from", "get",
9    "got", "had", "has", "have", "he", "her", "hers", "him", "his", "how", "however", "i", "if",
10    "in", "into", "is", "it", "its", "just", "least", "let", "like", "likely", "may", "me",
11    "might", "most", "must", "my", "neither", "no", "nor", "not", "of", "off", "often", "on",
12    "only", "or", "other", "our", "own", "rather", "said", "say", "says", "she", "should", "since",
13    "so", "some", "than", "that", "the", "their", "them", "then", "there", "these", "they", "this",
14    "tis", "to", "too", "twas", "us", "wants", "was", "we", "were", "what", "when", "where",
15    "which", "while", "who", "whom", "why", "will", "with", "would", "yet", "you", "your",
16];
17
18#[derive(Clone)]
19pub struct English {
20    stemmer: Stemmer,
21}
22
23impl English {
24    pub fn new() -> Self {
25        let stemmer = Stemmer::new();
26        Self { stemmer }
27    }
28}
29
30impl Language for English {
31    fn name(&self) -> String {
32        "English".into()
33    }
34    fn code(&self) -> String {
35        "en".into()
36    }
37
38    fn tokenize(&self, text: &str) -> Vec<String> {
39        super::tokenize_whitespace(text)
40    }
41
42    fn make_pipeline(&self) -> Pipeline {
43        Pipeline {
44            queue: vec![
45                Box::new(FnWrapper("trimmer".into(), trimmer)),
46                Box::new(StopWordFilter::new("stopWordFilter", WORDS)),
47                Box::new(self.stemmer.clone()),
48            ],
49        }
50    }
51}
52
53fn trimmer(token: String) -> Option<String> {
54    Some(
55        token
56            .trim_matches(|c: char| !c.is_digit(36) && c != '_')
57            .into(),
58    )
59}
60
61static STEP_2: &[(&str, &str)] = &[
62    ("ational", "ate"),
63    ("tional", "tion"),
64    ("enci", "ence"),
65    ("anci", "ance"),
66    ("izer", "ize"),
67    ("bli", "ble"),
68    ("alli", "al"),
69    ("entli", "ent"),
70    ("eli", "e"),
71    ("ousli", "ous"),
72    ("ization", "ize"),
73    ("ation", "ate"),
74    ("ator", "ate"),
75    ("alism", "al"),
76    ("iveness", "ive"),
77    ("fulness", "ful"),
78    ("ousness", "ous"),
79    ("aliti", "al"),
80    ("iviti", "ive"),
81    ("biliti", "ble"),
82    ("logi", "log"),
83];
84
85static STEP_3: &[(&str, &str)] = &[
86    ("icate", "ic"),
87    ("ative", ""),
88    ("alize", "al"),
89    ("iciti", "ic"),
90    ("ical", "ic"),
91    ("ful", ""),
92    ("ness", ""),
93];
94
95// This is a direct port of the stemmer from elasticlunr.js
96// It's not very efficient and very not-rusty, but it
97// generates identical output.
98
99#[derive(Clone)]
100struct Stemmer {
101    re_mgr0: Regex,
102    re_mgr1: Regex,
103    re_meq1: Regex,
104    re_s_v: Regex,
105
106    re_1a: Regex,
107    re2_1a: Regex,
108    re_1b: Regex,
109    re2_1b: Regex,
110    re2_1b_2: Regex,
111    re3_1b_2: Regex,
112    re4_1b_2: Regex,
113
114    re_1c: Regex,
115    re_2: Regex,
116
117    re_3: Regex,
118
119    re_4: Regex,
120    re2_4: Regex,
121
122    re_5: Regex,
123    re3_5: Regex,
124}
125
126impl PipelineFn for Stemmer {
127    fn name(&self) -> String {
128        "stemmer".into()
129    }
130
131    fn filter(&self, token: String) -> Option<String> {
132        Some(self.stem(token))
133    }
134}
135
136// vowel
137macro_rules! V {
138    () => {
139        "[aeiouy]"
140    };
141}
142
143// consonant sequence
144macro_rules! CS {
145    () => {
146        "[^aeiou][^aeiouy]*"
147    };
148}
149
150// vowel sequence
151macro_rules! VS {
152    () => {
153        "[aeiouy][aeiou]*"
154    };
155}
156
157#[inline]
158fn concat_string(strs: &[&str]) -> String {
159    strs.iter().cloned().collect()
160}
161
162impl Stemmer {
163    fn new() -> Self {
164        let mgr0 = concat!("^(", CS!(), ")?", VS!(), CS!());
165        let meq1 = concat!("^(", CS!(), ")?", VS!(), CS!(), "(", VS!(), ")?$");
166        let mgr1 = concat!("^(", CS!(), ")?", VS!(), CS!(), VS!(), CS!());
167        let s_v = concat!("^(", CS!(), ")?", V!());
168
169        let re_mgr0 = Regex::new(mgr0).unwrap();
170        let re_mgr1 = Regex::new(mgr1).unwrap();
171        let re_meq1 = Regex::new(meq1).unwrap();
172        let re_s_v = Regex::new(s_v).unwrap();
173
174        let re_1a = Regex::new("^(.+?)(ss|i)es$").unwrap();
175        let re2_1a = Regex::new("^(.+?)([^s])s$").unwrap();
176        let re_1b = Regex::new("^(.+?)eed$").unwrap();
177        let re2_1b = Regex::new("^(.+?)(ed|ing)$").unwrap();
178        let re2_1b_2 = Regex::new("(at|bl|iz)$").unwrap();
179        let re3_1b_2 = Regex::new("([^aeiouylsz]{2})$").unwrap();
180        let re4_1b_2 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
181
182        let re_1c = Regex::new("^(.+?[^aeiou])y$").unwrap();
183        let re_2 = Regex::new(
184            "^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|\
185             ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$",
186        )
187        .unwrap();
188
189        let re_3 = Regex::new("^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$").unwrap();
190
191        let re_4 = Regex::new(
192            "^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$",
193        )
194        .unwrap();
195        let re2_4 = Regex::new("^(.+?)(s|t)(ion)$").unwrap();
196
197        let re_5 = Regex::new("^(.+?)e$").unwrap();
198        let re3_5 = Regex::new(concat!("^", CS!(), V!(), "[^aeiouwxy]$")).unwrap();
199
200        Stemmer {
201            re_mgr0,
202            re_mgr1,
203            re_meq1,
204            re_s_v,
205            re_1a,
206            re2_1a,
207            re_1b,
208            re2_1b,
209            re2_1b_2,
210            re3_1b_2,
211            re4_1b_2,
212            re_1c,
213            re_2,
214            re_3,
215            re_4,
216            re2_4,
217            re_5,
218            re3_5,
219        }
220    }
221
222    /// Implements the Porter stemming algorithm
223    pub fn stem(&self, mut w: String) -> String {
224        if w.len() < 3 {
225            return w;
226        }
227
228        let starts_with_y = w.as_bytes()[0] == b'y';
229        if starts_with_y {
230            w.remove(0);
231            w.insert(0, 'Y');
232        }
233
234        // TODO: There's probably a better way to handle the
235        // borrowchecker than cloning w a million times
236
237        // Step 1a
238        if let Some(caps) = self.re_1a.captures(&w.clone()) {
239            w = concat_string(&[&caps[1], &caps[2]]);
240        }
241        if let Some(caps) = self.re2_1a.captures(&w.clone()) {
242            w = concat_string(&[&caps[1], &caps[2]]);
243        }
244
245        // Step 1b
246        if let Some(caps) = self.re_1b.captures(&w.clone()) {
247            let stem = &caps[1];
248            if self.re_mgr0.is_match(stem) {
249                w.pop();
250            }
251        } else if let Some(caps) = self.re2_1b.captures(&w.clone()) {
252            let stem = &caps[1];
253            if self.re_s_v.is_match(stem) {
254                w = stem.into();
255
256                let mut re3_1b_2_matched = false;
257
258                if self.re2_1b_2.is_match(&w) {
259                    w.push('e');
260                } else if let Some(m) = self.re3_1b_2.find(&w.clone()) {
261                    let mut suffix = m.as_str().chars();
262                    // Make sure the two characters are the same since we can't use backreferences
263                    if suffix.next() == suffix.next() {
264                        re3_1b_2_matched = true;
265                        w.pop();
266                    }
267                }
268
269                // re4_1b_2 still runs if re3_1b_2 matches but
270                // the matched chcaracters are not the same
271                if !re3_1b_2_matched && self.re4_1b_2.is_match(&w) {
272                    w.push('e');
273                }
274            }
275        }
276
277        // Step 1c - replace suffix y or Y by i if preceded by a non-vowel which is not the first
278        // letter of the word (so cry -> cri, by -> by, say -> say)
279        if let Some(caps) = self.re_1c.captures(&w.clone()) {
280            let stem = &caps[1];
281            w = concat_string(&[stem, "i"]);
282        }
283
284        // Step 2
285        if let Some(caps) = self.re_2.captures(&w.clone()) {
286            let stem = &caps[1];
287            let suffix = &caps[2];
288            if self.re_mgr0.is_match(stem) {
289                w = concat_string(&[stem, STEP_2.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
290            }
291        }
292
293        // Step 3
294        if let Some(caps) = self.re_3.captures(&w.clone()) {
295            let stem = &caps[1];
296            let suffix = &caps[2];
297            if self.re_mgr0.is_match(stem) {
298                w = concat_string(&[stem, STEP_3.iter().find(|&&(k, _)| k == suffix).unwrap().1]);
299            }
300        }
301
302        // Step 4
303        if let Some(caps) = self.re_4.captures(&w.clone()) {
304            let stem = &caps[1];
305            if self.re_mgr1.is_match(stem) {
306                w = stem.into();
307            }
308        } else if let Some(caps) = self.re2_4.captures(&w.clone()) {
309            let stem = concat_string(&[&caps[1], &caps[2]]);
310            if self.re_mgr1.is_match(&stem) {
311                w = stem;
312            }
313        }
314
315        // Step 5
316        if let Some(caps) = self.re_5.captures(&w.clone()) {
317            let stem = &caps[1];
318            if self.re_mgr1.is_match(stem)
319                || (self.re_meq1.is_match(stem) && !(self.re3_5.is_match(stem)))
320            {
321                w = stem.into();
322            }
323        }
324
325        if w.ends_with("ll") && self.re_mgr1.is_match(&w) {
326            w.pop();
327        }
328
329        // replace the original 'y'
330        if starts_with_y {
331            w.remove(0);
332            w.insert(0, 'y');
333        }
334
335        w
336    }
337}
338
339#[cfg(test)]
340mod tests {
341    use super::*;
342
343    macro_rules! pipeline_eq {
344        ($func:expr, $input:expr, $output:expr) => {
345            assert_eq!(&$func($input.to_string()).unwrap(), $output);
346        };
347    }
348
349    #[test]
350    fn latin_characters() {
351        pipeline_eq!(trimmer, "hello", "hello");
352    }
353
354    #[test]
355    fn removing_punctuation() {
356        pipeline_eq!(trimmer, "hello.", "hello");
357        pipeline_eq!(trimmer, "it's", "it's");
358        pipeline_eq!(trimmer, "james'", "james");
359        pipeline_eq!(trimmer, "stop!", "stop");
360        pipeline_eq!(trimmer, "first,", "first");
361        pipeline_eq!(trimmer, "", "");
362        pipeline_eq!(trimmer, "[tag]", "tag");
363        pipeline_eq!(trimmer, "[[[tag]]]", "tag");
364        pipeline_eq!(trimmer, "[[!@#@!hello]]]}}}", "hello");
365        pipeline_eq!(trimmer, "~!@@@hello***()()()]]", "hello");
366    }
367
368    #[test]
369    fn test_stemmer() {
370        let cases = [
371            ("consign", "consign"),
372            ("consigned", "consign"),
373            ("consigning", "consign"),
374            ("consignment", "consign"),
375            ("consist", "consist"),
376            ("consisted", "consist"),
377            ("consistency", "consist"),
378            ("consistent", "consist"),
379            ("consistently", "consist"),
380            ("consisting", "consist"),
381            ("consists", "consist"),
382            ("consolation", "consol"),
383            ("consolations", "consol"),
384            ("consolatory", "consolatori"),
385            ("console", "consol"),
386            ("consoled", "consol"),
387            ("consoles", "consol"),
388            ("consolidate", "consolid"),
389            ("consolidated", "consolid"),
390            ("consolidating", "consolid"),
391            ("consoling", "consol"),
392            ("consols", "consol"),
393            ("consonant", "conson"),
394            ("consort", "consort"),
395            ("consorted", "consort"),
396            ("consorting", "consort"),
397            ("conspicuous", "conspicu"),
398            ("conspicuously", "conspicu"),
399            ("conspiracy", "conspiraci"),
400            ("conspirator", "conspir"),
401            ("conspirators", "conspir"),
402            ("conspire", "conspir"),
403            ("conspired", "conspir"),
404            ("conspiring", "conspir"),
405            ("constable", "constabl"),
406            ("constables", "constabl"),
407            ("constance", "constanc"),
408            ("constancy", "constanc"),
409            ("constant", "constant"),
410            ("knack", "knack"),
411            ("knackeries", "knackeri"),
412            ("knacks", "knack"),
413            ("knag", "knag"),
414            ("knave", "knave"),
415            ("knaves", "knave"),
416            ("knavish", "knavish"),
417            ("kneaded", "knead"),
418            ("kneading", "knead"),
419            ("knee", "knee"),
420            ("kneel", "kneel"),
421            ("kneeled", "kneel"),
422            ("kneeling", "kneel"),
423            ("kneels", "kneel"),
424            ("knees", "knee"),
425            ("knell", "knell"),
426            ("knelt", "knelt"),
427            ("knew", "knew"),
428            ("knick", "knick"),
429            ("knif", "knif"),
430            ("knife", "knife"),
431            ("knight", "knight"),
432            ("knights", "knight"),
433            ("knit", "knit"),
434            ("knits", "knit"),
435            ("knitted", "knit"),
436            ("knitting", "knit"),
437            ("knives", "knive"),
438            ("knob", "knob"),
439            ("knobs", "knob"),
440            ("knock", "knock"),
441            ("knocked", "knock"),
442            ("knocker", "knocker"),
443            ("knockers", "knocker"),
444            ("knocking", "knock"),
445            ("knocks", "knock"),
446            ("knopp", "knopp"),
447            ("knot", "knot"),
448            ("knots", "knot"),
449            ("lay", "lay"),
450            ("try", "tri"),
451        ];
452
453        let stemmer = Stemmer::new();
454        for &(input, output) in cases.iter() {
455            assert_eq!(&stemmer.stem(input.into()), output);
456        }
457    }
458}