1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
use crate::pipeline::PipelineFn;
use regex::Regex;
use std::collections::HashSet;

#[derive(Clone)]
pub struct StopWordFilter {
    name: String,
    stop_words: HashSet<String>,
}

impl StopWordFilter {
    pub fn new(name: &str, stop_words: &[&str]) -> Self {
        Self {
            name: name.into(),
            stop_words: stop_words.iter().map(|s| s.to_string()).collect(),
        }
    }
}

impl PipelineFn for StopWordFilter {
    fn name(&self) -> String {
        self.name.clone()
    }

    fn filter(&self, token: String) -> Option<String> {
        if self.stop_words.contains(&token) {
            None
        } else {
            Some(token)
        }
    }
}

#[derive(Clone)]
pub struct RegexTrimmer {
    name: String,
    trimmer: Regex,
}

impl RegexTrimmer {
    pub fn new(name: &str, word_chars: &str) -> Self {
        let name = name.into();
        let trimmer = Regex::new(&format!("^[^{0}]+|[^{0}]+$", word_chars)).unwrap();
        Self { name, trimmer }
    }
}

impl PipelineFn for RegexTrimmer {
    fn name(&self) -> String {
        self.name.clone()
    }

    fn filter(&self, token: String) -> Option<String> {
        let result = self.trimmer.replace_all(&token, "");
        if result.is_empty() {
            None
        } else if result == token {
            Some(token)
        } else {
            Some(result.into())
        }
    }
}

#[cfg(feature = "rust-stemmers")]
pub struct RustStemmer {
    name: String,
    stemmer: rust_stemmers::Stemmer,
}

#[cfg(feature = "rust-stemmers")]
impl RustStemmer {
    pub fn new(name: &str, algo: rust_stemmers::Algorithm) -> Self {
        Self {
            name: name.into(),
            stemmer: rust_stemmers::Stemmer::create(algo),
        }
    }
}

#[cfg(feature = "rust-stemmers")]
impl PipelineFn for RustStemmer {
    fn name(&self) -> String {
        self.name.clone()
    }

    fn filter(&self, token: String) -> Option<String> {
        let result = self.stemmer.stem(&token);
        if result.is_empty() {
            None
        } else if result == token {
            Some(token)
        } else {
            Some(result.into())
        }
    }
}