elasticlunr/lang/
mod.rs

1//! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported
2//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use
3//! these modules directly.
4
5pub mod common;
6
7use crate::Pipeline;
8
9pub trait Language {
10    /// The name of the language in English
11    fn name(&self) -> String;
12
13    /// The ISO 639-1 language code of the language
14    fn code(&self) -> String;
15
16    /// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace.
17    fn tokenize(&self, text: &str) -> Vec<String>;
18
19    /// Returns the [`Pipeline`] to process the tokens with
20    fn make_pipeline(&self) -> Pipeline;
21}
22
23/// Splits a text string into a vector of individual tokens.
24pub fn tokenize_whitespace(text: &str) -> Vec<String> {
25    text.split(|c: char| c.is_whitespace() || c == '-')
26        .filter(|s| !s.is_empty())
27        .map(|s| s.trim().to_lowercase())
28        .collect()
29}
30
31macro_rules! impl_language {
32    ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => {
33        /// Returns a list of all the [`Language`] implementations in the crate
34        pub fn languages() -> Vec<Box<dyn Language>> {
35            vec![
36                $(
37                    $(#[$cfgs])?
38                    Box::new($code::$name::new()),
39                )+
40            ]
41        }
42
43        /// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the
44        /// language is supported. Returns `None` if not supported.
45        ///
46        /// *Note:*
47        ///
48        /// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name
49        /// and pipeline suffix in order to match lunr-languages.
50        ///
51        /// [iso]: https://en.wikipedia.org/wiki/ISO_639-1
52        pub fn from_code(code: &str) -> Option<Box<dyn Language>> {
53            match code.to_ascii_lowercase().as_str() {
54                $(
55                    $(#[$cfgs])?
56                    stringify!($code) => Some(Box::new($code::$name::new())),
57                )+
58                _ => None,
59            }
60        }
61
62        /// Returns the [`Language`] for the given English language name if the
63        /// language is supported. Returns `None` if not supported. The first letter must
64        /// be capitalized.
65        pub fn from_name(name: &str) -> Option<Box<dyn Language>> {
66            match name {
67                $(
68                    $(#[$cfgs])?
69                    stringify!($name) => Some(Box::new($code::$name::new())),
70                )+
71                _ => None,
72            }
73        }
74
75        $(
76            $(#[$cfgs])?
77            mod $code;
78
79            $(#[$cfgs])?
80            pub use $code::$name;
81        )+
82    };
83}
84
85impl_language! {
86    (English, en),
87    (Arabic, ar, #[cfg(feature = "ar")]),
88    (Chinese, zh, #[cfg(feature = "zh")]),
89    (Danish, da, #[cfg(feature = "da")]),
90    (Dutch, du, #[cfg(feature = "du")]),
91    (Finnish, fi, #[cfg(feature = "fi")]),
92    (French, fr, #[cfg(feature = "fr")]),
93    (German, de, #[cfg(feature = "de")]),
94    (Hungarian, hu, #[cfg(feature = "hu")]),
95    (Italian, it, #[cfg(feature = "it")]),
96    (Japanese, ja, #[cfg(feature = "ja")]),
97    (Korean, ko, #[cfg(feature = "ko")]),
98    (Norwegian, no, #[cfg(feature = "no")]),
99    (Portuguese, pt, #[cfg(feature = "pt")]),
100    (Romanian, ro, #[cfg(feature = "ro")]),
101    (Russian, ru, #[cfg(feature = "ru")]),
102    (Spanish, es, #[cfg(feature = "es")]),
103    (Swedish, sv, #[cfg(feature = "sv")]),
104    (Turkish, tr, #[cfg(feature = "tr")]),
105}
106
107#[cfg(test)]
108mod tests {
109    use super::tokenize_whitespace;
110
111    #[test]
112    fn split_simple_strings() {
113        let string = "this is a simple string";
114        assert_eq!(
115            &tokenize_whitespace(string),
116            &["this", "is", "a", "simple", "string"]
117        );
118    }
119
120    #[test]
121    fn multiple_white_space() {
122        let string = "  foo    bar  ";
123        assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]);
124    }
125
126    #[test]
127    fn hyphens() {
128        let string = "take the New York-San Francisco flight";
129        assert_eq!(
130            &tokenize_whitespace(string),
131            &["take", "the", "new", "york", "san", "francisco", "flight"]
132        );
133    }
134
135    #[test]
136    fn splitting_strings_with_hyphens() {
137        let string = "Solve for A - B";
138        assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]);
139    }
140}