elasticlunr/lang/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
//! Intended to be compatible with <https://github.com/MihaiValentin/lunr-languages>. Each supported
//! language has a trimmer, a stop word filter, and a stemmer. Most users will not need to use
//! these modules directly.
pub mod common;
use crate::Pipeline;
pub trait Language {
/// The name of the language in English
fn name(&self) -> String;
/// The ISO 639-1 language code of the language
fn code(&self) -> String;
/// Separates the input text into individual tokens. In most languages a token is a word, separated by whitespace.
fn tokenize(&self, text: &str) -> Vec<String>;
/// Returns the [`Pipeline`] to process the tokens with
fn make_pipeline(&self) -> Pipeline;
}
/// Splits a text string into a vector of individual tokens.
pub fn tokenize_whitespace(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}
macro_rules! impl_language {
($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => {
/// Returns a list of all the [`Language`] implementations in the crate
pub fn languages() -> Vec<Box<dyn Language>> {
vec![
$(
$(#[$cfgs])?
Box::new($code::$name::new()),
)+
]
}
/// Returns the [`Language`] for the given two-character [ISO 639-1][iso] language code if the
/// language is supported. Returns `None` if not supported.
///
/// *Note:*
///
/// The ISO 639-1 code for Dutch is "nl". However "du" is used for the module name
/// and pipeline suffix in order to match lunr-languages.
///
/// [iso]: https://en.wikipedia.org/wiki/ISO_639-1
pub fn from_code(code: &str) -> Option<Box<dyn Language>> {
match code.to_ascii_lowercase().as_str() {
$(
$(#[$cfgs])?
stringify!($code) => Some(Box::new($code::$name::new())),
)+
_ => None,
}
}
/// Returns the [`Language`] for the given English language name if the
/// language is supported. Returns `None` if not supported. The first letter must
/// be capitalized.
pub fn from_name(name: &str) -> Option<Box<dyn Language>> {
match name {
$(
$(#[$cfgs])?
stringify!($name) => Some(Box::new($code::$name::new())),
)+
_ => None,
}
}
$(
$(#[$cfgs])?
mod $code;
$(#[$cfgs])?
pub use $code::$name;
)+
};
}
impl_language! {
(English, en),
(Arabic, ar, #[cfg(feature = "ar")]),
(Chinese, zh, #[cfg(feature = "zh")]),
(Danish, da, #[cfg(feature = "da")]),
(Dutch, du, #[cfg(feature = "du")]),
(Finnish, fi, #[cfg(feature = "fi")]),
(French, fr, #[cfg(feature = "fr")]),
(German, de, #[cfg(feature = "de")]),
(Hungarian, hu, #[cfg(feature = "hu")]),
(Italian, it, #[cfg(feature = "it")]),
(Japanese, ja, #[cfg(feature = "ja")]),
(Korean, ko, #[cfg(feature = "ko")]),
(Norwegian, no, #[cfg(feature = "no")]),
(Portuguese, pt, #[cfg(feature = "pt")]),
(Romanian, ro, #[cfg(feature = "ro")]),
(Russian, ru, #[cfg(feature = "ru")]),
(Spanish, es, #[cfg(feature = "es")]),
(Swedish, sv, #[cfg(feature = "sv")]),
(Turkish, tr, #[cfg(feature = "tr")]),
}
#[cfg(test)]
mod tests {
use super::tokenize_whitespace;
#[test]
fn split_simple_strings() {
let string = "this is a simple string";
assert_eq!(
&tokenize_whitespace(string),
&["this", "is", "a", "simple", "string"]
);
}
#[test]
fn multiple_white_space() {
let string = " foo bar ";
assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]);
}
#[test]
fn hyphens() {
let string = "take the New York-San Francisco flight";
assert_eq!(
&tokenize_whitespace(string),
&["take", "the", "new", "york", "san", "francisco", "flight"]
);
}
#[test]
fn splitting_strings_with_hyphens() {
let string = "Solve for A - B";
assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]);
}
}