1pub mod common;
6
7use crate::Pipeline;
8
9pub trait Language {
10 fn name(&self) -> String;
12
13 fn code(&self) -> String;
15
16 fn tokenize(&self, text: &str) -> Vec<String>;
18
19 fn make_pipeline(&self) -> Pipeline;
21}
22
23pub fn tokenize_whitespace(text: &str) -> Vec<String> {
25 text.split(|c: char| c.is_whitespace() || c == '-')
26 .filter(|s| !s.is_empty())
27 .map(|s| s.trim().to_lowercase())
28 .collect()
29}
30
31macro_rules! impl_language {
32 ($( ( $name:ident, $code:ident $(, #[$cfgs:meta] )? ), )+) => {
33 pub fn languages() -> Vec<Box<dyn Language>> {
35 vec![
36 $(
37 $(#[$cfgs])?
38 Box::new($code::$name::new()),
39 )+
40 ]
41 }
42
43 pub fn from_code(code: &str) -> Option<Box<dyn Language>> {
53 match code.to_ascii_lowercase().as_str() {
54 $(
55 $(#[$cfgs])?
56 stringify!($code) => Some(Box::new($code::$name::new())),
57 )+
58 _ => None,
59 }
60 }
61
62 pub fn from_name(name: &str) -> Option<Box<dyn Language>> {
66 match name {
67 $(
68 $(#[$cfgs])?
69 stringify!($name) => Some(Box::new($code::$name::new())),
70 )+
71 _ => None,
72 }
73 }
74
75 $(
76 $(#[$cfgs])?
77 mod $code;
78
79 $(#[$cfgs])?
80 pub use $code::$name;
81 )+
82 };
83}
84
85impl_language! {
86 (English, en),
87 (Arabic, ar, #[cfg(feature = "ar")]),
88 (Chinese, zh, #[cfg(feature = "zh")]),
89 (Danish, da, #[cfg(feature = "da")]),
90 (Dutch, du, #[cfg(feature = "du")]),
91 (Finnish, fi, #[cfg(feature = "fi")]),
92 (French, fr, #[cfg(feature = "fr")]),
93 (German, de, #[cfg(feature = "de")]),
94 (Hungarian, hu, #[cfg(feature = "hu")]),
95 (Italian, it, #[cfg(feature = "it")]),
96 (Japanese, ja, #[cfg(feature = "ja")]),
97 (Korean, ko, #[cfg(feature = "ko")]),
98 (Norwegian, no, #[cfg(feature = "no")]),
99 (Portuguese, pt, #[cfg(feature = "pt")]),
100 (Romanian, ro, #[cfg(feature = "ro")]),
101 (Russian, ru, #[cfg(feature = "ru")]),
102 (Spanish, es, #[cfg(feature = "es")]),
103 (Swedish, sv, #[cfg(feature = "sv")]),
104 (Turkish, tr, #[cfg(feature = "tr")]),
105}
106
107#[cfg(test)]
108mod tests {
109 use super::tokenize_whitespace;
110
111 #[test]
112 fn split_simple_strings() {
113 let string = "this is a simple string";
114 assert_eq!(
115 &tokenize_whitespace(string),
116 &["this", "is", "a", "simple", "string"]
117 );
118 }
119
120 #[test]
121 fn multiple_white_space() {
122 let string = " foo bar ";
123 assert_eq!(&tokenize_whitespace(string), &["foo", "bar"]);
124 }
125
126 #[test]
127 fn hyphens() {
128 let string = "take the New York-San Francisco flight";
129 assert_eq!(
130 &tokenize_whitespace(string),
131 &["take", "the", "new", "york", "san", "francisco", "flight"]
132 );
133 }
134
135 #[test]
136 fn splitting_strings_with_hyphens() {
137 let string = "Solve for A - B";
138 assert_eq!(&tokenize_whitespace(string), &["solve", "for", "a", "b"]);
139 }
140}