1#[derive(PartialEq, Eq, Debug, Clone, Copy)]
35pub enum Script {
36 Arabic,
38 Bengali,
39 Cyrillic,
40 Devanagari,
41 Ethiopic,
42 Georgian,
43 Greek,
44 Gujarati,
45 Gurmukhi,
46 Hangul,
47 Hebrew,
48 Hiragana,
49 Kannada,
50 Katakana,
51 Khmer,
52 Latin,
53 Malayalam,
54 Mandarin,
55 Myanmar,
56 Oriya,
57 Sinhala,
58 Tamil,
59 Telugu,
60 Thai,
61}
62
63#[inline]
67pub fn is_stop_char(ch: char) -> bool {
68 matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
69}
70
71type ScriptCounter = (Script, fn(char) -> bool, usize);
72
73pub fn detect_script(text: &str) -> Option<Script> {
82 let mut script_counters: [ScriptCounter; 24] = [
83 (Script::Latin, is_latin, 0),
84 (Script::Cyrillic, is_cyrillic, 0),
85 (Script::Arabic, is_arabic, 0),
86 (Script::Mandarin, is_mandarin, 0),
87 (Script::Devanagari, is_devanagari, 0),
88 (Script::Hebrew, is_hebrew, 0),
89 (Script::Ethiopic, is_ethiopic, 0),
90 (Script::Georgian, is_georgian, 0),
91 (Script::Bengali, is_bengali, 0),
92 (Script::Hangul, is_hangul, 0),
93 (Script::Hiragana, is_hiragana, 0),
94 (Script::Katakana, is_katakana, 0),
95 (Script::Greek, is_greek, 0),
96 (Script::Kannada, is_kannada, 0),
97 (Script::Tamil, is_tamil, 0),
98 (Script::Thai, is_thai, 0),
99 (Script::Gujarati, is_gujarati, 0),
100 (Script::Gurmukhi, is_gurmukhi, 0),
101 (Script::Telugu, is_telugu, 0),
102 (Script::Malayalam, is_malayalam, 0),
103 (Script::Oriya, is_oriya, 0),
104 (Script::Myanmar, is_myanmar, 0),
105 (Script::Sinhala, is_sinhala, 0),
106 (Script::Khmer, is_khmer, 0),
107 ];
108
109 let half = text.chars().count() / 2;
110
111 for ch in text.chars() {
112 if is_stop_char(ch) {
113 continue;
114 }
115
116 for i in 0..script_counters.len() {
119 let found = {
120 let (script, check_fn, ref mut count) = script_counters[i];
121 if check_fn(ch) {
122 *count += 1;
123 if *count > half {
124 return Some(script);
125 }
126 true
127 } else {
128 false
129 }
130 };
131 if found {
134 if i > 0 {
138 script_counters.swap(i - 1, i);
139 }
140 break;
141 }
142 }
143 }
144
145 let (script, _, count) = script_counters
146 .iter()
147 .cloned()
148 .max_by_key(|&(_, _, count)| count)
149 .unwrap();
150 if count != 0 {
151 Some(script)
152 } else {
153 None
154 }
155}
156
157fn is_cyrillic(ch: char) -> bool {
158 matches!(ch,
159 '\u{0400}'..='\u{0484}'
160 | '\u{0487}'..='\u{052F}'
161 | '\u{2DE0}'..='\u{2DFF}'
162 | '\u{A640}'..='\u{A69D}'
163 | '\u{1D2B}'
164 | '\u{1D78}'
165 | '\u{A69F}'
166 )
167}
168
169fn is_latin(ch: char) -> bool {
171 matches!(ch,
172 'a'..='z'
173 | 'A'..='Z'
174 | '\u{0080}'..='\u{00FF}'
175 | '\u{0100}'..='\u{017F}'
176 | '\u{0180}'..='\u{024F}'
177 | '\u{0250}'..='\u{02AF}'
178 | '\u{1D00}'..='\u{1D7F}'
179 | '\u{1D80}'..='\u{1DBF}'
180 | '\u{1E00}'..='\u{1EFF}'
181 | '\u{2100}'..='\u{214F}'
182 | '\u{2C60}'..='\u{2C7F}'
183 | '\u{A720}'..='\u{A7FF}'
184 | '\u{AB30}'..='\u{AB6F}'
185 )
186}
187
188fn is_arabic(ch: char) -> bool {
190 matches!(ch,
191 '\u{0600}'..='\u{06FF}'
192 | '\u{0750}'..='\u{07FF}'
193 | '\u{08A0}'..='\u{08FF}'
194 | '\u{FB50}'..='\u{FDFF}'
195 | '\u{FE70}'..='\u{FEFF}'
196 | '\u{10E60}'..='\u{10E7F}'
197 | '\u{1EE00}'..='\u{1EEFF}'
198 )
199}
200
201fn is_devanagari(ch: char) -> bool {
203 matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
204}
205
206fn is_ethiopic(ch: char) -> bool {
208 matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
209}
210
211fn is_hebrew(ch: char) -> bool {
213 matches!(ch, '\u{0590}'..='\u{05FF}')
214}
215
216fn is_georgian(ch: char) -> bool {
217 matches!(ch, '\u{10A0}'..='\u{10FF}')
218}
219
220fn is_mandarin(ch: char) -> bool {
221 matches!(ch,
222 '\u{2E80}'..='\u{2E99}'
223 | '\u{2E9B}'..='\u{2EF3}'
224 | '\u{2F00}'..='\u{2FD5}'
225 | '\u{3005}'
226 | '\u{3007}'
227 | '\u{3021}'..='\u{3029}'
228 | '\u{3038}'..='\u{303B}'
229 | '\u{3400}'..='\u{4DB5}'
230 | '\u{4E00}'..='\u{9FCC}'
231 | '\u{F900}'..='\u{FA6D}'
232 | '\u{FA70}'..='\u{FAD9}'
233 )
234}
235
236fn is_bengali(ch: char) -> bool {
237 matches!(ch, '\u{0980}'..='\u{09FF}')
238}
239
240fn is_hiragana(ch: char) -> bool {
241 matches!(ch, '\u{3040}'..='\u{309F}')
242}
243
244fn is_katakana(ch: char) -> bool {
245 matches!(ch, '\u{30A0}'..='\u{30FF}')
246}
247
248fn is_hangul(ch: char) -> bool {
250 matches!(ch,
251 '\u{AC00}'..='\u{D7AF}'
252 | '\u{1100}'..='\u{11FF}'
253 | '\u{3130}'..='\u{318F}'
254 | '\u{3200}'..='\u{32FF}'
255 | '\u{A960}'..='\u{A97F}'
256 | '\u{D7B0}'..='\u{D7FF}'
257 | '\u{FF00}'..='\u{FFEF}'
258 )
259}
260
261fn is_greek(ch: char) -> bool {
263 matches!(ch, '\u{0370}'..='\u{03FF}')
264}
265
266fn is_kannada(ch: char) -> bool {
268 matches!(ch, '\u{0C80}'..='\u{0CFF}')
269}
270
271fn is_tamil(ch: char) -> bool {
273 matches!(ch, '\u{0B80}'..='\u{0BFF}')
274}
275
276fn is_thai(ch: char) -> bool {
278 matches!(ch, '\u{0E00}'..='\u{0E7F}')
279}
280
281fn is_gujarati(ch: char) -> bool {
283 matches!(ch, '\u{0A80}'..='\u{0AFF}')
284}
285
286fn is_gurmukhi(ch: char) -> bool {
289 matches!(ch, '\u{0A00}'..='\u{0A7F}')
290}
291
292fn is_telugu(ch: char) -> bool {
293 matches!(ch, '\u{0C00}'..='\u{0C7F}')
294}
295
296fn is_malayalam(ch: char) -> bool {
298 matches!(ch, '\u{0D00}'..='\u{0D7F}')
299}
300
301fn is_oriya(ch: char) -> bool {
303 matches!(ch, '\u{0B00}'..='\u{0B7F}')
304}
305
306fn is_myanmar(ch: char) -> bool {
308 matches!(ch, '\u{1000}'..='\u{109F}')
309}
310
311fn is_sinhala(ch: char) -> bool {
313 matches!(ch, '\u{0D80}'..='\u{0DFF}')
314}
315
316fn is_khmer(ch: char) -> bool {
318 matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
319}