azul_layout/text/
script.rs

1// Taken from: https://github.com/greyblake/whatlang-rs/blob/master/src/scripts/detect.rs
2//
3// See: https://github.com/greyblake/whatlang-rs/pull/67
4
5// License:
6//
7// (The MIT License)
8//
9// Copyright (c) 2017 Sergey Potapov <blake131313@gmail.com>
10// Copyright (c) 2014 Titus Wormer <tituswormer@gmail.com>
11// Copyright (c) 2008 Kent S Johnson
12// Copyright (c) 2006 Jacob R Rideout <kde@jacobrideout.net>
13// Copyright (c) 2004 Maciej Ceglowski
14//
15// Permission is hereby granted, free of charge, to any person obtaining
16// a copy of this software and associated documentation files (the
17// 'Software'), to deal in the Software without restriction, including
18// without limitation the rights to use, copy, modify, merge, publish,
19// distribute, sublicense, and/or sell copies of the Software, and to
20// permit persons to whom the Software is furnished to do so, subject to
21// the following conditions:
22//
23// The above copyright notice and this permission notice shall be
24// included in all copies or substantial portions of the Software.
25//
26// THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,
27// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
28// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
29// IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
30// CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
31// TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
32// SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
33
34#[derive(PartialEq, Eq, Debug, Clone, Copy)]
35pub enum Script {
36    // Keep this in alphabetic order (for C bindings)
37    Arabic,
38    Bengali,
39    Cyrillic,
40    Devanagari,
41    Ethiopic,
42    Georgian,
43    Greek,
44    Gujarati,
45    Gurmukhi,
46    Hangul,
47    Hebrew,
48    Hiragana,
49    Kannada,
50    Katakana,
51    Khmer,
52    Latin,
53    Malayalam,
54    Mandarin,
55    Myanmar,
56    Oriya,
57    Sinhala,
58    Tamil,
59    Telugu,
60    Thai,
61}
62
63// Is it space, punctuation or digit?
64// Stop character is a character that does not give any value for script
65// or language detection.
66#[inline]
67pub fn is_stop_char(ch: char) -> bool {
68    matches!(ch, '\u{0000}'..='\u{0040}' | '\u{005B}'..='\u{0060}' | '\u{007B}'..='\u{007E}')
69}
70
71type ScriptCounter = (Script, fn(char) -> bool, usize);
72
73/// Detect only a script by a given text
74///
75/// # Example
76/// ```
77/// use whatlang::{detect_script, Script};
78/// let script = detect_script("Благодаря Эсперанто вы обрётете друзей по всему миру!").unwrap();
79/// assert_eq!(script, Script::Cyrillic);
80/// ```
81pub fn detect_script(text: &str) -> Option<Script> {
82    let mut script_counters: [ScriptCounter; 24] = [
83        (Script::Latin, is_latin, 0),
84        (Script::Cyrillic, is_cyrillic, 0),
85        (Script::Arabic, is_arabic, 0),
86        (Script::Mandarin, is_mandarin, 0),
87        (Script::Devanagari, is_devanagari, 0),
88        (Script::Hebrew, is_hebrew, 0),
89        (Script::Ethiopic, is_ethiopic, 0),
90        (Script::Georgian, is_georgian, 0),
91        (Script::Bengali, is_bengali, 0),
92        (Script::Hangul, is_hangul, 0),
93        (Script::Hiragana, is_hiragana, 0),
94        (Script::Katakana, is_katakana, 0),
95        (Script::Greek, is_greek, 0),
96        (Script::Kannada, is_kannada, 0),
97        (Script::Tamil, is_tamil, 0),
98        (Script::Thai, is_thai, 0),
99        (Script::Gujarati, is_gujarati, 0),
100        (Script::Gurmukhi, is_gurmukhi, 0),
101        (Script::Telugu, is_telugu, 0),
102        (Script::Malayalam, is_malayalam, 0),
103        (Script::Oriya, is_oriya, 0),
104        (Script::Myanmar, is_myanmar, 0),
105        (Script::Sinhala, is_sinhala, 0),
106        (Script::Khmer, is_khmer, 0),
107    ];
108
109    let half = text.chars().count() / 2;
110
111    for ch in text.chars() {
112        if is_stop_char(ch) {
113            continue;
114        }
115
116        // For performance reasons, we need to mutate script_counters by calling
117        // `swap` function, it would not be possible to do using normal iterator.
118        for i in 0..script_counters.len() {
119            let found = {
120                let (script, check_fn, ref mut count) = script_counters[i];
121                if check_fn(ch) {
122                    *count += 1;
123                    if *count > half {
124                        return Some(script);
125                    }
126                    true
127                } else {
128                    false
129                }
130            };
131            // Have to let borrow of count fall out of scope before doing swapping, or we could
132            // do this above.
133            if found {
134                // If script was found, move it closer to the front.
135                // If the text contains largely 1 or 2 scripts, this will
136                // cause these scripts to be eventually checked first.
137                if i > 0 {
138                    script_counters.swap(i - 1, i);
139                }
140                break;
141            }
142        }
143    }
144
145    let (script, _, count) = script_counters
146        .iter()
147        .cloned()
148        .max_by_key(|&(_, _, count)| count)
149        .unwrap();
150    if count != 0 {
151        Some(script)
152    } else {
153        None
154    }
155}
156
157fn is_cyrillic(ch: char) -> bool {
158    matches!(ch,
159        '\u{0400}'..='\u{0484}'
160        | '\u{0487}'..='\u{052F}'
161        | '\u{2DE0}'..='\u{2DFF}'
162        | '\u{A640}'..='\u{A69D}'
163        | '\u{1D2B}'
164        | '\u{1D78}'
165        | '\u{A69F}'
166    )
167}
168
169// https://en.wikipedia.org/wiki/Latin_script_in_Unicode
170fn is_latin(ch: char) -> bool {
171    matches!(ch,
172        'a'..='z'
173        | 'A'..='Z'
174        | '\u{0080}'..='\u{00FF}'
175        | '\u{0100}'..='\u{017F}'
176        | '\u{0180}'..='\u{024F}'
177        | '\u{0250}'..='\u{02AF}'
178        | '\u{1D00}'..='\u{1D7F}'
179        | '\u{1D80}'..='\u{1DBF}'
180        | '\u{1E00}'..='\u{1EFF}'
181        | '\u{2100}'..='\u{214F}'
182        | '\u{2C60}'..='\u{2C7F}'
183        | '\u{A720}'..='\u{A7FF}'
184        | '\u{AB30}'..='\u{AB6F}'
185    )
186}
187
188// Based on https://en.wikipedia.org/wiki/Arabic_script_in_Unicode
189fn is_arabic(ch: char) -> bool {
190    matches!(ch,
191        '\u{0600}'..='\u{06FF}'
192        | '\u{0750}'..='\u{07FF}'
193        | '\u{08A0}'..='\u{08FF}'
194        | '\u{FB50}'..='\u{FDFF}'
195        | '\u{FE70}'..='\u{FEFF}'
196        | '\u{10E60}'..='\u{10E7F}'
197        | '\u{1EE00}'..='\u{1EEFF}'
198    )
199}
200
201// Based on https://en.wikipedia.org/wiki/Devanagari#Unicode
202fn is_devanagari(ch: char) -> bool {
203    matches!(ch, '\u{0900}'..='\u{097F}' | '\u{A8E0}'..='\u{A8FF}' | '\u{1CD0}'..='\u{1CFF}')
204}
205
206// Based on https://www.key-shortcut.com/en/writing-systems/ethiopian-script/
207fn is_ethiopic(ch: char) -> bool {
208    matches!(ch, '\u{1200}'..='\u{139F}' | '\u{2D80}'..='\u{2DDF}' | '\u{AB00}'..='\u{AB2F}')
209}
210
211// Based on https://en.wikipedia.org/wiki/Hebrew_(Unicode_block)
212fn is_hebrew(ch: char) -> bool {
213    matches!(ch, '\u{0590}'..='\u{05FF}')
214}
215
216fn is_georgian(ch: char) -> bool {
217    matches!(ch, '\u{10A0}'..='\u{10FF}')
218}
219
220fn is_mandarin(ch: char) -> bool {
221    matches!(ch,
222        '\u{2E80}'..='\u{2E99}'
223        | '\u{2E9B}'..='\u{2EF3}'
224        | '\u{2F00}'..='\u{2FD5}'
225        | '\u{3005}'
226        | '\u{3007}'
227        | '\u{3021}'..='\u{3029}'
228        | '\u{3038}'..='\u{303B}'
229        | '\u{3400}'..='\u{4DB5}'
230        | '\u{4E00}'..='\u{9FCC}'
231        | '\u{F900}'..='\u{FA6D}'
232        | '\u{FA70}'..='\u{FAD9}'
233    )
234}
235
236fn is_bengali(ch: char) -> bool {
237    matches!(ch, '\u{0980}'..='\u{09FF}')
238}
239
240fn is_hiragana(ch: char) -> bool {
241    matches!(ch, '\u{3040}'..='\u{309F}')
242}
243
244fn is_katakana(ch: char) -> bool {
245    matches!(ch, '\u{30A0}'..='\u{30FF}')
246}
247
248// Hangul is Korean Alphabet. Unicode ranges are taken from: https://en.wikipedia.org/wiki/Hangul
249fn is_hangul(ch: char) -> bool {
250    matches!(ch,
251        '\u{AC00}'..='\u{D7AF}'
252        | '\u{1100}'..='\u{11FF}'
253        | '\u{3130}'..='\u{318F}'
254        | '\u{3200}'..='\u{32FF}'
255        | '\u{A960}'..='\u{A97F}'
256        | '\u{D7B0}'..='\u{D7FF}'
257        | '\u{FF00}'..='\u{FFEF}'
258    )
259}
260
261// Taken from: https://en.wikipedia.org/wiki/Greek_and_Coptic
262fn is_greek(ch: char) -> bool {
263    matches!(ch, '\u{0370}'..='\u{03FF}')
264}
265
266// Based on: https://en.wikipedia.org/wiki/Kannada_(Unicode_block)
267fn is_kannada(ch: char) -> bool {
268    matches!(ch, '\u{0C80}'..='\u{0CFF}')
269}
270
271// Based on: https://en.wikipedia.org/wiki/Tamil_(Unicode_block)
272fn is_tamil(ch: char) -> bool {
273    matches!(ch, '\u{0B80}'..='\u{0BFF}')
274}
275
276// Based on: https://en.wikipedia.org/wiki/Thai_(Unicode_block)
277fn is_thai(ch: char) -> bool {
278    matches!(ch, '\u{0E00}'..='\u{0E7F}')
279}
280
281// Based on: https://en.wikipedia.org/wiki/Gujarati_(Unicode_block)
282fn is_gujarati(ch: char) -> bool {
283    matches!(ch, '\u{0A80}'..='\u{0AFF}')
284}
285
286// Gurmukhi is the script for Punjabi language.
287// Based on: https://en.wikipedia.org/wiki/Gurmukhi_(Unicode_block)
288fn is_gurmukhi(ch: char) -> bool {
289    matches!(ch, '\u{0A00}'..='\u{0A7F}')
290}
291
292fn is_telugu(ch: char) -> bool {
293    matches!(ch, '\u{0C00}'..='\u{0C7F}')
294}
295
296// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
297fn is_malayalam(ch: char) -> bool {
298    matches!(ch, '\u{0D00}'..='\u{0D7F}')
299}
300
301// Based on: https://en.wikipedia.org/wiki/Malayalam_(Unicode_block)
302fn is_oriya(ch: char) -> bool {
303    matches!(ch, '\u{0B00}'..='\u{0B7F}')
304}
305
306// Based on: https://en.wikipedia.org/wiki/Myanmar_(Unicode_block)
307fn is_myanmar(ch: char) -> bool {
308    matches!(ch, '\u{1000}'..='\u{109F}')
309}
310
311// Based on: https://en.wikipedia.org/wiki/Sinhala_(Unicode_block)
312fn is_sinhala(ch: char) -> bool {
313    matches!(ch, '\u{0D80}'..='\u{0DFF}')
314}
315
316// Based on: https://en.wikipedia.org/wiki/Khmer_alphabet
317fn is_khmer(ch: char) -> bool {
318    matches!(ch, '\u{1780}'..='\u{17FF}' | '\u{19E0}'..='\u{19FF}')
319}