unic_ucd_bidi/
bidi_class.rs

1// Copyright 2015 The Servo Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! Unicode `Bidi_Class` Character Property.
13
14use unic_char_property::TotalCharProperty;
15
16char_property! {
17    /// Represents the Unicode character
18    /// [`Bidi_Class`](https://www.unicode.org/reports/tr44/#Bidi_Class) property, also known as the
19    /// *bidirectional character type*.
20    ///
21    /// * <https://www.unicode.org/reports/tr9/#Bidirectional_Character_Types>
22    /// * <https://www.unicode.org/reports/tr44/#Bidi_Class_Values>
23    pub enum BidiClass {
24        abbr => "bc";
25        long => "Bidi_Class";
26        human => "Bidirectional Class";
27
28        /// A strong Right-to-Left (Arabic-type) character
29        ArabicLetter {
30            abbr => AL,
31            long => Arabic_Letter,
32            human => "Right-to-Left Arabic",
33        }
34
35        /// A (non-Eastern) Arabic-Indic digit
36        ArabicNumber {
37            abbr => AN,
38            long => Arabic_Number,
39            human => "Arabic Number",
40        }
41
42        /// A newline character
43        ParagraphSeparator {
44            abbr => B,
45            long => Paragraph_Separator,
46            human => "Paragraph Separator",
47        }
48
49        /// Most format characters, control codes, and noncharacters
50        BoundaryNeutral {
51            abbr => BN,
52            long => Boundary_Neutral,
53            human => "Neutral Boundary",
54        }
55
56        /// A comma, colon, or slash
57        CommonSeparator {
58            abbr => CS,
59            long => Common_Separator,
60            human => "Common Number Separator",
61        }
62
63        /// A ASCII digit or Eastern Arabic-Indic digit
64        EuropeanNumber {
65            abbr => EN,
66            long => European_Number,
67            human => "European Number",
68        }
69
70        /// A plus or minus sign
71        EuropeanSeparator {
72            abbr => ES,
73            long => European_Separator,
74            human => "European Number Separator",
75        }
76
77        /// A terminator in a numeric format context (including currency signs)
78        EuropeanTerminator {
79            abbr => ET,
80            long => European_Terminator,
81            human => "European Number Terminator",
82        }
83
84        /// U+2068: The first strong isolate control
85        FirstStrongIsolate {
86            abbr => FSI,
87            long => First_Strong_Isolate,
88            human => "First Strong Isolate",
89        }
90
91        /// A strong Left-to-Right character
92        LeftToRight {
93            abbr => L,
94            long => Left_To_Right,
95            human => "Left-to-Right",
96        }
97
98        /// U+202A: the Left-to-Right embedding control
99        LeftToRightEmbedding {
100            abbr => LRE,
101            long => Left_To_Right_Embedding,
102            human => "Left-to-Right Embedding",
103        }
104
105        /// U+2066: the Left-to-Right isolate control
106        LeftToRightIsolate {
107            abbr => LRI,
108            long => Left_To_Right_Isolate,
109            human => "Left-to-Right Isolate",
110        }
111
112        /// U+202D: the Left-to-Right override control
113        LeftToRightOverride {
114            abbr => LRO,
115            long => Left_To_Right_Override,
116            human => "LeftToRightOverride",
117        }
118
119        /// A nonspacing mark
120        NonspacingMark {
121            abbr => NSM,
122            long => Nonspacing_Mark,
123            human => "Nonspacing Mark",
124        }
125
126        /// Symbols and Punctuation not in a different category
127        OtherNeutral {
128            abbr => ON,
129            long => Other_Neutral,
130            human => "OtherNeutral",
131        }
132
133        /// U+202C: terminates an embedding or override control
134        PopDirectionalFormat {
135            abbr => PDF,
136            long => Pop_Directional_Format,
137            human => "Pop Directional Format",
138        }
139
140        /// U+2069: terminates an isolate control
141        PopDirectionalIsolate {
142            abbr => PDI,
143            long => Pop_Directional_Isolate,
144            human => "PopDirectionalIsolate",
145        }
146
147        /// A strong Right-to-Left (non-Arabic-type) character
148        RightToLeft {
149            abbr => R,
150            long => Right_To_Left,
151            human => "Right-to-Left",
152        }
153
154        /// U+202B: The Right-to-Left embedding control
155        RightToLeftEmbedding {
156            abbr => RLE,
157            long => Right_To_Left_Embedding,
158            human => "Right-to-Left Embedding",
159        }
160
161        /// U+2067: The Right-to-Left isolate control
162        RightToLeftIsolate {
163            abbr => RLI,
164            long => Right_To_Left_Isolate,
165            human => "Right-to-Left Isolate",
166        }
167
168        /// U+202E: The Right-to-Left override control
169        RightToLeftOverride {
170            abbr => RLO,
171            long => Right_To_Left_Override,
172            human => "Right-to-Left Override",
173        }
174
175        /// A segment-related control code
176        SegmentSeparator {
177            abbr => S,
178            long => Segment_Separator,
179            human => "Segment Separator",
180        }
181
182        /// Whitespace
183        WhiteSpace {
184            abbr => WS,
185            long => White_Space,
186            human => "Whitespace",
187        }
188    }
189
190    /// Abbreviated name aliases for the
191    /// [`Bidi_Class`](https://www.unicode.org/reports/tr44/#Bidi_Class) property.
192    ///
193    /// ## See Also
194    ///
195    /// - <https://www.unicode.org/reports/tr44/#Bidi_Class_Values>
196    /// - <https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt#Bidi_Class>
197    pub mod abbr_names for abbr;
198
199    /// Long name aliases for the
200    /// [`Bidi_Class`](https://www.unicode.org/reports/tr44/#Bidi_Class) property.
201    ///
202    /// ## See Also
203    ///
204    /// - <https://www.unicode.org/reports/tr44/#Bidi_Class_Values>
205    /// - <https://www.unicode.org/Public/UCD/latest/ucd/PropertyValueAliases.txt#Bidi_Class>
206    pub mod long_names for long;
207}
208
209impl TotalCharProperty for BidiClass {
210    fn of(ch: char) -> Self {
211        Self::of(ch)
212    }
213}
214
215/// UCD/extracted/DerivedBidiClass.txt:
216/// "All code points not explicitly listed for `Bidi_Class` have the value `Left_To_Right` (`L`)."
217impl Default for BidiClass {
218    #[inline]
219    fn default() -> Self {
220        BidiClass::LeftToRight
221    }
222}
223
224mod data {
225    use super::abbr_names::*;
226    use unic_char_property::tables::CharDataTable;
227    pub const BIDI_CLASS_TABLE: CharDataTable<super::BidiClass> =
228        include!("../tables/bidi_class.rsv");
229}
230
231impl BidiClass {
232    /// Find the character `Bidi_Class` property value.
233    pub fn of(ch: char) -> BidiClass {
234        data::BIDI_CLASS_TABLE.find_or_default(ch)
235    }
236
237    /// If the `BidiClass` has strong or explicit Left-to-Right direction.
238    #[inline]
239    pub fn category(&self) -> BidiClassCategory {
240        match *self {
241            BidiClass::LeftToRight | BidiClass::RightToLeft | BidiClass::ArabicLetter => {
242                BidiClassCategory::Strong
243            }
244
245            BidiClass::EuropeanNumber
246            | BidiClass::EuropeanSeparator
247            | BidiClass::EuropeanTerminator
248            | BidiClass::ArabicNumber
249            | BidiClass::CommonSeparator
250            | BidiClass::NonspacingMark
251            | BidiClass::BoundaryNeutral => BidiClassCategory::Weak,
252
253            BidiClass::ParagraphSeparator
254            | BidiClass::SegmentSeparator
255            | BidiClass::WhiteSpace
256            | BidiClass::OtherNeutral => BidiClassCategory::Neutral,
257
258            BidiClass::LeftToRightEmbedding
259            | BidiClass::LeftToRightOverride
260            | BidiClass::RightToLeftEmbedding
261            | BidiClass::RightToLeftOverride
262            | BidiClass::PopDirectionalFormat
263            | BidiClass::LeftToRightIsolate
264            | BidiClass::RightToLeftIsolate
265            | BidiClass::FirstStrongIsolate
266            | BidiClass::PopDirectionalIsolate => BidiClassCategory::ExplicitFormatting,
267        }
268    }
269
270    /// If the `BidiClass` has strong or explicit Left-to-Right direction.
271    #[inline]
272    pub fn is_ltr(&self) -> bool {
273        match *self {
274            BidiClass::LeftToRight
275            | BidiClass::LeftToRightEmbedding
276            | BidiClass::LeftToRightOverride
277            | BidiClass::LeftToRightIsolate => true,
278            _ => false,
279        }
280    }
281
282    /// If the `BidiClass` has strong or explicit Right-To-Left direction.
283    #[inline]
284    pub fn is_rtl(&self) -> bool {
285        match *self {
286            BidiClass::RightToLeft
287            | BidiClass::ArabicLetter
288            | BidiClass::RightToLeftEmbedding
289            | BidiClass::RightToLeftOverride
290            | BidiClass::RightToLeftIsolate => true,
291            _ => false,
292        }
293    }
294}
295
296/// Represents **Category** of Unicode character `Bidi_Class` property, as demostrated under
297/// "Table 4. Bidirectional Character Types".
298///
299/// * <https://www.unicode.org/reports/tr9/#Table_Bidirectional_Character_Types>
300#[derive(Clone, Copy, Debug, PartialEq, Eq)]
301pub enum BidiClassCategory {
302    /// Left-to-right and right-to-left types.
303    Strong,
304
305    /// Types associated with numbers.
306    Weak,
307
308    /// Directional formatting characters.
309    ExplicitFormatting,
310
311    /// Everything else.
312    Neutral,
313}
314
315/// Methods for `Bidi_Class` character property.
316pub trait CharBidiClass {
317    /// Get `BidiClass` of the character.
318    fn bidi_class(self) -> BidiClass;
319
320    /// Whether the character has *left-to-right* (LTR) bidi directionality.
321    fn is_ltr(self) -> bool;
322
323    /// Whether the character has *right-to-left* (RTL) bidi directionality.
324    fn is_rtl(self) -> bool;
325}
326
327impl CharBidiClass for char {
328    #[inline]
329    fn bidi_class(self) -> BidiClass {
330        BidiClass::of(self)
331    }
332
333    #[inline]
334    fn is_ltr(self) -> bool {
335        BidiClass::of(self).is_ltr()
336    }
337
338    #[inline]
339    fn is_rtl(self) -> bool {
340        BidiClass::of(self).is_rtl()
341    }
342}
343
344/// Methods for `Bidi_Class` character properties of string types.
345pub trait StrBidiClass {
346    /// Whether the string has any *explicit* bidi formatting character.
347    fn has_bidi_explicit(&self) -> bool;
348
349    /// Whether the string has any character with *left-to-right* (LTR) bidi directionality.
350    fn has_ltr(&self) -> bool;
351
352    /// Whether the string has any character with *right-to-left* (RTL) bidi directionality.
353    fn has_rtl(&self) -> bool;
354}
355
356impl StrBidiClass for str {
357    #[inline]
358    fn has_bidi_explicit(&self) -> bool {
359        self.chars()
360            .any(|ch| ch.bidi_class().category() == BidiClassCategory::ExplicitFormatting)
361    }
362
363    #[inline]
364    fn has_ltr(&self) -> bool {
365        self.chars().any(|ch| ch.is_rtl())
366    }
367
368    #[inline]
369    fn has_rtl(&self) -> bool {
370        self.chars().any(|ch| ch.is_rtl())
371    }
372}
373
374#[cfg(test)]
375mod tests {
376    use super::abbr_names::*;
377    use super::BidiClass;
378    use unic_char_property::EnumeratedCharProperty;
379
380    #[test]
381    fn test_ascii() {
382        assert_eq!(BidiClass::of('\u{0000}'), BN);
383        assert_eq!(BidiClass::of('\u{0040}'), ON);
384        assert_eq!(BidiClass::of('\u{0041}'), L);
385        assert_eq!(BidiClass::of('\u{0062}'), L);
386        assert_eq!(BidiClass::of('\u{007F}'), BN);
387    }
388
389    #[test]
390    fn test_bmp() {
391        // Hebrew
392        assert_eq!(BidiClass::of('\u{0590}'), R);
393        assert_eq!(BidiClass::of('\u{05D0}'), R);
394        assert_eq!(BidiClass::of('\u{05D1}'), R);
395        assert_eq!(BidiClass::of('\u{05FF}'), R);
396
397        // Arabic
398        assert_eq!(BidiClass::of('\u{0600}'), AN);
399        assert_eq!(BidiClass::of('\u{0627}'), AL);
400        assert_eq!(BidiClass::of('\u{07BF}'), AL);
401
402        // Default R + Arabic Extras
403        assert_eq!(BidiClass::of('\u{07C0}'), R);
404        assert_eq!(BidiClass::of('\u{085F}'), R);
405        assert_eq!(BidiClass::of('\u{0860}'), AL);
406        assert_eq!(BidiClass::of('\u{0870}'), R);
407        assert_eq!(BidiClass::of('\u{089F}'), R);
408        assert_eq!(BidiClass::of('\u{08A0}'), AL);
409        assert_eq!(BidiClass::of('\u{089F}'), R);
410        assert_eq!(BidiClass::of('\u{08FF}'), NSM);
411
412        // Default ET
413        assert_eq!(BidiClass::of('\u{20A0}'), ET);
414        assert_eq!(BidiClass::of('\u{20CF}'), ET);
415
416        // Arabic Presentation Forms
417        assert_eq!(BidiClass::of('\u{FB1D}'), R);
418        assert_eq!(BidiClass::of('\u{FB4F}'), R);
419        assert_eq!(BidiClass::of('\u{FB50}'), AL);
420        assert_eq!(BidiClass::of('\u{FDCF}'), AL);
421        assert_eq!(BidiClass::of('\u{FDF0}'), AL);
422        assert_eq!(BidiClass::of('\u{FDFF}'), AL);
423        assert_eq!(BidiClass::of('\u{FE70}'), AL);
424        assert_eq!(BidiClass::of('\u{FEFE}'), AL);
425        assert_eq!(BidiClass::of('\u{FEFF}'), BN);
426
427        // noncharacters
428        assert_eq!(BidiClass::of('\u{FDD0}'), L);
429        assert_eq!(BidiClass::of('\u{FDD1}'), L);
430        assert_eq!(BidiClass::of('\u{FDEE}'), L);
431        assert_eq!(BidiClass::of('\u{FDEF}'), L);
432        assert_eq!(BidiClass::of('\u{FFFE}'), L);
433        assert_eq!(BidiClass::of('\u{FFFF}'), L);
434    }
435
436    #[test]
437    fn test_smp() {
438        // Default AL + R
439        assert_eq!(BidiClass::of('\u{10800}'), R);
440        assert_eq!(BidiClass::of('\u{10FFF}'), R);
441        assert_eq!(BidiClass::of('\u{1E800}'), R);
442        assert_eq!(BidiClass::of('\u{1EDFF}'), R);
443        assert_eq!(BidiClass::of('\u{1EE00}'), AL);
444        assert_eq!(BidiClass::of('\u{1EEFF}'), AL);
445        assert_eq!(BidiClass::of('\u{1EF00}'), R);
446        assert_eq!(BidiClass::of('\u{1EFFF}'), R);
447    }
448
449    #[test]
450    fn test_unassigned_planes() {
451        assert_eq!(BidiClass::of('\u{30000}'), L);
452        assert_eq!(BidiClass::of('\u{40000}'), L);
453        assert_eq!(BidiClass::of('\u{50000}'), L);
454        assert_eq!(BidiClass::of('\u{60000}'), L);
455        assert_eq!(BidiClass::of('\u{70000}'), L);
456        assert_eq!(BidiClass::of('\u{80000}'), L);
457        assert_eq!(BidiClass::of('\u{90000}'), L);
458        assert_eq!(BidiClass::of('\u{a0000}'), L);
459    }
460
461    #[test]
462    fn test_abbr_name() {
463        assert_eq!(AL.abbr_name(), "AL");
464        assert_eq!(FSI.abbr_name(), "FSI");
465    }
466
467    #[test]
468    fn test_long_name() {
469        assert_eq!(AL.long_name(), "Arabic_Letter");
470        assert_eq!(FSI.long_name(), "First_Strong_Isolate");
471    }
472
473    #[test]
474    fn test_human_name() {
475        assert_eq!(AL.human_name(), "Right-to-Left Arabic");
476        assert_eq!(FSI.human_name(), "First Strong Isolate");
477    }
478
479    #[test]
480    fn test_char_trait() {
481        use super::{BidiClass, BidiClassCategory, CharBidiClass};
482
483        let ch = '\u{0028}'; // U+0028 LEFT PARENTHESIS "("
484        assert_eq!(ch.bidi_class(), BidiClass::OtherNeutral);
485        assert_eq!(ch.bidi_class().category(), BidiClassCategory::Neutral);
486        assert!(!ch.is_ltr());
487        assert!(!ch.is_rtl());
488
489        let ch = '\u{0041}'; // U+0041 LATIN CAPITAL LETTER A "A"
490        assert_eq!(ch.bidi_class(), BidiClass::LeftToRight);
491        assert_eq!(ch.bidi_class().category(), BidiClassCategory::Strong);
492        assert!(ch.is_ltr());
493        assert!(!ch.is_rtl());
494
495        let ch = '\u{05D0}'; // U+05D0 HEBREW LETTER ALEF "א"
496        assert_eq!(ch.bidi_class(), BidiClass::RightToLeft);
497        assert_eq!(ch.bidi_class().category(), BidiClassCategory::Strong);
498        assert!(!ch.is_ltr());
499        assert!(ch.is_rtl());
500
501        let ch = '\u{0627}'; // U+0627 ARABIC LETTER ALEF "ا"
502        assert_eq!(ch.bidi_class(), BidiClass::ArabicLetter);
503        assert_eq!(ch.bidi_class().category(), BidiClassCategory::Strong);
504        assert!(!ch.is_ltr());
505        assert!(ch.is_rtl());
506    }
507
508    #[test]
509    fn test_str_trait() {
510        use super::StrBidiClass;
511
512        let text = "";
513        assert!(!text.has_bidi_explicit());
514        assert!(!text.has_ltr());
515        assert!(!text.has_rtl());
516
517        let text = "[\u{0041}\u{05D0}\u{0627}\u{200e}]";
518        assert!(!text.has_bidi_explicit());
519        assert!(text.has_ltr());
520        assert!(text.has_rtl());
521    }
522}