unic_ucd_name/
name.rs

1// Copyright 2017 The UNIC Project Developers.
2//
3// See the COPYRIGHT file at the top-level directory of this distribution.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11use core::cmp::Ordering;
12use core::fmt;
13use unic_ucd_hangul::decompose_syllable;
14
15pub static PREFIX_HANGUL_SYLLABLE: &'static str = "HANGUL SYLLABLE ";
16pub static PREFIX_CJK_UNIFIED_IDEOGRAPH: &'static str = "CJK UNIFIED IDEOGRAPH-";
17pub static PREFIX_TANGUT_IDEOGRAPH: &'static str = "TANGUT IDEOGRAPH-";
18pub static PREFIX_NUSHU_CHARACTER: &'static str = "NUSHU CHARACTER-";
19pub static PREFIX_CJK_COMPATIBILITY_IDEOGRAPH: &'static str = "CJK COMPATIBILITY IDEOGRAPH-";
20
21const JAMO_BUFFER_SIZE: usize = 3;
22
23/// Represents values of the Unicode character property
24/// [*Name*](https://www.unicode.org/reports/tr44/#Name).
25///
26/// Note: NR4 is omitted in this implementation because it can be represented by `None`.
27///
28/// See *Section 4.8* in [*Unicode*](http://www.unicode.org/versions/Unicode10.0.0/ch04.pdf)
29/// for a full specification of all name derivation rules.
30#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash)]
31pub enum Name {
32    /// NR1: For Hangul syllables, the Name is derived by rule,
33    /// as specified in *Section 3.12* in
34    /// [*Unicode*](http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf)
35    /// by concatenating a fixed prefix string "HANGUL SYLLABLE" and appropriate values of the
36    /// [*Jamo_Short_Name*](http://www.unicode.org/Public/UCD/latest/ucd/Jamo.txt) property.
37    NR1(char),
38
39    /// NR2: For most ideographs, the Name is derived by
40    /// concatenating a script-specific prefix string, as specified in
41    /// [*Unicode*](http://www.unicode.org/versions/Unicode10.0.0/ch04.pdf),
42    /// to the code point, expressed in hexadecimal, with the usual
43    /// 4- to 6-digit convention.
44    NR2(&'static str, char),
45
46    /// NR3: For all other Graphic characters and for all Format characters,
47    /// the Name is as explicitly listed in Field 1 of
48    /// [*UnicodeData.txt*](https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt).
49    NR3(&'static [&'static str]),
50}
51
52#[cfg_attr(feature = "cargo-clippy", allow(len_without_is_empty))]
53impl Name {
54    /// Find the character `Name` property value.
55    pub fn of(ch: char) -> Option<Name> {
56        match ch {
57            '\u{AC00}'..='\u{D7A3}' => Some(Name::NR1(ch)),
58            '\u{3400}'..='\u{4DB5}'
59            | '\u{4E00}'..='\u{9FEA}'
60            | '\u{20000}'..='\u{2A6D6}'
61            | '\u{2A700}'..='\u{2B734}'
62            | '\u{2B740}'..='\u{2B81D}'
63            | '\u{2B820}'..='\u{2CEA1}'
64            | '\u{2CEB0}'..='\u{2EBE0}' => Some(Name::NR2(PREFIX_CJK_UNIFIED_IDEOGRAPH, ch)),
65            '\u{17000}'..='\u{187EC}' => Some(Name::NR2(PREFIX_TANGUT_IDEOGRAPH, ch)),
66            '\u{1B170}'..='\u{1B2FB}' => Some(Name::NR2(PREFIX_NUSHU_CHARACTER, ch)),
67            '\u{F900}'..='\u{FA6D}' | '\u{FA70}'..='\u{FAD9}' | '\u{2F800}'..='\u{2FA1D}' => {
68                Some(Name::NR2(PREFIX_CJK_COMPATIBILITY_IDEOGRAPH, ch))
69            }
70            _ => data::NAMES.find(ch).map(|pieces| Name::NR3(pieces)),
71        }
72    }
73
74    /// Length of the name in bytes.
75    pub fn len(&self) -> usize {
76        match *self {
77            Name::NR1(ch) => {
78                let mut len = PREFIX_HANGUL_SYLLABLE.len();
79                {
80                    let mut count_jamos = |jamo| {
81                        let jamo_name = Name::jamo_short_name(jamo);
82                        len += jamo_name.len();
83                    };
84                    decompose_syllable(ch, &mut count_jamos);
85                }
86                len
87            }
88            Name::NR2(prefix, ch) => {
89                let mut len = prefix.len();
90                len += Name::number_of_hex_digits(ch);
91                len
92            }
93            Name::NR3(pieces) => {
94                // start with spaces
95                let mut len = pieces.len().saturating_sub(1);
96                for piece in pieces {
97                    len += piece.len();
98                }
99                len
100            }
101        }
102    }
103
104    #[cfg_attr(feature = "cargo-clippy", allow(inline_always))]
105    #[inline(always)]
106    fn number_of_hex_digits(ch: char) -> usize {
107        (32 - u32::leading_zeros(ch as u32) as usize + 3) / 4
108    }
109
110    fn jamo_short_name(ch: char) -> &'static str {
111        data::JAMO_SHORT_NAMES
112            .find(ch)
113            .expect("Unexpected jamo character")
114    }
115
116    fn collect_jamo_short_names(ch: char) -> [Option<&'static str>; JAMO_BUFFER_SIZE] {
117        let mut jamos = [None; JAMO_BUFFER_SIZE];
118        {
119            let mut index = 0;
120            let mut collect_jamos = |jamo| {
121                debug_assert!(
122                    index < JAMO_BUFFER_SIZE,
123                    "Decomposed hangul jamos exceed buffer size limit",
124                );
125                jamos[index] = Some(Name::jamo_short_name(jamo));
126                index += 1;
127            };
128            decompose_syllable(ch, &mut collect_jamos);
129        }
130        jamos
131    }
132}
133
134impl fmt::Display for Name {
135    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
136        match *self {
137            Name::NR1(ch) => {
138                f.write_str(PREFIX_HANGUL_SYLLABLE)?;
139                let mut result = Ok(());
140                {
141                    let mut write_jamos = |jamo| {
142                        let write_result = f.write_str(Name::jamo_short_name(jamo));
143                        if write_result.is_err() {
144                            result = write_result;
145                        }
146                    };
147                    decompose_syllable(ch, &mut write_jamos);
148                }
149                result
150            }
151            Name::NR2(prefix, ch) => {
152                f.write_str(prefix)?;
153                write!(f, "{:X}", ch as u32)
154            }
155            Name::NR3(pieces) => {
156                let (first, rest) = pieces.split_first().unwrap();
157                f.write_str(first)?;
158                for piece in rest {
159                    f.write_str(" ")?;
160                    f.write_str(piece)?;
161                }
162                Ok(())
163            }
164        }
165    }
166}
167
168impl Ord for Name {
169    fn cmp(&self, other: &Name) -> Ordering {
170        match *self {
171            Name::NR1(ch) => match *other {
172                Name::NR1(other_ch) => {
173                    let jamos = Name::collect_jamo_short_names(ch);
174                    let other_jamos = Name::collect_jamo_short_names(other_ch);
175                    jamos.cmp(&other_jamos)
176                }
177                Name::NR2(other_prefix, _) => PREFIX_HANGUL_SYLLABLE.cmp(other_prefix),
178                Name::NR3(other_pieces) => {
179                    let (first, _) = other_pieces.split_first().unwrap();
180                    PREFIX_HANGUL_SYLLABLE.cmp(first)
181                }
182            },
183            Name::NR2(prefix, ch) => match *other {
184                Name::NR1(_) => prefix.cmp(PREFIX_HANGUL_SYLLABLE),
185                Name::NR2(other_prefix, other_ch) => {
186                    if prefix == other_prefix {
187                        ch.cmp(&other_ch)
188                    } else {
189                        prefix.cmp(other_prefix)
190                    }
191                }
192                Name::NR3(other_pieces) => {
193                    let (first, _) = other_pieces.split_first().unwrap();
194                    prefix.cmp(first)
195                }
196            },
197            Name::NR3(pieces) => {
198                let (first, _) = pieces.split_first().unwrap();
199                match *other {
200                    Name::NR1(_) => first.cmp(&PREFIX_HANGUL_SYLLABLE),
201                    Name::NR2(other_prefix, _) => first.cmp(&other_prefix),
202                    Name::NR3(other_pieces) => pieces.cmp(other_pieces),
203                }
204            }
205        }
206    }
207}
208
209impl PartialOrd for Name {
210    fn partial_cmp(&self, other: &Name) -> Option<Ordering> {
211        Some(self.cmp(other))
212    }
213}
214
215mod data {
216    use unic_char_property::tables::CharDataTable;
217    include!("../tables/name_values.rsd");
218    pub const NAMES: CharDataTable<&[&str]> = include!("../tables/name_map.rsv");
219    pub const JAMO_SHORT_NAMES: CharDataTable<&str> = include!("../tables/jamo.rsv");
220}