1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
use crate::Error;
use std::path::Path;
use std::str::FromStr;
use ucd_parse::UcdFile;

/// A single row in the
/// [`HangulSyllableType`](http://www.unicode.org/reports/tr44/#HangulSyllableType.txt)
/// file.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct HangulSyllableType {
    /// A single row in the `PropList.txt` file.
    pub prop: ucd_parse::Property,
}

impl ucd_parse::UcdFile for HangulSyllableType {
    fn relative_file_path() -> &'static Path {
        Path::new("HangulSyllableType.txt")
    }
}

impl ucd_parse::UcdFileByCodepoint for HangulSyllableType {
    fn codepoints(&self) -> ucd_parse::CodepointIter {
        self.prop.codepoints.into_iter()
    }
}

impl FromStr for HangulSyllableType {
    type Err = ucd_parse::Error;

    fn from_str(line: &str) -> Result<HangulSyllableType, ucd_parse::Error> {
        let prop = ucd_parse::Property::from_str(line)?;
        Ok(HangulSyllableType { prop })
    }
}

/// A single row in the `DerivedJoiningType` file.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct DerivedJoiningType {
    /// A single row in the `PropList.txt` file.
    pub prop: ucd_parse::Property,
}

impl ucd_parse::UcdFile for DerivedJoiningType {
    fn relative_file_path() -> &'static Path {
        Path::new("extracted/DerivedJoiningType.txt")
    }
}

impl ucd_parse::UcdFileByCodepoint for DerivedJoiningType {
    fn codepoints(&self) -> ucd_parse::CodepointIter {
        self.prop.codepoints.into_iter()
    }
}

impl FromStr for DerivedJoiningType {
    type Err = ucd_parse::Error;

    fn from_str(line: &str) -> Result<DerivedJoiningType, ucd_parse::Error> {
        let prop = ucd_parse::Property::from_str(line)?;
        Ok(DerivedJoiningType { prop })
    }
}

/// Extension of the `UnicodeData` `struct` provided by the
/// [`ucd_parse`](https://docs.rs/ucd-parse) crate. Unlike the
/// original one, this `struct` does not represent a single line in the
/// [`UnicodeData`](https://www.unicode.org/reports/tr44/#UnicodeData.txt)
/// file, but it could be the result of a whole parsing of several files
/// to contain range of Unicode code points. Note that this file, unlike
/// others in the Unicode data files, represents ranges split in different
/// lines in order not to break parsers compatibility.
#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct UnicodeData {
    /// The code points corresponding to this row.
    pub codepoints: ucd_parse::Codepoints,
    /// The name of this code point.
    pub name: String,
    /// The "general category" of this code point.
    pub general_category: String,
    /// The class of this code point used in the Canonical Ordering Algorithm.
    ///
    /// Note that some classes map to a particular symbol. See
    /// [`UAX44`, Table 15](https://www.unicode.org/reports/tr44/#Canonical_Combining_Class_Values).
    pub canonical_combining_class: u8,
    /// The bidirectional class of this code point.
    ///
    /// Possible values are listed in
    /// [`UAX44`, Table 13](https://www.unicode.org/reports/tr44/#Bidi_Class_Values).
    pub bidi_class: String,
    /// The decomposition mapping for this code point. This includes its
    /// formatting tag (if present).
    pub decomposition: ucd_parse::UnicodeDataDecomposition,
    /// A decimal numeric representation of this code point, if it has the
    /// property `Numeric_Type=Decimal`.
    pub numeric_type_decimal: Option<u8>,
    /// A decimal numeric representation of this code point, if it has the
    /// property `Numeric_Type=Digit`. Note that while this field is still
    /// populated for existing code points, no new code points will have this
    /// field populated.
    pub numeric_type_digit: Option<u8>,
    /// A decimal or rational numeric representation of this code point, if it
    /// has the property `Numeric_Type=Numeric`.
    pub numeric_type_numeric: Option<ucd_parse::UnicodeDataNumeric>,
    /// A Boolean indicating whether this code point is "mirrored" in
    /// bidirectional text.
    pub bidi_mirrored: bool,
    /// The "old" Unicode 1.0 or ISO 6429 name of this code point. Note that
    /// this field is empty unless it is significantly different from
    /// the `name` field.
    pub unicode1_name: String,
    /// The ISO 10464 comment field. This field no longer contains any non-NULL
    /// values.
    pub iso_comment: String,
    /// This code point's simple uppercase mapping, if it exists.
    pub simple_uppercase_mapping: Option<ucd_parse::Codepoint>,
    /// This code point's simple lowercase mapping, if it exists.
    pub simple_lowercase_mapping: Option<ucd_parse::Codepoint>,
    /// This code point's simple title case mapping, if it exists.
    pub simple_titlecase_mapping: Option<ucd_parse::Codepoint>,
}

impl UnicodeData {
    /// Parse a particular `UCD` file into a sequence of rows.
    pub fn parse(ucd_dir: &Path) -> Result<Vec<UnicodeData>, Error> {
        let mut xs = vec![];

        let raws: Vec<ucd_parse::UnicodeData> = ucd_parse::parse(ucd_dir)?;
        let mut range: Option<ucd_parse::CodepointRange> = None;
        for udata in raws.iter() {
            match range.as_mut() {
                Some(r) => {
                    if !udata.is_range_end() {
                        return err!("Expected end range after codepoint {:#06x}. Current codepoint{:#06x}. File: {}",
							r.start.value(), udata.codepoint.value(), ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap());
                    }
                    r.end = udata.codepoint;
                    if r.start.value() > r.end.value() {
                        return err!(
                            "Start range {:#06x} is minor than end range {:#06x}. File: {}",
                            r.start.value(),
                            r.end.value(),
                            ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
                        );
                    }
                }
                None => {
                    if udata.is_range_end() {
                        return err!(
                            "Found end range without starting. Current codepoint {:#06x}. File: {}",
                            udata.codepoint.value(),
                            ucd_parse::UnicodeData::file_path(ucd_dir).to_str().unwrap()
                        );
                    }
                }
            }

            if udata.is_range_start() {
                if range.is_some() {
                    return err!(
                            "Previous range started with codepoint {:#06x} has not yet finished. File: {}",
							range.unwrap().start.value(),
                            ucd_parse::UnicodeData::file_path(ucd_dir)
                                .to_str()
                                .unwrap()
                        );
                }
                range = Some(ucd_parse::CodepointRange {
                    start: udata.codepoint,
                    end: udata.codepoint,
                });
                continue;
            }

            let codepoints = match range {
                Some(r) => ucd_parse::Codepoints::Range(r),
                None => ucd_parse::Codepoints::Single(udata.codepoint),
            };

            let ucd = UnicodeData {
                codepoints,
                name: udata.name.clone(),
                general_category: udata.general_category.clone(),
                canonical_combining_class: udata.canonical_combining_class,
                bidi_class: udata.bidi_class.clone(),
                decomposition: udata.decomposition.clone(),
                numeric_type_decimal: udata.numeric_type_decimal,
                numeric_type_digit: udata.numeric_type_digit,
                numeric_type_numeric: udata.numeric_type_numeric,
                bidi_mirrored: udata.bidi_mirrored,
                unicode1_name: udata.unicode1_name.clone(),
                iso_comment: udata.iso_comment.clone(),
                simple_uppercase_mapping: udata.simple_uppercase_mapping,
                simple_lowercase_mapping: udata.simple_lowercase_mapping,
                simple_titlecase_mapping: udata.simple_titlecase_mapping,
            };

            if udata.is_range_end() {
                range = None;
            }

            xs.push(ucd);
        }

        Ok(xs)
    }
}