xi_unicode/
lib.rs

1// Copyright 2016 The xi-editor Authors.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Unicode utilities useful for text editing, including a line breaking iterator.
16#![no_std]
17
18extern crate alloc;
19
20mod tables;
21
22use core::cmp::Ordering;
23
24use crate::tables::*;
25
26/// The Unicode line breaking property of the given code point.
27///
28/// This is given as a numeric value which matches the ULineBreak
29/// enum value from ICU.
30pub fn linebreak_property(cp: char) -> u8 {
31    let cp = cp as usize;
32    if cp < 0x800 {
33        LINEBREAK_1_2[cp]
34    } else if cp < 0x10000 {
35        let child = LINEBREAK_3_ROOT[cp >> 6];
36        LINEBREAK_3_CHILD[(child as usize) * 0x40 + (cp & 0x3f)]
37    } else {
38        let mid = LINEBREAK_4_ROOT[cp >> 12];
39        let leaf = LINEBREAK_4_MID[(mid as usize) * 0x40 + ((cp >> 6) & 0x3f)];
40        LINEBREAK_4_LEAVES[(leaf as usize) * 0x40 + (cp & 0x3f)]
41    }
42}
43
44/// The Unicode line breaking property of the given code point.
45///
46/// Look up the line breaking property for the first code point in the
47/// string. Return the property as a numeric value, and also the utf-8
48/// length of the codepoint, for convenience.
49pub fn linebreak_property_str(s: &str, ix: usize) -> (u8, usize) {
50    let b = s.as_bytes()[ix];
51    if b < 0x80 {
52        (LINEBREAK_1_2[b as usize], 1)
53    } else if b < 0xe0 {
54        // 2 byte UTF-8 sequences
55        let cp = ((b as usize) << 6) + (s.as_bytes()[ix + 1] as usize) - 0x3080;
56        (LINEBREAK_1_2[cp], 2)
57    } else if b < 0xf0 {
58        // 3 byte UTF-8 sequences
59        let mid_ix = ((b as usize) << 6) + (s.as_bytes()[ix + 1] as usize) - 0x3880;
60        let mid = LINEBREAK_3_ROOT[mid_ix];
61        (LINEBREAK_3_CHILD[(mid as usize) * 0x40 + (s.as_bytes()[ix + 2] as usize) - 0x80], 3)
62    } else {
63        // 4 byte UTF-8 sequences
64        let mid_ix = ((b as usize) << 6) + (s.as_bytes()[ix + 1] as usize) - 0x3c80;
65        let mid = LINEBREAK_4_ROOT[mid_ix];
66        let leaf_ix = ((mid as usize) << 6) + (s.as_bytes()[ix + 2] as usize) - 0x80;
67        let leaf = LINEBREAK_4_MID[leaf_ix];
68        (LINEBREAK_4_LEAVES[(leaf as usize) * 0x40 + (s.as_bytes()[ix + 3] as usize) - 0x80], 4)
69    }
70}
71
72/// An iterator which produces line breaks according to the UAX 14 line
73/// breaking algorithm. For each break, return a tuple consisting of the offset
74/// within the source string and a bool indicating whether it's a hard break.
75///
76/// There is never a break at the beginning of the string (thus, the empty string
77/// produces no breaks). For non-empty strings, there is always a break at the
78/// end. It is indicated as a hard break when the string is terminated with a
79/// newline or other Unicode explicit line-end character.
80#[derive(Copy, Clone)]
81pub struct LineBreakIterator<'a> {
82    s: &'a str,
83    ix: usize,
84    state: u8,
85}
86
87impl<'a> Iterator for LineBreakIterator<'a> {
88    type Item = (usize, bool);
89
90    // return break pos and whether it's a hard break
91    fn next(&mut self) -> Option<(usize, bool)> {
92        loop {
93            match self.ix.cmp(&self.s.len()) {
94                Ordering::Greater => {
95                    return None;
96                }
97                Ordering::Equal => {
98                    // LB3, break at EOT
99                    self.ix += 1;
100                    let i = (self.state as usize) * N_LINEBREAK_CATEGORIES;
101                    let new = LINEBREAK_STATE_MACHINE[i];
102                    return Some((self.s.len(), new >= 0xc0));
103                }
104                Ordering::Less => {
105                    let (lb, len) = linebreak_property_str(self.s, self.ix);
106                    let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize);
107                    let new = LINEBREAK_STATE_MACHINE[i];
108                    //println!("{:?}[{}], state {} + lb {} -> {}", &self.s[self.ix..], self.ix, self.state, lb, new);
109                    let result = self.ix;
110                    self.ix += len;
111                    if (new as i8) < 0 {
112                        // break found
113                        self.state = new & 0x3f;
114                        return Some((result, new >= 0xc0));
115                    } else {
116                        self.state = new;
117                    }
118                }
119            }
120        }
121    }
122}
123
124impl<'a> LineBreakIterator<'a> {
125    /// Create a new iterator for the given string slice.
126    pub fn new(s: &str) -> LineBreakIterator {
127        if s.is_empty() {
128            LineBreakIterator {
129                s,
130                ix: 1, // LB2, don't break; sot takes priority for empty string
131                state: 0,
132            }
133        } else {
134            let (lb, len) = linebreak_property_str(s, 0);
135            LineBreakIterator { s, ix: len, state: lb }
136        }
137    }
138}
139
140/// A struct useful for computing line breaks in a rope or other non-contiguous
141/// string representation. This is a trickier problem than iterating in a string
142/// for a few reasons, the trickiest of which is that in the general case,
143/// line breaks require an indeterminate amount of look-behind.
144///
145/// This is something of an "expert-level" interface, and should only be used if
146/// the caller is prepared to respect all the invariants. Otherwise, you might
147/// get inconsistent breaks depending on start position and leaf boundaries.
148#[derive(Copy, Clone)]
149pub struct LineBreakLeafIter {
150    ix: usize,
151    state: u8,
152}
153
154impl Default for LineBreakLeafIter {
155    // A default value. No guarantees on what happens when next() is called
156    // on this. Intended to be useful for empty ropes.
157    fn default() -> LineBreakLeafIter {
158        LineBreakLeafIter { ix: 0, state: 0 }
159    }
160}
161
162impl LineBreakLeafIter {
163    /// Create a new line break iterator suitable for leaves in a rope.
164    /// Precondition: ix is at a code point boundary within s.
165    pub fn new(s: &str, ix: usize) -> LineBreakLeafIter {
166        let (lb, len) = if ix == s.len() { (0, 0) } else { linebreak_property_str(s, ix) };
167        LineBreakLeafIter { ix: ix + len, state: lb }
168    }
169
170    /// Return break pos and whether it's a hard break. Note: hard break
171    /// indication may go away, this may not be useful in actual application.
172    /// If end of leaf is found, return leaf's len. This does not indicate
173    /// a break, as that requires at least one more codepoint of context.
174    /// If it is a break, then subsequent next call will return an offset of 0.
175    /// EOT is always a break, so in the EOT case it's up to the caller
176    /// to figure that out.
177    ///
178    /// For consistent results, always supply same `s` until end of leaf is
179    /// reached (and initially this should be the same as in the `new` call).
180    pub fn next(&mut self, s: &str) -> (usize, bool) {
181        loop {
182            if self.ix == s.len() {
183                self.ix = 0; // in preparation for next leaf
184                return (s.len(), false);
185            }
186            let (lb, len) = linebreak_property_str(s, self.ix);
187            let i = (self.state as usize) * N_LINEBREAK_CATEGORIES + (lb as usize);
188            let new = LINEBREAK_STATE_MACHINE[i];
189            //println!("\"{}\"[{}], state {} + lb {} -> {}", &s[self.ix..], self.ix, self.state, lb, new);
190            let result = self.ix;
191            self.ix += len;
192            if (new as i8) < 0 {
193                // break found
194                self.state = new & 0x3f;
195                return (result, new >= 0xc0);
196            } else {
197                self.state = new;
198            }
199        }
200    }
201}
202
203fn is_in_asc_list<T: core::cmp::PartialOrd>(c: T, list: &[T], start: usize, end: usize) -> bool {
204    if c == list[start] || c == list[end] {
205        return true;
206    }
207    if end - start <= 1 {
208        return false;
209    }
210
211    let mid = (start + end) / 2;
212
213    if c >= list[mid] {
214        is_in_asc_list(c, &list, mid, end)
215    } else {
216        is_in_asc_list(c, &list, start, mid)
217    }
218}
219
220pub fn is_variation_selector(c: char) -> bool {
221    (c >= '\u{FE00}' && c <= '\u{FE0F}') || (c >= '\u{E0100}' && c <= '\u{E01EF}')
222}
223
224pub trait EmojiExt {
225    fn is_regional_indicator_symbol(self) -> bool;
226    fn is_emoji_modifier(self) -> bool;
227    fn is_emoji_combining_enclosing_keycap(self) -> bool;
228    fn is_emoji(self) -> bool;
229    fn is_emoji_modifier_base(self) -> bool;
230    fn is_tag_spec_char(self) -> bool;
231    fn is_emoji_cancel_tag(self) -> bool;
232    fn is_zwj(self) -> bool;
233}
234
235impl EmojiExt for char {
236    fn is_regional_indicator_symbol(self) -> bool {
237        self >= '\u{1F1E6}' && self <= '\u{1F1FF}'
238    }
239    fn is_emoji_modifier(self) -> bool {
240        self >= '\u{1F3FB}' && self <= '\u{1F3FF}'
241    }
242    fn is_emoji_combining_enclosing_keycap(self) -> bool {
243        self == '\u{20E3}'
244    }
245    fn is_emoji(self) -> bool {
246        is_in_asc_list(self, &EMOJI_TABLE, 0, EMOJI_TABLE.len() - 1)
247    }
248    fn is_emoji_modifier_base(self) -> bool {
249        is_in_asc_list(self, &EMOJI_MODIFIER_BASE_TABLE, 0, EMOJI_MODIFIER_BASE_TABLE.len() - 1)
250    }
251    fn is_tag_spec_char(self) -> bool {
252        '\u{E0020}' <= self && self <= '\u{E007E}'
253    }
254    fn is_emoji_cancel_tag(self) -> bool {
255        self == '\u{E007F}'
256    }
257    fn is_zwj(self) -> bool {
258        self == '\u{200D}'
259    }
260}
261
262pub fn is_keycap_base(c: char) -> bool {
263    ('0' <= c && c <= '9') || c == '#' || c == '*'
264}
265
266#[cfg(test)]
267mod tests {
268    use crate::linebreak_property;
269    use crate::linebreak_property_str;
270    use crate::LineBreakIterator;
271    use alloc::vec;
272    use alloc::vec::*;
273
274    #[test]
275    fn linebreak_prop() {
276        assert_eq!(9, linebreak_property('\u{0001}'));
277        assert_eq!(9, linebreak_property('\u{0003}'));
278        assert_eq!(9, linebreak_property('\u{0004}'));
279        assert_eq!(9, linebreak_property('\u{0008}'));
280        assert_eq!(10, linebreak_property('\u{000D}'));
281        assert_eq!(9, linebreak_property('\u{0010}'));
282        assert_eq!(9, linebreak_property('\u{0015}'));
283        assert_eq!(9, linebreak_property('\u{0018}'));
284        assert_eq!(22, linebreak_property('\u{002B}'));
285        assert_eq!(16, linebreak_property('\u{002C}'));
286        assert_eq!(13, linebreak_property('\u{002D}'));
287        assert_eq!(27, linebreak_property('\u{002F}'));
288        assert_eq!(19, linebreak_property('\u{0030}'));
289        assert_eq!(19, linebreak_property('\u{0038}'));
290        assert_eq!(19, linebreak_property('\u{0039}'));
291        assert_eq!(16, linebreak_property('\u{003B}'));
292        assert_eq!(2, linebreak_property('\u{003E}'));
293        assert_eq!(11, linebreak_property('\u{003F}'));
294        assert_eq!(2, linebreak_property('\u{0040}'));
295        assert_eq!(2, linebreak_property('\u{0055}'));
296        assert_eq!(2, linebreak_property('\u{0056}'));
297        assert_eq!(2, linebreak_property('\u{0058}'));
298        assert_eq!(2, linebreak_property('\u{0059}'));
299        assert_eq!(20, linebreak_property('\u{005B}'));
300        assert_eq!(22, linebreak_property('\u{005C}'));
301        assert_eq!(2, linebreak_property('\u{0062}'));
302        assert_eq!(2, linebreak_property('\u{006C}'));
303        assert_eq!(2, linebreak_property('\u{006D}'));
304        assert_eq!(2, linebreak_property('\u{0071}'));
305        assert_eq!(2, linebreak_property('\u{0074}'));
306        assert_eq!(2, linebreak_property('\u{0075}'));
307        assert_eq!(4, linebreak_property('\u{007C}'));
308        assert_eq!(9, linebreak_property('\u{009D}'));
309        assert_eq!(2, linebreak_property('\u{00D5}'));
310        assert_eq!(2, linebreak_property('\u{00D8}'));
311        assert_eq!(2, linebreak_property('\u{00E9}'));
312        assert_eq!(2, linebreak_property('\u{0120}'));
313        assert_eq!(2, linebreak_property('\u{0121}'));
314        assert_eq!(2, linebreak_property('\u{015C}'));
315        assert_eq!(2, linebreak_property('\u{016C}'));
316        assert_eq!(2, linebreak_property('\u{017E}'));
317        assert_eq!(2, linebreak_property('\u{01B0}'));
318        assert_eq!(2, linebreak_property('\u{0223}'));
319        assert_eq!(2, linebreak_property('\u{028D}'));
320        assert_eq!(2, linebreak_property('\u{02BE}'));
321        assert_eq!(1, linebreak_property('\u{02D0}'));
322        assert_eq!(9, linebreak_property('\u{0337}'));
323        assert_eq!(0, linebreak_property('\u{0380}'));
324        assert_eq!(2, linebreak_property('\u{04AA}'));
325        assert_eq!(2, linebreak_property('\u{04CE}'));
326        assert_eq!(2, linebreak_property('\u{04F1}'));
327        assert_eq!(2, linebreak_property('\u{0567}'));
328        assert_eq!(2, linebreak_property('\u{0580}'));
329        assert_eq!(9, linebreak_property('\u{05A1}'));
330        assert_eq!(9, linebreak_property('\u{05B0}'));
331        assert_eq!(38, linebreak_property('\u{05D4}'));
332        assert_eq!(2, linebreak_property('\u{0643}'));
333        assert_eq!(9, linebreak_property('\u{065D}'));
334        assert_eq!(19, linebreak_property('\u{066C}'));
335        assert_eq!(2, linebreak_property('\u{066E}'));
336        assert_eq!(2, linebreak_property('\u{068A}'));
337        assert_eq!(2, linebreak_property('\u{0776}'));
338        assert_eq!(2, linebreak_property('\u{07A2}'));
339        assert_eq!(0, linebreak_property('\u{07BB}'));
340        assert_eq!(19, linebreak_property('\u{1091}'));
341        assert_eq!(19, linebreak_property('\u{1B53}'));
342        assert_eq!(2, linebreak_property('\u{1EEA}'));
343        assert_eq!(42, linebreak_property('\u{200D}'));
344        assert_eq!(14, linebreak_property('\u{30C7}'));
345        assert_eq!(14, linebreak_property('\u{318B}'));
346        assert_eq!(14, linebreak_property('\u{3488}'));
347        assert_eq!(14, linebreak_property('\u{3B6E}'));
348        assert_eq!(14, linebreak_property('\u{475B}'));
349        assert_eq!(14, linebreak_property('\u{490B}'));
350        assert_eq!(14, linebreak_property('\u{5080}'));
351        assert_eq!(14, linebreak_property('\u{7846}'));
352        assert_eq!(14, linebreak_property('\u{7F3A}'));
353        assert_eq!(14, linebreak_property('\u{8B51}'));
354        assert_eq!(14, linebreak_property('\u{920F}'));
355        assert_eq!(14, linebreak_property('\u{9731}'));
356        assert_eq!(14, linebreak_property('\u{9F3A}'));
357        assert_eq!(2, linebreak_property('\u{ABD2}'));
358        assert_eq!(19, linebreak_property('\u{ABF6}'));
359        assert_eq!(32, linebreak_property('\u{B2EA}'));
360        assert_eq!(32, linebreak_property('\u{B3F5}'));
361        assert_eq!(32, linebreak_property('\u{B796}'));
362        assert_eq!(32, linebreak_property('\u{B9E8}'));
363        assert_eq!(32, linebreak_property('\u{BD42}'));
364        assert_eq!(32, linebreak_property('\u{C714}'));
365        assert_eq!(32, linebreak_property('\u{CC25}'));
366        assert_eq!(0, linebreak_property('\u{EA59}'));
367        assert_eq!(0, linebreak_property('\u{F6C8}'));
368        assert_eq!(0, linebreak_property('\u{F83C}'));
369        assert_eq!(2, linebreak_property('\u{FC6A}'));
370        assert_eq!(0, linebreak_property('\u{15199}'));
371        assert_eq!(0, linebreak_property('\u{163AC}'));
372        assert_eq!(0, linebreak_property('\u{1EF65}'));
373        assert_eq!(14, linebreak_property('\u{235A7}'));
374        assert_eq!(14, linebreak_property('\u{2E483}'));
375        assert_eq!(14, linebreak_property('\u{2FFFA}'));
376        assert_eq!(14, linebreak_property('\u{3613E}'));
377        assert_eq!(14, linebreak_property('\u{3799A}'));
378        assert_eq!(0, linebreak_property('\u{4DD35}'));
379        assert_eq!(0, linebreak_property('\u{5858D}'));
380        assert_eq!(0, linebreak_property('\u{585C2}'));
381        assert_eq!(0, linebreak_property('\u{6CF38}'));
382        assert_eq!(0, linebreak_property('\u{7573F}'));
383        assert_eq!(0, linebreak_property('\u{7AABF}'));
384        assert_eq!(0, linebreak_property('\u{87762}'));
385        assert_eq!(0, linebreak_property('\u{90297}'));
386        assert_eq!(0, linebreak_property('\u{9D037}'));
387        assert_eq!(0, linebreak_property('\u{A0E65}'));
388        assert_eq!(0, linebreak_property('\u{B8E7F}'));
389        assert_eq!(0, linebreak_property('\u{BBEA5}'));
390        assert_eq!(0, linebreak_property('\u{BE28C}'));
391        assert_eq!(0, linebreak_property('\u{C1B57}'));
392        assert_eq!(0, linebreak_property('\u{C2011}'));
393        assert_eq!(0, linebreak_property('\u{CBF32}'));
394        assert_eq!(0, linebreak_property('\u{DD9BD}'));
395        assert_eq!(0, linebreak_property('\u{DF4A6}'));
396        assert_eq!(0, linebreak_property('\u{E923D}'));
397        assert_eq!(0, linebreak_property('\u{E94DB}'));
398        assert_eq!(0, linebreak_property('\u{F90AB}'));
399        assert_eq!(0, linebreak_property('\u{100EF6}'));
400        assert_eq!(0, linebreak_property('\u{106487}'));
401        assert_eq!(0, linebreak_property('\u{1064B4}'));
402    }
403
404    #[test]
405    fn linebreak_prop_str() {
406        assert_eq!((9, 1), linebreak_property_str(&"\u{0004}", 0));
407        assert_eq!((9, 1), linebreak_property_str(&"\u{0005}", 0));
408        assert_eq!((9, 1), linebreak_property_str(&"\u{0008}", 0));
409        assert_eq!((4, 1), linebreak_property_str(&"\u{0009}", 0));
410        assert_eq!((17, 1), linebreak_property_str(&"\u{000A}", 0));
411        assert_eq!((6, 1), linebreak_property_str(&"\u{000C}", 0));
412        assert_eq!((9, 1), linebreak_property_str(&"\u{000E}", 0));
413        assert_eq!((9, 1), linebreak_property_str(&"\u{0010}", 0));
414        assert_eq!((9, 1), linebreak_property_str(&"\u{0013}", 0));
415        assert_eq!((9, 1), linebreak_property_str(&"\u{0017}", 0));
416        assert_eq!((9, 1), linebreak_property_str(&"\u{001C}", 0));
417        assert_eq!((9, 1), linebreak_property_str(&"\u{001D}", 0));
418        assert_eq!((9, 1), linebreak_property_str(&"\u{001F}", 0));
419        assert_eq!((11, 1), linebreak_property_str(&"\u{0021}", 0));
420        assert_eq!((23, 1), linebreak_property_str(&"\u{0027}", 0));
421        assert_eq!((22, 1), linebreak_property_str(&"\u{002B}", 0));
422        assert_eq!((13, 1), linebreak_property_str(&"\u{002D}", 0));
423        assert_eq!((27, 1), linebreak_property_str(&"\u{002F}", 0));
424        assert_eq!((2, 1), linebreak_property_str(&"\u{003C}", 0));
425        assert_eq!((2, 1), linebreak_property_str(&"\u{0043}", 0));
426        assert_eq!((2, 1), linebreak_property_str(&"\u{004B}", 0));
427        assert_eq!((36, 1), linebreak_property_str(&"\u{005D}", 0));
428        assert_eq!((2, 1), linebreak_property_str(&"\u{0060}", 0));
429        assert_eq!((2, 1), linebreak_property_str(&"\u{0065}", 0));
430        assert_eq!((2, 1), linebreak_property_str(&"\u{0066}", 0));
431        assert_eq!((2, 1), linebreak_property_str(&"\u{0068}", 0));
432        assert_eq!((2, 1), linebreak_property_str(&"\u{0069}", 0));
433        assert_eq!((2, 1), linebreak_property_str(&"\u{006C}", 0));
434        assert_eq!((2, 1), linebreak_property_str(&"\u{006D}", 0));
435        assert_eq!((2, 1), linebreak_property_str(&"\u{0077}", 0));
436        assert_eq!((2, 1), linebreak_property_str(&"\u{0079}", 0));
437        assert_eq!((4, 1), linebreak_property_str(&"\u{007C}", 0));
438        assert_eq!((9, 2), linebreak_property_str(&"\u{008D}", 0));
439        assert_eq!((1, 2), linebreak_property_str(&"\u{00D7}", 0));
440        assert_eq!((2, 2), linebreak_property_str(&"\u{015C}", 0));
441        assert_eq!((2, 2), linebreak_property_str(&"\u{01B5}", 0));
442        assert_eq!((2, 2), linebreak_property_str(&"\u{0216}", 0));
443        assert_eq!((2, 2), linebreak_property_str(&"\u{0234}", 0));
444        assert_eq!((2, 2), linebreak_property_str(&"\u{026E}", 0));
445        assert_eq!((2, 2), linebreak_property_str(&"\u{027C}", 0));
446        assert_eq!((2, 2), linebreak_property_str(&"\u{02BB}", 0));
447        assert_eq!((9, 2), linebreak_property_str(&"\u{0313}", 0));
448        assert_eq!((9, 2), linebreak_property_str(&"\u{0343}", 0));
449        assert_eq!((9, 2), linebreak_property_str(&"\u{034A}", 0));
450        assert_eq!((9, 2), linebreak_property_str(&"\u{0358}", 0));
451        assert_eq!((0, 2), linebreak_property_str(&"\u{0378}", 0));
452        assert_eq!((2, 2), linebreak_property_str(&"\u{038C}", 0));
453        assert_eq!((2, 2), linebreak_property_str(&"\u{03A4}", 0));
454        assert_eq!((2, 2), linebreak_property_str(&"\u{03AC}", 0));
455        assert_eq!((2, 2), linebreak_property_str(&"\u{041F}", 0));
456        assert_eq!((2, 2), linebreak_property_str(&"\u{049A}", 0));
457        assert_eq!((2, 2), linebreak_property_str(&"\u{04B4}", 0));
458        assert_eq!((2, 2), linebreak_property_str(&"\u{04C6}", 0));
459        assert_eq!((2, 2), linebreak_property_str(&"\u{0535}", 0));
460        assert_eq!((9, 2), linebreak_property_str(&"\u{05B1}", 0));
461        assert_eq!((0, 2), linebreak_property_str(&"\u{05FF}", 0));
462        assert_eq!((9, 2), linebreak_property_str(&"\u{065D}", 0));
463        assert_eq!((2, 2), linebreak_property_str(&"\u{067E}", 0));
464        assert_eq!((19, 2), linebreak_property_str(&"\u{06F5}", 0));
465        assert_eq!((19, 2), linebreak_property_str(&"\u{06F6}", 0));
466        assert_eq!((9, 2), linebreak_property_str(&"\u{0735}", 0));
467        assert_eq!((2, 2), linebreak_property_str(&"\u{074D}", 0));
468        assert_eq!((9, 2), linebreak_property_str(&"\u{07A6}", 0));
469        assert_eq!((0, 2), linebreak_property_str(&"\u{07B9}", 0));
470        assert_eq!((2, 3), linebreak_property_str(&"\u{131F}", 0));
471        assert_eq!((42, 3), linebreak_property_str(&"\u{200D}", 0));
472        assert_eq!((2, 3), linebreak_property_str(&"\u{25DA}", 0));
473        assert_eq!((2, 3), linebreak_property_str(&"\u{2C01}", 0));
474        assert_eq!((14, 3), linebreak_property_str(&"\u{2EE5}", 0));
475        assert_eq!((14, 3), linebreak_property_str(&"\u{4207}", 0));
476        assert_eq!((14, 3), linebreak_property_str(&"\u{4824}", 0));
477        assert_eq!((14, 3), linebreak_property_str(&"\u{491A}", 0));
478        assert_eq!((14, 3), linebreak_property_str(&"\u{4C20}", 0));
479        assert_eq!((14, 3), linebreak_property_str(&"\u{4D6A}", 0));
480        assert_eq!((14, 3), linebreak_property_str(&"\u{50EB}", 0));
481        assert_eq!((14, 3), linebreak_property_str(&"\u{521B}", 0));
482        assert_eq!((14, 3), linebreak_property_str(&"\u{5979}", 0));
483        assert_eq!((14, 3), linebreak_property_str(&"\u{5F9B}", 0));
484        assert_eq!((14, 3), linebreak_property_str(&"\u{65AB}", 0));
485        assert_eq!((14, 3), linebreak_property_str(&"\u{6B1F}", 0));
486        assert_eq!((14, 3), linebreak_property_str(&"\u{7169}", 0));
487        assert_eq!((14, 3), linebreak_property_str(&"\u{87CA}", 0));
488        assert_eq!((14, 3), linebreak_property_str(&"\u{87FF}", 0));
489        assert_eq!((14, 3), linebreak_property_str(&"\u{8A91}", 0));
490        assert_eq!((14, 3), linebreak_property_str(&"\u{943A}", 0));
491        assert_eq!((14, 3), linebreak_property_str(&"\u{9512}", 0));
492        assert_eq!((14, 3), linebreak_property_str(&"\u{9D66}", 0));
493        assert_eq!((9, 3), linebreak_property_str(&"\u{A928}", 0));
494        assert_eq!((24, 3), linebreak_property_str(&"\u{AA7E}", 0));
495        assert_eq!((2, 3), linebreak_property_str(&"\u{AAEA}", 0));
496        assert_eq!((0, 3), linebreak_property_str(&"\u{AB66}", 0));
497        assert_eq!((32, 3), linebreak_property_str(&"\u{B9FC}", 0));
498        assert_eq!((32, 3), linebreak_property_str(&"\u{CD89}", 0));
499        assert_eq!((32, 3), linebreak_property_str(&"\u{CDB2}", 0));
500        assert_eq!((0, 3), linebreak_property_str(&"\u{F71D}", 0));
501        assert_eq!((14, 3), linebreak_property_str(&"\u{F9DF}", 0));
502        assert_eq!((2, 3), linebreak_property_str(&"\u{FEC3}", 0));
503        assert_eq!((0, 4), linebreak_property_str(&"\u{13CC5}", 0));
504        assert_eq!((2, 4), linebreak_property_str(&"\u{1D945}", 0));
505        assert_eq!((40, 4), linebreak_property_str(&"\u{1F3C3}", 0));
506        assert_eq!((41, 4), linebreak_property_str(&"\u{1F3FB}", 0));
507        assert_eq!((14, 4), linebreak_property_str(&"\u{2BDCD}", 0));
508        assert_eq!((14, 4), linebreak_property_str(&"\u{3898E}", 0));
509        assert_eq!((0, 4), linebreak_property_str(&"\u{45C35}", 0));
510        assert_eq!((0, 4), linebreak_property_str(&"\u{4EC30}", 0));
511        assert_eq!((0, 4), linebreak_property_str(&"\u{58EE2}", 0));
512        assert_eq!((0, 4), linebreak_property_str(&"\u{5E3E8}", 0));
513        assert_eq!((0, 4), linebreak_property_str(&"\u{5FB7D}", 0));
514        assert_eq!((0, 4), linebreak_property_str(&"\u{6A564}", 0));
515        assert_eq!((0, 4), linebreak_property_str(&"\u{6C591}", 0));
516        assert_eq!((0, 4), linebreak_property_str(&"\u{6CA82}", 0));
517        assert_eq!((0, 4), linebreak_property_str(&"\u{83839}", 0));
518        assert_eq!((0, 4), linebreak_property_str(&"\u{88F47}", 0));
519        assert_eq!((0, 4), linebreak_property_str(&"\u{91CA0}", 0));
520        assert_eq!((0, 4), linebreak_property_str(&"\u{95644}", 0));
521        assert_eq!((0, 4), linebreak_property_str(&"\u{AC335}", 0));
522        assert_eq!((0, 4), linebreak_property_str(&"\u{AE8BF}", 0));
523        assert_eq!((0, 4), linebreak_property_str(&"\u{B282B}", 0));
524        assert_eq!((0, 4), linebreak_property_str(&"\u{B4CFC}", 0));
525        assert_eq!((0, 4), linebreak_property_str(&"\u{BBED0}", 0));
526        assert_eq!((0, 4), linebreak_property_str(&"\u{CCC89}", 0));
527        assert_eq!((0, 4), linebreak_property_str(&"\u{D40EB}", 0));
528        assert_eq!((0, 4), linebreak_property_str(&"\u{D65F5}", 0));
529        assert_eq!((0, 4), linebreak_property_str(&"\u{D8E0B}", 0));
530        assert_eq!((0, 4), linebreak_property_str(&"\u{DF93A}", 0));
531        assert_eq!((0, 4), linebreak_property_str(&"\u{E4E2C}", 0));
532        assert_eq!((0, 4), linebreak_property_str(&"\u{F7935}", 0));
533        assert_eq!((0, 4), linebreak_property_str(&"\u{F9DFF}", 0));
534        assert_eq!((0, 4), linebreak_property_str(&"\u{1094B7}", 0));
535        assert_eq!((0, 4), linebreak_property_str(&"\u{10C782}", 0));
536        assert_eq!((0, 4), linebreak_property_str(&"\u{10E4D5}", 0));
537    }
538
539    #[test]
540    fn lb_iter_simple() {
541        assert_eq!(
542            vec![(6, false), (11, false)],
543            LineBreakIterator::new("hello world").collect::<Vec<_>>()
544        );
545
546        // LB7, LB18
547        assert_eq!(
548            vec![(3, false), (4, false)],
549            LineBreakIterator::new("a  b").collect::<Vec<_>>()
550        );
551
552        // LB5
553        assert_eq!(vec![(2, true), (3, false)], LineBreakIterator::new("a\nb").collect::<Vec<_>>());
554        assert_eq!(
555            vec![(2, true), (4, true)],
556            LineBreakIterator::new("\r\n\r\n").collect::<Vec<_>>()
557        );
558
559        // LB8a
560        assert_eq!(
561            vec![(7, false)],
562            LineBreakIterator::new("\u{200D}\u{1F3FB}").collect::<Vec<_>>()
563        );
564
565        // LB10 combining mark after space
566        assert_eq!(
567            vec![(2, false), (4, false)],
568            LineBreakIterator::new("a \u{301}").collect::<Vec<_>>()
569        );
570
571        // LB15
572        assert_eq!(vec![(3, false)], LineBreakIterator::new("\" [").collect::<Vec<_>>());
573
574        // LB17
575        assert_eq!(
576            vec![(2, false), (10, false), (11, false)],
577            LineBreakIterator::new("a \u{2014} \u{2014} c").collect::<Vec<_>>()
578        );
579
580        // LB18
581        assert_eq!(
582            vec![(2, false), (6, false), (7, false)],
583            LineBreakIterator::new("a \"b\" c").collect::<Vec<_>>()
584        );
585
586        // LB21
587        assert_eq!(vec![(2, false), (3, false)], LineBreakIterator::new("a-b").collect::<Vec<_>>());
588
589        // LB21a
590        assert_eq!(
591            vec![(5, false)],
592            LineBreakIterator::new("\u{05D0}-\u{05D0}").collect::<Vec<_>>()
593        );
594
595        // LB23a
596        assert_eq!(vec![(6, false)], LineBreakIterator::new("$\u{1F3FB}%").collect::<Vec<_>>());
597
598        // LB30b
599        assert_eq!(
600            vec![(8, false)],
601            LineBreakIterator::new("\u{1F466}\u{1F3FB}").collect::<Vec<_>>()
602        );
603
604        // LB31
605        assert_eq!(
606            vec![(8, false), (16, false)],
607            LineBreakIterator::new("\u{1F1E6}\u{1F1E6}\u{1F1E6}\u{1F1E6}").collect::<Vec<_>>()
608        );
609    }
610
611    #[test]
612    // The final break is hard only when there is an explicit separator.
613    fn lb_iter_eot() {
614        assert_eq!(vec![(4, false)], LineBreakIterator::new("abc ").collect::<Vec<_>>());
615
616        assert_eq!(vec![(4, true)], LineBreakIterator::new("abc\r").collect::<Vec<_>>());
617
618        assert_eq!(vec![(5, true)], LineBreakIterator::new("abc\u{0085}").collect::<Vec<_>>());
619    }
620}