unicode_normalization_alignments/
stream_safe.rs

1use normalize::{
2    hangul_decomposition_length,
3    is_hangul_syllable,
4};
5use lookups::{
6    canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
7    stream_safe_trailing_nonstarters,
8};
9use tables::stream_safe_leading_nonstarters;
10
11pub(crate) const MAX_NONSTARTERS: usize = 30;
12const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
13
14/// UAX15-D4: This iterator keeps track of how many non-starters there have been
15/// since the last starter in *NFKD* and will emit a Combining Grapheme Joiner
16/// (U+034F) if the count exceeds 30.
17pub struct StreamSafe<I> {
18    iter: I,
19    nonstarter_count: usize,
20    buffer: Option<char>,
21}
22
23impl<I> StreamSafe<I> {
24    pub(crate) fn new(iter: I) -> Self {
25        Self { iter, nonstarter_count: 0, buffer: None }
26    }
27}
28
29impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
30    type Item = char;
31
32    #[inline]
33    fn next(&mut self) -> Option<char> {
34        if let Some(ch) = self.buffer.take() {
35            return Some(ch);
36        }
37        let next_ch = match self.iter.next() {
38            None => return None,
39            Some(c) => c,
40        };
41        let d = classify_nonstarters(next_ch);
42        if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
43            self.buffer = Some(next_ch);
44            self.nonstarter_count = 0;
45            return Some(COMBINING_GRAPHEME_JOINER);
46        }
47
48        // No starters in the decomposition, so keep accumulating
49        if d.leading_nonstarters == d.decomposition_len {
50            self.nonstarter_count += d.decomposition_len;
51        }
52        // Otherwise, restart the nonstarter counter.
53        else {
54            self.nonstarter_count = d.trailing_nonstarters;
55        }
56        Some(next_ch)
57    }
58}
59
60#[derive(Debug)]
61pub(crate) struct Decomposition {
62    pub(crate) leading_nonstarters: usize,
63    pub(crate) trailing_nonstarters: usize,
64    pub(crate) decomposition_len: usize,
65}
66
67#[inline]
68pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
69    // As usual, fast path for ASCII (which is always a starter)
70    if c <= '\x7f' {
71        return Decomposition {
72            leading_nonstarters: 0,
73            trailing_nonstarters: 0,
74            decomposition_len: 1,
75        }
76    }
77    // Next, special case Hangul, since it's not handled by our tables.
78    if is_hangul_syllable(c) {
79        return Decomposition {
80            leading_nonstarters: 0,
81            trailing_nonstarters: 0,
82            decomposition_len: hangul_decomposition_length(c),
83        };
84    }
85    let decomp = compatibility_fully_decomposed(c)
86        .or_else(|| canonical_fully_decomposed(c));
87    match decomp {
88        Some(decomp) => {
89            Decomposition {
90                leading_nonstarters: stream_safe_leading_nonstarters(c),
91                trailing_nonstarters: stream_safe_trailing_nonstarters(c),
92                decomposition_len: decomp.len(),
93            }
94        },
95        None => {
96            let is_nonstarter = canonical_combining_class(c) != 0;
97            let nonstarter = if is_nonstarter { 1 } else { 0 };
98            Decomposition {
99                leading_nonstarters: nonstarter,
100                trailing_nonstarters: nonstarter,
101                decomposition_len: 1,
102            }
103        }
104    }
105}
106
107#[cfg(test)]
108mod tests {
109    use super::{
110        StreamSafe,
111        classify_nonstarters,
112    };
113    use std::char;
114    use normalize::decompose_compatible;
115    use lookups::canonical_combining_class;
116
117    fn stream_safe(s: &str) -> String {
118        StreamSafe::new(s.chars()).collect()
119    }
120
121    #[test]
122    fn test_simple() {
123        let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
124        assert_eq!(stream_safe(technically_okay), technically_okay);
125
126        let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
127        assert_ne!(stream_safe(too_much), too_much);
128    }
129
130    #[test]
131    fn test_classify_nonstarters() {
132        // Highest character in the `compat_fully_decomp` table is 2FA1D
133        for ch in 0..0x2FA1E {
134            let ch = match char::from_u32(ch) {
135                Some(c) => c,
136                None => continue,
137            };
138            let c = classify_nonstarters(ch);
139            let mut s = vec![];
140            decompose_compatible(ch, |c| s.push(c));
141
142            assert_eq!(s.len(), c.decomposition_len);
143
144            let num_leading = s
145                .iter()
146                .take_while(|&c| canonical_combining_class(*c) != 0)
147                .count();
148            let num_trailing = s
149                .iter()
150                .rev()
151                .take_while(|&c| canonical_combining_class(*c) != 0)
152                .count();
153
154            assert_eq!(num_leading, c.leading_nonstarters);
155            assert_eq!(num_trailing, c.trailing_nonstarters);
156        }
157    }
158}