unic_bidi/
prepare.rs

1// Copyright 2015 The Servo Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! 3.3.3 Preparations for Implicit Processing
13//!
14//! <https://www.unicode.org/reports/tr9/#Preparations_for_Implicit_Processing>
15
16use std::cmp::max;
17use std::ops::Range;
18
19use unic_ucd_bidi::bidi_class::abbr_names::*;
20use unic_ucd_bidi::BidiClass;
21
22use super::level::Level;
23
24/// A maximal substring of characters with the same embedding level.
25///
26/// Represented as a range of byte indices.
27pub type LevelRun = Range<usize>;
28
29/// Output of `isolating_run_sequences` (steps X9-X10)
30#[derive(Debug, PartialEq)]
31pub struct IsolatingRunSequence {
32    pub runs: Vec<LevelRun>,
33    pub sos: BidiClass, // Start-of-sequence type.
34    pub eos: BidiClass, // End-of-sequence type.
35}
36
37/// Compute the set of isolating run sequences.
38///
39/// An isolating run sequence is a maximal sequence of level runs such that for all level runs
40/// except the last one in the sequence, the last character of the run is an isolate initiator
41/// whose matching PDI is the first character of the next level run in the sequence.
42///
43/// Note: This function does *not* return the sequences in order by their first characters.
44pub fn isolating_run_sequences(
45    para_level: Level,
46    original_classes: &[BidiClass],
47    levels: &[Level],
48) -> Vec<IsolatingRunSequence> {
49    let runs = level_runs(levels, original_classes);
50
51    // Compute the set of isolating run sequences.
52    // <https://www.unicode.org/reports/tr9/#BD13>
53    let mut sequences = Vec::with_capacity(runs.len());
54
55    // When we encounter an isolate initiator, we push the current sequence onto the
56    // stack so we can resume it after the matching PDI.
57    let mut stack = vec![Vec::new()];
58
59    for run in runs {
60        assert!(run.len() > 0);
61        assert!(!stack.is_empty());
62
63        let start_class = original_classes[run.start];
64        let end_class = original_classes[run.end - 1];
65
66        let mut sequence = if start_class == PDI && stack.len() > 1 {
67            // Continue a previous sequence interrupted by an isolate.
68            stack.pop().unwrap()
69        } else {
70            // Start a new sequence.
71            Vec::new()
72        };
73
74        sequence.push(run);
75
76        if matches!(end_class, RLI | LRI | FSI) {
77            // Resume this sequence after the isolate.
78            stack.push(sequence);
79        } else {
80            // This sequence is finished.
81            sequences.push(sequence);
82        }
83    }
84    // Pop any remaning sequences off the stack.
85    sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty()));
86
87    // Determine the `sos` and `eos` class for each sequence.
88    // <https://www.unicode.org/reports/tr9/#X10>
89    sequences
90        .into_iter()
91        .map(|sequence: Vec<LevelRun>| {
92            assert!(!sequence.is_empty());
93
94            let start_of_seq = sequence[0].start;
95            let end_of_seq = sequence[sequence.len() - 1].end;
96            let seq_level = levels[start_of_seq];
97
98            #[cfg(test)]
99            for run in sequence.clone() {
100                for idx in run {
101                    if not_removed_by_x9(&original_classes[idx]) {
102                        assert_eq!(seq_level, levels[idx]);
103                    }
104                }
105            }
106
107            // Get the level of the last non-removed char before the runs.
108            let pred_level = match original_classes[..start_of_seq]
109                .iter()
110                .rposition(not_removed_by_x9)
111            {
112                Some(idx) => levels[idx],
113                None => para_level,
114            };
115
116            // Get the level of the next non-removed char after the runs.
117            let succ_level = if matches!(original_classes[end_of_seq - 1], RLI | LRI | FSI) {
118                para_level
119            } else {
120                match original_classes[end_of_seq..]
121                    .iter()
122                    .position(not_removed_by_x9)
123                {
124                    Some(idx) => levels[end_of_seq + idx],
125                    None => para_level,
126                }
127            };
128
129            IsolatingRunSequence {
130                runs: sequence,
131                sos: max(seq_level, pred_level).bidi_class(),
132                eos: max(seq_level, succ_level).bidi_class(),
133            }
134        })
135        .collect()
136}
137
138/// Finds the level runs in a paragraph.
139///
140/// <https://www.unicode.org/reports/tr9/#BD7>
141fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec<LevelRun> {
142    assert_eq!(levels.len(), original_classes.len());
143
144    let mut runs = Vec::new();
145    if levels.is_empty() {
146        return runs;
147    }
148
149    let mut current_run_level = levels[0];
150    let mut current_run_start = 0;
151    for i in 1..levels.len() {
152        if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
153            // End the last run and start a new one.
154            runs.push(current_run_start..i);
155            current_run_level = levels[i];
156            current_run_start = i;
157        }
158    }
159    runs.push(current_run_start..levels.len());
160
161    runs
162}
163
164/// Should this character be ignored in steps after X9?
165///
166/// <https://www.unicode.org/reports/tr9/#X9>
167pub fn removed_by_x9(class: BidiClass) -> bool {
168    matches!(class, RLE | LRE | RLO | LRO | PDF | BN)
169}
170
171// For use as a predicate for `position` / `rposition`
172pub fn not_removed_by_x9(class: &BidiClass) -> bool {
173    !removed_by_x9(*class)
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn test_level_runs() {
182        assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]);
183        assert_eq!(
184            level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]),
185            &[0..3, 3..5, 5..6, 6..8]
186        );
187    }
188
189    // From <https://www.unicode.org/reports/tr9/#BD13>
190    #[cfg_attr(rustfmt, rustfmt_skip)]
191    #[test]
192    fn test_isolating_run_sequences() {
193
194        // == Example 1 ==
195        // text1·RLE·text2·PDF·RLE·text3·PDF·text4
196        // index        0    1  2    3    4  5    6  7
197        let classes = &[L, RLE, L, PDF, RLE, L, PDF, L];
198        let levels =  &[0,   1, 1,   1,   1, 1,   1, 0];
199        let para_level = Level::ltr();
200        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
201        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
202        assert_eq!(
203            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
204            vec![vec![0..2], vec![2..7], vec![7..8]]
205        );
206
207        // == Example 2 ==
208        // text1·RLI·text2·PDI·RLI·text3·PDI·text4
209        // index        0    1  2    3    4  5    6  7
210        let classes = &[L, RLI, L, PDI, RLI, L, PDI, L];
211        let levels =  &[0,   0, 1,   0,   0, 1,   0, 0];
212        let para_level = Level::ltr();
213        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
214        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
215        assert_eq!(
216            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
217            vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]]
218        );
219
220        // == Example 3 ==
221        // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7
222        // index        0    1  2    3  4    5  6    7  8    9  10  11  12
223        let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI,  L];
224        let levels =  &[0,   0, 1,   1, 2,   3, 3,   3, 2,   1, 1,   0,  0];
225        let para_level = Level::ltr();
226        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
227        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
228        assert_eq!(
229            sequences.iter().map(|s| s.runs.clone()).collect::<Vec<_>>(),
230            vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]]
231        );
232    }
233
234    // From <https://www.unicode.org/reports/tr9/#X10>
235    #[cfg_attr(rustfmt, rustfmt_skip)]
236    #[test]
237    fn test_isolating_run_sequences_sos_and_eos() {
238
239        // == Example 1 ==
240        // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6
241        // index        0    1  2    3  4    5  6    7    8  9   10  11
242        let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF,  L];
243        let levels =  &[0,   1, 1,   2, 2,   2, 1,   1,   1, 1,   1,  0];
244        let para_level = Level::ltr();
245        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
246        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
247
248        // text1
249        assert_eq!(
250            &sequences[0],
251            &IsolatingRunSequence {
252                runs: vec![0..2],
253                sos: L,
254                eos: R,
255            }
256        );
257
258        // text2
259        assert_eq!(
260            &sequences[1],
261            &IsolatingRunSequence {
262                runs: vec![2..4],
263                sos: R,
264                eos: L,
265            }
266        );
267
268        // text3
269        assert_eq!(
270            &sequences[2],
271            &IsolatingRunSequence {
272                runs: vec![4..6],
273                sos: L,
274                eos: L,
275            }
276        );
277
278        // text4 text5
279        assert_eq!(
280            &sequences[3],
281            &IsolatingRunSequence {
282                runs: vec![6..11],
283                sos: L,
284                eos: R,
285            }
286        );
287
288        // text6
289        assert_eq!(
290            &sequences[4],
291            &IsolatingRunSequence {
292                runs: vec![11..12],
293                sos: R,
294                eos: L,
295            }
296        );
297
298        // == Example 2 ==
299        // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6
300        // index        0    1  2    3  4    5  6    7    8  9   10  11
301        let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI,  L];
302        let levels =  &[0,   0, 1,   1, 2,   1, 1,   0,   0, 1,   0,  0];
303        let para_level = Level::ltr();
304        let mut sequences = isolating_run_sequences(para_level, classes, &Level::vec(levels));
305        sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone()));
306
307        // text1·RLI·PDI·RLI·PDI·text6
308        assert_eq!(
309            &sequences[0],
310            &IsolatingRunSequence {
311                runs: vec![0..2, 7..9, 10..12],
312                sos: L,
313                eos: L,
314            }
315        );
316
317        // text2·LRI·PDI·text4
318        assert_eq!(
319            &sequences[1],
320            &IsolatingRunSequence {
321                runs: vec![2..4, 5..7],
322                sos: R,
323                eos: R,
324            }
325        );
326
327        // text3
328        assert_eq!(
329            &sequences[2],
330            &IsolatingRunSequence {
331                runs: vec![4..5],
332                sos: L,
333                eos: L,
334            }
335        );
336
337        // text5
338        assert_eq!(
339            &sequences[3],
340            &IsolatingRunSequence {
341                runs: vec![9..10],
342                sos: R,
343                eos: R,
344            }
345        );
346    }
347
348    #[test]
349    fn test_removed_by_x9() {
350        let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN];
351        let not_classes = &[L, RLI, AL, LRI, PDI];
352        for x in rem_classes {
353            assert_eq!(removed_by_x9(*x), true);
354        }
355        for x in not_classes {
356            assert_eq!(removed_by_x9(*x), false);
357        }
358    }
359
360    #[test]
361    fn test_not_removed_by_x9() {
362        let non_x9_classes = &[
363            L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI,
364        ];
365        for x in non_x9_classes {
366            assert_eq!(not_removed_by_x9(&x), true);
367        }
368    }
369}