wezterm_bidi/lib.rs
1use level::MAX_DEPTH;
2use level_stack::{LevelStack, Override};
3use log::trace;
4use std::borrow::Cow;
5use std::ops::Range;
6use wezterm_dynamic::{FromDynamic, ToDynamic};
7
8mod bidi_brackets;
9mod bidi_class;
10mod direction;
11mod level;
12mod level_stack;
13
14use bidi_brackets::BracketType;
15pub use bidi_class::BidiClass;
16pub use direction::Direction;
17pub use level::Level;
18
19/// Placeholder codepoint index that corresponds to NO_LEVEL
20const DELETED: usize = usize::max_value();
21
22#[derive(Debug, Clone, Copy, PartialEq, Eq, FromDynamic, ToDynamic)]
23pub enum ParagraphDirectionHint {
24 LeftToRight,
25 RightToLeft,
26 /// Attempt to auto-detect but fall back to LTR
27 AutoLeftToRight,
28 /// Attempt to auto-detect but fall back to RTL
29 AutoRightToLeft,
30}
31
32impl Default for ParagraphDirectionHint {
33 fn default() -> Self {
34 Self::LeftToRight
35 }
36}
37
38impl ParagraphDirectionHint {
39 /// Returns just the direction portion of the hint, independent
40 /// of the auto-detection state.
41 pub fn direction(self) -> Direction {
42 match self {
43 ParagraphDirectionHint::AutoLeftToRight | ParagraphDirectionHint::LeftToRight => {
44 Direction::LeftToRight
45 }
46 ParagraphDirectionHint::AutoRightToLeft | ParagraphDirectionHint::RightToLeft => {
47 Direction::RightToLeft
48 }
49 }
50 }
51}
52
53#[derive(Debug, Default)]
54pub struct BidiContext {
55 orig_char_types: Vec<BidiClass>,
56 char_types: Vec<BidiClass>,
57 levels: Vec<Level>,
58 base_level: Level,
59 runs: Vec<Run>,
60 reorder_nsm: bool,
61}
62
63/// Represents a formatting character that has been removed by the X9 rule
64pub const NO_LEVEL: i8 = -1;
65
66/// A `BidiRun` represents a run which is a contiguous sequence of codepoints
67/// from the original paragraph that have been resolved to the same embedding
68/// level, and that thus all have the same direction.
69///
70/// The `range` field encapsulates the starting and ending codepoint indices
71/// into the original paragraph.
72///
73/// Note: while the run sequence has the same level throughout, the X9 portion
74/// of the bidi algorithm can logically delete some control characters.
75/// I haven't been able to prove to myself that those control characters
76/// never manifest in the middle of a run, so it is recommended that you use the `indices`
77/// method to skip over any such elements if your shaper doesn't want them.
78#[derive(Debug, Clone, PartialEq, Eq)]
79pub struct BidiRun {
80 /// The direction for this run. Derived from the level.
81 pub direction: Direction,
82
83 /// Embedding level of this run.
84 pub level: Level,
85
86 /// The starting and ending codepoint indices for this run
87 pub range: Range<usize>,
88
89 /// the list of control codepoint indices that were removed from the text
90 /// by the X9 portion of the bidi algorithm.
91 // Expected to have low cardinality and be generally empty, so we're
92 // using a simple vec for this.
93 pub removed_by_x9: Vec<usize>,
94}
95
96impl BidiRun {
97 pub fn indices<'a>(&'a self) -> impl Iterator<Item = usize> + 'a {
98 struct Iter<'a> {
99 range: Range<usize>,
100 removed_by_x9: &'a [usize],
101 }
102
103 impl<'a> Iterator for Iter<'a> {
104 type Item = usize;
105 fn next(&mut self) -> Option<usize> {
106 for idx in self.range.by_ref() {
107 if self.removed_by_x9.iter().any(|&i| i == idx) {
108 // Skip it
109 continue;
110 }
111 return Some(idx);
112 }
113 None
114 }
115 }
116
117 Iter {
118 range: self.range.clone(),
119 removed_by_x9: &self.removed_by_x9,
120 }
121 }
122}
123
124struct RunIter<'a> {
125 pos: usize,
126 levels: Cow<'a, [Level]>,
127 line_range: Range<usize>,
128}
129
130impl<'a> Iterator for RunIter<'a> {
131 type Item = BidiRun;
132
133 fn next(&mut self) -> Option<BidiRun> {
134 loop {
135 if self.pos >= self.levels.len() {
136 return None;
137 }
138
139 let start = self.pos;
140 let len = span_len(start, &self.levels);
141 self.pos += len;
142
143 let level = self.levels[start];
144 if !level.removed_by_x9() {
145 let range = start..start + len;
146
147 let mut removed_by_x9 = vec![];
148 for idx in range.clone() {
149 if self.levels[idx].removed_by_x9() {
150 removed_by_x9.push(idx + self.line_range.start);
151 }
152 }
153
154 return Some(BidiRun {
155 direction: level.direction(),
156 level,
157 range: self.line_range.start + range.start..self.line_range.start + range.end,
158 removed_by_x9,
159 });
160 }
161 }
162 }
163}
164
165#[derive(Debug, Clone, PartialEq, Eq)]
166pub struct ReorderedRun {
167 /// The direction for this run. Derived from the level.
168 pub direction: Direction,
169
170 /// Embedding level of this run.
171 pub level: Level,
172
173 /// The starting and ending codepoint indices for this run
174 pub range: Range<usize>,
175
176 /// The indices in their adjusted order
177 pub indices: Vec<usize>,
178}
179
180fn span_len(start: usize, levels: &[Level]) -> usize {
181 let starting_level = levels[start];
182 levels
183 .iter()
184 .skip(start + 1)
185 .position(|&l| l != starting_level)
186 .unwrap_or(levels.len() - (start + 1))
187 + 1
188}
189
190impl BidiContext {
191 pub fn new() -> Self {
192 Self::default()
193 }
194
195 pub fn base_level(&self) -> Level {
196 self.base_level
197 }
198
199 /// When `reorder` is set to true, reordering will apply rule L3 to
200 /// non-spacing marks. This is likely more desirable for terminal
201 /// based applications than it is for more modern GUI applications
202 /// that feed into eg: harfbuzz.
203 pub fn set_reorder_non_spacing_marks(&mut self, reorder: bool) {
204 self.reorder_nsm = reorder;
205 }
206
207 /// Produces a sequence of `BidiRun` structs that represent runs of
208 /// text and their direction (and level) across the entire paragraph.
209 pub fn runs<'a>(&'a self) -> impl Iterator<Item = BidiRun> + 'a {
210 RunIter {
211 pos: 0,
212 levels: Cow::Borrowed(&self.levels),
213 line_range: 0..self.levels.len(),
214 }
215 }
216
217 /// Given a line_range (a subslice of the current paragraph that represents
218 /// a single wrapped line), this method resets whitespace levels for the line
219 /// boundaries, and then returns the set of runs for that line.
220 pub fn line_runs(&self, line_range: Range<usize>) -> impl Iterator<Item = BidiRun> {
221 let levels = self.reset_whitespace_levels(line_range.clone());
222
223 RunIter {
224 pos: 0,
225 levels: levels.into(),
226 line_range,
227 }
228 }
229
230 pub fn reordered_runs(&self, line_range: Range<usize>) -> Vec<ReorderedRun> {
231 // reorder_line's `level` result includes entries that were
232 // removed_by_x9() but `reordered` does NOT (for compatibility with
233 // the UCD test suite).
234 // We need to account for that when we reorder the levels here!
235 let (levels, reordered) = self.reorder_line(line_range);
236 let mut reordered_levels = vec![Level(NO_LEVEL); reordered.len()];
237
238 for (vis_idx, &log_idx) in reordered.iter().enumerate() {
239 reordered_levels[vis_idx] = levels[log_idx];
240 }
241
242 reordered_levels.retain(|l| !l.removed_by_x9());
243
244 let mut runs = vec![];
245
246 let mut idx = 0;
247 while idx < reordered_levels.len() {
248 let len = span_len(idx, &reordered_levels);
249 let level = reordered_levels[idx];
250 if !level.removed_by_x9() {
251 let idx_range = idx..idx + len;
252 let start = reordered[idx_range.clone()].iter().min().unwrap();
253 let end = reordered[idx_range.clone()].iter().max().unwrap();
254 runs.push(ReorderedRun {
255 direction: level.direction(),
256 level,
257 range: *start..*end + 1,
258 indices: reordered[idx_range].to_vec(),
259 });
260 }
261 idx += len;
262 }
263
264 runs
265 }
266
267 /// `line_range` indicates a contiguous range of character indices
268 /// in the paragraph set via `resolve_paragraph`.
269 /// This method returns the reordered set of indices for display
270 /// purposes.
271 pub fn reorder_line(&self, line_range: Range<usize>) -> (Vec<Level>, Vec<usize>) {
272 self.dump_state("before L1");
273 let mut levels = self.reset_whitespace_levels(line_range.clone());
274 assert_eq!(levels.len(), line_range.end - line_range.start);
275 let reordered = self.reverse_levels(line_range.start, &mut levels);
276
277 (levels, reordered)
278 }
279
280 /// Performs Rule L3.
281 /// This rule is optional and must be enabled by calling the
282 /// set_reorder_non_spacing_marks method
283 fn reorder_non_spacing_marks(&self, levels: &mut [Level], visual: &mut [usize]) {
284 let mut idx = levels.len() - 1;
285 loop {
286 if idx > 0
287 && !levels[idx].removed_by_x9()
288 && levels[idx].direction() == Direction::RightToLeft
289 && self.orig_char_types[visual[idx]] == BidiClass::NonspacingMark
290 {
291 // Keep scanning backwards within this level
292 let level = levels[idx];
293 let seq_end = idx;
294
295 idx -= 1;
296 while idx > 0 && levels[idx].removed_by_x9()
297 || (levels[idx] == level
298 && matches!(
299 self.orig_char_types[visual[idx]],
300 BidiClass::LeftToRightEmbedding
301 | BidiClass::RightToLeftEmbedding
302 | BidiClass::LeftToRightOverride
303 | BidiClass::RightToLeftOverride
304 | BidiClass::PopDirectionalFormat
305 | BidiClass::BoundaryNeutral
306 | BidiClass::NonspacingMark
307 ))
308 {
309 idx -= 1;
310 }
311
312 if levels[idx] != level {
313 idx += 1;
314 }
315
316 if seq_end > idx {
317 visual[idx..=seq_end].reverse();
318 levels[idx..=seq_end].reverse();
319 }
320 }
321
322 if idx == 0 {
323 return;
324 }
325 idx -= 1;
326 }
327 }
328
329 /// This function runs Rule L2.
330 ///
331 /// Find the highest level among the resolved levels.
332 /// Then from that highest level down to the lowest odd
333 /// level, reverse any contiguous runs at that level or higher.
334 fn reverse_levels(&self, first_cidx: usize, levels: &mut [Level]) -> Vec<usize> {
335 // Not typed as Level because the Step trait required by the loop
336 // below is nightly only
337 let mut highest_level = 0;
338 let mut lowest_odd_level = MAX_DEPTH as i8 + 1;
339 let mut no_levels = true;
340
341 for &level in levels.iter() {
342 if level.removed_by_x9() {
343 continue;
344 }
345
346 // Found something other than NO_LEVEL
347 no_levels = false;
348 highest_level = highest_level.max(level.0);
349 if level.0 % 2 == 1 && level.0 < lowest_odd_level {
350 lowest_odd_level = level.0;
351 }
352 }
353
354 if no_levels {
355 return vec![];
356 }
357
358 // Initial visual order
359 let mut visual = vec![];
360 for i in 0..levels.len() {
361 if levels[i].removed_by_x9() {
362 visual.push(DELETED);
363 } else {
364 visual.push(i + first_cidx);
365 }
366 }
367
368 // Apply L3. UAX9 has this occur after L2, but we do it
369 // before that for consistency with FriBidi's implementation.
370 if self.reorder_nsm {
371 self.reorder_non_spacing_marks(levels, &mut visual);
372 }
373
374 // Apply L2.
375 for level in (lowest_odd_level..=highest_level).rev() {
376 let level = Level(level);
377 let mut i = 0;
378 let mut in_range = false;
379 let mut significant_range = false;
380 let mut first_pos = None;
381 let mut last_pos = None;
382
383 while i < levels.len() {
384 if levels[i] >= level {
385 if !in_range {
386 in_range = true;
387 first_pos.replace(i);
388 } else {
389 // Hit a second explicit level
390 significant_range = true;
391 last_pos.replace(i);
392 }
393 } else if levels[i].removed_by_x9() {
394 // Don't break ranges for deleted controls
395 if in_range {
396 last_pos.replace(i);
397 }
398 } else {
399 // End of a range. Reset the range flag
400 // and rever the range.
401 in_range = false;
402 match (last_pos, first_pos, significant_range) {
403 (Some(last_pos), Some(first_pos), true) if last_pos > first_pos => {
404 visual[first_pos..=last_pos].reverse();
405 }
406 _ => {}
407 }
408 first_pos = None;
409 last_pos = None;
410 }
411 i += 1;
412 }
413
414 if in_range && significant_range {
415 match (last_pos, first_pos) {
416 (Some(last_pos), Some(first_pos)) if last_pos > first_pos => {
417 visual[first_pos..=last_pos].reverse();
418 }
419 _ => {}
420 }
421 }
422 }
423
424 visual.retain(|&i| i != DELETED);
425 visual
426 }
427
428 /// <http://unicode.org/reports/tr9/>
429 pub fn resolve_paragraph(&mut self, paragraph: &[char], hint: ParagraphDirectionHint) {
430 self.populate_char_types(paragraph);
431 self.resolve(hint, paragraph);
432 }
433
434 /// BD1: The bidirectional character types are values assigned to each
435 /// Unicode character, including unassigned characters
436 fn populate_char_types(&mut self, paragraph: &[char]) {
437 self.orig_char_types.clear();
438 self.orig_char_types.reserve(paragraph.len());
439 self.orig_char_types
440 .extend(paragraph.iter().map(|&c| bidi_class_for_char(c)));
441 }
442
443 pub fn set_char_types(&mut self, char_types: &[BidiClass], hint: ParagraphDirectionHint) {
444 self.orig_char_types.clear();
445 self.orig_char_types.extend(char_types);
446 self.resolve(hint, &[]);
447 }
448
449 fn resolve(&mut self, hint: ParagraphDirectionHint, paragraph: &[char]) {
450 trace!("\n**** resolve \n");
451 self.char_types.clear();
452 self.char_types.extend(self.orig_char_types.iter());
453
454 self.base_level = match hint {
455 ParagraphDirectionHint::LeftToRight => Level(0),
456 ParagraphDirectionHint::RightToLeft => Level(1),
457 ParagraphDirectionHint::AutoLeftToRight => {
458 paragraph_level(&self.char_types, false, Direction::LeftToRight)
459 }
460 ParagraphDirectionHint::AutoRightToLeft => {
461 paragraph_level(&self.char_types, false, Direction::RightToLeft)
462 }
463 };
464
465 self.dump_state("before X1-X8");
466 self.explicit_embedding_levels();
467 self.dump_state("before X9");
468 self.delete_format_characters();
469 self.dump_state("after X9");
470 self.identify_runs();
471 let iso_runs = self.identify_isolating_run_sequences();
472
473 self.dump_state("before W1");
474 self.resolve_combining_marks(&iso_runs); // W1
475 self.dump_state("before W2");
476 self.resolve_european_numbers(&iso_runs); // W2
477 self.dump_state("before W3");
478 self.resolve_arabic_letters(&iso_runs); // W3
479 self.dump_state("before W4");
480 self.resolve_separators(&iso_runs); // W4
481 self.dump_state("before W5");
482 self.resolve_terminators(&iso_runs); // W5
483 self.dump_state("before W6");
484 self.resolve_es_cs_et(&iso_runs); // W6
485 self.dump_state("before W7");
486 self.resolve_en(&iso_runs); // W7
487
488 self.dump_state("before N0");
489 self.resolve_paired_brackets(&iso_runs, paragraph); // N0
490
491 self.dump_state("before N1");
492 self.resolve_neutrals_by_context(&iso_runs); // N1
493 self.dump_state("before N2");
494 self.resolve_neutrals_by_level(&iso_runs); // N2
495
496 self.dump_state("before I1, I2");
497 self.resolve_implicit_levels();
498 }
499
500 fn dump_state(&self, label: &str) {
501 trace!("State: {}", label);
502 trace!("BidiClass: {:?}", self.char_types);
503 trace!("Levels: {:?}", self.levels);
504 trace!("");
505 }
506
507 /// This is the method for Rule W1.
508 ///
509 /// Resolve combining marks for a single text chain.
510 ///
511 /// For each character in the text chain, examine its
512 /// Bidi_Class. For characters of bc=NSM, change the Bidi_Class
513 /// value to that of the preceding character. Formatting characters
514 /// (Bidi_Class RLE, LRE, RLO, LRO, PDF) and boundary neutral (Bidi_Class BN)
515 /// are skipped over in this calculation, because they have been
516 /// "deleted" by Rule X9.
517 ///
518 /// If a bc=NSM character occurs at the start of a text chain, it is given
519 /// the Bidi_Class of sot (either R or L).
520 fn resolve_combining_marks(&mut self, iso_runs: &[IsolatingRunSequence]) {
521 for iso_run in iso_runs {
522 let mut prior_bc = iso_run.sos;
523 for &idx in &iso_run.indices {
524 if self.char_types[idx] == BidiClass::NonspacingMark {
525 self.char_types[idx] = prior_bc;
526 } else if !self.levels[idx].removed_by_x9() {
527 prior_bc = self.char_types[idx];
528 }
529 }
530 }
531 }
532
533 /// This is the method for Rule W2.
534 ///
535 /// Resolve European numbers for a single text chain.
536 ///
537 /// For each character in the text chain, examine its
538 /// Bidi_Class. For characters of bc=EN, scan back to find the first
539 /// character of strong type (or sot). If the strong type is bc=AL,
540 /// change the Bidi_Class EN to AN. Formatting characters
541 /// (Bidi_Class RLE, LRE, RLO, LRO, PDF) and boundary neutral (Bidi_Class BN)
542 /// are skipped over in this calculation, because they have been
543 /// "deleted" by Rule X9.
544 fn resolve_european_numbers(&mut self, iso_runs: &[IsolatingRunSequence]) {
545 for iso_run in iso_runs {
546 for (ridx, &cidx) in iso_run.indices.iter().enumerate() {
547 if self.char_types[cidx] == BidiClass::EuropeanNumber {
548 // Scan backwards to find the first strong type
549 let mut first_strong_bc = iso_run.sos;
550
551 if ridx > 0 {
552 for &pidx in iso_run.indices.get(0..ridx).unwrap().iter().rev() {
553 match self.char_types[pidx] {
554 bc @ BidiClass::LeftToRight
555 | bc @ BidiClass::RightToLeft
556 | bc @ BidiClass::ArabicLetter => {
557 first_strong_bc = bc;
558 break;
559 }
560 _ => {}
561 }
562 }
563 }
564
565 // Check if the first strong type is AL. If so
566 // reset this EN to AN.
567 if first_strong_bc == BidiClass::ArabicLetter {
568 self.char_types[cidx] = BidiClass::ArabicNumber;
569 }
570 }
571 }
572 }
573 }
574
575 /// This is the method for Rule W3.
576 ///
577 /// Resolve Bidi_Class=AL for a single text chain.
578 ///
579 /// For each character in the text chain, examine its
580 /// Bidi_Class. For characters of bc=AL, change the Bidi_Class
581 /// value to R.
582 fn resolve_arabic_letters(&mut self, iso_runs: &[IsolatingRunSequence]) {
583 for iso_run in iso_runs {
584 for &idx in &iso_run.indices {
585 if self.char_types[idx] == BidiClass::ArabicLetter {
586 self.char_types[idx] = BidiClass::RightToLeft;
587 }
588 }
589 }
590 }
591
592 /// Look back ahead of `index_idx` and return true if the
593 /// bidi class == bc. However, skip backwards over entries
594 /// that were removed by X9; they will have NO_LEVEL.
595 /// Returns the char index of the match.
596 fn is_prior_context(
597 &self,
598 index_idx: usize,
599 indices: &[usize],
600 bc: BidiClass,
601 ) -> Option<usize> {
602 if index_idx == 0 {
603 return None;
604 }
605 for &idx in indices[0..index_idx].iter().rev() {
606 if self.char_types[idx] == bc {
607 return Some(idx);
608 }
609 if !self.levels[idx].removed_by_x9() {
610 break;
611 }
612 }
613 None
614 }
615
616 /// Look ahead of `index_idx` and return true if the
617 /// bidi class == bc. However, skip over entries
618 /// that were removed by X9; they will have NO_LEVEL.
619 /// Returns the char index of the match.
620 fn is_following_context(
621 &self,
622 index_idx: usize,
623 indices: &[usize],
624 bc: BidiClass,
625 ) -> Option<usize> {
626 for &idx in &indices[index_idx + 1..] {
627 if self.char_types[idx] == bc {
628 return Some(idx);
629 }
630 if !self.levels[idx].removed_by_x9() {
631 break;
632 }
633 }
634 None
635 }
636
637 fn is_in_context(&self, index_idx: usize, indices: &[usize], bc: BidiClass) -> bool {
638 self.is_prior_context(index_idx, indices, bc).is_some()
639 && self.is_following_context(index_idx, indices, bc).is_some()
640 }
641
642 /// This is the method for Rule W4.
643 ///
644 /// Resolve Bidi_Class=ES and CS for a single text chain.
645 ///
646 /// For each character in the text chain, examine its
647 /// Bidi_Class.
648 ///
649 /// For characters of bc=ES, check if they are *between* EN.
650 /// If so, change their Bidi_Class to EN.
651 ///
652 /// For characters of bc=CS, check if they are *between* EN
653 /// or between AN. If so, change their Bidi_Class to match.
654 ///
655 fn resolve_separators(&mut self, iso_runs: &[IsolatingRunSequence]) {
656 for iso_run in iso_runs {
657 for (index_idx, &idx) in iso_run.indices.iter().enumerate() {
658 if self.char_types[idx] == BidiClass::EuropeanSeparator {
659 if self.is_in_context(index_idx, &iso_run.indices, BidiClass::EuropeanNumber) {
660 self.char_types[idx] = BidiClass::EuropeanNumber;
661 }
662 } else if self.char_types[idx] == BidiClass::CommonSeparator {
663 if self.is_in_context(index_idx, &iso_run.indices, BidiClass::EuropeanNumber) {
664 self.char_types[idx] = BidiClass::EuropeanNumber;
665 } else if self.is_in_context(
666 index_idx,
667 &iso_run.indices,
668 BidiClass::ArabicNumber,
669 ) {
670 self.char_types[idx] = BidiClass::ArabicNumber;
671 }
672 }
673 }
674 }
675 }
676
677 /// This is the method for Rule W5.
678 ///
679 /// Resolve Bidi_Class=ET for a single text chain.
680 ///
681 /// For each character in the text chain, examine its
682 /// Bidi_Class.
683 ///
684 /// For characters of bc=ET, check if they are *next to* EN.
685 /// If so, change their Bidi_Class to EN. This includes
686 /// ET on either side of EN, so the context on both sides
687 /// needs to be checked.
688 ///
689 /// Because this rule applies to indefinite sequences of ET,
690 /// and because the context which triggers any change is
691 /// adjacency to EN, the strategy taken here is to seek for
692 /// EN first. If found, scan backwards, changing any eligible
693 /// ET to EN. Then scan forwards, changing any eligible ET
694 /// to EN. Then continue the search from the point of the
695 /// last ET changed (if any).
696 ///
697 fn resolve_terminators(&mut self, iso_runs: &[IsolatingRunSequence]) {
698 for iso_run in iso_runs {
699 for (index_idx, &idx) in iso_run.indices.iter().enumerate() {
700 if self.char_types[idx] == BidiClass::EuropeanNumber {
701 for &prior_idx in iso_run.indices[0..index_idx].iter().rev() {
702 if self.char_types[prior_idx] == BidiClass::EuropeanTerminator {
703 self.char_types[prior_idx] = BidiClass::EuropeanNumber;
704 } else if !self.levels[prior_idx].removed_by_x9() {
705 break;
706 }
707 }
708 for &next_idx in &iso_run.indices[index_idx + 1..] {
709 if self.char_types[next_idx] == BidiClass::EuropeanTerminator {
710 self.char_types[next_idx] = BidiClass::EuropeanNumber;
711 } else if !self.levels[next_idx].removed_by_x9() {
712 break;
713 }
714 }
715 }
716 }
717 }
718 }
719
720 /// This is the method for Rule W6.
721 ///
722 /// Resolve remaining Bidi_Class=ES, CS, or ET for a single text chain.
723 ///
724 /// For each character in the text chain, examine its
725 /// Bidi_Class. For characters of bc=ES, bc=CS, or bc=ET, change
726 /// the Bidi_Class value to ON. This resolves any remaining
727 /// separators or terminators which were not already processed
728 /// by Rules W4 and W5.
729 fn resolve_es_cs_et(&mut self, iso_runs: &[IsolatingRunSequence]) {
730 for iso_run in iso_runs {
731 for &idx in &iso_run.indices {
732 match self.char_types[idx] {
733 BidiClass::EuropeanSeparator
734 | BidiClass::CommonSeparator
735 | BidiClass::EuropeanTerminator => {
736 self.char_types[idx] = BidiClass::OtherNeutral;
737 }
738 _ => {}
739 }
740 }
741 }
742 }
743
744 /// This is the method for Rule W7.
745 ///
746 /// Resolve Bidi_Class=EN for a single level text chain.
747 ///
748 /// Process the text chain in reverse order. For each character in the text chain, examine its
749 /// Bidi_Class. For characters of bc=EN, scan back to find the first strong
750 /// directional type. If that type is L, change the Bidi_Class
751 /// value of the number to L.
752 fn resolve_en(&mut self, iso_runs: &[IsolatingRunSequence]) {
753 for iso_run in iso_runs {
754 for (ridx, &cidx) in iso_run.indices.iter().enumerate().rev() {
755 if self.char_types[cidx] == BidiClass::EuropeanNumber {
756 // Scan backwards to find the first strong type
757 let mut first_strong_bc = iso_run.sos;
758
759 if ridx > 0 {
760 for &pidx in iso_run.indices.get(0..ridx).unwrap().iter().rev() {
761 match self.char_types[pidx] {
762 bc @ BidiClass::LeftToRight | bc @ BidiClass::RightToLeft => {
763 first_strong_bc = bc;
764 break;
765 }
766 _ => {}
767 }
768 }
769 }
770
771 if first_strong_bc == BidiClass::LeftToRight {
772 self.char_types[cidx] = BidiClass::LeftToRight;
773 }
774 }
775 }
776 }
777 }
778
779 /// This is the method for Rule N0. (New in UBA63)
780 /// Resolve paired brackets for a single text chain.
781 ///
782 /// For each character in the text chain, examine its
783 /// Bidi_Class. For any character with the bpt value open or close,
784 /// scan its context seeking a matching paired bracket. If found,
785 /// resolve the type of both brackets to match the embedding
786 /// direction.
787 ///
788 /// For UBA63 (and unchanged in UBA70), the error handling for
789 /// a stack overflow was unspecified for this rule.
790 ///
791 /// Starting with UBA80, the exact stack size is specified (63),
792 /// and the specification declares that if a stack overflow
793 /// condition is encountered, the BD16 processing for this
794 /// particular isolating run ceases immediately. This condition
795 /// does not treated as a fatal error, however, so the rule
796 /// should not return an error code here, which would stop
797 /// all processing for *all* runs of the input string.
798 fn resolve_paired_brackets(&mut self, iso_runs: &[IsolatingRunSequence], paragraph: &[char]) {
799 if paragraph.is_empty() {
800 // BidiTest cases don't populate the paragraph, but they
801 // also don't contain any bracket related tests either,
802 // so we have nothing to do here.
803 return;
804 }
805
806 let mut stack = BracketStack::new();
807 for iso_run in iso_runs {
808 stack.clear();
809 for (ridx, &cidx) in iso_run.indices.iter().enumerate() {
810 if let Some((closing_bracket, bpt)) = lookup_closing(paragraph[cidx]) {
811 trace!("ridx={} cidx={} {:?} bracket", ridx, cidx, paragraph[cidx]);
812 if self.char_types[cidx] == BidiClass::OtherNeutral {
813 if bpt == BracketType::Open {
814 trace!("push open ridx={}", ridx);
815 if !stack.push(closing_bracket, ridx) {
816 // Stack overflow: halt processing
817 return;
818 }
819 } else {
820 // a closing bracket
821 trace!("close at ridx={}, search for opener", ridx);
822 stack.seek_matching_open_bracket(paragraph[cidx], ridx);
823 }
824 }
825 }
826 }
827
828 if stack.pairs.is_empty() {
829 // The pairList pointer will still be NULL if no paired brackets
830 // were found. In this case, no further processing is necessary.
831 continue;
832 }
833
834 // Because of the way the stack
835 // processing works, the pairs may not be in the best order
836 // in the pair list for further processing. Sort them
837 // by position order of the opening bracket.
838 stack.pairs.sort_unstable_by_key(|p| p.opening_pos);
839 trace!("\nPairs: {:?}", stack.pairs);
840
841 for pair in &stack.pairs {
842 // Now for each pair, we have the first and last position
843 // of the substring in this isolating run sequence
844 // enclosed by those brackets (inclusive
845 // of the brackets). Resolve that individual pair.
846 self.resolve_one_pair(pair, &iso_run);
847 }
848 }
849 }
850
851 /// Set the Bidi_Class of a bracket pair, based on the
852 /// direction determined by the N0 rule processing in
853 /// br_ResolveOnePair().
854 ///
855 /// The direction passed in will either be BIDI_R or BIDI_L.
856 ///
857 /// This setting is abstracted in a function here, rather than
858 /// simply being done inline, because of
859 /// an edge case added to rule N0 as of UBA80. For UBA63 (and
860 /// UBA70), no special handling of combining marks following
861 /// either of the brackets is done. However, starting with UBA80,
862 /// there is an edge case fix-up done which echoes the processing
863 /// of rule W1. The text run needs to be scanned to find any
864 /// combining marks (orig_bc=NSM) following a bracket which has
865 /// its Bidi_Class changed by N0. Then those combining marks
866 /// can again be adjusted to match the Bidi_Class of the
867 /// bracket they apply to. This is an odd edge case, as combining
868 /// marks do not typically occur with brackets, but the UBA80
869 /// specification is now explicit about requiring this fix-up
870 /// to be done.
871 fn set_bracket_pair_bc(
872 pair: &Pair,
873 indices: &[usize],
874 direction: Direction,
875 char_types: &mut [BidiClass],
876 orig_char_types: &[BidiClass],
877 levels: &[Level],
878 ) {
879 let opening_pos = indices[pair.opening_pos];
880 let closing_pos = indices[pair.closing_pos];
881 let bc = match direction {
882 Direction::LeftToRight => BidiClass::LeftToRight,
883 Direction::RightToLeft => BidiClass::RightToLeft,
884 };
885 trace!(
886 "set_bracket_pair_bc index={} from {:?} -> {:?}",
887 opening_pos,
888 char_types[opening_pos],
889 bc
890 );
891 trace!(
892 "set_bracket_pair_bc index={} from {:?} -> {:?}",
893 closing_pos,
894 char_types[closing_pos],
895 bc
896 );
897 char_types[opening_pos] = bc;
898 char_types[closing_pos] = bc;
899
900 // Here is the tricky part.
901 //
902 // First scan from the opening bracket for any subsequent
903 // character whose *original* Bidi_Class was NSM, and set
904 // the current bc for it to direction also, to match the bracket.
905 // Break out of the loop at the first character with any other
906 // original Bidi_Class, so that this change only impacts
907 // actual combining mark sequences.
908 //
909 // 2020-03-27 note: This scanning for original combining marks
910 // must also scan past any intervening NO_LEVEL characters,
911 // typically bc=BN characters removed earlier by rule X9.
912 // Such sequences may, for example involve a ZWJ or ZWNJ,
913 // or in bizarre edge cases involve other bc=BN characters
914 // such as ZWSP. The latter would be defective combining character
915 // sequences, but also need to be handled here.
916 //
917 // Then repeat the process for the matching closing bracket.
918 //
919 // The processing for the opening bracket is bounded to the
920 // right by the position of the matching closing bracket.
921 // The processing for the closing bracket is bounded to the
922 // right by the end of the text run.
923 for &cidx in &indices[pair.opening_pos + 1..pair.closing_pos] {
924 if orig_char_types[cidx] == BidiClass::NonspacingMark {
925 char_types[cidx] = bc;
926 } else if !levels[cidx].removed_by_x9() {
927 break;
928 }
929 }
930 for &cidx in &indices[pair.closing_pos + 1..] {
931 if orig_char_types[cidx] == BidiClass::NonspacingMark {
932 char_types[cidx] = bc;
933 } else if !levels[cidx].removed_by_x9() {
934 break;
935 }
936 }
937 }
938
939 /// Resolve the embedding levels of one pair of matched brackets.
940 ///
941 /// This determination is based on the embedding direction.
942 /// See BD3 in the UBA specification.
943 ///
944 /// If embedding level is even, embedding direction = L.
945 /// If embedding level is odd, embedding direction = R.
946 fn resolve_one_pair(&mut self, pair: &Pair, iso_run: &IsolatingRunSequence) {
947 let embedding_direction = iso_run.level.direction();
948 let opposite_direction = embedding_direction.opposite();
949
950 let mut strong_type_found = false;
951 // Next check for a strong type (R or L)
952 // between the matched brackets. If a strong type is found
953 // which matches the embedding direction, then set the type of both
954 // brackets to match the embedding direction, too.
955 if pair.opening_pos < pair.closing_pos.saturating_sub(1) {
956 trace!("pair: {:?}", pair);
957 for &cidx in &iso_run.indices[pair.opening_pos + 1..pair.closing_pos] {
958 let direction = match self.char_types[cidx] {
959 BidiClass::RightToLeft
960 | BidiClass::EuropeanNumber
961 | BidiClass::ArabicNumber => Some(Direction::RightToLeft),
962 BidiClass::LeftToRight => Some(Direction::LeftToRight),
963 _ => None,
964 };
965
966 if direction == Some(embedding_direction) {
967 // N0 step b
968 trace!("Strong direction e between brackets");
969 Self::set_bracket_pair_bc(
970 pair,
971 &iso_run.indices,
972 embedding_direction,
973 &mut self.char_types,
974 &self.orig_char_types,
975 &self.levels,
976 );
977 return;
978 } else if direction == Some(opposite_direction) {
979 strong_type_found = true;
980 }
981 }
982 }
983
984 if strong_type_found {
985 // First attempt to resolve direction by checking the prior context for
986 // a strong type matching the opposite direction. N0 Step c1.
987 if (opposite_direction == Direction::LeftToRight
988 && self.is_prior_context_left(pair.opening_pos, &iso_run.indices, iso_run.sos))
989 || (opposite_direction == Direction::RightToLeft
990 && self.is_prior_context_right(pair.opening_pos, &iso_run.indices, iso_run.sos))
991 {
992 Self::set_bracket_pair_bc(
993 pair,
994 &iso_run.indices,
995 opposite_direction,
996 &mut self.char_types,
997 &self.orig_char_types,
998 &self.levels,
999 );
1000 return;
1001 } else {
1002 // No strong type matching the oppositedirection was found either
1003 // before or after these brackets in this text chain. Resolve the
1004 // brackets based on the embedding direction. N0 Step c2.
1005 Self::set_bracket_pair_bc(
1006 pair,
1007 &iso_run.indices,
1008 embedding_direction,
1009 &mut self.char_types,
1010 &self.orig_char_types,
1011 &self.levels,
1012 );
1013 return;
1014 }
1015 } else {
1016 // No strong type was found between the brackets. Leave
1017 // the brackets with unresolved direction.
1018 }
1019 }
1020
1021 /// This is the method for Rule N1.
1022 ///
1023 /// Resolve neutrals by context for a single text chain.
1024 ///
1025 /// For each character in the text chain, examine its
1026 /// Bidi_Class. For any character of neutral type, examine its
1027 /// context.
1028 ///
1029 /// L N L --> L L L
1030 /// R N R --> R R R [note that AN and EN count as R for this rule]
1031 ///
1032 /// Here "N" stands for "any sequence of neutrals", so the neutral
1033 /// does not have to be immediately adjacent to a strong type
1034 /// to be resolved this way.
1035 fn resolve_neutrals_by_context(&mut self, iso_runs: &[IsolatingRunSequence]) {
1036 for iso_run in iso_runs {
1037 for (ridx, &cidx) in iso_run.indices.iter().enumerate().rev() {
1038 if !self.char_types[cidx].is_neutral() {
1039 continue;
1040 }
1041
1042 if self.is_prior_context_left(ridx, &iso_run.indices, iso_run.sos)
1043 && self.is_following_context_left(ridx, &iso_run.indices, iso_run.eos)
1044 {
1045 trace!(
1046 "ridx={} cidx={} was {:?}, setting to LeftToRight",
1047 ridx,
1048 cidx,
1049 self.char_types[cidx]
1050 );
1051 self.char_types[cidx] = BidiClass::LeftToRight;
1052 } else if self.is_prior_context_right(ridx, &iso_run.indices, iso_run.sos)
1053 && self.is_following_context_right(ridx, &iso_run.indices, iso_run.eos)
1054 {
1055 trace!(
1056 "ridx={} cidx={} was {:?}, setting to RightToLeft",
1057 ridx,
1058 cidx,
1059 self.char_types[cidx]
1060 );
1061 self.char_types[cidx] = BidiClass::RightToLeft;
1062 }
1063 }
1064 }
1065 }
1066
1067 /// Scan backwards in a text chain, checking if the first non-neutral character
1068 /// is an "L" type. Skip over any "deleted" controls, which have NO_LEVEL,
1069 /// as well as any neutral types.
1070 fn is_prior_context_left(&self, index_idx: usize, indices: &[usize], sot: BidiClass) -> bool {
1071 if index_idx == 0 {
1072 trace!(
1073 "is_prior_context_left: short circuit because index_idx=0. sot is {:?}",
1074 sot
1075 );
1076 return sot == BidiClass::LeftToRight;
1077 }
1078 for &idx in indices[0..index_idx].iter().rev() {
1079 trace!(
1080 "is_prior_context_left considering idx={} {:?}",
1081 idx,
1082 self.char_types[idx]
1083 );
1084 if self.char_types[idx] == BidiClass::LeftToRight {
1085 return true;
1086 }
1087 if self.levels[idx].removed_by_x9() {
1088 continue;
1089 }
1090 if self.char_types[idx].is_neutral() {
1091 continue;
1092 }
1093 return false;
1094 }
1095 sot == BidiClass::LeftToRight
1096 }
1097
1098 /// Scan forwards in a text chain, checking if the first non-neutral character is an "L" type.
1099 /// Skip over any "deleted" controls, which have NO_LEVEL, as well as any neutral types.
1100 fn is_following_context_left(
1101 &self,
1102 index_idx: usize,
1103 indices: &[usize],
1104 eot: BidiClass,
1105 ) -> bool {
1106 trace!(
1107 "is_following_context_left index_idx={} vs. len {}",
1108 index_idx,
1109 indices.len()
1110 );
1111 for &idx in &indices[index_idx + 1..] {
1112 if self.char_types[idx] == BidiClass::LeftToRight {
1113 trace!("is_following_context_left true because idx={} is left", idx);
1114 return true;
1115 }
1116 if self.levels[idx].removed_by_x9() {
1117 continue;
1118 }
1119 if self.char_types[idx].is_neutral() {
1120 continue;
1121 }
1122 return false;
1123 }
1124 trace!(
1125 "is_following_context_left fall through to bottom, check against eot={:?}",
1126 eot
1127 );
1128 eot == BidiClass::LeftToRight
1129 }
1130
1131 /// Used by Rule N1.
1132 ///
1133 /// Scan backwards in a text chain, checking if the first non-neutral character is an "R" type.
1134 /// (BIDI_R, BIDI_AN, BIDI_EN) Skip over any "deleted" controls, which
1135 /// have NO_LEVEL, as well as any neutral types.
1136 fn is_prior_context_right(&self, index_idx: usize, indices: &[usize], sot: BidiClass) -> bool {
1137 if index_idx == 0 {
1138 return sot == BidiClass::RightToLeft;
1139 }
1140 for &idx in indices[0..index_idx].iter().rev() {
1141 match self.char_types[idx] {
1142 BidiClass::RightToLeft | BidiClass::ArabicNumber | BidiClass::EuropeanNumber => {
1143 return true;
1144 }
1145 _ => {}
1146 }
1147 if self.levels[idx].removed_by_x9() {
1148 continue;
1149 }
1150 if self.char_types[idx].is_neutral() {
1151 continue;
1152 }
1153 return false;
1154 }
1155 sot == BidiClass::RightToLeft
1156 }
1157
1158 fn is_following_context_right(
1159 &self,
1160 index_idx: usize,
1161 indices: &[usize],
1162 eot: BidiClass,
1163 ) -> bool {
1164 for &idx in &indices[index_idx + 1..] {
1165 match self.char_types[idx] {
1166 BidiClass::RightToLeft | BidiClass::ArabicNumber | BidiClass::EuropeanNumber => {
1167 return true;
1168 }
1169 _ => {}
1170 }
1171 if self.levels[idx].removed_by_x9() {
1172 continue;
1173 }
1174 if self.char_types[idx].is_neutral() {
1175 continue;
1176 }
1177 return false;
1178 }
1179 eot == BidiClass::RightToLeft
1180 }
1181
1182 /// This is the method for Rule N2.
1183 ///
1184 /// Resolve neutrals by level for a single text chain.
1185 ///
1186 /// For each character in the text chain, examine its
1187 /// Bidi_Class. For any character of neutral type, examine its
1188 /// embedding level and resolve accordingly.
1189 ///
1190 /// N --> e
1191 /// where e = L for an even level, R for an odd level
1192 fn resolve_neutrals_by_level(&mut self, iso_runs: &[IsolatingRunSequence]) {
1193 for iso_run in iso_runs {
1194 for &cidx in iso_run.indices.iter().rev() {
1195 if self.char_types[cidx].is_neutral() {
1196 self.char_types[cidx] = self.levels[cidx].as_bidi_class();
1197 }
1198 }
1199 }
1200 }
1201
1202 /// This function runs Rules I1 and I2 together.
1203 fn resolve_implicit_levels(&mut self) {
1204 for (idx, level) in self.levels.iter_mut().enumerate() {
1205 if level.removed_by_x9() {
1206 continue;
1207 }
1208
1209 match level.direction() {
1210 Direction::LeftToRight => {
1211 // I1
1212 match self.char_types[idx] {
1213 BidiClass::RightToLeft => {
1214 level.0 += 1;
1215 }
1216 BidiClass::ArabicNumber | BidiClass::EuropeanNumber => {
1217 level.0 += 2;
1218 }
1219 _ => {}
1220 }
1221 }
1222 Direction::RightToLeft => {
1223 // I2
1224 match self.char_types[idx] {
1225 BidiClass::LeftToRight
1226 | BidiClass::ArabicNumber
1227 | BidiClass::EuropeanNumber => {
1228 level.0 += 1;
1229 }
1230 _ => {}
1231 }
1232 }
1233 }
1234 }
1235 }
1236
1237 /// This function runs Rule L1.
1238 ///
1239 /// The strategy here for Rule L1 is to scan forward through
1240 /// the text searching for segment separators or paragraph
1241 /// separators. If a segment separator or paragraph
1242 /// separator is found, it is reset to the paragraph embedding
1243 /// level. Then scan backwards from the separator to
1244 /// find any contiguous stretch of whitespace characters
1245 /// and reset any which are found to the paragraph embedding
1246 /// level, as well. When we reach the *last* character in the
1247 /// text (which will also constitute, by definition, the last
1248 /// character in the line being processed here), check if it
1249 /// is whitespace. If so, reset it to the paragraph embedding
1250 /// level. Then scan backwards to find any contiguous stretch
1251 /// of whitespace characters and reset those as well.
1252 ///
1253 /// These checks for whitespace are done with the *original*
1254 /// Bidi_Class values for characters, not the resolved values.
1255 ///
1256 /// As for many rules, this rule simply ignores any character
1257 /// whose level has been set to NO_LEVEL, which is the way
1258 /// this reference algorithm "deletes" boundary neutrals and
1259 /// embedding and override controls from the text.
1260 fn reset_whitespace_levels(&self, line_range: Range<usize>) -> Vec<Level> {
1261 fn reset_contiguous_whitespace_before(
1262 line_range: Range<usize>,
1263 base_level: Level,
1264 orig_char_types: &[BidiClass],
1265 levels: &mut Vec<Level>,
1266 ) {
1267 for i in line_range.rev() {
1268 if orig_char_types[i] == BidiClass::WhiteSpace
1269 || orig_char_types[i].is_iso_control()
1270 {
1271 levels[i] = base_level;
1272 } else if levels[i].removed_by_x9() {
1273 // Skip over deleted entries
1274 } else {
1275 // end of contiguous section
1276 break;
1277 }
1278 }
1279 }
1280
1281 let mut levels = self.levels.clone();
1282
1283 for (idx, orig_bc) in self
1284 .orig_char_types
1285 .iter()
1286 .enumerate()
1287 .skip(line_range.start)
1288 .take(line_range.end - line_range.start)
1289 {
1290 match orig_bc {
1291 // Explicit boundary
1292 BidiClass::SegmentSeparator | BidiClass::ParagraphSeparator => {
1293 levels[idx] = self.base_level;
1294 reset_contiguous_whitespace_before(
1295 line_range.start..idx,
1296 self.base_level,
1297 &self.orig_char_types,
1298 &mut levels,
1299 );
1300 }
1301 _ => {}
1302 }
1303 }
1304
1305 reset_contiguous_whitespace_before(
1306 line_range.clone(),
1307 self.base_level,
1308 &self.orig_char_types,
1309 &mut levels,
1310 );
1311
1312 levels[line_range].to_vec()
1313 }
1314
1315 /// Rules X1 through X8
1316 fn explicit_embedding_levels(&mut self) {
1317 // X1: initialize stack and other variables
1318 let mut stack = LevelStack::new();
1319 stack.push(self.base_level, Override::Neutral, false);
1320
1321 let len = self.char_types.len();
1322 self.levels.resize(len, Level::default());
1323
1324 let mut overflow_isolate = 0;
1325 let mut overflow_embedding = 0;
1326 let mut valid_isolate = 0;
1327
1328 // X2..X8: process each character, setting embedding levels
1329 // and override status
1330 for idx in 0..len {
1331 let bc = self.char_types[idx];
1332 trace!("Considering idx={} {:?}", idx, bc);
1333 match bc {
1334 // X2
1335 BidiClass::RightToLeftEmbedding => {
1336 if let Some(level) = stack.embedding_level().least_greater_odd() {
1337 if overflow_isolate == 0 && overflow_embedding == 0 {
1338 stack.push(level, Override::Neutral, false);
1339 continue;
1340 }
1341 }
1342 if overflow_isolate == 0 {
1343 overflow_embedding += 1;
1344 }
1345 }
1346 // X3
1347 BidiClass::LeftToRightEmbedding => {
1348 if let Some(level) = stack.embedding_level().least_greater_even() {
1349 if overflow_isolate == 0 && overflow_embedding == 0 {
1350 stack.push(level, Override::Neutral, false);
1351 continue;
1352 }
1353 }
1354 if overflow_isolate == 0 {
1355 overflow_embedding += 1;
1356 }
1357 }
1358 // X4
1359 BidiClass::RightToLeftOverride => {
1360 if let Some(level) = stack.embedding_level().least_greater_odd() {
1361 if overflow_isolate == 0 && overflow_embedding == 0 {
1362 stack.push(level, Override::RTL, false);
1363 continue;
1364 }
1365 }
1366 if overflow_isolate == 0 {
1367 overflow_embedding += 1;
1368 }
1369 }
1370 // X5
1371 BidiClass::LeftToRightOverride => {
1372 if let Some(level) = stack.embedding_level().least_greater_even() {
1373 if overflow_isolate == 0 && overflow_embedding == 0 {
1374 stack.push(level, Override::LTR, false);
1375 continue;
1376 }
1377 }
1378 if overflow_isolate == 0 {
1379 overflow_embedding += 1;
1380 }
1381 }
1382 // X5a
1383 BidiClass::RightToLeftIsolate => {
1384 self.levels[idx] = stack.embedding_level();
1385 stack.apply_override(&mut self.char_types[idx]);
1386 if let Some(level) = stack.embedding_level().least_greater_odd() {
1387 if overflow_isolate == 0 && overflow_embedding == 0 {
1388 valid_isolate += 1;
1389 stack.push(level, Override::Neutral, true);
1390 continue;
1391 }
1392 }
1393 overflow_isolate += 1;
1394 }
1395 // X5b
1396 BidiClass::LeftToRightIsolate => {
1397 self.levels[idx] = stack.embedding_level();
1398 stack.apply_override(&mut self.char_types[idx]);
1399 if let Some(level) = stack.embedding_level().least_greater_even() {
1400 if overflow_isolate == 0 && overflow_embedding == 0 {
1401 valid_isolate += 1;
1402 stack.push(level, Override::Neutral, true);
1403 continue;
1404 }
1405 }
1406 overflow_isolate += 1;
1407 }
1408 // X5c
1409 BidiClass::FirstStrongIsolate => {
1410 let level =
1411 paragraph_level(&self.char_types[idx + 1..], true, Direction::LeftToRight);
1412 self.levels[idx] = stack.embedding_level();
1413 stack.apply_override(&mut self.char_types[idx]);
1414 let level = if level.0 == 1 {
1415 stack.embedding_level().least_greater_odd()
1416 } else {
1417 stack.embedding_level().least_greater_even()
1418 };
1419 trace!(
1420 "picked {:?} based on current stack level {:?}",
1421 level,
1422 stack.embedding_level()
1423 );
1424
1425 if let Some(level) = level {
1426 if overflow_isolate == 0 && overflow_embedding == 0 {
1427 valid_isolate += 1;
1428 stack.push(level, Override::Neutral, true);
1429 continue;
1430 }
1431 }
1432 overflow_isolate += 1;
1433 }
1434 // X6a
1435 BidiClass::PopDirectionalIsolate => {
1436 if overflow_isolate > 0 {
1437 overflow_isolate -= 1;
1438 } else if valid_isolate == 0 {
1439 // Do nothing
1440 } else {
1441 overflow_embedding = 0;
1442 loop {
1443 if stack.isolate_status() {
1444 break;
1445 }
1446 stack.pop();
1447 }
1448 stack.pop();
1449 valid_isolate -= 1;
1450 }
1451
1452 self.levels[idx] = stack.embedding_level();
1453 stack.apply_override(&mut self.char_types[idx]);
1454 }
1455 // X7
1456 BidiClass::PopDirectionalFormat => {
1457 if overflow_isolate > 0 {
1458 // Do nothing
1459 } else if overflow_embedding > 0 {
1460 overflow_embedding -= 1;
1461 } else {
1462 if !stack.isolate_status() {
1463 if stack.depth() >= 2 {
1464 stack.pop();
1465 }
1466 }
1467 }
1468 }
1469 BidiClass::BoundaryNeutral => {}
1470 // X8
1471 BidiClass::ParagraphSeparator => {
1472 // Terminates all embedding contexts.
1473 // Should only ever be the last character in
1474 // a paragraph if present at all.
1475 self.levels[idx] = self.base_level;
1476 }
1477 // X6
1478 _ => {
1479 self.levels[idx] = stack.embedding_level();
1480 stack.apply_override(&mut self.char_types[idx]);
1481 }
1482 }
1483 }
1484 }
1485
1486 /// X9
1487 fn delete_format_characters(&mut self) {
1488 for (bc, level) in self.char_types.iter().zip(&mut self.levels) {
1489 match bc {
1490 BidiClass::RightToLeftEmbedding
1491 | BidiClass::LeftToRightEmbedding
1492 | BidiClass::RightToLeftOverride
1493 | BidiClass::LeftToRightOverride
1494 | BidiClass::PopDirectionalFormat
1495 | BidiClass::BoundaryNeutral => {
1496 *level = Level(NO_LEVEL);
1497 }
1498 _ => {}
1499 }
1500 }
1501 }
1502
1503 /// X10
1504 fn identify_runs(&mut self) {
1505 let mut idx = 0;
1506 let len = self.char_types.len();
1507 self.runs.clear();
1508
1509 while idx < len {
1510 let (span_level, span_len) = span_one_run(&self.char_types, &self.levels, idx);
1511 if !span_level.removed_by_x9() {
1512 self.runs.push(Run {
1513 start: idx,
1514 end: idx + span_len,
1515 len: span_len,
1516 seq_id: 0,
1517 level: span_level,
1518 sor: BidiClass::OtherNeutral,
1519 eor: BidiClass::OtherNeutral,
1520 });
1521 }
1522
1523 assert!(span_len > 0);
1524 idx += span_len;
1525 }
1526
1527 self.calculate_sor_eor();
1528
1529 trace!("\nRuns: {:#?}", self.runs);
1530 }
1531
1532 fn calculate_sor_eor(&mut self) {
1533 let mut prior_run_level = self.base_level;
1534 let mut iter = self.runs.iter_mut().peekable();
1535 while let Some(run) = iter.next() {
1536 let next_run_level = match iter.peek() {
1537 Some(next) => next.level,
1538 None => self.base_level,
1539 };
1540
1541 // Set sor based on the higher of the prior_run_level and the current level.
1542 run.sor = prior_run_level.max(run.level).as_bidi_class();
1543 run.eor = next_run_level.max(run.level).as_bidi_class();
1544
1545 prior_run_level = run.level;
1546 }
1547 }
1548
1549 /// This function applies only to UBA63. Once the embedding
1550 /// levels are identified, UBA63 requires further processing
1551 /// to assign each of the level runs to an isolating run sequence.
1552 ///
1553 /// Each level run must be uniquely assigned to exactly one
1554 /// isolating run sequence. Each isolating run sequence must
1555 /// have at least one level run, but may have more.
1556 ///
1557 /// The exact details on how to match up isolating run sequences
1558 /// with level runs are specified in BD13.
1559 ///
1560 /// The strategy taken here is to scan the level runs in order.
1561 ///
1562 /// If a level run is not yet assigned to an isolating run sequence,
1563 /// its seqID will be zero. Create a new isolating run sequence
1564 /// and add this level run to it.
1565 ///
1566 /// If the last BIDIUNIT of *this* level run is an isolate
1567 /// initiator (LRI/RLI/FSI), then scan ahead in the list of
1568 /// level runs seeking the next level run which meets the
1569 /// following criteria:
1570 /// 1. seqID = 0 (not yet assigned to an isolating run sequence)
1571 /// 2. its level matches the level we are processing
1572 /// 3. the first BIDIUNIT is a PDI
1573 /// If all those conditions are met, assign that next level run
1574 /// to this isolating run sequence (set its seqID, and append to
1575 /// the list).
1576 ///
1577 /// Repeat until we hit a level run that doesn't terminate with
1578 /// an isolate initiator or we hit the end of the list of level
1579 /// runs.
1580 ///
1581 /// That terminates the definition of the isolating run sequence
1582 /// we are working on. Append it to the list of isolating run
1583 /// sequences in the UBACONTEXT.
1584 ///
1585 /// Then advance to the next level run which has not yet been
1586 /// assigned to an isolating run sequence and repeat the process.
1587 ///
1588 /// Continue until all level runs have been assigned to an
1589 /// isolating run sequence.
1590 fn identify_isolating_run_sequences(&mut self) -> Vec<IsolatingRunSequence> {
1591 let mut seq_id = 0;
1592 let mut iso_runs = vec![];
1593 let num_runs = self.runs.len();
1594
1595 for run_idx in 0..num_runs {
1596 let save_level;
1597
1598 {
1599 let run = &mut self.runs[run_idx];
1600 if run.seq_id != 0 {
1601 continue;
1602 }
1603 seq_id += 1;
1604 iso_runs.push(Self::new_iso_run_seq(run_idx, run));
1605 run.seq_id = seq_id;
1606
1607 if !self.char_types[run.end - 1].is_iso_init() {
1608 continue;
1609 }
1610 save_level = run.level;
1611 }
1612
1613 // Look ahead to find the run with the corresponding
1614 // PopDirectionalIsolate
1615 for idx in run_idx + 1..num_runs {
1616 let run = &mut self.runs[idx];
1617 if run.seq_id == 0
1618 && run.level == save_level
1619 && run.first_significant_bidi_class(&self.char_types, &self.levels)
1620 == Some(BidiClass::PopDirectionalIsolate)
1621 {
1622 // we matched the criteria for adding this run to the sequence.
1623 let iso_run = iso_runs.last_mut().unwrap();
1624 iso_run.runs.push(idx);
1625 iso_run.len += run.len;
1626 run.seq_id = seq_id;
1627
1628 // Check if the last char in this run is also an
1629 // isolate initiator. If not, this sequence is done.
1630 if !self.char_types[run.end - 1].is_iso_init() {
1631 break;
1632 }
1633 }
1634 }
1635 }
1636 self.calculate_sos_eos(&mut iso_runs);
1637 self.build_text_chains(&mut iso_runs);
1638 iso_runs
1639 }
1640
1641 /// In order to simplify later rule processing, assemble the indices
1642 /// of the characters in the isolating runs so that there is just a
1643 /// single list to iterate
1644 fn build_text_chains(&mut self, iso_runs: &mut [IsolatingRunSequence]) {
1645 for iso_run in iso_runs {
1646 for &run_idx in &iso_run.runs {
1647 let run = &self.runs[run_idx];
1648 iso_run.indices.extend(run.start..run.end);
1649 }
1650 }
1651 }
1652
1653 /// Process the isolating run sequence list, calculating sos and eos values for
1654 /// each sequence. Each needs to be set to either L or R.
1655 ///
1656 /// Strategy: Instead of recalculating all the sos and eos values from
1657 /// scratch, as specified in X10, we can take a shortcut here, because
1658 /// we already have sor and eor values assigned to all the level runs.
1659 /// For any isolating run sequence, simply assign sos to the value of
1660 /// sor for the *first* run in that sequence, and assign eos to the
1661 /// value of eor for the *last* run in that sequence. This provides
1662 /// equivalent values, and is more straightforward to implement and
1663 /// understand.
1664 ///
1665 /// This strategy has to be modified for defective isolating run sequences,
1666 /// where the sequence ends with an LRI/RLI/FSI.
1667 /// In those cases the eot needs to be calculated based on
1668 /// the paragraph embedding level, rather than from the level run.
1669 /// Note that this only applies when an isolating run sequence
1670 /// terminating in an LRI/RLI/FSI but with no matching PDI.
1671 /// An example would be:
1672 ///
1673 /// R RLI R
1674 /// <L-----R> <RR>
1675 /// <L------[ <== eot would be L, not R
1676 /// <RR>
1677 ///
1678 fn calculate_sos_eos(&mut self, iso_runs: &mut [IsolatingRunSequence]) {
1679 for iso_run in iso_runs {
1680 // First inherit the sos and eos values from the
1681 // first and last runs in the sequence.
1682 let first_run_idx = iso_run.runs.first().cloned().expect("at least 1 run");
1683 let last_run_idx = iso_run.runs.last().cloned().expect("at least 1 run");
1684 iso_run.sos = self.runs[first_run_idx].sor;
1685 iso_run.eos = self.runs[last_run_idx].eor;
1686 // Next adjust for the special case when an isolating
1687 // run sequence terminates in an unmatched isolate
1688 // initiator.
1689 if self.char_types[self.runs[last_run_idx].end - 1].is_iso_init() {
1690 let higher_level = self.base_level.max(iso_run.level);
1691 iso_run.eos = higher_level.as_bidi_class();
1692 }
1693 }
1694 }
1695
1696 fn new_iso_run_seq(run_idx: usize, run: &Run) -> IsolatingRunSequence {
1697 let len = run.len;
1698 let level = run.level;
1699 IsolatingRunSequence {
1700 runs: vec![run_idx],
1701 len,
1702 level,
1703 sos: BidiClass::OtherNeutral,
1704 eos: BidiClass::OtherNeutral,
1705 indices: vec![],
1706 }
1707 }
1708}
1709
1710impl BidiClass {
1711 pub fn is_iso_init(self) -> bool {
1712 match self {
1713 BidiClass::RightToLeftIsolate
1714 | BidiClass::LeftToRightIsolate
1715 | BidiClass::FirstStrongIsolate => true,
1716 _ => false,
1717 }
1718 }
1719
1720 pub fn is_iso_control(self) -> bool {
1721 match self {
1722 BidiClass::RightToLeftIsolate
1723 | BidiClass::LeftToRightIsolate
1724 | BidiClass::PopDirectionalIsolate
1725 | BidiClass::FirstStrongIsolate => true,
1726 _ => false,
1727 }
1728 }
1729
1730 pub fn is_neutral(self) -> bool {
1731 match self {
1732 BidiClass::OtherNeutral
1733 | BidiClass::WhiteSpace
1734 | BidiClass::SegmentSeparator
1735 | BidiClass::ParagraphSeparator => true,
1736 _ => self.is_iso_control(),
1737 }
1738 }
1739}
1740
1741#[derive(Debug)]
1742struct Run {
1743 /// char indices for start, end of run
1744 start: usize,
1745 end: usize,
1746 /// length of run
1747 len: usize,
1748 /// isolating run sequence id
1749 seq_id: usize,
1750 /// Embedding level of this run
1751 level: Level,
1752 /// Direction of start of run
1753 sor: BidiClass,
1754 /// Direction of end of run
1755 eor: BidiClass,
1756}
1757
1758impl Run {
1759 fn first_significant_bidi_class(
1760 &self,
1761 types: &[BidiClass],
1762 levels: &[Level],
1763 ) -> Option<BidiClass> {
1764 for idx in self.start..self.end {
1765 if !levels[idx].removed_by_x9() {
1766 return types.get(idx).cloned();
1767 }
1768 }
1769 None
1770 }
1771}
1772
1773#[derive(Debug)]
1774struct IsolatingRunSequence {
1775 /// List of the runs in this sequence. The values are indices
1776 /// into the runs array
1777 runs: Vec<usize>,
1778 /// length of the run
1779 len: usize,
1780 /// Embedding level of this run
1781 level: Level,
1782 /// Direction of start of run
1783 sos: BidiClass,
1784 /// Direction of end of run
1785 eos: BidiClass,
1786 /// The sequence of indices into the original paragraph,
1787 /// across the contained set of runs
1788 indices: Vec<usize>,
1789}
1790
1791/// Starting from `start`, extract the first run containing characters
1792/// all with the same level
1793fn span_one_run(types: &[BidiClass], levels: &[Level], start: usize) -> (Level, usize) {
1794 let mut span_level = Level(NO_LEVEL);
1795 let mut isolate_init_found = false;
1796 let mut span_len = 0;
1797
1798 trace!(
1799 "span_one_run called with types: {:?}, levels: {:?}, start={}",
1800 types,
1801 levels,
1802 start
1803 );
1804
1805 for (idx, (bc, level)) in types
1806 .iter()
1807 .skip(start)
1808 .zip(levels.iter().skip(start))
1809 .enumerate()
1810 {
1811 trace!(
1812 "span_one_run: consider idx={} bc={:?} level={:?}",
1813 idx,
1814 bc,
1815 level
1816 );
1817 if !level.removed_by_x9() {
1818 if bc.is_iso_init() {
1819 isolate_init_found = true;
1820 }
1821 if span_level.removed_by_x9() {
1822 span_level = *level;
1823 } else if *level != span_level {
1824 // End of run
1825 break;
1826 }
1827 }
1828 span_len = idx;
1829 if isolate_init_found {
1830 break;
1831 }
1832 }
1833
1834 (span_level, span_len + 1)
1835}
1836
1837/// 3.3.1 Paragraph level.
1838/// We've been fed a single paragraph, which takes care of rule P1.
1839/// This function implements rules P2 and P3.
1840fn paragraph_level(types: &[BidiClass], respect_pdi: bool, fallback: Direction) -> Level {
1841 let mut isolate_count = 0;
1842 for &t in types {
1843 match t {
1844 BidiClass::RightToLeftIsolate
1845 | BidiClass::LeftToRightIsolate
1846 | BidiClass::FirstStrongIsolate => isolate_count += 1,
1847 BidiClass::PopDirectionalIsolate => {
1848 if isolate_count > 0 {
1849 isolate_count -= 1;
1850 } else if respect_pdi {
1851 break;
1852 }
1853 }
1854 BidiClass::LeftToRight if isolate_count == 0 => return Level(0),
1855 BidiClass::RightToLeft | BidiClass::ArabicLetter if isolate_count == 0 => {
1856 return Level(1)
1857 }
1858 _ => {}
1859 }
1860 }
1861 if fallback == Direction::LeftToRight {
1862 Level(0)
1863 } else {
1864 Level(1)
1865 }
1866}
1867
1868struct Pair {
1869 opening_pos: usize,
1870 closing_pos: usize,
1871}
1872
1873impl std::fmt::Debug for Pair {
1874 fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
1875 write!(fmt, "Pair{{{},{}}}", self.opening_pos, self.closing_pos)
1876 }
1877}
1878
1879const MAX_PAIRING_DEPTH: usize = 63;
1880struct BracketStack {
1881 closing_bracket: [char; MAX_PAIRING_DEPTH],
1882 position: [usize; MAX_PAIRING_DEPTH],
1883 depth: usize,
1884 pairs: Vec<Pair>,
1885}
1886
1887impl std::fmt::Debug for BracketStack {
1888 fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result {
1889 fmt.debug_struct("BracketStack")
1890 .field("closing_bracket", &&self.closing_bracket[0..self.depth])
1891 .field("position", &&self.position[0..self.depth])
1892 .field("depth", &self.depth)
1893 .field("pairs", &self.pairs)
1894 .finish()
1895 }
1896}
1897
1898impl BracketStack {
1899 pub fn new() -> Self {
1900 Self {
1901 closing_bracket: [' '; MAX_PAIRING_DEPTH],
1902 position: [0; MAX_PAIRING_DEPTH],
1903 depth: 0,
1904 pairs: vec![],
1905 }
1906 }
1907
1908 pub fn clear(&mut self) {
1909 self.pairs.clear();
1910 self.depth = 0;
1911 }
1912
1913 pub fn push(&mut self, closing_bracket: char, pos: usize) -> bool {
1914 let depth = self.depth;
1915 if depth >= MAX_PAIRING_DEPTH {
1916 return false;
1917 }
1918 self.closing_bracket[depth] = closing_bracket;
1919 self.position[depth] = pos;
1920 self.depth += 1;
1921 true
1922 }
1923
1924 /// Seek an opening bracket pair for the closing bracket
1925 /// passed in.
1926 ///
1927 /// This is a stack based search.
1928 /// Start with the top element in the stack and search
1929 /// downwards until we either find a match or reach the
1930 /// bottom of the stack.
1931 ///
1932 /// If we find a match, construct and append the bracket
1933 /// pair to the pairList. Then pop the stack for all the
1934 /// levels down to the level where we found the match.
1935 /// (This approach is designed to discard pairs that
1936 /// are not cleanly nested.)
1937 ///
1938 /// If we search all the way to the bottom of the stack
1939 /// without finding a match, just return without changing
1940 /// state. This represents a closing bracket with no
1941 /// opening bracket to match it. Just discard and move on.
1942 pub fn seek_matching_open_bracket(&mut self, closing_bracket: char, pos: usize) -> bool {
1943 trace!(
1944 "seek_matching_open_bracket: closing_bracket={:?} pos={}\n{:?}",
1945 closing_bracket,
1946 pos,
1947 self
1948 );
1949 for depth in (0..self.depth).rev() {
1950 trace!("seek_matching_open_bracket: consider depth={}", depth);
1951 // The basic test is for the closingcp equal to the bpb value
1952 // stored in the bracketData. But to account for the canonical
1953 // equivalences for U+2329 and U+232A, tack on extra checks here
1954 // for the asymmetrical matches. This hard-coded check avoids
1955 // having to require full normalization of all the bracket code
1956 // points before checking. It is highly unlikely that additional
1957 // canonical singletons for bracket pairs will be added to future
1958 // versions of the UCD.
1959 if self.closing_bracket[depth] == closing_bracket
1960 || (self.closing_bracket[depth] == '\u{232a}' && closing_bracket == '\u{3009}')
1961 || (self.closing_bracket[depth] == '\u{3009}' && closing_bracket == '\u{232a}')
1962 {
1963 self.pairs.push(Pair {
1964 opening_pos: self.position[depth],
1965 closing_pos: pos,
1966 });
1967 // Pop back to this depth, pruning out any intermediates;
1968 // they are mismatched brackets
1969 self.depth = depth;
1970 return true;
1971 }
1972 }
1973 false
1974 }
1975}
1976
1977fn lookup_closing(c: char) -> Option<(char, BracketType)> {
1978 use bidi_brackets::BIDI_BRACKETS;
1979 if let Ok(idx) = BIDI_BRACKETS.binary_search_by_key(&c, |&(left, _, _)| left) {
1980 let entry = &BIDI_BRACKETS[idx];
1981 return Some((entry.1, entry.2));
1982 }
1983 None
1984}
1985
1986pub fn bidi_class_for_char(c: char) -> BidiClass {
1987 use std::cmp::Ordering;
1988 if let Ok(idx) = bidi_class::BIDI_CLASS.binary_search_by(|&(lower, upper, _)| {
1989 if c >= lower && c <= upper {
1990 Ordering::Equal
1991 } else if c < lower {
1992 Ordering::Greater
1993 } else if c > upper {
1994 Ordering::Less
1995 } else {
1996 unreachable!()
1997 }
1998 }) {
1999 let entry = &bidi_class::BIDI_CLASS[idx];
2000 if c >= entry.0 && c <= entry.1 {
2001 return entry.2;
2002 }
2003 }
2004 // extracted/DerivedBidiClass.txt says:
2005 // All code points not explicitly listed for Bidi_Class
2006 // have the value Left_To_Right (L).
2007 BidiClass::LeftToRight
2008}
2009
2010#[cfg(test)]
2011mod tests {
2012 use super::*;
2013 use k9::assert_equal as assert_eq;
2014
2015 #[test]
2016 fn runs() {
2017 let text = vec!['א', 'ב', 'ג', 'a', 'b', 'c'];
2018
2019 let mut context = BidiContext::new();
2020 context.resolve_paragraph(&text, ParagraphDirectionHint::AutoLeftToRight);
2021 k9::snapshot!(
2022 context.runs().collect::<Vec<_>>(),
2023 "
2024[
2025 BidiRun {
2026 direction: RightToLeft,
2027 level: Level(
2028 1,
2029 ),
2030 range: 0..3,
2031 removed_by_x9: [],
2032 },
2033 BidiRun {
2034 direction: LeftToRight,
2035 level: Level(
2036 2,
2037 ),
2038 range: 3..6,
2039 removed_by_x9: [],
2040 },
2041]
2042"
2043 );
2044 }
2045
2046 #[test]
2047 fn mirror() {
2048 assert_eq!(lookup_closing('{'), Some(('}', BracketType::Open)));
2049 assert_eq!(lookup_closing('['), Some((']', BracketType::Open)));
2050 assert_eq!(lookup_closing(']'), Some(('[', BracketType::Close)));
2051 }
2052
2053 #[test]
2054 fn bidi_class_resolve() {
2055 assert_eq!(bidi_class_for_char('\u{0}'), BidiClass::BoundaryNeutral);
2056 assert_eq!(bidi_class_for_char('\u{9}'), BidiClass::SegmentSeparator);
2057 assert_eq!(bidi_class_for_char(' '), BidiClass::WhiteSpace);
2058 assert_eq!(bidi_class_for_char('a'), BidiClass::LeftToRight);
2059 assert_eq!(bidi_class_for_char('\u{590}'), BidiClass::RightToLeft);
2060 assert_eq!(bidi_class_for_char('\u{5d0}'), BidiClass::RightToLeft);
2061 assert_eq!(bidi_class_for_char('\u{5d1}'), BidiClass::RightToLeft);
2062 }
2063
2064 /// This example is taken from
2065 /// <https://terminal-wg.pages.freedesktop.org/bidi/recommendation/combining.html>
2066 #[test]
2067 fn reorder_nsm() {
2068 let shalom: Vec<char> = vec![
2069 '\u{5e9}', '\u{5b8}', '\u{5c1}', '\u{5dc}', '\u{5d5}', '\u{05b9}', '\u{5dd}',
2070 ];
2071 let mut context = BidiContext::new();
2072 context.set_reorder_non_spacing_marks(true);
2073 context.resolve_paragraph(&shalom, ParagraphDirectionHint::LeftToRight);
2074
2075 let mut reordered = vec![];
2076 for run in context.reordered_runs(0..shalom.len()) {
2077 for idx in run.indices {
2078 reordered.push(shalom[idx]);
2079 }
2080 }
2081
2082 let explicit_ltr = vec![
2083 '\u{5dd}', '\u{5d5}', '\u{5b9}', '\u{5dc}', '\u{5e9}', '\u{5b8}', '\u{5c1}',
2084 ];
2085 assert_eq!(reordered, explicit_ltr);
2086 }
2087}