unic_segment/
grapheme.rs

1// Copyright 2012-2015 The Rust Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! Unicode Grapheme Clusters of a string.
13//!
14//! ## References
15//!
16//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
17
18use std::cmp;
19
20use unic_ucd_segment::GraphemeClusterBreak as GCB;
21
22/// External iterator for grapheme clusters and byte offsets.
23#[derive(Clone, Debug)]
24pub struct GraphemeIndices<'a> {
25    start_offset: usize,
26    iter: Graphemes<'a>,
27}
28
29impl<'a> GraphemeIndices<'a> {
30    /// Create new iterator for *extended grapheme clusters*.
31    #[inline]
32    pub fn new(s: &str) -> GraphemeIndices<'_> {
33        GraphemeIndices {
34            start_offset: s.as_ptr() as usize,
35            iter: Graphemes::new(s),
36        }
37    }
38
39    /// Create new iterator for *legacy grapheme clusters*.
40    #[inline]
41    pub fn new_legacy(s: &str) -> GraphemeIndices<'_> {
42        GraphemeIndices {
43            start_offset: s.as_ptr() as usize,
44            iter: Graphemes::new_legacy(s),
45        }
46    }
47
48    #[inline]
49    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
50    ///
51    /// ```rust
52    /// # use unic_segment::GraphemeIndices;
53    /// let mut iter = GraphemeIndices::new("abc");
54    /// assert_eq!(iter.as_str(), "abc");
55    /// iter.next();
56    /// assert_eq!(iter.as_str(), "bc");
57    /// iter.next();
58    /// iter.next();
59    /// assert_eq!(iter.as_str(), "");
60    /// ```
61    pub fn as_str(&self) -> &'a str {
62        self.iter.as_str()
63    }
64}
65
66impl<'a> Iterator for GraphemeIndices<'a> {
67    type Item = (usize, &'a str);
68
69    #[inline]
70    fn next(&mut self) -> Option<(usize, &'a str)> {
71        self.iter
72            .next()
73            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
74    }
75
76    #[inline]
77    fn size_hint(&self) -> (usize, Option<usize>) {
78        self.iter.size_hint()
79    }
80}
81
82impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
83    #[inline]
84    fn next_back(&mut self) -> Option<(usize, &'a str)> {
85        self.iter
86            .next_back()
87            .map(|s| (s.as_ptr() as usize - self.start_offset, s))
88    }
89}
90
91/// External iterator for a string's
92/// [grapheme clusters](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
93#[derive(Clone, Debug)]
94pub struct Graphemes<'a> {
95    string: &'a str,
96    cursor: GraphemeCursor,
97    cursor_back: GraphemeCursor,
98}
99
100impl<'a> Graphemes<'a> {
101    /// Create new iterator for *extended grapheme clusters*.
102    #[inline]
103    pub fn new(s: &str) -> Graphemes<'_> {
104        let len = s.len();
105        Graphemes {
106            string: s,
107            cursor: GraphemeCursor::new(0, len),
108            cursor_back: GraphemeCursor::new(len, len),
109        }
110    }
111
112    /// Create new iterator for *legacy grapheme clusters*.
113    #[inline]
114    pub fn new_legacy(s: &str) -> Graphemes<'_> {
115        let len = s.len();
116        Graphemes {
117            string: s,
118            cursor: GraphemeCursor::new_legacy(0, len),
119            cursor_back: GraphemeCursor::new_legacy(len, len),
120        }
121    }
122
123    #[inline]
124    /// View the underlying data (the part yet to be iterated) as a slice of the original string.
125    ///
126    /// ```rust
127    /// # use unic_segment::Graphemes;
128    /// let mut iter = Graphemes::new("abc");
129    /// assert_eq!(iter.as_str(), "abc");
130    /// iter.next();
131    /// assert_eq!(iter.as_str(), "bc");
132    /// iter.next();
133    /// iter.next();
134    /// assert_eq!(iter.as_str(), "");
135    /// ```
136    pub fn as_str(&self) -> &'a str {
137        &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
138    }
139}
140
141impl<'a> Iterator for Graphemes<'a> {
142    type Item = &'a str;
143
144    #[inline]
145    fn size_hint(&self) -> (usize, Option<usize>) {
146        let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
147        (cmp::min(slen, 1), Some(slen))
148    }
149
150    #[inline]
151    fn next(&mut self) -> Option<&'a str> {
152        let start = self.cursor.cur_cursor();
153        if start == self.cursor_back.cur_cursor() {
154            return None;
155        }
156        let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
157        Some(&self.string[start..next])
158    }
159}
160
161impl<'a> DoubleEndedIterator for Graphemes<'a> {
162    #[inline]
163    fn next_back(&mut self) -> Option<&'a str> {
164        let end = self.cursor_back.cur_cursor();
165        if end == self.cursor.cur_cursor() {
166            return None;
167        }
168        let prev = self
169            .cursor_back
170            .prev_boundary(self.string, 0)
171            .unwrap()
172            .unwrap();
173        Some(&self.string[prev..end])
174    }
175}
176
177// maybe unify with PairResult?
178// An enum describing information about a potential boundary.
179#[derive(Clone, Debug, Eq, PartialEq)]
180enum GraphemeState {
181    // No information is known.
182    Unknown,
183    // It is known to not be a boundary.
184    NotBreak,
185    // It is known to be a boundary.
186    Break,
187    // The codepoint after is a Regional Indicator Symbol, so a boundary iff
188    // it is preceded by an even number of RIS codepoints. (GB12, GB13)
189    Regional,
190    // The codepoint after is in the E_Modifier category, so whether it's a boundary
191    // depends on pre-context according to GB10.
192    Emoji,
193}
194
195/// Cursor-based segmenter for grapheme clusters.
196#[derive(Clone, Debug)]
197pub struct GraphemeCursor {
198    /// Current cursor position.
199    offset: usize,
200
201    /// Total length of the string.
202    len: usize,
203
204    /// A config flag indicating whether this cursor computes legacy or extended grapheme cluster
205    /// boundaries (enables GB9a and GB9b if set).
206    is_extended: bool,
207
208    /// Information about the potential boundary at `offset`.
209    state: GraphemeState,
210
211    /// Category of codepoint immediately preceding cursor, if known.
212    cat_before: Option<GCB>,
213
214    /// Category of codepoint immediately after cursor, if known.
215    cat_after: Option<GCB>,
216
217    /// If set, at least one more codepoint immediately preceding this offset is needed to resolve
218    /// whether there's a boundary at `offset`.
219    pre_context_offset: Option<usize>,
220
221    /// The number of RIS codepoints preceding `offset`. If `pre_context_offset` is set, then counts
222    /// the number of RIS between that and `offset`, otherwise is an accurate count relative to the
223    /// string.
224    ris_count: Option<usize>,
225
226    /// Set if a call to `prev_boundary` or `next_boundary` was suspended due to needing more input.
227    resuming: bool,
228}
229
230/// An error return indicating that not enough content was available in the
231/// provided chunk to satisfy the query, and that more content must be provided.
232#[derive(Debug, Eq, PartialEq)]
233pub enum GraphemeIncomplete {
234    /// More pre-context is needed. The caller should call `provide_context`
235    /// with a chunk ending at the offset given, then retry the query. This
236    /// will only be returned if the `chunk_start` parameter is nonzero.
237    PreContext(usize),
238
239    /// When requesting `prev_boundary`, the cursor is moving past the beginning
240    /// of the current chunk, so the chunk before that is requested. This will
241    /// only be returned if the `chunk_start` parameter is nonzero.
242    PrevChunk,
243
244    /// When requesting `next_boundary`, the cursor is moving past the end of the
245    /// current chunk, so the chunk after that is requested. This will only be
246    /// returned if the chunk ends before the `len` parameter provided on
247    /// creation of the cursor.
248    NextChunk, // requesting chunk following the one given
249
250    /// An error returned when the chunk given does not contain the cursor position.
251    InvalidOffset,
252}
253
254// An enum describing the result from lookup of a pair of categories.
255#[derive(Eq, PartialEq)]
256enum PairResult {
257    /// definitely not a break
258    NotBreak,
259
260    /// definitely a break
261    Break,
262
263    /// a break iff not in extended mode
264    Extended,
265
266    /// a break if preceded by an even number of Regional Indicators
267    Regional,
268
269    /// a break if preceded by Emoji Base and (Extend)*
270    Emoji,
271}
272
273fn check_pair(before: GCB, after: GCB) -> PairResult {
274    use self::PairResult::*;
275
276    #[cfg_attr(feature = "cargo-clippy", allow(match_same_arms))]
277    match (before, after) {
278        // Do not break between a CR and LF. Otherwise, break before and after controls.
279        (GCB::CR, GCB::LF) => NotBreak, // GB3
280        (GCB::Control, _) => Break,     // GB4
281        (GCB::CR, _) => Break,          // GB4
282        (GCB::LF, _) => Break,          // GB4
283        (_, GCB::Control) => Break,     // GB5
284        (_, GCB::CR) => Break,          // GB5
285        (_, GCB::LF) => Break,          // GB5
286
287        // Do not break Hangul syllable sequences.
288        (GCB::L, GCB::L) => NotBreak,   // GB6
289        (GCB::L, GCB::V) => NotBreak,   // GB6
290        (GCB::L, GCB::LV) => NotBreak,  // GB6
291        (GCB::L, GCB::LVT) => NotBreak, // GB6
292        (GCB::LV, GCB::V) => NotBreak,  // GB7
293        (GCB::LV, GCB::T) => NotBreak,  // GB7
294        (GCB::V, GCB::V) => NotBreak,   // GB7
295        (GCB::V, GCB::T) => NotBreak,   // GB7
296        (GCB::LVT, GCB::T) => NotBreak, // GB8
297        (GCB::T, GCB::T) => NotBreak,   // GB8
298
299        // Do not break before extending characters or ZWJ.
300        (_, GCB::Extend) => NotBreak, // GB9
301        (_, GCB::ZWJ) => NotBreak,    // GB9
302
303        // Only for extended grapheme clusters:
304        // Do not break before SpacingMarks, or after Prepend characters.
305        (_, GCB::SpacingMark) => Extended, // GB9a
306        (GCB::Prepend, _) => Extended,     // GB9b
307
308        // Do not break within Emoji Modifier Sequences or Emoji ZWJ Sequences.
309        (GCB::EBase, GCB::EModifier) => NotBreak,    // GB10
310        (GCB::EBaseGAZ, GCB::EModifier) => NotBreak, // GB10
311        (GCB::Extend, GCB::EModifier) => Emoji,      // GB10
312        (GCB::ZWJ, GCB::GlueAfterZwj) => NotBreak,   // GB11
313        (GCB::ZWJ, GCB::EBaseGAZ) => NotBreak,       // GB11
314
315        // Do not break within emoji flag sequences. That is, do not break between regional
316        // indicator (RI) symbols if there is an odd number of RI characters before the break point.
317        (GCB::RegionalIndicator, GCB::RegionalIndicator) => Regional, // GB12, GB13
318
319        // Otherwise, break everywhere.
320        (_, _) => Break, // GB999
321    }
322}
323
324impl GraphemeCursor {
325    /// Create a new cursor. The string and initial offset are given at creation
326    /// time, but the contents of the string are not.
327    ///
328    /// The `offset` parameter must be on a codepoint boundary.
329    ///
330    /// ```rust
331    /// # use unic_segment::GraphemeCursor;
332    /// let s = "हिन्दी";
333    /// let mut extended = GraphemeCursor::new(0, s.len());
334    /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
335    /// ```
336    pub fn new(offset: usize, len: usize) -> GraphemeCursor {
337        let state = if offset == 0 || offset == len {
338            GraphemeState::Break
339        } else {
340            GraphemeState::Unknown
341        };
342        GraphemeCursor {
343            offset,
344            len,
345            state,
346            is_extended: true,
347            cat_before: None,
348            cat_after: None,
349            pre_context_offset: None,
350            ris_count: None,
351            resuming: false,
352        }
353    }
354
355    /// Create a new cursor. The string and initial offset are given at creation
356    /// time, but the contents of the string are not.
357    ///
358    /// The `offset` parameter must be on a codepoint boundary.
359    ///
360    /// ```rust
361    /// # use unic_segment::GraphemeCursor;
362    /// let s = "हिन्दी";
363    /// let mut legacy = GraphemeCursor::new_legacy(0, s.len());
364    /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
365    /// ```
366    pub fn new_legacy(offset: usize, len: usize) -> GraphemeCursor {
367        let state = if offset == 0 || offset == len {
368            GraphemeState::Break
369        } else {
370            GraphemeState::Unknown
371        };
372        GraphemeCursor {
373            offset,
374            len,
375            state,
376            is_extended: false,
377            cat_before: None,
378            cat_after: None,
379            pre_context_offset: None,
380            ris_count: None,
381            resuming: false,
382        }
383    }
384
385    // FIXME: Not sure I'm gonna keep this, the advantage over new() seems thin.
386    /// Set the cursor to a new location in the same string.
387    ///
388    /// ```rust
389    /// # use unic_segment::GraphemeCursor;
390    /// let s = "abcd";
391    /// let mut cursor = GraphemeCursor::new(0, s.len());
392    /// assert_eq!(cursor.cur_cursor(), 0);
393    /// cursor.set_cursor(2);
394    /// assert_eq!(cursor.cur_cursor(), 2);
395    /// ```
396    pub fn set_cursor(&mut self, offset: usize) {
397        if offset != self.offset {
398            self.offset = offset;
399            self.state = if offset == 0 || offset == self.len {
400                GraphemeState::Break
401            } else {
402                GraphemeState::Unknown
403            };
404            // reset state derived from text around cursor
405            self.cat_before = None;
406            self.cat_after = None;
407            self.ris_count = None;
408        }
409    }
410
411    /// The current offset of the cursor. Equal to the last value provided to
412    /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
413    /// `prev_boundary()`.
414    ///
415    /// ```rust
416    /// # use unic_segment::GraphemeCursor;
417    /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
418    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
419    /// let mut cursor = GraphemeCursor::new(4, flags.len());
420    /// assert_eq!(cursor.cur_cursor(), 4);
421    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
422    /// assert_eq!(cursor.cur_cursor(), 8);
423    /// ```
424    pub fn cur_cursor(&self) -> usize {
425        self.offset
426    }
427
428    /// Provide additional pre-context when it is needed to decide a boundary.
429    /// The end of the chunk must coincide with the value given in the
430    /// `GraphemeIncomplete::PreContext` request.
431    ///
432    /// ```rust
433    /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
434    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
435    /// let mut cursor = GraphemeCursor::new(8, flags.len());
436    ///
437    /// // Not enough pre-context to decide if there's a boundary between the two flags.
438    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
439    ///
440    /// // Provide one more Regional Indicator Symbol of pre-context
441    /// cursor.provide_context(&flags[4..8], 4);
442    ///
443    /// // Still not enough context to decide.
444    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
445    ///
446    /// // Provide additional requested context.
447    /// cursor.provide_context(&flags[0..4], 0);
448    ///
449    /// // That's enough to decide (it always is when context goes to the start of the string)
450    /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
451    /// ```
452    pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
453        assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
454        self.pre_context_offset = None;
455        if self.is_extended && chunk_start + chunk.len() == self.offset {
456            let ch = chunk.chars().rev().next().unwrap();
457            if GCB::of(ch) == GCB::Prepend {
458                self.decide(false); // GB9b
459                return;
460            }
461        }
462        match self.state {
463            GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
464            GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
465            _ => panic!("invalid state"),
466        }
467    }
468
469    fn decide(&mut self, is_break: bool) {
470        self.state = if is_break {
471            GraphemeState::Break
472        } else {
473            GraphemeState::NotBreak
474        };
475    }
476
477    fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
478        self.decide(is_break);
479        Ok(is_break)
480    }
481
482    fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
483        if self.state == GraphemeState::Break {
484            Ok(true)
485        } else if self.state == GraphemeState::NotBreak {
486            Ok(false)
487        } else if let Some(pre_context_offset) = self.pre_context_offset {
488            Err(GraphemeIncomplete::PreContext(pre_context_offset))
489        } else {
490            unreachable!("inconsistent state");
491        }
492    }
493
494    fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
495        let mut ris_count = self.ris_count.unwrap_or(0);
496        for ch in chunk.chars().rev() {
497            if GCB::of(ch) != GCB::RegionalIndicator {
498                self.ris_count = Some(ris_count);
499                self.decide((ris_count % 2) == 0);
500                return;
501            }
502            ris_count += 1;
503        }
504        self.ris_count = Some(ris_count);
505        if chunk_start == 0 {
506            self.decide((ris_count % 2) == 0);
507            return;
508        }
509        self.pre_context_offset = Some(chunk_start);
510    }
511
512    fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
513        for ch in chunk.chars().rev() {
514            match GCB::of(ch) {
515                GCB::Extend => (),
516                GCB::EBase | GCB::EBaseGAZ => {
517                    self.decide(false);
518                    return;
519                }
520                _ => {
521                    self.decide(true);
522                    return;
523                }
524            }
525        }
526        if chunk_start == 0 {
527            self.decide(true);
528            return;
529        }
530        self.pre_context_offset = Some(chunk_start);
531    }
532
533    // TODO(clippy): Fix clippy warning or leave it as allowed if really needed.
534    // `warning: methods called `is_*` usually take self by reference or no self; consider choosing
535    // a less ambiguous name`
536    #[cfg_attr(feature = "cargo-clippy", allow(wrong_self_convention))]
537    /// Determine whether the current cursor location is a grapheme cluster boundary.
538    /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
539    /// the length of `chunk` is not equal to `len` on creation, then this method
540    /// may return `GraphemeIncomplete::PreContext`. The caller should then
541    /// call `provide_context` with the requested chunk, then retry calling this
542    /// method.
543    ///
544    /// For partial chunks, if the cursor is not at the beginning or end of the
545    /// string, the chunk should contain at least the codepoint following the cursor.
546    /// If the string is nonempty, the chunk must be nonempty.
547    ///
548    /// All calls should have consistent chunk contents (ie, if a chunk provides
549    /// content for a given slice, all further chunks covering that slice must have
550    /// the same content for it).
551    ///
552    /// ```rust
553    /// # use unic_segment::GraphemeCursor;
554    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
555    /// let mut cursor = GraphemeCursor::new(8, flags.len());
556    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
557    /// cursor.set_cursor(12);
558    /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
559    /// ```
560    pub fn is_boundary(
561        &mut self,
562        chunk: &str,
563        chunk_start: usize,
564    ) -> Result<bool, GraphemeIncomplete> {
565        if self.state == GraphemeState::Break {
566            return Ok(true);
567        }
568        if self.state == GraphemeState::NotBreak {
569            return Ok(false);
570        }
571        if (self.offset < chunk_start || self.offset >= chunk_start + chunk.len())
572            && (self.offset > chunk_start + chunk.len() || self.cat_after.is_none())
573        {
574            return Err(GraphemeIncomplete::InvalidOffset);
575        }
576        if let Some(pre_context_offset) = self.pre_context_offset {
577            return Err(GraphemeIncomplete::PreContext(pre_context_offset));
578        }
579        let offset_in_chunk = self.offset - chunk_start;
580        if self.cat_after.is_none() {
581            let ch = chunk[offset_in_chunk..].chars().next().unwrap();
582            self.cat_after = Some(GCB::of(ch));
583        }
584        if self.offset == chunk_start {
585            let mut need_pre_context = true;
586            match self.cat_after.unwrap() {
587                GCB::RegionalIndicator => self.state = GraphemeState::Regional,
588                GCB::EModifier => self.state = GraphemeState::Emoji,
589                _ => need_pre_context = self.cat_before.is_none(),
590            }
591            if need_pre_context {
592                self.pre_context_offset = Some(chunk_start);
593                return Err(GraphemeIncomplete::PreContext(chunk_start));
594            }
595        }
596        if self.cat_before.is_none() {
597            let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
598            self.cat_before = Some(GCB::of(ch));
599        }
600        match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
601            PairResult::NotBreak => self.decision(false),
602            PairResult::Break => self.decision(true),
603            PairResult::Extended => {
604                let is_extended = self.is_extended;
605                self.decision(!is_extended)
606            }
607            PairResult::Regional => {
608                if let Some(ris_count) = self.ris_count {
609                    return self.decision((ris_count % 2) == 0);
610                }
611                self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
612                self.is_boundary_result()
613            }
614            PairResult::Emoji => {
615                self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
616                self.is_boundary_result()
617            }
618        }
619    }
620
621    /// Find the next boundary after the current cursor position. Only a part of
622    /// the string need be supplied. If the chunk is incomplete, then this
623    /// method might return `GraphemeIncomplete::PreContext` or
624    /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
625    /// call `provide_context` with the requested chunk, then retry. In the
626    /// latter case, the caller should provide the chunk following the one
627    /// given, then retry.
628    ///
629    /// See `is_boundary` for expectations on the provided chunk.
630    ///
631    /// ```rust
632    /// # use unic_segment::GraphemeCursor;
633    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
634    /// let mut cursor = GraphemeCursor::new(4, flags.len());
635    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
636    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
637    /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
638    /// ```
639    ///
640    /// And an example that uses partial strings:
641    ///
642    /// ```rust
643    /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
644    /// let s = "abcd";
645    /// let mut cursor = GraphemeCursor::new(0, s.len());
646    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
647    /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
648    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
649    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
650    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
651    /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
652    /// ```
653    pub fn next_boundary(
654        &mut self,
655        chunk: &str,
656        chunk_start: usize,
657    ) -> Result<Option<usize>, GraphemeIncomplete> {
658        if self.offset == self.len {
659            return Ok(None);
660        }
661        let mut iter = chunk[self.offset - chunk_start..].chars();
662        let mut ch = iter.next().unwrap();
663        loop {
664            if self.resuming {
665                if self.cat_after.is_none() {
666                    self.cat_after = Some(GCB::of(ch));
667                }
668            } else {
669                self.offset += ch.len_utf8();
670                self.state = GraphemeState::Unknown;
671                self.cat_before = self.cat_after.take();
672                if self.cat_before.is_none() {
673                    self.cat_before = Some(GCB::of(ch));
674                }
675                if self.cat_before == Some(GCB::RegionalIndicator) {
676                    self.ris_count = self.ris_count.map(|c| c + 1);
677                } else {
678                    self.ris_count = Some(0);
679                }
680                if let Some(next_ch) = iter.next() {
681                    ch = next_ch;
682                    self.cat_after = Some(GCB::of(ch));
683                } else if self.offset == self.len {
684                    self.decide(true);
685                } else {
686                    self.resuming = true;
687                    return Err(GraphemeIncomplete::NextChunk);
688                }
689            }
690            self.resuming = true;
691            if self.is_boundary(chunk, chunk_start)? {
692                self.resuming = false;
693                return Ok(Some(self.offset));
694            }
695            self.resuming = false;
696        }
697    }
698
699    /// Find the previous boundary after the current cursor position. Only a part
700    /// of the string need be supplied. If the chunk is incomplete, then this
701    /// method might return `GraphemeIncomplete::PreContext` or
702    /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
703    /// call `provide_context` with the requested chunk, then retry. In the
704    /// latter case, the caller should provide the chunk preceding the one
705    /// given, then retry.
706    ///
707    /// See `is_boundary` for expectations on the provided chunk.
708    ///
709    /// ```rust
710    /// # use unic_segment::GraphemeCursor;
711    /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
712    /// let mut cursor = GraphemeCursor::new(12, flags.len());
713    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
714    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
715    /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
716    /// ```
717    ///
718    /// And an example that uses partial strings (note the exact return is not
719    /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
720    ///
721    /// ```rust
722    /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
723    /// let s = "abcd";
724    /// let mut cursor = GraphemeCursor::new(4, s.len());
725    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
726    /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
727    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
728    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
729    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
730    /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
731    /// ```
732    pub fn prev_boundary(
733        &mut self,
734        chunk: &str,
735        chunk_start: usize,
736    ) -> Result<Option<usize>, GraphemeIncomplete> {
737        if self.offset == 0 {
738            return Ok(None);
739        }
740        let mut iter = chunk[..self.offset - chunk_start].chars().rev();
741        let mut ch = iter.next().unwrap();
742        loop {
743            if self.offset == chunk_start {
744                self.resuming = true;
745                return Err(GraphemeIncomplete::PrevChunk);
746            }
747            if self.resuming {
748                self.cat_before = Some(GCB::of(ch));
749            } else {
750                self.offset -= ch.len_utf8();
751                self.cat_after = self.cat_before.take();
752                self.state = GraphemeState::Unknown;
753                if let Some(ris_count) = self.ris_count {
754                    self.ris_count = if ris_count > 0 {
755                        Some(ris_count - 1)
756                    } else {
757                        None
758                    };
759                }
760                if let Some(prev_ch) = iter.next() {
761                    ch = prev_ch;
762                    self.cat_before = Some(GCB::of(ch));
763                } else if self.offset == 0 {
764                    self.decide(true);
765                } else {
766                    self.resuming = true;
767                    return Err(GraphemeIncomplete::PrevChunk);
768                }
769            }
770            self.resuming = true;
771            if self.is_boundary(chunk, chunk_start)? {
772                self.resuming = false;
773                return Ok(Some(self.offset));
774            }
775            self.resuming = false;
776        }
777    }
778}
779
780#[cfg(test)]
781mod tests {
782    use super::{GraphemeIndices, Graphemes};
783
784    #[test]
785    fn test_grapheme_indices() {
786        let input = "a̐éö̲\r\n";
787        let grapheme_indices = GraphemeIndices::new(input).collect::<Vec<(usize, &str)>>();
788        assert_eq!(
789            grapheme_indices,
790            &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
791        );
792
793        let grapheme_indices = GraphemeIndices::new(input)
794            .rev()
795            .collect::<Vec<(usize, &str)>>();
796        assert_eq!(
797            grapheme_indices,
798            &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")]
799        );
800
801        let mut grapheme_indices_iter = GraphemeIndices::new(input);
802        {
803            let grapheme_indices = grapheme_indices_iter.by_ref();
804            let e1 = grapheme_indices.size_hint();
805            assert_eq!(e1, (1, Some(13)));
806            let c = grapheme_indices.count();
807            assert_eq!(c, 4);
808        }
809        assert_eq!(grapheme_indices_iter.size_hint(), (0, Some(0)));
810    }
811
812    #[test]
813    fn test_graphemes() {
814        let input = "a̐éö̲\r\n";
815        let graphemes = Graphemes::new(input).collect::<Vec<&str>>();
816        assert_eq!(graphemes, &["a̐", "é", "ö̲", "\r\n"]);
817
818        // Make sure the reverse iterator does the right thing with "\n" at beginning of string.
819        let input = "\n\r\n\r";
820        let graphemes = Graphemes::new(input).rev().collect::<Vec<&str>>();
821        assert_eq!(graphemes, &["\r", "\r\n", "\n"]);
822    }
823}