unic_segment/grapheme.rs
1// Copyright 2012-2015 The Rust Project Developers.
2// Copyright 2017 The UNIC Project Developers.
3//
4// See the COPYRIGHT file at the top-level directory of this distribution.
5//
6// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
7// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
8// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
9// option. This file may not be copied, modified, or distributed
10// except according to those terms.
11
12//! Unicode Grapheme Clusters of a string.
13//!
14//! ## References
15//!
16//! * <https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries>
17
18use std::cmp;
19
20use unic_ucd_segment::GraphemeClusterBreak as GCB;
21
22/// External iterator for grapheme clusters and byte offsets.
23#[derive(Clone, Debug)]
24pub struct GraphemeIndices<'a> {
25 start_offset: usize,
26 iter: Graphemes<'a>,
27}
28
29impl<'a> GraphemeIndices<'a> {
30 /// Create new iterator for *extended grapheme clusters*.
31 #[inline]
32 pub fn new(s: &str) -> GraphemeIndices<'_> {
33 GraphemeIndices {
34 start_offset: s.as_ptr() as usize,
35 iter: Graphemes::new(s),
36 }
37 }
38
39 /// Create new iterator for *legacy grapheme clusters*.
40 #[inline]
41 pub fn new_legacy(s: &str) -> GraphemeIndices<'_> {
42 GraphemeIndices {
43 start_offset: s.as_ptr() as usize,
44 iter: Graphemes::new_legacy(s),
45 }
46 }
47
48 #[inline]
49 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
50 ///
51 /// ```rust
52 /// # use unic_segment::GraphemeIndices;
53 /// let mut iter = GraphemeIndices::new("abc");
54 /// assert_eq!(iter.as_str(), "abc");
55 /// iter.next();
56 /// assert_eq!(iter.as_str(), "bc");
57 /// iter.next();
58 /// iter.next();
59 /// assert_eq!(iter.as_str(), "");
60 /// ```
61 pub fn as_str(&self) -> &'a str {
62 self.iter.as_str()
63 }
64}
65
66impl<'a> Iterator for GraphemeIndices<'a> {
67 type Item = (usize, &'a str);
68
69 #[inline]
70 fn next(&mut self) -> Option<(usize, &'a str)> {
71 self.iter
72 .next()
73 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
74 }
75
76 #[inline]
77 fn size_hint(&self) -> (usize, Option<usize>) {
78 self.iter.size_hint()
79 }
80}
81
82impl<'a> DoubleEndedIterator for GraphemeIndices<'a> {
83 #[inline]
84 fn next_back(&mut self) -> Option<(usize, &'a str)> {
85 self.iter
86 .next_back()
87 .map(|s| (s.as_ptr() as usize - self.start_offset, s))
88 }
89}
90
91/// External iterator for a string's
92/// [grapheme clusters](https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries).
93#[derive(Clone, Debug)]
94pub struct Graphemes<'a> {
95 string: &'a str,
96 cursor: GraphemeCursor,
97 cursor_back: GraphemeCursor,
98}
99
100impl<'a> Graphemes<'a> {
101 /// Create new iterator for *extended grapheme clusters*.
102 #[inline]
103 pub fn new(s: &str) -> Graphemes<'_> {
104 let len = s.len();
105 Graphemes {
106 string: s,
107 cursor: GraphemeCursor::new(0, len),
108 cursor_back: GraphemeCursor::new(len, len),
109 }
110 }
111
112 /// Create new iterator for *legacy grapheme clusters*.
113 #[inline]
114 pub fn new_legacy(s: &str) -> Graphemes<'_> {
115 let len = s.len();
116 Graphemes {
117 string: s,
118 cursor: GraphemeCursor::new_legacy(0, len),
119 cursor_back: GraphemeCursor::new_legacy(len, len),
120 }
121 }
122
123 #[inline]
124 /// View the underlying data (the part yet to be iterated) as a slice of the original string.
125 ///
126 /// ```rust
127 /// # use unic_segment::Graphemes;
128 /// let mut iter = Graphemes::new("abc");
129 /// assert_eq!(iter.as_str(), "abc");
130 /// iter.next();
131 /// assert_eq!(iter.as_str(), "bc");
132 /// iter.next();
133 /// iter.next();
134 /// assert_eq!(iter.as_str(), "");
135 /// ```
136 pub fn as_str(&self) -> &'a str {
137 &self.string[self.cursor.cur_cursor()..self.cursor_back.cur_cursor()]
138 }
139}
140
141impl<'a> Iterator for Graphemes<'a> {
142 type Item = &'a str;
143
144 #[inline]
145 fn size_hint(&self) -> (usize, Option<usize>) {
146 let slen = self.cursor_back.cur_cursor() - self.cursor.cur_cursor();
147 (cmp::min(slen, 1), Some(slen))
148 }
149
150 #[inline]
151 fn next(&mut self) -> Option<&'a str> {
152 let start = self.cursor.cur_cursor();
153 if start == self.cursor_back.cur_cursor() {
154 return None;
155 }
156 let next = self.cursor.next_boundary(self.string, 0).unwrap().unwrap();
157 Some(&self.string[start..next])
158 }
159}
160
161impl<'a> DoubleEndedIterator for Graphemes<'a> {
162 #[inline]
163 fn next_back(&mut self) -> Option<&'a str> {
164 let end = self.cursor_back.cur_cursor();
165 if end == self.cursor.cur_cursor() {
166 return None;
167 }
168 let prev = self
169 .cursor_back
170 .prev_boundary(self.string, 0)
171 .unwrap()
172 .unwrap();
173 Some(&self.string[prev..end])
174 }
175}
176
177// maybe unify with PairResult?
178// An enum describing information about a potential boundary.
179#[derive(Clone, Debug, Eq, PartialEq)]
180enum GraphemeState {
181 // No information is known.
182 Unknown,
183 // It is known to not be a boundary.
184 NotBreak,
185 // It is known to be a boundary.
186 Break,
187 // The codepoint after is a Regional Indicator Symbol, so a boundary iff
188 // it is preceded by an even number of RIS codepoints. (GB12, GB13)
189 Regional,
190 // The codepoint after is in the E_Modifier category, so whether it's a boundary
191 // depends on pre-context according to GB10.
192 Emoji,
193}
194
195/// Cursor-based segmenter for grapheme clusters.
196#[derive(Clone, Debug)]
197pub struct GraphemeCursor {
198 /// Current cursor position.
199 offset: usize,
200
201 /// Total length of the string.
202 len: usize,
203
204 /// A config flag indicating whether this cursor computes legacy or extended grapheme cluster
205 /// boundaries (enables GB9a and GB9b if set).
206 is_extended: bool,
207
208 /// Information about the potential boundary at `offset`.
209 state: GraphemeState,
210
211 /// Category of codepoint immediately preceding cursor, if known.
212 cat_before: Option<GCB>,
213
214 /// Category of codepoint immediately after cursor, if known.
215 cat_after: Option<GCB>,
216
217 /// If set, at least one more codepoint immediately preceding this offset is needed to resolve
218 /// whether there's a boundary at `offset`.
219 pre_context_offset: Option<usize>,
220
221 /// The number of RIS codepoints preceding `offset`. If `pre_context_offset` is set, then counts
222 /// the number of RIS between that and `offset`, otherwise is an accurate count relative to the
223 /// string.
224 ris_count: Option<usize>,
225
226 /// Set if a call to `prev_boundary` or `next_boundary` was suspended due to needing more input.
227 resuming: bool,
228}
229
230/// An error return indicating that not enough content was available in the
231/// provided chunk to satisfy the query, and that more content must be provided.
232#[derive(Debug, Eq, PartialEq)]
233pub enum GraphemeIncomplete {
234 /// More pre-context is needed. The caller should call `provide_context`
235 /// with a chunk ending at the offset given, then retry the query. This
236 /// will only be returned if the `chunk_start` parameter is nonzero.
237 PreContext(usize),
238
239 /// When requesting `prev_boundary`, the cursor is moving past the beginning
240 /// of the current chunk, so the chunk before that is requested. This will
241 /// only be returned if the `chunk_start` parameter is nonzero.
242 PrevChunk,
243
244 /// When requesting `next_boundary`, the cursor is moving past the end of the
245 /// current chunk, so the chunk after that is requested. This will only be
246 /// returned if the chunk ends before the `len` parameter provided on
247 /// creation of the cursor.
248 NextChunk, // requesting chunk following the one given
249
250 /// An error returned when the chunk given does not contain the cursor position.
251 InvalidOffset,
252}
253
254// An enum describing the result from lookup of a pair of categories.
255#[derive(Eq, PartialEq)]
256enum PairResult {
257 /// definitely not a break
258 NotBreak,
259
260 /// definitely a break
261 Break,
262
263 /// a break iff not in extended mode
264 Extended,
265
266 /// a break if preceded by an even number of Regional Indicators
267 Regional,
268
269 /// a break if preceded by Emoji Base and (Extend)*
270 Emoji,
271}
272
273fn check_pair(before: GCB, after: GCB) -> PairResult {
274 use self::PairResult::*;
275
276 #[cfg_attr(feature = "cargo-clippy", allow(match_same_arms))]
277 match (before, after) {
278 // Do not break between a CR and LF. Otherwise, break before and after controls.
279 (GCB::CR, GCB::LF) => NotBreak, // GB3
280 (GCB::Control, _) => Break, // GB4
281 (GCB::CR, _) => Break, // GB4
282 (GCB::LF, _) => Break, // GB4
283 (_, GCB::Control) => Break, // GB5
284 (_, GCB::CR) => Break, // GB5
285 (_, GCB::LF) => Break, // GB5
286
287 // Do not break Hangul syllable sequences.
288 (GCB::L, GCB::L) => NotBreak, // GB6
289 (GCB::L, GCB::V) => NotBreak, // GB6
290 (GCB::L, GCB::LV) => NotBreak, // GB6
291 (GCB::L, GCB::LVT) => NotBreak, // GB6
292 (GCB::LV, GCB::V) => NotBreak, // GB7
293 (GCB::LV, GCB::T) => NotBreak, // GB7
294 (GCB::V, GCB::V) => NotBreak, // GB7
295 (GCB::V, GCB::T) => NotBreak, // GB7
296 (GCB::LVT, GCB::T) => NotBreak, // GB8
297 (GCB::T, GCB::T) => NotBreak, // GB8
298
299 // Do not break before extending characters or ZWJ.
300 (_, GCB::Extend) => NotBreak, // GB9
301 (_, GCB::ZWJ) => NotBreak, // GB9
302
303 // Only for extended grapheme clusters:
304 // Do not break before SpacingMarks, or after Prepend characters.
305 (_, GCB::SpacingMark) => Extended, // GB9a
306 (GCB::Prepend, _) => Extended, // GB9b
307
308 // Do not break within Emoji Modifier Sequences or Emoji ZWJ Sequences.
309 (GCB::EBase, GCB::EModifier) => NotBreak, // GB10
310 (GCB::EBaseGAZ, GCB::EModifier) => NotBreak, // GB10
311 (GCB::Extend, GCB::EModifier) => Emoji, // GB10
312 (GCB::ZWJ, GCB::GlueAfterZwj) => NotBreak, // GB11
313 (GCB::ZWJ, GCB::EBaseGAZ) => NotBreak, // GB11
314
315 // Do not break within emoji flag sequences. That is, do not break between regional
316 // indicator (RI) symbols if there is an odd number of RI characters before the break point.
317 (GCB::RegionalIndicator, GCB::RegionalIndicator) => Regional, // GB12, GB13
318
319 // Otherwise, break everywhere.
320 (_, _) => Break, // GB999
321 }
322}
323
324impl GraphemeCursor {
325 /// Create a new cursor. The string and initial offset are given at creation
326 /// time, but the contents of the string are not.
327 ///
328 /// The `offset` parameter must be on a codepoint boundary.
329 ///
330 /// ```rust
331 /// # use unic_segment::GraphemeCursor;
332 /// let s = "हिन्दी";
333 /// let mut extended = GraphemeCursor::new(0, s.len());
334 /// assert_eq!(extended.next_boundary(s, 0), Ok(Some("हि".len())));
335 /// ```
336 pub fn new(offset: usize, len: usize) -> GraphemeCursor {
337 let state = if offset == 0 || offset == len {
338 GraphemeState::Break
339 } else {
340 GraphemeState::Unknown
341 };
342 GraphemeCursor {
343 offset,
344 len,
345 state,
346 is_extended: true,
347 cat_before: None,
348 cat_after: None,
349 pre_context_offset: None,
350 ris_count: None,
351 resuming: false,
352 }
353 }
354
355 /// Create a new cursor. The string and initial offset are given at creation
356 /// time, but the contents of the string are not.
357 ///
358 /// The `offset` parameter must be on a codepoint boundary.
359 ///
360 /// ```rust
361 /// # use unic_segment::GraphemeCursor;
362 /// let s = "हिन्दी";
363 /// let mut legacy = GraphemeCursor::new_legacy(0, s.len());
364 /// assert_eq!(legacy.next_boundary(s, 0), Ok(Some("ह".len())));
365 /// ```
366 pub fn new_legacy(offset: usize, len: usize) -> GraphemeCursor {
367 let state = if offset == 0 || offset == len {
368 GraphemeState::Break
369 } else {
370 GraphemeState::Unknown
371 };
372 GraphemeCursor {
373 offset,
374 len,
375 state,
376 is_extended: false,
377 cat_before: None,
378 cat_after: None,
379 pre_context_offset: None,
380 ris_count: None,
381 resuming: false,
382 }
383 }
384
385 // FIXME: Not sure I'm gonna keep this, the advantage over new() seems thin.
386 /// Set the cursor to a new location in the same string.
387 ///
388 /// ```rust
389 /// # use unic_segment::GraphemeCursor;
390 /// let s = "abcd";
391 /// let mut cursor = GraphemeCursor::new(0, s.len());
392 /// assert_eq!(cursor.cur_cursor(), 0);
393 /// cursor.set_cursor(2);
394 /// assert_eq!(cursor.cur_cursor(), 2);
395 /// ```
396 pub fn set_cursor(&mut self, offset: usize) {
397 if offset != self.offset {
398 self.offset = offset;
399 self.state = if offset == 0 || offset == self.len {
400 GraphemeState::Break
401 } else {
402 GraphemeState::Unknown
403 };
404 // reset state derived from text around cursor
405 self.cat_before = None;
406 self.cat_after = None;
407 self.ris_count = None;
408 }
409 }
410
411 /// The current offset of the cursor. Equal to the last value provided to
412 /// `new()` or `set_cursor()`, or returned from `next_boundary()` or
413 /// `prev_boundary()`.
414 ///
415 /// ```rust
416 /// # use unic_segment::GraphemeCursor;
417 /// // Two flags (🇷🇸🇮🇴), each flag is two RIS codepoints, each RIS is 4 bytes.
418 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
419 /// let mut cursor = GraphemeCursor::new(4, flags.len());
420 /// assert_eq!(cursor.cur_cursor(), 4);
421 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
422 /// assert_eq!(cursor.cur_cursor(), 8);
423 /// ```
424 pub fn cur_cursor(&self) -> usize {
425 self.offset
426 }
427
428 /// Provide additional pre-context when it is needed to decide a boundary.
429 /// The end of the chunk must coincide with the value given in the
430 /// `GraphemeIncomplete::PreContext` request.
431 ///
432 /// ```rust
433 /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
434 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
435 /// let mut cursor = GraphemeCursor::new(8, flags.len());
436 ///
437 /// // Not enough pre-context to decide if there's a boundary between the two flags.
438 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(8)));
439 ///
440 /// // Provide one more Regional Indicator Symbol of pre-context
441 /// cursor.provide_context(&flags[4..8], 4);
442 ///
443 /// // Still not enough context to decide.
444 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Err(GraphemeIncomplete::PreContext(4)));
445 ///
446 /// // Provide additional requested context.
447 /// cursor.provide_context(&flags[0..4], 0);
448 ///
449 /// // That's enough to decide (it always is when context goes to the start of the string)
450 /// assert_eq!(cursor.is_boundary(&flags[8..], 8), Ok(true));
451 /// ```
452 pub fn provide_context(&mut self, chunk: &str, chunk_start: usize) {
453 assert!(chunk_start + chunk.len() == self.pre_context_offset.unwrap());
454 self.pre_context_offset = None;
455 if self.is_extended && chunk_start + chunk.len() == self.offset {
456 let ch = chunk.chars().rev().next().unwrap();
457 if GCB::of(ch) == GCB::Prepend {
458 self.decide(false); // GB9b
459 return;
460 }
461 }
462 match self.state {
463 GraphemeState::Regional => self.handle_regional(chunk, chunk_start),
464 GraphemeState::Emoji => self.handle_emoji(chunk, chunk_start),
465 _ => panic!("invalid state"),
466 }
467 }
468
469 fn decide(&mut self, is_break: bool) {
470 self.state = if is_break {
471 GraphemeState::Break
472 } else {
473 GraphemeState::NotBreak
474 };
475 }
476
477 fn decision(&mut self, is_break: bool) -> Result<bool, GraphemeIncomplete> {
478 self.decide(is_break);
479 Ok(is_break)
480 }
481
482 fn is_boundary_result(&self) -> Result<bool, GraphemeIncomplete> {
483 if self.state == GraphemeState::Break {
484 Ok(true)
485 } else if self.state == GraphemeState::NotBreak {
486 Ok(false)
487 } else if let Some(pre_context_offset) = self.pre_context_offset {
488 Err(GraphemeIncomplete::PreContext(pre_context_offset))
489 } else {
490 unreachable!("inconsistent state");
491 }
492 }
493
494 fn handle_regional(&mut self, chunk: &str, chunk_start: usize) {
495 let mut ris_count = self.ris_count.unwrap_or(0);
496 for ch in chunk.chars().rev() {
497 if GCB::of(ch) != GCB::RegionalIndicator {
498 self.ris_count = Some(ris_count);
499 self.decide((ris_count % 2) == 0);
500 return;
501 }
502 ris_count += 1;
503 }
504 self.ris_count = Some(ris_count);
505 if chunk_start == 0 {
506 self.decide((ris_count % 2) == 0);
507 return;
508 }
509 self.pre_context_offset = Some(chunk_start);
510 }
511
512 fn handle_emoji(&mut self, chunk: &str, chunk_start: usize) {
513 for ch in chunk.chars().rev() {
514 match GCB::of(ch) {
515 GCB::Extend => (),
516 GCB::EBase | GCB::EBaseGAZ => {
517 self.decide(false);
518 return;
519 }
520 _ => {
521 self.decide(true);
522 return;
523 }
524 }
525 }
526 if chunk_start == 0 {
527 self.decide(true);
528 return;
529 }
530 self.pre_context_offset = Some(chunk_start);
531 }
532
533 // TODO(clippy): Fix clippy warning or leave it as allowed if really needed.
534 // `warning: methods called `is_*` usually take self by reference or no self; consider choosing
535 // a less ambiguous name`
536 #[cfg_attr(feature = "cargo-clippy", allow(wrong_self_convention))]
537 /// Determine whether the current cursor location is a grapheme cluster boundary.
538 /// Only a part of the string need be supplied. If `chunk_start` is nonzero or
539 /// the length of `chunk` is not equal to `len` on creation, then this method
540 /// may return `GraphemeIncomplete::PreContext`. The caller should then
541 /// call `provide_context` with the requested chunk, then retry calling this
542 /// method.
543 ///
544 /// For partial chunks, if the cursor is not at the beginning or end of the
545 /// string, the chunk should contain at least the codepoint following the cursor.
546 /// If the string is nonempty, the chunk must be nonempty.
547 ///
548 /// All calls should have consistent chunk contents (ie, if a chunk provides
549 /// content for a given slice, all further chunks covering that slice must have
550 /// the same content for it).
551 ///
552 /// ```rust
553 /// # use unic_segment::GraphemeCursor;
554 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
555 /// let mut cursor = GraphemeCursor::new(8, flags.len());
556 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(true));
557 /// cursor.set_cursor(12);
558 /// assert_eq!(cursor.is_boundary(flags, 0), Ok(false));
559 /// ```
560 pub fn is_boundary(
561 &mut self,
562 chunk: &str,
563 chunk_start: usize,
564 ) -> Result<bool, GraphemeIncomplete> {
565 if self.state == GraphemeState::Break {
566 return Ok(true);
567 }
568 if self.state == GraphemeState::NotBreak {
569 return Ok(false);
570 }
571 if (self.offset < chunk_start || self.offset >= chunk_start + chunk.len())
572 && (self.offset > chunk_start + chunk.len() || self.cat_after.is_none())
573 {
574 return Err(GraphemeIncomplete::InvalidOffset);
575 }
576 if let Some(pre_context_offset) = self.pre_context_offset {
577 return Err(GraphemeIncomplete::PreContext(pre_context_offset));
578 }
579 let offset_in_chunk = self.offset - chunk_start;
580 if self.cat_after.is_none() {
581 let ch = chunk[offset_in_chunk..].chars().next().unwrap();
582 self.cat_after = Some(GCB::of(ch));
583 }
584 if self.offset == chunk_start {
585 let mut need_pre_context = true;
586 match self.cat_after.unwrap() {
587 GCB::RegionalIndicator => self.state = GraphemeState::Regional,
588 GCB::EModifier => self.state = GraphemeState::Emoji,
589 _ => need_pre_context = self.cat_before.is_none(),
590 }
591 if need_pre_context {
592 self.pre_context_offset = Some(chunk_start);
593 return Err(GraphemeIncomplete::PreContext(chunk_start));
594 }
595 }
596 if self.cat_before.is_none() {
597 let ch = chunk[..offset_in_chunk].chars().rev().next().unwrap();
598 self.cat_before = Some(GCB::of(ch));
599 }
600 match check_pair(self.cat_before.unwrap(), self.cat_after.unwrap()) {
601 PairResult::NotBreak => self.decision(false),
602 PairResult::Break => self.decision(true),
603 PairResult::Extended => {
604 let is_extended = self.is_extended;
605 self.decision(!is_extended)
606 }
607 PairResult::Regional => {
608 if let Some(ris_count) = self.ris_count {
609 return self.decision((ris_count % 2) == 0);
610 }
611 self.handle_regional(&chunk[..offset_in_chunk], chunk_start);
612 self.is_boundary_result()
613 }
614 PairResult::Emoji => {
615 self.handle_emoji(&chunk[..offset_in_chunk], chunk_start);
616 self.is_boundary_result()
617 }
618 }
619 }
620
621 /// Find the next boundary after the current cursor position. Only a part of
622 /// the string need be supplied. If the chunk is incomplete, then this
623 /// method might return `GraphemeIncomplete::PreContext` or
624 /// `GraphemeIncomplete::NextChunk`. In the former case, the caller should
625 /// call `provide_context` with the requested chunk, then retry. In the
626 /// latter case, the caller should provide the chunk following the one
627 /// given, then retry.
628 ///
629 /// See `is_boundary` for expectations on the provided chunk.
630 ///
631 /// ```rust
632 /// # use unic_segment::GraphemeCursor;
633 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
634 /// let mut cursor = GraphemeCursor::new(4, flags.len());
635 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(8)));
636 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(Some(16)));
637 /// assert_eq!(cursor.next_boundary(flags, 0), Ok(None));
638 /// ```
639 ///
640 /// And an example that uses partial strings:
641 ///
642 /// ```rust
643 /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
644 /// let s = "abcd";
645 /// let mut cursor = GraphemeCursor::new(0, s.len());
646 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Ok(Some(1)));
647 /// assert_eq!(cursor.next_boundary(&s[..2], 0), Err(GraphemeIncomplete::NextChunk));
648 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(2)));
649 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(3)));
650 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(Some(4)));
651 /// assert_eq!(cursor.next_boundary(&s[2..4], 2), Ok(None));
652 /// ```
653 pub fn next_boundary(
654 &mut self,
655 chunk: &str,
656 chunk_start: usize,
657 ) -> Result<Option<usize>, GraphemeIncomplete> {
658 if self.offset == self.len {
659 return Ok(None);
660 }
661 let mut iter = chunk[self.offset - chunk_start..].chars();
662 let mut ch = iter.next().unwrap();
663 loop {
664 if self.resuming {
665 if self.cat_after.is_none() {
666 self.cat_after = Some(GCB::of(ch));
667 }
668 } else {
669 self.offset += ch.len_utf8();
670 self.state = GraphemeState::Unknown;
671 self.cat_before = self.cat_after.take();
672 if self.cat_before.is_none() {
673 self.cat_before = Some(GCB::of(ch));
674 }
675 if self.cat_before == Some(GCB::RegionalIndicator) {
676 self.ris_count = self.ris_count.map(|c| c + 1);
677 } else {
678 self.ris_count = Some(0);
679 }
680 if let Some(next_ch) = iter.next() {
681 ch = next_ch;
682 self.cat_after = Some(GCB::of(ch));
683 } else if self.offset == self.len {
684 self.decide(true);
685 } else {
686 self.resuming = true;
687 return Err(GraphemeIncomplete::NextChunk);
688 }
689 }
690 self.resuming = true;
691 if self.is_boundary(chunk, chunk_start)? {
692 self.resuming = false;
693 return Ok(Some(self.offset));
694 }
695 self.resuming = false;
696 }
697 }
698
699 /// Find the previous boundary after the current cursor position. Only a part
700 /// of the string need be supplied. If the chunk is incomplete, then this
701 /// method might return `GraphemeIncomplete::PreContext` or
702 /// `GraphemeIncomplete::PrevChunk`. In the former case, the caller should
703 /// call `provide_context` with the requested chunk, then retry. In the
704 /// latter case, the caller should provide the chunk preceding the one
705 /// given, then retry.
706 ///
707 /// See `is_boundary` for expectations on the provided chunk.
708 ///
709 /// ```rust
710 /// # use unic_segment::GraphemeCursor;
711 /// let flags = "\u{1F1F7}\u{1F1F8}\u{1F1EE}\u{1F1F4}";
712 /// let mut cursor = GraphemeCursor::new(12, flags.len());
713 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(8)));
714 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(Some(0)));
715 /// assert_eq!(cursor.prev_boundary(flags, 0), Ok(None));
716 /// ```
717 ///
718 /// And an example that uses partial strings (note the exact return is not
719 /// guaranteed, and may be `PrevChunk` or `PreContext` arbitrarily):
720 ///
721 /// ```rust
722 /// # use unic_segment::{GraphemeCursor, GraphemeIncomplete};
723 /// let s = "abcd";
724 /// let mut cursor = GraphemeCursor::new(4, s.len());
725 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Ok(Some(3)));
726 /// assert_eq!(cursor.prev_boundary(&s[2..4], 2), Err(GraphemeIncomplete::PrevChunk));
727 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(2)));
728 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(1)));
729 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(Some(0)));
730 /// assert_eq!(cursor.prev_boundary(&s[0..2], 0), Ok(None));
731 /// ```
732 pub fn prev_boundary(
733 &mut self,
734 chunk: &str,
735 chunk_start: usize,
736 ) -> Result<Option<usize>, GraphemeIncomplete> {
737 if self.offset == 0 {
738 return Ok(None);
739 }
740 let mut iter = chunk[..self.offset - chunk_start].chars().rev();
741 let mut ch = iter.next().unwrap();
742 loop {
743 if self.offset == chunk_start {
744 self.resuming = true;
745 return Err(GraphemeIncomplete::PrevChunk);
746 }
747 if self.resuming {
748 self.cat_before = Some(GCB::of(ch));
749 } else {
750 self.offset -= ch.len_utf8();
751 self.cat_after = self.cat_before.take();
752 self.state = GraphemeState::Unknown;
753 if let Some(ris_count) = self.ris_count {
754 self.ris_count = if ris_count > 0 {
755 Some(ris_count - 1)
756 } else {
757 None
758 };
759 }
760 if let Some(prev_ch) = iter.next() {
761 ch = prev_ch;
762 self.cat_before = Some(GCB::of(ch));
763 } else if self.offset == 0 {
764 self.decide(true);
765 } else {
766 self.resuming = true;
767 return Err(GraphemeIncomplete::PrevChunk);
768 }
769 }
770 self.resuming = true;
771 if self.is_boundary(chunk, chunk_start)? {
772 self.resuming = false;
773 return Ok(Some(self.offset));
774 }
775 self.resuming = false;
776 }
777 }
778}
779
780#[cfg(test)]
781mod tests {
782 use super::{GraphemeIndices, Graphemes};
783
784 #[test]
785 fn test_grapheme_indices() {
786 let input = "a̐éö̲\r\n";
787 let grapheme_indices = GraphemeIndices::new(input).collect::<Vec<(usize, &str)>>();
788 assert_eq!(
789 grapheme_indices,
790 &[(0, "a̐"), (3, "é"), (6, "ö̲"), (11, "\r\n")]
791 );
792
793 let grapheme_indices = GraphemeIndices::new(input)
794 .rev()
795 .collect::<Vec<(usize, &str)>>();
796 assert_eq!(
797 grapheme_indices,
798 &[(11, "\r\n"), (6, "ö̲"), (3, "é"), (0, "a̐")]
799 );
800
801 let mut grapheme_indices_iter = GraphemeIndices::new(input);
802 {
803 let grapheme_indices = grapheme_indices_iter.by_ref();
804 let e1 = grapheme_indices.size_hint();
805 assert_eq!(e1, (1, Some(13)));
806 let c = grapheme_indices.count();
807 assert_eq!(c, 4);
808 }
809 assert_eq!(grapheme_indices_iter.size_hint(), (0, Some(0)));
810 }
811
812 #[test]
813 fn test_graphemes() {
814 let input = "a̐éö̲\r\n";
815 let graphemes = Graphemes::new(input).collect::<Vec<&str>>();
816 assert_eq!(graphemes, &["a̐", "é", "ö̲", "\r\n"]);
817
818 // Make sure the reverse iterator does the right thing with "\n" at beginning of string.
819 let input = "\n\r\n\r";
820 let graphemes = Graphemes::new(input).rev().collect::<Vec<&str>>();
821 assert_eq!(graphemes, &["\r", "\r\n", "\n"]);
822 }
823}