unicode_normalization_alignments/
stream_safe.rsuse normalize::{
hangul_decomposition_length,
is_hangul_syllable,
};
use lookups::{
canonical_combining_class, canonical_fully_decomposed, compatibility_fully_decomposed,
stream_safe_trailing_nonstarters,
};
use tables::stream_safe_leading_nonstarters;
pub(crate) const MAX_NONSTARTERS: usize = 30;
const COMBINING_GRAPHEME_JOINER: char = '\u{034F}';
pub struct StreamSafe<I> {
iter: I,
nonstarter_count: usize,
buffer: Option<char>,
}
impl<I> StreamSafe<I> {
pub(crate) fn new(iter: I) -> Self {
Self { iter, nonstarter_count: 0, buffer: None }
}
}
impl<I: Iterator<Item=char>> Iterator for StreamSafe<I> {
type Item = char;
#[inline]
fn next(&mut self) -> Option<char> {
if let Some(ch) = self.buffer.take() {
return Some(ch);
}
let next_ch = match self.iter.next() {
None => return None,
Some(c) => c,
};
let d = classify_nonstarters(next_ch);
if self.nonstarter_count + d.leading_nonstarters > MAX_NONSTARTERS {
self.buffer = Some(next_ch);
self.nonstarter_count = 0;
return Some(COMBINING_GRAPHEME_JOINER);
}
if d.leading_nonstarters == d.decomposition_len {
self.nonstarter_count += d.decomposition_len;
}
else {
self.nonstarter_count = d.trailing_nonstarters;
}
Some(next_ch)
}
}
#[derive(Debug)]
pub(crate) struct Decomposition {
pub(crate) leading_nonstarters: usize,
pub(crate) trailing_nonstarters: usize,
pub(crate) decomposition_len: usize,
}
#[inline]
pub(crate) fn classify_nonstarters(c: char) -> Decomposition {
if c <= '\x7f' {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: 1,
}
}
if is_hangul_syllable(c) {
return Decomposition {
leading_nonstarters: 0,
trailing_nonstarters: 0,
decomposition_len: hangul_decomposition_length(c),
};
}
let decomp = compatibility_fully_decomposed(c)
.or_else(|| canonical_fully_decomposed(c));
match decomp {
Some(decomp) => {
Decomposition {
leading_nonstarters: stream_safe_leading_nonstarters(c),
trailing_nonstarters: stream_safe_trailing_nonstarters(c),
decomposition_len: decomp.len(),
}
},
None => {
let is_nonstarter = canonical_combining_class(c) != 0;
let nonstarter = if is_nonstarter { 1 } else { 0 };
Decomposition {
leading_nonstarters: nonstarter,
trailing_nonstarters: nonstarter,
decomposition_len: 1,
}
}
}
}
#[cfg(test)]
mod tests {
use super::{
StreamSafe,
classify_nonstarters,
};
use std::char;
use normalize::decompose_compatible;
use lookups::canonical_combining_class;
fn stream_safe(s: &str) -> String {
StreamSafe::new(s.chars()).collect()
}
#[test]
fn test_simple() {
let technically_okay = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}ngerzone";
assert_eq!(stream_safe(technically_okay), technically_okay);
let too_much = "Da\u{0300}\u{0301}\u{0302}\u{0303}\u{0304}\u{0305}\u{0306}\u{0307}\u{0308}\u{0309}\u{030a}\u{030b}\u{030c}\u{030d}\u{030e}\u{030f}\u{0310}\u{0311}\u{0312}\u{0313}\u{0314}\u{0315}\u{0316}\u{0317}\u{0318}\u{0319}\u{031a}\u{031b}\u{031c}\u{031d}\u{032e}ngerzone";
assert_ne!(stream_safe(too_much), too_much);
}
#[test]
fn test_classify_nonstarters() {
for ch in 0..0x2FA1E {
let ch = match char::from_u32(ch) {
Some(c) => c,
None => continue,
};
let c = classify_nonstarters(ch);
let mut s = vec![];
decompose_compatible(ch, |c| s.push(c));
assert_eq!(s.len(), c.decomposition_len);
let num_leading = s
.iter()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
let num_trailing = s
.iter()
.rev()
.take_while(|&c| canonical_combining_class(*c) != 0)
.count();
assert_eq!(num_leading, c.leading_nonstarters);
assert_eq!(num_trailing, c.trailing_nonstarters);
}
}
}