use crate::common;
#[inline]
fn after(s: &str, offset: usize) -> Option<char> {
s.chars().nth(offset + 1)
}
#[inline]
fn before(s: &str, offset: usize) -> Option<char> {
if offset == 0 {
None
} else {
s.chars().nth(offset - 1)
}
}
#[derive(Debug, PartialEq, Eq)]
pub enum ContextRuleError {
NotApplicable,
Undefined,
}
pub fn rule_zero_width_nonjoiner(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
if 0x200c != s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32 {
return Err(ContextRuleError::NotApplicable);
}
let mut prev = before(s, offset).ok_or(ContextRuleError::Undefined)?;
let mut cp = prev as u32;
if common::is_virama(cp) {
return Ok(true);
}
let mut i = offset - 1;
while common::is_transparent(cp) {
prev = before(s, i).ok_or(ContextRuleError::Undefined)?;
cp = prev as u32;
i -= 1;
}
if !(common::is_left_joining(cp) || common::is_dual_joining(cp)) {
return Ok(false);
}
let mut next = after(s, offset).ok_or(ContextRuleError::Undefined)?;
cp = next as u32;
i = offset + 1;
while common::is_transparent(cp) {
next = after(s, i).ok_or(ContextRuleError::Undefined)?;
cp = next as u32;
i += 1;
}
Ok(common::is_right_joining(cp) || common::is_dual_joining(cp))
}
pub fn rule_zero_width_joiner(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
if 0x200d != s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32 {
return Err(ContextRuleError::NotApplicable);
}
let prev = before(s, offset).ok_or(ContextRuleError::Undefined)?;
Ok(common::is_virama(prev as u32))
}
pub fn rule_middle_dot(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
if 0x00b7 != s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32 {
return Err(ContextRuleError::NotApplicable);
}
let prev = before(s, offset).ok_or(ContextRuleError::Undefined)?;
let next = after(s, offset).ok_or(ContextRuleError::Undefined)?;
Ok(prev as u32 == 0x006c && next as u32 == 0x006c)
}
pub fn rule_greek_lower_numeral_sign_keraia(
s: &str,
offset: usize,
) -> Result<bool, ContextRuleError> {
if 0x0375 != s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32 {
return Err(ContextRuleError::NotApplicable);
}
let after = after(s, offset).ok_or(ContextRuleError::Undefined)?;
Ok(common::is_greek(after as u32))
}
pub fn rule_hebrew_punctuation(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
let cp = s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32;
if cp != 0x05f3 && cp != 0x05f4 {
return Err(ContextRuleError::NotApplicable);
}
let prev = before(s, offset).ok_or(ContextRuleError::Undefined)?;
Ok(common::is_hebrew(prev as u32))
}
pub fn rule_katakana_middle_dot(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
if 0x30fb != s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32 {
return Err(ContextRuleError::NotApplicable);
}
for c in s.chars() {
let cp = c as u32;
if common::is_hiragana(cp) || common::is_katakana(cp) || common::is_han(cp) {
return Ok(true);
}
}
Ok(false)
}
pub fn rule_arabic_indic_digits(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
let cp = s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32;
if !(0x0660..=0x0669).contains(&cp) {
return Err(ContextRuleError::NotApplicable);
}
let range = 0x06f0..=0x06f9;
for c in s.chars() {
if range.contains(&(c as u32)) {
return Ok(false);
}
}
Ok(true)
}
pub fn rule_extended_arabic_indic_digits(s: &str, offset: usize) -> Result<bool, ContextRuleError> {
let cp = s.chars().nth(offset).ok_or(ContextRuleError::Undefined)? as u32;
if !(0x06f0..=0x06f9).contains(&cp) {
return Err(ContextRuleError::NotApplicable);
}
let range = 0x0660..=0x0669;
for c in s.chars() {
if range.contains(&(c as u32)) {
return Ok(false);
}
}
Ok(true)
}
pub type ContextRule = fn(s: &str, offset: usize) -> Result<bool, ContextRuleError>;
pub fn get_context_rule(cp: u32) -> Option<ContextRule> {
match cp {
0x00b7 => Some(rule_middle_dot),
0x200c => Some(rule_zero_width_nonjoiner),
0x200d => Some(rule_zero_width_joiner),
0x0375 => Some(rule_greek_lower_numeral_sign_keraia),
0x05f3 | 0x5f4 => Some(rule_hebrew_punctuation),
0x30fb => Some(rule_katakana_middle_dot),
0x0660..=0x0669 => Some(rule_arabic_indic_digits),
0x06f0..=0x06f9 => Some(rule_extended_arabic_indic_digits),
_ => None,
}
}
#[cfg(test)]
mod tests {
use crate::context::*;
#[test]
fn check_after() {
assert_eq!(after("", 0), None);
assert_eq!(after("", 5), None);
assert_eq!(after("a", 0), None);
assert_eq!(after("a", 5), None);
assert_eq!(after("ab", 0), Some('b'));
assert_eq!(after("ab", 1), None);
assert_eq!(after("abc", 1), Some('c'));
}
#[test]
fn check_before() {
assert_eq!(before("", 0), None);
assert_eq!(before("", 5), None);
assert_eq!(before("a", 0), None);
assert_eq!(before("a", 5), None);
assert_eq!(before("ab", 1), Some('a'));
assert_eq!(before("ab", 0), None);
assert_eq!(before("abc", 2), Some('b'));
}
#[test]
fn check_rule_zero_width_nonjoiner() {
let label = "A";
let res = rule_zero_width_nonjoiner(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{200c}";
let res = rule_zero_width_nonjoiner(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{94d}\u{200c}";
let res = rule_zero_width_nonjoiner(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "A\u{94d}\u{200c}B";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "A\u{200c}";
let res = rule_zero_width_nonjoiner(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "\u{5bf}\u{200c}";
let res = rule_zero_width_nonjoiner(label, 1);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A\u{5bf}\u{200c}";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "\u{a872}\u{5bf}\u{200c}";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{a872}\u{5bf}\u{200c}\u{5bf}";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{a872}\u{5bf}\u{200c}\u{5bf}\u{629}";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{a872}\u{5bf}\u{200c}\u{5bf}A";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "\u{a872}\u{5bf}\u{200c}A";
let res = rule_zero_width_nonjoiner(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "A\u{5bf}\u{5bf}\u{200c}\u{5bf}\u{5bf}\u{5bf}\u{5bf}\u{626}";
let res = rule_zero_width_nonjoiner(label, 3);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "\u{626}\u{200c}\u{5bf}\u{626}";
let res = rule_zero_width_nonjoiner(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{626}\u{200c}\u{626}";
let res = rule_zero_width_nonjoiner(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{626}\u{5bf}\u{5bf}\u{200c}\u{5bf}\u{5bf}\u{5bf}\u{5bf}\u{626}";
let res = rule_zero_width_nonjoiner(label, 3);
assert!(res.is_ok());
assert!(res.unwrap());
}
#[test]
fn check_rule_zero_width_joiner() {
let label = "";
let res = rule_zero_width_joiner(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A";
let res = rule_zero_width_joiner(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{200d}";
let res = rule_zero_width_joiner(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{200d}A";
let res = rule_zero_width_joiner(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{94d}\u{200d}";
let res = rule_zero_width_joiner(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "A\u{200d}";
let res = rule_zero_width_joiner(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "A\u{94d}\u{200d}B";
let res = rule_zero_width_joiner(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
}
#[test]
fn check_rule_middle_dot() {
let label = "";
let res = rule_middle_dot(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A";
let res = rule_middle_dot(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{00b7}";
let res = rule_middle_dot(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{006c}\u{00b7}";
let res = rule_middle_dot(label, 1);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{006c}\u{00b7}\u{006c}";
let res = rule_middle_dot(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{006c}\u{00b7}A";
let res = rule_middle_dot(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "A\u{00b7}A";
let res = rule_middle_dot(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
}
#[test]
fn check_rule_greek_lower_numeral_sign_keraia() {
let label = "";
let res = rule_greek_lower_numeral_sign_keraia(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A";
let res = rule_greek_lower_numeral_sign_keraia(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{0375}";
let res = rule_greek_lower_numeral_sign_keraia(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{0375}\u{0384}";
let res = rule_greek_lower_numeral_sign_keraia(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "A\u{0375}\u{0384}";
let res = rule_greek_lower_numeral_sign_keraia(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{0375}A";
let res = rule_greek_lower_numeral_sign_keraia(label, 0);
assert!(res.is_ok());
assert!(!res.unwrap());
}
#[test]
fn check_rule_hebrew_punctuation() {
let label = "";
let res = rule_hebrew_punctuation(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A";
let res = rule_hebrew_punctuation(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{05F3}";
let res = rule_hebrew_punctuation(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{5f0}\u{05F3}";
let res = rule_hebrew_punctuation(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{5f0}\u{05F4}";
let res = rule_hebrew_punctuation(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "A\u{05F4}";
let res = rule_hebrew_punctuation(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "YYY\u{5f0}\u{05F4}XXX";
let res = rule_hebrew_punctuation(label, 4);
assert!(res.is_ok());
assert!(res.unwrap());
}
#[test]
fn check_rule_katakana_middle_dot() {
let label = "";
let res = rule_katakana_middle_dot(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "A";
let res = rule_katakana_middle_dot(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{30fb}";
let res = rule_katakana_middle_dot(label, 0);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "a\u{30fb}b";
let res = rule_katakana_middle_dot(label, 1);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "a\u{30fb}b\u{1b001}c";
let res = rule_katakana_middle_dot(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "a\u{30fb}bc\u{3357}";
let res = rule_katakana_middle_dot(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{3007}\u{30fb}bc";
let res = rule_katakana_middle_dot(label, 1);
assert!(res.is_ok());
assert!(res.unwrap());
}
#[test]
fn check_rule_arabic_indic_digits() {
let label = "";
let res = rule_arabic_indic_digits(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{065f}";
let res = rule_arabic_indic_digits(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{066a}";
let res = rule_arabic_indic_digits(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{0660}";
let res = rule_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{0665}";
let res = rule_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{0669}";
let res = rule_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{0669}cd";
let res = rule_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{0669}c\u{06ef}";
let res = rule_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{0669}c\u{06fa}";
let res = rule_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{0669}c\u{06f0}";
let res = rule_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "ab\u{0669}c\u{06f9}";
let res = rule_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
}
#[test]
fn check_rule_extended_arabic_indic_digits() {
let label = "";
let res = rule_extended_arabic_indic_digits(label, 3);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::Undefined);
let label = "\u{06ef}";
let res = rule_extended_arabic_indic_digits(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{06fa}";
let res = rule_extended_arabic_indic_digits(label, 0);
assert!(res.is_err());
assert_eq!(res.unwrap_err(), ContextRuleError::NotApplicable);
let label = "\u{06f0}";
let res = rule_extended_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{06f5}";
let res = rule_extended_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "\u{06f9}";
let res = rule_extended_arabic_indic_digits(label, 0);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{06f0}cd";
let res = rule_extended_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{06f0}c\u{065f}";
let res = rule_extended_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{06f0}c\u{066a}";
let res = rule_extended_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(res.unwrap());
let label = "ab\u{06f0}c\u{0660}";
let res = rule_extended_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
let label = "ab\u{06f0}c\u{0669}";
let res = rule_extended_arabic_indic_digits(label, 2);
assert!(res.is_ok());
assert!(!res.unwrap());
}
#[test]
fn check_get_context_rule() {
let val = get_context_rule(0x013);
assert!(val.is_none());
let val = get_context_rule(0x00b7);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_middle_dot as usize);
let val = get_context_rule(0x200c);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_zero_width_nonjoiner as usize);
let val = get_context_rule(0x0375);
assert!(val.is_some());
assert_eq!(
val.unwrap() as usize,
rule_greek_lower_numeral_sign_keraia as usize
);
let val = get_context_rule(0x05f3);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_hebrew_punctuation as usize);
let val = get_context_rule(0x05f4);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_hebrew_punctuation as usize);
let val = get_context_rule(0x30fb);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_katakana_middle_dot as usize);
let val = get_context_rule(0x0660);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_arabic_indic_digits as usize);
let val = get_context_rule(0x0669);
assert!(val.is_some());
assert_eq!(val.unwrap() as usize, rule_arabic_indic_digits as usize);
let val = get_context_rule(0x065f);
assert!(val.is_none());
let val = get_context_rule(0x066a);
assert!(val.is_none());
let val = get_context_rule(0x06f0);
assert!(val.is_some());
assert_eq!(
val.unwrap() as usize,
rule_extended_arabic_indic_digits as usize
);
let val = get_context_rule(0x06f9);
assert!(val.is_some());
assert_eq!(
val.unwrap() as usize,
rule_extended_arabic_indic_digits as usize
);
let val = get_context_rule(0x06ef);
assert!(val.is_none());
let val = get_context_rule(0x06fa);
assert!(val.is_none());
}
}