lexical_sort/
iter.rs

1//! Iterators to transliterate Unicode to ASCII. Note that only alphanumeric
2//! characters are transliterated, and not all of them are supported.
3//!
4//! Characters can be transliterated to multiple ASCII characters. For example,
5//! `æ` is converted to `ae`, and `½` is converted to `1/2`.
6//!
7//! The iterators don't allocate memory on the heap. I haven't benchmarked it,
8//! but I believe that it's quite efficient.
9
10use any_ascii::any_ascii_char;
11use core::iter::FusedIterator;
12
13/// An iterator over one `char`, converted to lowercase
14/// and transliterated to ASCII, if it is an alphanumeric character
15///
16/// This iterator can be created by calling `iterate_lexical_char()` or
17/// `iterate_lexical_char_only_alnum()`
18pub struct LexicalChar(CharOrSlice);
19
20impl LexicalChar {
21    #[inline]
22    fn from_char(c: char) -> Self {
23        LexicalChar(CharOrSlice::Char(c))
24    }
25
26    #[inline]
27    fn from_slice(s: &'static [u8]) -> Self {
28        LexicalChar(CharOrSlice::Slice(s))
29    }
30
31    #[inline]
32    fn empty() -> Self {
33        LexicalChar(CharOrSlice::Slice(&[]))
34    }
35
36    #[inline]
37    fn inner(&self) -> &CharOrSlice {
38        &self.0
39    }
40
41    #[inline]
42    fn inner_mut(&mut self) -> &mut CharOrSlice {
43        &mut self.0
44    }
45}
46
47enum CharOrSlice {
48    Char(char),
49    Slice(&'static [u8]),
50}
51
52impl Iterator for LexicalChar {
53    type Item = char;
54
55    #[inline]
56    fn next(&mut self) -> Option<Self::Item> {
57        match self.inner_mut() {
58            &mut CharOrSlice::Char(c) => {
59                *self = LexicalChar::empty();
60                Some(c)
61            }
62            CharOrSlice::Slice(slice) => match slice.get(0_usize) {
63                Some(&next) => {
64                    *slice = &slice[1..];
65                    Some((next as char).to_ascii_lowercase())
66                }
67                None => None,
68            },
69        }
70    }
71
72    #[inline]
73    fn size_hint(&self) -> (usize, Option<usize>) {
74        match self.inner() {
75            CharOrSlice::Char(_) => (1, Some(1)),
76            CharOrSlice::Slice(s) => (s.len(), Some(s.len())),
77        }
78    }
79
80    #[inline]
81    fn nth(&mut self, n: usize) -> Option<Self::Item> {
82        if n == 0 {
83            self.next()
84        } else if let CharOrSlice::Slice(slice) = self.inner_mut() {
85            match slice.get(n) {
86                Some(&next) => {
87                    *slice = &slice[1..];
88                    Some((next as char).to_ascii_lowercase())
89                }
90                None => None,
91            }
92        } else {
93            None
94        }
95    }
96}
97
98impl FusedIterator for LexicalChar {}
99
100impl ExactSizeIterator for LexicalChar {}
101
102impl DoubleEndedIterator for LexicalChar {
103    #[inline]
104    fn next_back(&mut self) -> Option<Self::Item> {
105        match self.inner_mut() {
106            &mut CharOrSlice::Char(c) => {
107                *self = LexicalChar::empty();
108                Some(c)
109            }
110            CharOrSlice::Slice(slice) => {
111                if slice.len() > 0 {
112                    let ix = slice.len() - 1;
113                    *slice = &slice[..ix];
114                    Some((slice[ix] as char).to_ascii_lowercase())
115                } else {
116                    None
117                }
118            }
119        }
120    }
121}
122
123/// Returns an iterator over one `char`, converted to lowercase
124/// and transliterated to ASCII, if it is alphanumeric
125#[inline]
126pub fn iterate_lexical_char(c: char) -> LexicalChar {
127    if c.is_ascii() {
128        LexicalChar::from_char(c.to_ascii_lowercase())
129    } else if c.is_alphanumeric() {
130        match any_ascii_char(c) {
131            s if s.is_empty() => LexicalChar::from_char(c),
132            s => LexicalChar::from_slice(s.as_bytes()),
133        }
134    } else if combining_diacritical(&c) {
135        LexicalChar::empty()
136    } else {
137        LexicalChar::from_char(c)
138    }
139}
140
141/// Returns an iterator over one `char`, converted to lowercase
142/// and transliterated to ASCII, if it is alphanumeric
143#[inline]
144pub fn iterate_lexical_char_only_alnum(c: char) -> LexicalChar {
145    if c.is_ascii() {
146        if c.is_ascii_alphanumeric() {
147            LexicalChar::from_char(c.to_ascii_lowercase())
148        } else {
149            LexicalChar::empty()
150        }
151    } else if c.is_alphanumeric() {
152        match any_ascii_char(c) {
153            s if s.is_empty() => LexicalChar::from_char(c),
154            s => LexicalChar::from_slice(s.as_bytes()),
155        }
156    } else {
157        LexicalChar::empty()
158    }
159}
160
161/// returns `true` for combining diacritical marks
162#[inline]
163fn combining_diacritical(&c: &char) -> bool {
164    c >= '\u{300}' && c <= '\u{36F}'
165}
166
167/// Returns an iterator over the characters of a string, converted to lowercase
168/// and transliterated to ASCII, if they're alphanumeric
169pub fn iterate_lexical(s: &'_ str) -> impl Iterator<Item = char> + '_ {
170    s.chars().flat_map(iterate_lexical_char)
171}
172
173/// Returns an iterator over the characters of a string, converted to lowercase
174/// and transliterated to ASCII. Non-alphanumeric characters are skipped
175pub fn iterate_lexical_only_alnum(s: &'_ str) -> impl Iterator<Item = char> + '_ {
176    s.chars().flat_map(iterate_lexical_char_only_alnum)
177}
178
179#[test]
180#[cfg(feature = "std")]
181fn test_iteration() {
182    fn it(s: &'static str) -> String {
183        iterate_lexical(s).collect()
184    }
185
186    assert_eq!(&it("Hello, world!"), "hello, world!");
187    assert_eq!(&it("Ω A æ b ö ß é"), "o a ae b o ss e");
188    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/2/5/8 £ → € ®™");
189    assert_eq!(&it("»@« 15% ¡¹!"), "»@« 15% ¡1!");
190    assert_eq!(&it("🎉🦄☣"), "🎉🦄☣");
191    assert_eq!(&it("北亰"), "beijing");
192    assert_eq!(&it("ΣΣΣ"), "sss");
193    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
194}
195
196#[test]
197#[cfg(feature = "std")]
198fn test_iteration_only_alnum() {
199    fn it(s: &'static str) -> String {
200        iterate_lexical_only_alnum(s).collect()
201    }
202
203    assert_eq!(&it("Hello, world!"), "helloworld");
204    assert_eq!(&it("Ω A æ b ö ß é"), "oaaebosse");
205    assert_eq!(&it("3½/⅝ £ → € ®™"), "31/25/8");
206    assert_eq!(&it("»@« 15% ¡¹!"), "151");
207    assert_eq!(&it("🎉🦄☣"), "");
208    assert_eq!(&it("北亰"), "beijing");
209    assert_eq!(&it("ΣΣΣ"), "sss");
210    assert_eq!(&it("à"), "a"); // 'a' with combining diacritical mark '\u{300}'
211}