utf8_ranges/
lib.rs

1/*!
2Crate `utf8-ranges` converts ranges of Unicode scalar values to equivalent
3ranges of UTF-8 bytes. This is useful for constructing byte based automatons
4that need to embed UTF-8 decoding.
5
6See the documentation on the `Utf8Sequences` iterator for more details and
7an example.
8
9# Wait, what is this?
10
11This is simplest to explain with an example. Let's say you wanted to test
12whether a particular byte sequence was a Cyrillic character. One possible
13scalar value range is `[0400-04FF]`. The set of allowed bytes for this
14range can be expressed as a sequence of byte ranges:
15
16```ignore
17[D0-D3][80-BF]
18```
19
20This is simple enough: simply encode the boundaries, `0400` encodes to
21`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
22corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
23
24However, what if you wanted to add the Cyrillic Supplementary characters to
25your range? Your range might then become `[0400-052F]`. The same procedure
26as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
27you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
28this isn't quite correct because this range doesn't capture many characters,
29for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
30
31Instead, you need multiple sequences of byte ranges:
32
33```ignore
34[D0-D3][80-BF]  # matches codepoints 0400-04FF
35[D4][80-AF]     # matches codepoints 0500-052F
36```
37
38This gets even more complicated if you want bigger ranges, particularly if
39they naively contain surrogate codepoints. For example, the sequence of byte
40ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
41
42```ignore
43[0-7F]
44[C2-DF][80-BF]
45[E0][A0-BF][80-BF]
46[E1-EC][80-BF][80-BF]
47[ED][80-9F][80-BF]
48[EE-EF][80-BF][80-BF]
49```
50
51Note that the byte ranges above will *not* match any erroneous encoding of
52UTF-8, including encodings of surrogate codepoints.
53
54And, of course, for all of Unicode (`[000000-10FFFF]`):
55
56```ignore
57[0-7F]
58[C2-DF][80-BF]
59[E0][A0-BF][80-BF]
60[E1-EC][80-BF][80-BF]
61[ED][80-9F][80-BF]
62[EE-EF][80-BF][80-BF]
63[F0][90-BF][80-BF][80-BF]
64[F1-F3][80-BF][80-BF][80-BF]
65[F4][80-8F][80-BF][80-BF]
66```
67
68This crate automates the process of creating these byte ranges from ranges of
69Unicode scalar values.
70
71# Why would I ever use this?
72
73You probably won't ever need this. In 99% of cases, you just decode the byte
74sequence into a Unicode scalar value and compare scalar values directly.
75However, this explicit decoding step isn't always possible. For example, the
76construction of some finite state machines may benefit from converting ranges
77of scalar values into UTF-8 decoder automata (e.g., for character classes in
78regular expressions).
79
80# Lineage
81
82I got the idea and general implementation strategy from Russ Cox in his
83[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
84Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
85I also got the idea from
86[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
87which uses it for executing automata on their term index.
88*/
89
90#![deny(missing_docs)]
91
92#[cfg(test)]
93extern crate quickcheck;
94
95use std::char;
96use std::fmt;
97use std::slice;
98
99use char_utf8::encode_utf8;
100
101const MAX_UTF8_BYTES: usize = 4;
102
103mod char_utf8;
104
105/// Utf8Sequence represents a sequence of byte ranges.
106///
107/// To match a Utf8Sequence, a candidate byte sequence must match each
108/// successive range.
109///
110/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
111/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
112#[derive(Copy, Clone, Eq, PartialEq)]
113pub enum Utf8Sequence {
114    /// One byte range.
115    One(Utf8Range),
116    /// Two successive byte ranges.
117    Two([Utf8Range; 2]),
118    /// Three successive byte ranges.
119    Three([Utf8Range; 3]),
120    /// Four successive byte ranges.
121    Four([Utf8Range; 4]),
122}
123
124impl Utf8Sequence {
125    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
126    /// range.
127    ///
128    /// This assumes that `start` and `end` have the same length.
129    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
130        assert_eq!(start.len(), end.len());
131        match start.len() {
132            2 => Utf8Sequence::Two([
133                Utf8Range::new(start[0], end[0]),
134                Utf8Range::new(start[1], end[1]),
135            ]),
136            3 => Utf8Sequence::Three([
137                Utf8Range::new(start[0], end[0]),
138                Utf8Range::new(start[1], end[1]),
139                Utf8Range::new(start[2], end[2]),
140            ]),
141            4 => Utf8Sequence::Four([
142                Utf8Range::new(start[0], end[0]),
143                Utf8Range::new(start[1], end[1]),
144                Utf8Range::new(start[2], end[2]),
145                Utf8Range::new(start[3], end[3]),
146            ]),
147            n => unreachable!("invalid encoded length: {}", n),
148        }
149    }
150
151    /// Returns the underlying sequence of byte ranges as a slice.
152    pub fn as_slice(&self) -> &[Utf8Range] {
153        use self::Utf8Sequence::*;
154        match *self {
155            One(ref r) => slice::from_ref(r),
156            Two(ref r) => &r[..],
157            Three(ref r) => &r[..],
158            Four(ref r) => &r[..],
159        }
160    }
161
162    /// Returns the number of byte ranges in this sequence.
163    ///
164    /// The length is guaranteed to be in the closed interval `[1, 4]`.
165    pub fn len(&self) -> usize {
166        self.as_slice().len()
167    }
168
169    /// Returns true if and only if a prefix of `bytes` matches this sequence
170    /// of byte ranges.
171    pub fn matches(&self, bytes: &[u8]) -> bool {
172        if bytes.len() < self.len() {
173            return false;
174        }
175        for (&b, r) in bytes.iter().zip(self) {
176            if !r.matches(b) {
177                return false;
178            }
179        }
180        true
181    }
182}
183
184impl<'a> IntoIterator for &'a Utf8Sequence {
185    type IntoIter = slice::Iter<'a, Utf8Range>;
186    type Item = &'a Utf8Range;
187
188    fn into_iter(self) -> Self::IntoIter {
189        self.as_slice().into_iter()
190    }
191}
192
193impl fmt::Debug for Utf8Sequence {
194    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
195        use self::Utf8Sequence::*;
196        match *self {
197            One(ref r) => write!(f, "{:?}", r),
198            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
199            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
200            Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}",
201                                  r[0], r[1], r[2], r[3]),
202        }
203    }
204}
205
206/// A single inclusive range of UTF-8 bytes.
207#[derive(Clone, Copy, PartialEq, Eq)]
208pub struct Utf8Range {
209    /// Start of byte range (inclusive).
210    pub start: u8,
211    /// End of byte range (inclusive).
212    pub end: u8,
213}
214
215impl Utf8Range {
216    fn new(start: u8, end: u8) -> Self {
217        Utf8Range { start: start, end: end }
218    }
219
220    /// Returns true if and only if the given byte is in this range.
221    pub fn matches(&self, b: u8) -> bool {
222        self.start <= b && b <= self.end
223    }
224}
225
226impl fmt::Debug for Utf8Range {
227    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
228        if self.start == self.end {
229            write!(f, "[{:X}]", self.start)
230        } else {
231            write!(f, "[{:X}-{:X}]", self.start, self.end)
232        }
233    }
234}
235
236/// An iterator over ranges of matching UTF-8 byte sequences.
237///
238/// The iteration represents an alternation of comprehensive byte sequences
239/// that match precisely the set of UTF-8 encoded scalar values.
240///
241/// A byte sequence corresponds to one of the scalar values in the range given
242/// if and only if it completely matches exactly one of the sequences of byte
243/// ranges produced by this iterator.
244///
245/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
246/// sequences will match the same bytes.
247///
248/// # Example
249///
250/// This shows how to match an arbitrary byte sequence against a range of
251/// scalar values.
252///
253/// ```rust
254/// use utf8_ranges::{Utf8Sequences, Utf8Sequence};
255///
256/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
257///     for range in seqs {
258///         if range.matches(bytes) {
259///             return true;
260///         }
261///     }
262///     false
263/// }
264///
265/// // Test the basic multilingual plane.
266/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
267///
268/// // UTF-8 encoding of 'a'.
269/// assert!(matches(&seqs, &[0x61]));
270/// // UTF-8 encoding of '☃' (`\u{2603}`).
271/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
272/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
273/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
274/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
275/// // which is invalid UTF-8, and therefore fails, despite the fact that
276/// // the corresponding codepoint (0xD800) falls in the range given.
277/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
278/// // And fails against plain old invalid UTF-8.
279/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
280/// ```
281///
282/// If this example seems circuitous, that's because it is! It's meant to be
283/// illustrative. In practice, you could just try to decode your byte sequence
284/// and compare it with the scalar value range directly. However, this is not
285/// always possible (for example, in a byte based automaton).
286pub struct Utf8Sequences {
287    range_stack: Vec<ScalarRange>,
288}
289
290impl Utf8Sequences {
291    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
292    /// given.
293    pub fn new(start: char, end: char) -> Self {
294        let mut it = Utf8Sequences { range_stack: vec![] };
295        it.push(start as u32, end as u32);
296        it
297    }
298
299    /// reset resets the scalar value range.
300    /// Any existing state is cleared, but resources may be reused.
301    ///
302    /// N.B. Benchmarks say that this method is dubious.
303    #[doc(hidden)]
304    pub fn reset(&mut self, start: char, end: char) {
305        self.range_stack.clear();
306        self.push(start as u32, end as u32);
307    }
308
309    fn push(&mut self, start: u32, end: u32) {
310        self.range_stack.push(ScalarRange { start: start, end: end });
311    }
312}
313
314struct ScalarRange {
315    start: u32,
316    end: u32,
317}
318
319impl fmt::Debug for ScalarRange {
320    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
321        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
322    }
323}
324
325impl Iterator for Utf8Sequences {
326    type Item = Utf8Sequence;
327
328    fn next(&mut self) -> Option<Self::Item> {
329    'TOP:
330        while let Some(mut r) = self.range_stack.pop() {
331        'INNER:
332            loop {
333                if let Some((r1, r2)) = r.split() {
334                    self.push(r2.start, r2.end);
335                    r.start = r1.start;
336                    r.end = r1.end;
337                    continue 'INNER;
338                }
339                if !r.is_valid() {
340                    continue 'TOP;
341                }
342                for i in 1..MAX_UTF8_BYTES {
343                    let max = max_scalar_value(i);
344                    if r.start <= max && max < r.end {
345                        self.push(max + 1, r.end);
346                        r.end = max;
347                        continue 'INNER;
348                    }
349                }
350                if let Some(ascii_range) = r.as_ascii() {
351                    return Some(Utf8Sequence::One(ascii_range));
352                }
353                for i in 1..MAX_UTF8_BYTES {
354                    let m = (1 << (6 * i)) - 1;
355                    if (r.start & !m) != (r.end & !m) {
356                        if (r.start & m) != 0 {
357                            self.push((r.start | m) + 1, r.end);
358                            r.end = r.start | m;
359                            continue 'INNER;
360                        }
361                        if (r.end & m) != m {
362                            self.push(r.end & !m, r.end);
363                            r.end = (r.end & !m) - 1;
364                            continue 'INNER;
365                        }
366                    }
367                }
368                let mut start = [0; MAX_UTF8_BYTES];
369                let mut end = [0; MAX_UTF8_BYTES];
370                let n = r.encode(&mut start, &mut end);
371                return Some(Utf8Sequence::from_encoded_range(
372                    &start[0..n], &end[0..n]));
373            }
374        }
375        None
376    }
377}
378
379impl ScalarRange {
380    /// split splits this range if it overlaps with a surrogate codepoint.
381    ///
382    /// Either or both ranges may be invalid.
383    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
384        if self.start < 0xE000 && self.end > 0xD7FF {
385            Some((ScalarRange {
386                start: self.start,
387                end: 0xD7FF,
388            }, ScalarRange {
389                start: 0xE000,
390                end: self.end,
391            }))
392        } else {
393            None
394        }
395    }
396
397    /// is_valid returns true if and only if start <= end.
398    fn is_valid(&self) -> bool {
399        self.start <= self.end
400    }
401
402    /// as_ascii returns this range as a Utf8Range if and only if all scalar
403    /// values in this range can be encoded as a single byte.
404    fn as_ascii(&self) -> Option<Utf8Range> {
405        if self.is_ascii() {
406            Some(Utf8Range::new(self.start as u8, self.end as u8))
407        } else {
408            None
409        }
410    }
411
412    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
413    /// byte to encode any scalar value).
414    fn is_ascii(&self) -> bool {
415        self.is_valid() && self.end <= 0x7f
416    }
417
418    /// encode writes the UTF-8 encoding of the start and end of this range
419    /// to the corresponding destination slices.
420    ///
421    /// The slices should have room for at least `MAX_UTF8_BYTES`.
422    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
423        let cs = char::from_u32(self.start).unwrap();
424        let ce = char::from_u32(self.end).unwrap();
425        let n = encode_utf8(cs, start).unwrap();
426        let m = encode_utf8(ce, end).unwrap();
427        assert_eq!(n, m);
428        n
429    }
430}
431
432fn max_scalar_value(nbytes: usize) -> u32 {
433    match nbytes {
434        1 => 0x007F,
435        2 => 0x07FF,
436        3 => 0xFFFF,
437        4 => 0x10FFFF,
438        _ => unreachable!("invalid UTF-8 byte sequence size"),
439    }
440}
441
442#[cfg(test)]
443mod tests {
444    use std::char;
445
446    use quickcheck::{TestResult, quickcheck};
447
448    use {Utf8Range, Utf8Sequences};
449
450    fn rutf8(s: u8, e: u8) -> Utf8Range {
451        Utf8Range::new(s, e)
452    }
453
454    fn never_accepts_surrogate_codepoints(start: char, end: char) {
455        for cp in 0xD800..0xE000 {
456            let buf = encode_surrogate(cp);
457            for r in Utf8Sequences::new(start, end) {
458                if r.matches(&buf) {
459                    panic!(
460                        "Sequence ({:X}, {:X}) contains range {:?}, \
461                         which matches surrogate code point {:X} \
462                         with encoded bytes {:?}",
463                        start as u32, end as u32, r, cp, buf,
464                    );
465                }
466            }
467        }
468    }
469
470    #[test]
471    fn codepoints_no_surrogates() {
472        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
473        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
474        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
475        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
476        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
477    }
478
479    #[test]
480    fn single_codepoint_one_sequence() {
481        // Tests that every range of scalar values that contains a single
482        // scalar value is recognized by one sequence of byte ranges.
483        for i in 0x0..(0x10FFFF + 1) {
484            let c = match char::from_u32(i) {
485                None => continue,
486                Some(c) => c,
487            };
488            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
489            assert_eq!(seqs.len(), 1);
490        }
491    }
492
493    #[test]
494    fn qc_codepoints_no_surrogate() {
495        fn p(s: char, e: char) -> TestResult {
496            if s > e {
497                return TestResult::discard();
498            }
499            never_accepts_surrogate_codepoints(s, e);
500            TestResult::passed()
501        }
502        quickcheck(p as fn(char, char) -> TestResult);
503    }
504
505    #[test]
506    fn bmp() {
507        use Utf8Sequence::*;
508
509        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}')
510                                 .collect::<Vec<_>>();
511        assert_eq!(seqs, vec![
512            One(rutf8(0x0, 0x7F)),
513            Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
514            Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]),
515            Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
516            Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]),
517            Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
518        ]);
519    }
520
521    fn encode_surrogate(cp: u32) -> [u8; 3] {
522        const TAG_CONT: u8 = 0b1000_0000;
523        const TAG_THREE_B: u8 = 0b1110_0000;
524
525        assert!(0xD800 <= cp && cp < 0xE000);
526        let mut dst = [0; 3];
527        dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
528        dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
529        dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
530        dst
531    }
532}