utf8_ranges/lib.rs
1/*!
2Crate `utf8-ranges` converts ranges of Unicode scalar values to equivalent
3ranges of UTF-8 bytes. This is useful for constructing byte based automatons
4that need to embed UTF-8 decoding.
5
6See the documentation on the `Utf8Sequences` iterator for more details and
7an example.
8
9# Wait, what is this?
10
11This is simplest to explain with an example. Let's say you wanted to test
12whether a particular byte sequence was a Cyrillic character. One possible
13scalar value range is `[0400-04FF]`. The set of allowed bytes for this
14range can be expressed as a sequence of byte ranges:
15
16```ignore
17[D0-D3][80-BF]
18```
19
20This is simple enough: simply encode the boundaries, `0400` encodes to
21`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
22corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
23
24However, what if you wanted to add the Cyrillic Supplementary characters to
25your range? Your range might then become `[0400-052F]`. The same procedure
26as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
27you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
28this isn't quite correct because this range doesn't capture many characters,
29for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
30
31Instead, you need multiple sequences of byte ranges:
32
33```ignore
34[D0-D3][80-BF] # matches codepoints 0400-04FF
35[D4][80-AF] # matches codepoints 0500-052F
36```
37
38This gets even more complicated if you want bigger ranges, particularly if
39they naively contain surrogate codepoints. For example, the sequence of byte
40ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
41
42```ignore
43[0-7F]
44[C2-DF][80-BF]
45[E0][A0-BF][80-BF]
46[E1-EC][80-BF][80-BF]
47[ED][80-9F][80-BF]
48[EE-EF][80-BF][80-BF]
49```
50
51Note that the byte ranges above will *not* match any erroneous encoding of
52UTF-8, including encodings of surrogate codepoints.
53
54And, of course, for all of Unicode (`[000000-10FFFF]`):
55
56```ignore
57[0-7F]
58[C2-DF][80-BF]
59[E0][A0-BF][80-BF]
60[E1-EC][80-BF][80-BF]
61[ED][80-9F][80-BF]
62[EE-EF][80-BF][80-BF]
63[F0][90-BF][80-BF][80-BF]
64[F1-F3][80-BF][80-BF][80-BF]
65[F4][80-8F][80-BF][80-BF]
66```
67
68This crate automates the process of creating these byte ranges from ranges of
69Unicode scalar values.
70
71# Why would I ever use this?
72
73You probably won't ever need this. In 99% of cases, you just decode the byte
74sequence into a Unicode scalar value and compare scalar values directly.
75However, this explicit decoding step isn't always possible. For example, the
76construction of some finite state machines may benefit from converting ranges
77of scalar values into UTF-8 decoder automata (e.g., for character classes in
78regular expressions).
79
80# Lineage
81
82I got the idea and general implementation strategy from Russ Cox in his
83[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
84Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
85I also got the idea from
86[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
87which uses it for executing automata on their term index.
88*/
89
90#![deny(missing_docs)]
91
92#[cfg(test)]
93extern crate quickcheck;
94
95use std::char;
96use std::fmt;
97use std::slice;
98
99use char_utf8::encode_utf8;
100
101const MAX_UTF8_BYTES: usize = 4;
102
103mod char_utf8;
104
105/// Utf8Sequence represents a sequence of byte ranges.
106///
107/// To match a Utf8Sequence, a candidate byte sequence must match each
108/// successive range.
109///
110/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
111/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
112#[derive(Copy, Clone, Eq, PartialEq)]
113pub enum Utf8Sequence {
114 /// One byte range.
115 One(Utf8Range),
116 /// Two successive byte ranges.
117 Two([Utf8Range; 2]),
118 /// Three successive byte ranges.
119 Three([Utf8Range; 3]),
120 /// Four successive byte ranges.
121 Four([Utf8Range; 4]),
122}
123
124impl Utf8Sequence {
125 /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
126 /// range.
127 ///
128 /// This assumes that `start` and `end` have the same length.
129 fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
130 assert_eq!(start.len(), end.len());
131 match start.len() {
132 2 => Utf8Sequence::Two([
133 Utf8Range::new(start[0], end[0]),
134 Utf8Range::new(start[1], end[1]),
135 ]),
136 3 => Utf8Sequence::Three([
137 Utf8Range::new(start[0], end[0]),
138 Utf8Range::new(start[1], end[1]),
139 Utf8Range::new(start[2], end[2]),
140 ]),
141 4 => Utf8Sequence::Four([
142 Utf8Range::new(start[0], end[0]),
143 Utf8Range::new(start[1], end[1]),
144 Utf8Range::new(start[2], end[2]),
145 Utf8Range::new(start[3], end[3]),
146 ]),
147 n => unreachable!("invalid encoded length: {}", n),
148 }
149 }
150
151 /// Returns the underlying sequence of byte ranges as a slice.
152 pub fn as_slice(&self) -> &[Utf8Range] {
153 use self::Utf8Sequence::*;
154 match *self {
155 One(ref r) => slice::from_ref(r),
156 Two(ref r) => &r[..],
157 Three(ref r) => &r[..],
158 Four(ref r) => &r[..],
159 }
160 }
161
162 /// Returns the number of byte ranges in this sequence.
163 ///
164 /// The length is guaranteed to be in the closed interval `[1, 4]`.
165 pub fn len(&self) -> usize {
166 self.as_slice().len()
167 }
168
169 /// Returns true if and only if a prefix of `bytes` matches this sequence
170 /// of byte ranges.
171 pub fn matches(&self, bytes: &[u8]) -> bool {
172 if bytes.len() < self.len() {
173 return false;
174 }
175 for (&b, r) in bytes.iter().zip(self) {
176 if !r.matches(b) {
177 return false;
178 }
179 }
180 true
181 }
182}
183
184impl<'a> IntoIterator for &'a Utf8Sequence {
185 type IntoIter = slice::Iter<'a, Utf8Range>;
186 type Item = &'a Utf8Range;
187
188 fn into_iter(self) -> Self::IntoIter {
189 self.as_slice().into_iter()
190 }
191}
192
193impl fmt::Debug for Utf8Sequence {
194 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
195 use self::Utf8Sequence::*;
196 match *self {
197 One(ref r) => write!(f, "{:?}", r),
198 Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
199 Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
200 Four(ref r) => write!(f, "{:?}{:?}{:?}{:?}",
201 r[0], r[1], r[2], r[3]),
202 }
203 }
204}
205
206/// A single inclusive range of UTF-8 bytes.
207#[derive(Clone, Copy, PartialEq, Eq)]
208pub struct Utf8Range {
209 /// Start of byte range (inclusive).
210 pub start: u8,
211 /// End of byte range (inclusive).
212 pub end: u8,
213}
214
215impl Utf8Range {
216 fn new(start: u8, end: u8) -> Self {
217 Utf8Range { start: start, end: end }
218 }
219
220 /// Returns true if and only if the given byte is in this range.
221 pub fn matches(&self, b: u8) -> bool {
222 self.start <= b && b <= self.end
223 }
224}
225
226impl fmt::Debug for Utf8Range {
227 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
228 if self.start == self.end {
229 write!(f, "[{:X}]", self.start)
230 } else {
231 write!(f, "[{:X}-{:X}]", self.start, self.end)
232 }
233 }
234}
235
236/// An iterator over ranges of matching UTF-8 byte sequences.
237///
238/// The iteration represents an alternation of comprehensive byte sequences
239/// that match precisely the set of UTF-8 encoded scalar values.
240///
241/// A byte sequence corresponds to one of the scalar values in the range given
242/// if and only if it completely matches exactly one of the sequences of byte
243/// ranges produced by this iterator.
244///
245/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
246/// sequences will match the same bytes.
247///
248/// # Example
249///
250/// This shows how to match an arbitrary byte sequence against a range of
251/// scalar values.
252///
253/// ```rust
254/// use utf8_ranges::{Utf8Sequences, Utf8Sequence};
255///
256/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
257/// for range in seqs {
258/// if range.matches(bytes) {
259/// return true;
260/// }
261/// }
262/// false
263/// }
264///
265/// // Test the basic multilingual plane.
266/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
267///
268/// // UTF-8 encoding of 'a'.
269/// assert!(matches(&seqs, &[0x61]));
270/// // UTF-8 encoding of '☃' (`\u{2603}`).
271/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
272/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
273/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
274/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
275/// // which is invalid UTF-8, and therefore fails, despite the fact that
276/// // the corresponding codepoint (0xD800) falls in the range given.
277/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
278/// // And fails against plain old invalid UTF-8.
279/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
280/// ```
281///
282/// If this example seems circuitous, that's because it is! It's meant to be
283/// illustrative. In practice, you could just try to decode your byte sequence
284/// and compare it with the scalar value range directly. However, this is not
285/// always possible (for example, in a byte based automaton).
286pub struct Utf8Sequences {
287 range_stack: Vec<ScalarRange>,
288}
289
290impl Utf8Sequences {
291 /// Create a new iterator over UTF-8 byte ranges for the scalar value range
292 /// given.
293 pub fn new(start: char, end: char) -> Self {
294 let mut it = Utf8Sequences { range_stack: vec![] };
295 it.push(start as u32, end as u32);
296 it
297 }
298
299 /// reset resets the scalar value range.
300 /// Any existing state is cleared, but resources may be reused.
301 ///
302 /// N.B. Benchmarks say that this method is dubious.
303 #[doc(hidden)]
304 pub fn reset(&mut self, start: char, end: char) {
305 self.range_stack.clear();
306 self.push(start as u32, end as u32);
307 }
308
309 fn push(&mut self, start: u32, end: u32) {
310 self.range_stack.push(ScalarRange { start: start, end: end });
311 }
312}
313
314struct ScalarRange {
315 start: u32,
316 end: u32,
317}
318
319impl fmt::Debug for ScalarRange {
320 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
321 write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
322 }
323}
324
325impl Iterator for Utf8Sequences {
326 type Item = Utf8Sequence;
327
328 fn next(&mut self) -> Option<Self::Item> {
329 'TOP:
330 while let Some(mut r) = self.range_stack.pop() {
331 'INNER:
332 loop {
333 if let Some((r1, r2)) = r.split() {
334 self.push(r2.start, r2.end);
335 r.start = r1.start;
336 r.end = r1.end;
337 continue 'INNER;
338 }
339 if !r.is_valid() {
340 continue 'TOP;
341 }
342 for i in 1..MAX_UTF8_BYTES {
343 let max = max_scalar_value(i);
344 if r.start <= max && max < r.end {
345 self.push(max + 1, r.end);
346 r.end = max;
347 continue 'INNER;
348 }
349 }
350 if let Some(ascii_range) = r.as_ascii() {
351 return Some(Utf8Sequence::One(ascii_range));
352 }
353 for i in 1..MAX_UTF8_BYTES {
354 let m = (1 << (6 * i)) - 1;
355 if (r.start & !m) != (r.end & !m) {
356 if (r.start & m) != 0 {
357 self.push((r.start | m) + 1, r.end);
358 r.end = r.start | m;
359 continue 'INNER;
360 }
361 if (r.end & m) != m {
362 self.push(r.end & !m, r.end);
363 r.end = (r.end & !m) - 1;
364 continue 'INNER;
365 }
366 }
367 }
368 let mut start = [0; MAX_UTF8_BYTES];
369 let mut end = [0; MAX_UTF8_BYTES];
370 let n = r.encode(&mut start, &mut end);
371 return Some(Utf8Sequence::from_encoded_range(
372 &start[0..n], &end[0..n]));
373 }
374 }
375 None
376 }
377}
378
379impl ScalarRange {
380 /// split splits this range if it overlaps with a surrogate codepoint.
381 ///
382 /// Either or both ranges may be invalid.
383 fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
384 if self.start < 0xE000 && self.end > 0xD7FF {
385 Some((ScalarRange {
386 start: self.start,
387 end: 0xD7FF,
388 }, ScalarRange {
389 start: 0xE000,
390 end: self.end,
391 }))
392 } else {
393 None
394 }
395 }
396
397 /// is_valid returns true if and only if start <= end.
398 fn is_valid(&self) -> bool {
399 self.start <= self.end
400 }
401
402 /// as_ascii returns this range as a Utf8Range if and only if all scalar
403 /// values in this range can be encoded as a single byte.
404 fn as_ascii(&self) -> Option<Utf8Range> {
405 if self.is_ascii() {
406 Some(Utf8Range::new(self.start as u8, self.end as u8))
407 } else {
408 None
409 }
410 }
411
412 /// is_ascii returns true if the range is ASCII only (i.e., takes a single
413 /// byte to encode any scalar value).
414 fn is_ascii(&self) -> bool {
415 self.is_valid() && self.end <= 0x7f
416 }
417
418 /// encode writes the UTF-8 encoding of the start and end of this range
419 /// to the corresponding destination slices.
420 ///
421 /// The slices should have room for at least `MAX_UTF8_BYTES`.
422 fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
423 let cs = char::from_u32(self.start).unwrap();
424 let ce = char::from_u32(self.end).unwrap();
425 let n = encode_utf8(cs, start).unwrap();
426 let m = encode_utf8(ce, end).unwrap();
427 assert_eq!(n, m);
428 n
429 }
430}
431
432fn max_scalar_value(nbytes: usize) -> u32 {
433 match nbytes {
434 1 => 0x007F,
435 2 => 0x07FF,
436 3 => 0xFFFF,
437 4 => 0x10FFFF,
438 _ => unreachable!("invalid UTF-8 byte sequence size"),
439 }
440}
441
442#[cfg(test)]
443mod tests {
444 use std::char;
445
446 use quickcheck::{TestResult, quickcheck};
447
448 use {Utf8Range, Utf8Sequences};
449
450 fn rutf8(s: u8, e: u8) -> Utf8Range {
451 Utf8Range::new(s, e)
452 }
453
454 fn never_accepts_surrogate_codepoints(start: char, end: char) {
455 for cp in 0xD800..0xE000 {
456 let buf = encode_surrogate(cp);
457 for r in Utf8Sequences::new(start, end) {
458 if r.matches(&buf) {
459 panic!(
460 "Sequence ({:X}, {:X}) contains range {:?}, \
461 which matches surrogate code point {:X} \
462 with encoded bytes {:?}",
463 start as u32, end as u32, r, cp, buf,
464 );
465 }
466 }
467 }
468 }
469
470 #[test]
471 fn codepoints_no_surrogates() {
472 never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
473 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
474 never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
475 never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
476 never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
477 }
478
479 #[test]
480 fn single_codepoint_one_sequence() {
481 // Tests that every range of scalar values that contains a single
482 // scalar value is recognized by one sequence of byte ranges.
483 for i in 0x0..(0x10FFFF + 1) {
484 let c = match char::from_u32(i) {
485 None => continue,
486 Some(c) => c,
487 };
488 let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
489 assert_eq!(seqs.len(), 1);
490 }
491 }
492
493 #[test]
494 fn qc_codepoints_no_surrogate() {
495 fn p(s: char, e: char) -> TestResult {
496 if s > e {
497 return TestResult::discard();
498 }
499 never_accepts_surrogate_codepoints(s, e);
500 TestResult::passed()
501 }
502 quickcheck(p as fn(char, char) -> TestResult);
503 }
504
505 #[test]
506 fn bmp() {
507 use Utf8Sequence::*;
508
509 let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}')
510 .collect::<Vec<_>>();
511 assert_eq!(seqs, vec![
512 One(rutf8(0x0, 0x7F)),
513 Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
514 Three([rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF)]),
515 Three([rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
516 Three([rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF)]),
517 Three([rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF)]),
518 ]);
519 }
520
521 fn encode_surrogate(cp: u32) -> [u8; 3] {
522 const TAG_CONT: u8 = 0b1000_0000;
523 const TAG_THREE_B: u8 = 0b1110_0000;
524
525 assert!(0xD800 <= cp && cp < 0xE000);
526 let mut dst = [0; 3];
527 dst[0] = (cp >> 12 & 0x0F) as u8 | TAG_THREE_B;
528 dst[1] = (cp >> 6 & 0x3F) as u8 | TAG_CONT;
529 dst[2] = (cp & 0x3F) as u8 | TAG_CONT;
530 dst
531 }
532}