utf8_width/
lib.rs

1/*!
2# UTF-8 Width
3
4To determine the width of a UTF-8 character by providing its first byte.
5
6References: https://tools.ietf.org/html/rfc3629
7
8## Examples
9
10```rust
11assert_eq!(1, utf8_width::get_width(b'1'));
12assert_eq!(3, utf8_width::get_width("δΈ­".as_bytes()[0]));
13```
14
15## Benchmark
16
17```bash
18cargo bench
19```
20*/
21
22#![no_std]
23
24pub const MIN_0_1: u8 = 0x80;
25pub const MAX_0_1: u8 = 0xC1;
26pub const MIN_0_2: u8 = 0xF5;
27pub const MAX_0_2: u8 = 0xFF;
28pub const MIN_1: u8 = 0x00;
29pub const MAX_1: u8 = 0x7F;
30pub const MIN_2: u8 = 0xC2;
31pub const MAX_2: u8 = 0xDF;
32pub const MIN_3: u8 = 0xE0;
33pub const MAX_3: u8 = 0xEF;
34pub const MIN_4: u8 = 0xF0;
35pub const MAX_4: u8 = 0xF4;
36
37#[inline]
38pub const fn is_width_1(byte: u8) -> bool {
39    byte <= MAX_1 // no need to check `MIN_1 <= byte`
40}
41
42#[inline]
43pub const fn is_width_2(byte: u8) -> bool {
44    byte >= MIN_2 && byte <= MAX_2
45}
46
47#[inline]
48pub const fn is_width_3(byte: u8) -> bool {
49    byte >= MIN_3 && byte <= MAX_3
50}
51
52#[inline]
53pub const fn is_width_4(byte: u8) -> bool {
54    byte >= MIN_4 && byte <= MAX_4
55}
56
57#[inline]
58pub const fn is_width_0(byte: u8) -> bool {
59    byte >= MIN_0_1 && byte <= MAX_0_1 || MIN_0_2 <= byte // no need to check `byte <= MAX_0_2`
60}
61
62/// Given a first byte, determine how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, return `0`; otherwise, return `1` to `4`.
63#[inline]
64pub const fn get_width(byte: u8) -> usize {
65    if is_width_1(byte) {
66        1
67    } else if is_width_2(byte) {
68        2
69    } else if byte <= MAX_3 {
70        // no need to check `MIN_3 <= byte`
71        3
72    } else if byte <= MAX_4 {
73        // no need to check `MIN_4 <= byte`
74        4
75    } else {
76        0
77    }
78}
79
80/// *Assuming the input first byte is from a valid UTF-8 character*, determine how many bytes are in this UTF-8 character. It returns `1` to `4`.
81///
82/// # Safety
83///
84/// You must ensure that the input byte is a valid UTF-8 first byte on your own.
85#[inline]
86pub const unsafe fn get_width_assume_valid(byte: u8) -> usize {
87    if byte <= MAX_1 {
88        1
89    } else if byte <= MAX_2 {
90        2
91    } else if byte <= MAX_3 {
92        3
93    } else {
94        4
95    }
96}