arf_strings/
rustix.rs

1use std::borrow::Cow;
2use std::ffi::{CStr, CString, OsStr};
3#[cfg(unix)]
4use std::os::unix::ffi::OsStrExt;
5#[cfg(target_os = "wasi")]
6use std::os::wasi::ffi::OsStrExt;
7use std::{io, str};
8
9/// Convert a byte sequence which is either plain UTF-8 or an ARF encoding into
10/// a `CString` ready for use in POSIX-style APIs.
11pub fn bytes_to_host(bytes: &[u8]) -> io::Result<CString> {
12    let s = str::from_utf8(bytes).map_err(|_| encoding_error())?;
13    str_to_host(s)
14}
15
16/// Convert a `&str` which is either plain UTF-8 or an ARF encoding into a
17/// `CString` ready for use in POSIX-style APIs.
18pub fn str_to_host(s: &str) -> io::Result<CString> {
19    match CString::new(s) {
20        Ok(c_string) => Ok(c_string),
21        Err(e) => from_arf(s, e.nul_position()),
22    }
23}
24
25/// Convert an `&OsStr` produced by POSIX-style APIs into a `Cow<str>` which
26/// is either plain UTF-8 or an ARF encoding. Returns an error if the input
27/// string contains NUL bytes.
28pub fn host_os_str_to_str(host: &OsStr) -> io::Result<Cow<str>> {
29    if host.as_bytes().contains(&b'\0') {
30        return Err(encoding_error());
31    }
32    Ok(if let Ok(s) = str::from_utf8(host.as_bytes()) {
33        Cow::Borrowed(s)
34    } else {
35        Cow::Owned(to_arf(host.as_bytes()))
36    })
37}
38
39/// Convert an `&OsStr` produced by POSIX-style APIs into a `Cow<[u8]>` which
40/// is either plain UTF-8 or an ARF encoding. Returns an error if the input
41/// string contains NUL bytes.
42pub fn host_os_str_to_bytes(host: &OsStr) -> io::Result<Cow<[u8]>> {
43    Ok(match host_os_str_to_str(host)? {
44        Cow::Borrowed(b) => Cow::Borrowed(b.as_bytes()),
45        Cow::Owned(b) => Cow::Owned(b.into_bytes()),
46    })
47}
48
49/// Convert an `&CStr` produced by POSIX-style APIs into a `Cow<str>` which
50/// is either plain UTF-8 or an ARF encoding.
51pub fn host_c_str_to_str(host: &CStr) -> Cow<str> {
52    if let Ok(s) = str::from_utf8(host.to_bytes()) {
53        Cow::Borrowed(s)
54    } else {
55        Cow::Owned(to_arf(host.to_bytes()))
56    }
57}
58
59/// Convert an `&CStr` produced by POSIX-style APIs into a `Cow<[u8]>` which
60/// is either plain UTF-8 or an ARF encoding.
61pub fn host_c_str_to_bytes(host: &CStr) -> Cow<[u8]> {
62    let bytes = host_c_str_to_str(host);
63    match bytes {
64        Cow::Borrowed(b) => Cow::Borrowed(b.as_bytes()),
65        Cow::Owned(b) => Cow::Owned(b.into_bytes()),
66    }
67}
68
69/// Slow path for `str_to_host`.
70#[cold]
71fn from_arf(s: &str, nul: usize) -> io::Result<CString> {
72    if !s.starts_with('\u{feff}') {
73        return Err(encoding_error());
74    }
75
76    let mut lossy = s.bytes().skip('\u{feff}'.len_utf8());
77    let mut nul_escaped = s.bytes().skip(nul + 1);
78    let mut any_invalid = false;
79    let mut vec = Vec::new();
80    while let Some(b) = nul_escaped.next() {
81        if b == b'\0' {
82            let more = nul_escaped.next().ok_or_else(encoding_error)?;
83            if (more & 0x80) != 0 {
84                return Err(encoding_error());
85            }
86            // Test for U+FFFD.
87            let l0 = lossy.next().ok_or_else(encoding_error)?;
88            let l1 = lossy.next().ok_or_else(encoding_error)?;
89            let l2 = lossy.next().ok_or_else(encoding_error)?;
90            if [l0, l1, l2] != [0xef, 0xbf, 0xbd] {
91                return Err(encoding_error());
92            }
93            any_invalid = true;
94            vec.push(more | 0x80);
95        } else {
96            if lossy.next() != Some(b) {
97                return Err(encoding_error());
98            }
99            vec.push(b);
100        }
101    }
102    if !any_invalid {
103        return Err(encoding_error());
104    }
105    if lossy.next() != Some(b'\0') {
106        return Err(encoding_error());
107    }
108
109    // Validation succeeded.
110    Ok(unsafe { CString::from_vec_unchecked(vec) })
111}
112
113/// Slow path for `host_to_bytes`.
114#[cold]
115fn to_arf(bytes: &[u8]) -> String {
116    let mut data = String::new();
117
118    data.push('\u{feff}');
119
120    let mut input = bytes;
121
122    // This loop and `unsafe` follow the example in the documentation:
123    // <https://doc.rust-lang.org/std/str/struct.Utf8Error.html#examples>
124    loop {
125        match std::str::from_utf8(input) {
126            Ok(valid) => {
127                data.push_str(valid);
128                break;
129            }
130            Err(error) => {
131                let (valid, after_valid) = input.split_at(error.valid_up_to());
132                unsafe { data.push_str(str::from_utf8_unchecked(valid)) }
133                data.push('\u{FFFD}');
134
135                if let Some((_, remaining)) = after_valid.split_first() {
136                    input = remaining;
137                } else {
138                    break;
139                }
140            }
141        }
142    }
143
144    data.push('\0');
145
146    // This loop and `unsafe` follow the example in the documentation
147    // mentioned above.
148    let mut input = bytes;
149    loop {
150        match std::str::from_utf8(input) {
151            Ok(valid) => {
152                data.push_str(valid);
153                break;
154            }
155            Err(error) => {
156                let (valid, after_valid) = input.split_at(error.valid_up_to());
157
158                unsafe { data.push_str(str::from_utf8_unchecked(valid)) }
159                if let Some((byte, remaining)) = after_valid.split_first() {
160                    data.push('\0');
161                    data.push((byte & 0x7f) as char);
162                    input = remaining;
163                } else {
164                    break;
165                }
166            }
167        }
168    }
169
170    data
171}
172
173#[cold]
174fn encoding_error() -> io::Error {
175    ::rustix::io::Errno::ILSEQ.into()
176}
177
178#[test]
179fn utf8_inputs() {
180    assert_eq!(str_to_host("").unwrap().to_bytes(), b"");
181    assert_eq!(str_to_host("f").unwrap().to_bytes(), b"f");
182    assert_eq!(str_to_host("foo").unwrap().to_bytes(), b"foo");
183    assert_eq!(
184        str_to_host("\u{fffd}").unwrap().to_bytes(),
185        "\u{fffd}".as_bytes()
186    );
187    assert_eq!(
188        str_to_host("\u{fffd}foo").unwrap().to_bytes(),
189        "\u{fffd}foo".as_bytes()
190    );
191    assert_eq!(
192        str_to_host("\u{feff}foo").unwrap().to_bytes(),
193        "\u{feff}foo".as_bytes()
194    );
195}
196
197#[test]
198fn arf_inputs() {
199    assert_eq!(
200        str_to_host("\u{feff}hello\u{fffd}world\0hello\0\x05world")
201            .unwrap()
202            .to_bytes(),
203        b"hello\x85world"
204    );
205    assert_eq!(
206        str_to_host("\u{feff}hello\u{fffd}\0hello\0\x05")
207            .unwrap()
208            .to_bytes(),
209        b"hello\x85"
210    );
211}
212
213#[test]
214fn errors_from_bytes() {
215    assert!(bytes_to_host(b"\xfe").is_err());
216    assert!(bytes_to_host(b"\xc0\xff").is_err());
217}
218
219#[test]
220fn errors_from_str() {
221    assert!(str_to_host("\u{feff}hello world\0hello world").is_err());
222    assert!(str_to_host("\u{feff}hello world\0\0hello world\0").is_err());
223    assert!(str_to_host("\u{feff}hello\u{fffd}world\0\0hello\0\x05world\0").is_err());
224    assert!(str_to_host("\u{fffe}hello\u{fffd}world\0hello\0\x05world").is_err());
225    assert!(str_to_host("\u{feff}hello\u{fffd}\0hello\0").is_err());
226}
227
228#[test]
229fn valid_utf8() {
230    assert_eq!(host_os_str_to_str(OsStr::from_bytes(b"")).unwrap(), "");
231    assert_eq!(
232        host_os_str_to_str(OsStr::from_bytes(b"foo")).unwrap(),
233        "foo"
234    );
235
236    // Same thing, now with `CStr`s.
237    assert_eq!(
238        host_c_str_to_str(CStr::from_bytes_with_nul(b"\0").unwrap()),
239        ""
240    );
241    assert_eq!(
242        host_c_str_to_str(CStr::from_bytes_with_nul(b"foo\0").unwrap()),
243        "foo"
244    );
245}
246
247#[test]
248fn not_utf8() {
249    assert_eq!(
250        host_os_str_to_str(OsStr::from_bytes(b"\xfe")).unwrap(),
251        "\u{feff}\u{fffd}\0\0\u{7e}"
252    );
253    assert_eq!(
254        host_os_str_to_str(OsStr::from_bytes(b"\xc0\xff")).unwrap(),
255        "\u{feff}\u{fffd}\u{fffd}\0\0\u{40}\0\u{7f}"
256    );
257    assert_eq!(
258        host_os_str_to_str(OsStr::from_bytes(b"\xef\xbb\xbf")).unwrap(),
259        "\u{feff}"
260    );
261    assert_eq!(
262        host_os_str_to_str(OsStr::from_bytes(b"\xef\xbb\xbf\xfd")).unwrap(),
263        "\u{feff}\u{feff}\u{fffd}\0\u{feff}\0\x7d"
264    );
265    assert_eq!(
266        host_os_str_to_str(OsStr::from_bytes(b"\xe2\x98")).unwrap(),
267        "\u{feff}\u{fffd}\u{fffd}\0\0\u{62}\0\u{18}"
268    );
269    assert_eq!(
270        host_os_str_to_str(OsStr::from_bytes(b"\xf0\x9f")).unwrap(),
271        "\u{feff}\u{fffd}\u{fffd}\0\0\u{70}\0\u{1f}"
272    );
273    assert_eq!(
274        host_os_str_to_str(OsStr::from_bytes(b"\xf0\x9f\x92")).unwrap(),
275        "\u{feff}\u{fffd}\u{fffd}\u{fffd}\0\0\u{70}\0\u{1f}\0\u{12}"
276    );
277
278    // Same thing, now with `CStr`s.
279    assert_eq!(
280        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xfe\0").unwrap()),
281        "\u{feff}\u{fffd}\0\0\u{7e}"
282    );
283    assert_eq!(
284        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xc0\xff\0").unwrap()),
285        "\u{feff}\u{fffd}\u{fffd}\0\0\u{40}\0\u{7f}"
286    );
287    assert_eq!(
288        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xef\xbb\xbf\0").unwrap()),
289        "\u{feff}"
290    );
291    assert_eq!(
292        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xef\xbb\xbf\xfd\0").unwrap()),
293        "\u{feff}\u{feff}\u{fffd}\0\u{feff}\0\x7d"
294    );
295    assert_eq!(
296        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xe2\x98\0").unwrap()),
297        "\u{feff}\u{fffd}\u{fffd}\0\0\u{62}\0\u{18}"
298    );
299    assert_eq!(
300        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xf0\x9f\0").unwrap()),
301        "\u{feff}\u{fffd}\u{fffd}\0\0\u{70}\0\u{1f}"
302    );
303    assert_eq!(
304        host_c_str_to_str(CStr::from_bytes_with_nul(b"\xf0\x9f\x92\0").unwrap()),
305        "\u{feff}\u{fffd}\u{fffd}\u{fffd}\0\0\u{70}\0\u{1f}\0\u{12}"
306    );
307}
308
309#[test]
310fn round_trip() {
311    assert_eq!(
312        host_os_str_to_str(OsStr::from_bytes(bytes_to_host(b"").unwrap().as_bytes())).unwrap(),
313        ""
314    );
315    assert_eq!(
316        host_os_str_to_str(OsStr::from_bytes(
317            bytes_to_host(b"hello").unwrap().as_bytes()
318        ))
319        .unwrap(),
320        "hello"
321    );
322    assert_eq!(
323        str_to_host(&host_os_str_to_str(OsStr::from_bytes(b"hello")).unwrap())
324            .unwrap()
325            .as_bytes(),
326        b"hello"
327    );
328    assert_eq!(
329        str_to_host(&host_os_str_to_str(OsStr::from_bytes(b"h\xc0ello\xc1")).unwrap())
330            .unwrap()
331            .as_bytes(),
332        b"h\xc0ello\xc1"
333    );
334    assert_eq!(
335        str_to_host(&host_os_str_to_str(OsStr::from_bytes(b"\xf5\xff")).unwrap())
336            .unwrap()
337            .as_bytes(),
338        b"\xf5\xff"
339    );
340    assert_eq!(
341        str_to_host(&host_os_str_to_str(OsStr::from_bytes(b"")).unwrap())
342            .unwrap()
343            .as_bytes(),
344        b""
345    );
346    assert_eq!(
347        str_to_host(&host_os_str_to_str(OsStr::from_bytes(b"\xe6\x96")).unwrap())
348            .unwrap()
349            .as_bytes(),
350        b"\xe6\x96"
351    );
352
353    // Same thing, now with `CStr`s.
354    assert_eq!(
355        str_to_host(&host_c_str_to_str(
356            CStr::from_bytes_with_nul(b"hello\0").unwrap()
357        ))
358        .unwrap()
359        .as_bytes(),
360        b"hello"
361    );
362    assert_eq!(
363        str_to_host(&host_c_str_to_str(
364            CStr::from_bytes_with_nul(b"h\xc0ello\xc1\0").unwrap()
365        ))
366        .unwrap()
367        .as_bytes(),
368        b"h\xc0ello\xc1"
369    );
370    assert_eq!(
371        str_to_host(&host_c_str_to_str(
372            CStr::from_bytes_with_nul(b"\xf5\xff\0").unwrap()
373        ))
374        .unwrap()
375        .as_bytes(),
376        b"\xf5\xff"
377    );
378    assert_eq!(
379        str_to_host(&host_c_str_to_str(
380            CStr::from_bytes_with_nul(b"\0").unwrap()
381        ))
382        .unwrap()
383        .as_bytes(),
384        b""
385    );
386    assert_eq!(
387        str_to_host(&host_c_str_to_str(
388            CStr::from_bytes_with_nul(b"\xe6\x96\0").unwrap()
389        ))
390        .unwrap()
391        .as_bytes(),
392        b"\xe6\x96"
393    );
394}