simd_json/
stringparse.rs

1use std::ops::Range;
2
3use crate::charutils::{codepoint_to_utf8, hex_to_u32_nocheck};
4use crate::error::ErrorType;
5use crate::safer_unchecked::GetSaferUnchecked;
6
7/// begin copypasta
8/// These chars yield themselves: " \ /
9/// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
10/// u not handled in this table as it's Container
11pub(crate) const ESCAPE_MAP: [u8; 256] = [
12    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
13    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
14    0x2f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
15    0, 0, // 0x4.
16    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
17    0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
18    0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
19    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
20    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
21    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
22    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
23];
24
25const HIGH_SURROGATES: Range<u32> = 0xd800..0xdc00;
26const LOW_SURROGATES: Range<u32> = 0xdc00..0xe000;
27
28/// handle a unicode codepoint
29/// write appropriate values into dest
30#[cfg_attr(not(feature = "no-inline"), inline)]
31#[allow(dead_code)]
32pub(crate) fn handle_unicode_codepoint(
33    src_ptr: &[u8],
34    dst_ptr: &mut [u8],
35) -> Result<(usize, usize), ErrorType> {
36    let (code_point, src_offset) = get_unicode_codepoint(src_ptr)?;
37    let offset: usize = codepoint_to_utf8(code_point, dst_ptr);
38    Ok((offset, src_offset))
39}
40
41/// handle a unicode codepoint
42/// write appropriate values into dest
43/// src will advance 6 bytes or 12 bytes
44/// dest will advance a variable amount (return via pointer)
45/// return true if the unicode codepoint was valid
46/// We work in little-endian then swap at write time
47#[cfg_attr(not(feature = "no-inline"), inline)]
48pub(crate) fn get_unicode_codepoint(mut src_ptr: &[u8]) -> Result<(u32, usize), ErrorType> {
49    // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
50    // conversion isn't valid; we defer the check for this to inside the
51    // multilingual plane check
52    let mut code_point: u32 = hex_to_u32_nocheck(unsafe { src_ptr.get_kinda_unchecked(2..) });
53    src_ptr = unsafe { src_ptr.get_kinda_unchecked(6..) };
54    let mut src_offset = 6;
55    // check for low surrogate for characters outside the Basic
56    // Multilingual Plane.
57    if HIGH_SURROGATES.contains(&code_point) {
58        if (unsafe { *src_ptr.get_kinda_unchecked(0) } != b'\\')
59            || unsafe { *src_ptr.get_kinda_unchecked(1) } != b'u'
60        {
61            return Ok((0, src_offset));
62        }
63
64        let code_point_2: u32 = hex_to_u32_nocheck(unsafe { src_ptr.get_kinda_unchecked(2..) });
65
66        // if the first code point is invalid we will get here, as we will go past
67        // the check for being outside the Basic Multilingual plane. If we don't
68        // find a \u immediately afterwards we fail out anyhow, but if we do,
69        // this check catches both the case of the first code point being invalid
70        // or the second code point being invalid.
71        if ((code_point | code_point_2) >> 16) != 0 {
72            return Ok((0, src_offset));
73        }
74        let Some(c1) = code_point.checked_sub(0xd800) else {
75            return Err(ErrorType::InvalidUtf8);
76        };
77        let Some(c2) = code_point_2.checked_sub(0xdc00) else {
78            return Err(ErrorType::InvalidUtf8);
79        };
80        code_point = ((c1 << 10) | c2) + 0x10000;
81        src_offset += 6;
82    } else if LOW_SURROGATES.contains(&code_point) {
83        // This is a low surrogate on it's own, which is invalid.
84        return Err(ErrorType::InvalidUtf8);
85    }
86    Ok((code_point, src_offset))
87}