1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
use crate::charutils::*;
use crate::error::*;
//use crate::Result;

// begin copypasta
// These chars yield themselves: " \ /
// b -> backspace, f -> formfeed, n -> newline, r -> cr, t -> horizontal tab
// u not handled in this table as it's complex
pub const ESCAPE_MAP: [u8; 256] = [
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x0.
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0x2f, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, // 0x4.
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x5c, 0, 0, 0, // 0x5.
    0, 0, 0x08, 0, 0, 0, 0x0c, 0, 0, 0, 0, 0, 0, 0, 0x0a, 0, // 0x6.
    0, 0, 0x0d, 0, 0x09, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x7.
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
];

// handle a unicode codepoint
// write appropriate values into dest
// src will advance 6 bytes or 12 bytes
// dest will advance a variable amount (return via pointer)
// return true if the unicode codepoint was valid
// We work in little-endian then swap at write time

#[cfg_attr(not(feature = "no-inline"), inline(always))]
pub fn handle_unicode_codepoint(
    mut src_ptr: &[u8],
    dst_ptr: &mut [u8],
) -> Result<(usize, usize), ErrorType> {
    // hex_to_u32_nocheck fills high 16 bits of the return value with 1s if the
    // conversion isn't valid; we defer the check for this to inside the
    // multilingual plane check
    let mut code_point: u32 = hex_to_u32_nocheck(unsafe { src_ptr.get_unchecked(2..) });
    src_ptr = unsafe { src_ptr.get_unchecked(6..) };
    let mut src_offset = 6;
    // check for low surrogate for characters outside the Basic
    // Multilingual Plane.
    if code_point >= 0xd800 && code_point < 0xdc00 {
        if (unsafe { *src_ptr.get_unchecked(0) } != b'\\')
            || unsafe { *src_ptr.get_unchecked(1) } != b'u'
        {
            return Ok((0, src_offset));
        }

        let code_point_2: u32 = hex_to_u32_nocheck(unsafe { src_ptr.get_unchecked(2..) });

        // if the first code point is invalid we will get here, as we will go past
        // the check for being outside the Basic Multilingual plane. If we don't
        // find a \u immediately afterwards we fail out anyhow, but if we do,
        // this check catches both the case of the first code point being invalid
        // or the second code point being invalid.
        if ((code_point | code_point_2) >> 16) != 0 {
            return Ok((0, src_offset));
        }
        let c1 = if let Some(c) = code_point.checked_sub(0xd800) {
            c
        } else {
            return Err(ErrorType::InvalidUTF8);
        };
        let c2 = if let Some(c) = code_point_2.checked_sub(0xdc00) {
            c
        } else {
            return Err(ErrorType::InvalidUTF8);
        };
        code_point = ((c1 << 10) | c2) + 0x10000;
        src_offset += 6;
    }
    let offset: usize = codepoint_to_utf8(code_point, dst_ptr);
    Ok((offset, src_offset))
}