1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
// Copyright (C) 2019-2023 Aleo Systems Inc.
// This file is part of the snarkVM library.

// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at:
// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

/// From https://github.com/Geal/nom/blob/main/examples/string.rs
pub mod string_parser {
    //! This example shows an example of how to parse an escaped string. The
    //! rules for the string are similar to JSON and rust. A string is:
    //!
    //! - Enclosed by double quotes
    //! - Can contain any raw unescaped code point besides \ and "
    //! - Matches the following escape sequences: \b, \f, \n, \r, \t, \", \\, \/
    //! - Matches code points like Rust: \u{XXXX}, where XXXX can be up to 6
    //!   hex characters
    //! - an escape followed by whitespace consumes all whitespace between the
    //!   escape and the next non-whitespace character

    use nom::{
        branch::alt,
        bytes::streaming::{is_not, take_while_m_n},
        character::streaming::{char, multispace1},
        combinator::{map, map_opt, map_res, value, verify},
        error::{ErrorKind, FromExternalError, ParseError},
        multi::fold_many0,
        sequence::{delimited, preceded},
        Err::Error,
        IResult,
    };

    /// Checks for supported code points.
    ///
    /// We regard the following characters as safe:
    /// - Horizontal tab (code 9).
    /// - Line feed (code 10).
    /// - Carriage return (code 13).
    /// - Space (code 32).
    /// - Visible ASCII (codes 33-126).
    /// - Non-ASCII Unicode scalar values (codes 128+) except
    ///   * bidi embeddings, overrides and their termination (codes U+202A-U+202E)
    ///   * isolates (codes U+2066-U+2069)
    ///
    /// The Unicode bidi characters are well-known for presenting Trojan Source dangers.
    /// The ASCII backspace (code 8) can be also used to make text look different from what it is,
    /// and a similar danger may apply to delete (126).
    /// Other ASCII control characters
    /// (except for horizontal tab, space, line feed, and carriage return, which are allowed)
    /// may or may not present dangers, but we see no good reason for allowing them.
    /// At some point we may want disallow additional non-ASCII characters,
    /// if we see no good reason to allow them.
    ///
    /// Note that we say 'Unicode scalar values' above,
    /// because we read UTF-8-decoded characters,
    /// and thus we will never encounter surrogate code points,
    /// and we do not need to explicitly exclude them in this function.
    pub fn is_char_supported(c: char) -> bool {
        !is_char_unsupported(c)
    }

    /// Checks for unsupported "invisible" code points.
    fn is_char_unsupported(c: char) -> bool {
        let code = c as u32;

        // A quick early return, as anything above is supported.
        if code > 0x2069 {
            return false;
        }

        // A "divide and conquer" approach for greater performance; ranges are
        // checked before single values and all the comparisons get "reused".
        if code < 0x202a {
            if code <= 31 { !(9..14).contains(&code) || code == 11 || code == 12 } else { code == 127 }
        } else {
            code <= 0x202e || code >= 0x2066
        }
    }

    /// Parse a unicode sequence, of the form u{XXXX}, where XXXX is 1 to 6
    /// hexadecimal numerals. We will combine this later with parse_escaped_char
    /// to parse sequences like \u{00AC}.
    fn parse_unicode<'a, E>(input: &'a str) -> IResult<&'a str, char, E>
    where
        E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>,
    {
        // `take_while_m_n` parses between `m` and `n` bytes (inclusive) that match
        // a predicate. `parse_hex` here parses between 1 and 6 hexadecimal numerals.
        let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());

        // `preceded` takes a prefix parser, and if it succeeds, returns the result
        // of the body parser. In this case, it parses u{XXXX}.
        let parse_delimited_hex = preceded(
            char('u'),
            // `delimited` is like `preceded`, but it parses both a prefix and a suffix.
            // It returns the result of the middle parser. In this case, it parses
            // {XXXX}, where XXXX is 1 to 6 hex numerals, and returns XXXX
            delimited(char('{'), parse_hex, char('}')),
        );

        // `map_res` takes the result of a parser and applies a function that returns
        // a Result. In this case we take the hex bytes from parse_hex and attempt to
        // convert them to a u32.
        let parse_u32 = map_res(parse_delimited_hex, move |hex| u32::from_str_radix(hex, 16));

        // map_opt is like map_res, but it takes an Option instead of a Result. If
        // the function returns None, map_opt returns an error. In this case, because
        // not all u32 values are valid unicode code points, we have to fallibly
        // convert to char with from_u32.
        map_opt(parse_u32, std::char::from_u32)(input)
    }

    /// Parse an escaped character: \n, \t, \r, \u{00AC}, etc.
    fn parse_escaped_char<'a, E>(input: &'a str) -> IResult<&'a str, char, E>
    where
        E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>,
    {
        preceded(
            char('\\'),
            // `alt` tries each parser in sequence, returning the result of
            // the first successful match
            alt((
                parse_unicode,
                // The `value` parser returns a fixed value (the first argument) if its
                // parser (the second argument) succeeds. In these cases, it looks for
                // the marker characters (n, r, t, etc) and returns the matching
                // character (\n, \r, \t, etc).
                value('\n', char('n')),
                value('\r', char('r')),
                value('\t', char('t')),
                value('\u{08}', char('b')),
                value('\u{0C}', char('f')),
                value('\\', char('\\')),
                value('/', char('/')),
                value('"', char('"')),
            )),
        )(input)
    }

    /// Parse a backslash, followed by any amount of whitespace. This is used later
    /// to discard any escaped whitespace.
    fn parse_escaped_whitespace<'a, E: ParseError<&'a str>>(input: &'a str) -> IResult<&'a str, &'a str, E> {
        preceded(char('\\'), multispace1)(input)
    }

    /// Parse a non-empty block of text that doesn't include \ or "
    fn parse_literal<'a, E: ParseError<&'a str>>(input: &'a str) -> IResult<&'a str, &'a str, E> {
        // Return an error if the literal contains an unsupported code point.
        if input.chars().any(is_char_unsupported) {
            return Err(Error(E::from_error_kind("String literal contains invalid codepoint", ErrorKind::Char)));
        }

        // `is_not` parses a string of 0 or more characters that aren't one of the
        // given characters.
        let not_quote_slash = is_not("\"\\");

        // `verify` runs a parser, then runs a verification function on the output of
        // the parser. The verification function accepts out output only if it
        // returns true. In this case, we want to ensure that the output of is_not
        // is non-empty.
        verify(not_quote_slash, |s: &str| !s.is_empty())(input)
    }

    /// A string fragment contains a fragment of a string being parsed: either
    /// a non-empty Literal (a series of non-escaped characters), a single
    /// parsed escaped character, or a block of escaped whitespace.
    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
    enum StringFragment<'a> {
        Literal(&'a str),
        EscapedChar(char),
        EscapedWS,
    }

    /// Combine parse_literal, parse_escaped_whitespace, and parse_escaped_char
    /// into a StringFragment.
    fn parse_fragment<'a, E>(input: &'a str) -> IResult<&'a str, StringFragment<'a>, E>
    where
        E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>,
    {
        alt((
            // The `map` combinator runs a parser, then applies a function to the output
            // of that parser.
            map(parse_literal, StringFragment::Literal),
            map(parse_escaped_char, StringFragment::EscapedChar),
            value(StringFragment::EscapedWS, parse_escaped_whitespace),
        ))(input)
    }

    /// Parse a string. Use a loop of parse_fragment and push all of the fragments
    /// into an output string.
    pub fn parse_string<'a, E>(input: &'a str) -> IResult<&'a str, String, E>
    where
        E: ParseError<&'a str> + FromExternalError<&'a str, std::num::ParseIntError>,
    {
        // fold_many0 is the equivalent of iterator::fold. It runs a parser in a loop,
        // and for each output value, calls a folding function on each output value.
        let build_string = fold_many0(
            // Our parser function– parses a single string fragment
            parse_fragment,
            // Our init value, an empty string
            String::new,
            // Our folding function. For each fragment, append the fragment to the
            // string.
            |mut string, fragment| {
                match fragment {
                    StringFragment::Literal(s) => string.push_str(s),
                    StringFragment::EscapedChar(c) => string.push(c),
                    StringFragment::EscapedWS => {}
                }
                string
            },
        );

        // Finally, parse the string. Note that, if `build_string` could accept a raw
        // " character, the closing delimiter " would never match. When using
        // `delimited` with a looping parser (like fold_many0), be sure that the
        // loop won't accidentally match your closing delimiter!
        delimited(char('"'), build_string, char('"'))(input)
    }
}

#[test]
fn test_parse_string() {
    // to use parse_string_wrapper instead of string_parser::parse_string::<nom::error::VerboseError<&str>> in the tests below:
    fn parse_string_wrapper(input: &str) -> crate::ParserResult<String> {
        string_parser::parse_string(input)
    }

    // tests some correct string literals:
    assert_eq!(("", String::from("")), parse_string_wrapper("\"\"").unwrap());
    assert_eq!(("", String::from("abc")), parse_string_wrapper("\"abc\"").unwrap());
    assert_eq!((" and more", String::from("abc")), parse_string_wrapper("\"abc\" and more").unwrap());
    assert_eq!(("", String::from("\r")), parse_string_wrapper("\"\r\"").unwrap());
    assert_eq!(("", String::from("4\u{4141}x\x09")), parse_string_wrapper("\"4\u{4141}x\x09\"").unwrap());

    // test rejection of disallowed characters:
    assert!(parse_string_wrapper("\"hel\x08lo\"").is_err());
    assert!(parse_string_wrapper("\"hel\x1flo\"").is_err());
    assert!(parse_string_wrapper("\"hel\u{2069}lo\"").is_err());
}