gix_config/parse/nom/
mod.rs

1use std::borrow::Cow;
2
3use bstr::{BStr, ByteSlice};
4use winnow::{
5    combinator::{alt, delimited, opt, preceded, repeat},
6    error::{ErrorKind, InputError as NomError, ParserError as _},
7    prelude::*,
8    stream::{Offset as _, Stream as _},
9    token::{one_of, take_till, take_while},
10};
11
12use crate::parse::{error::ParseNode, section, Comment, Error, Event};
13
14/// Attempt to zero-copy parse the provided bytes, passing results to `dispatch`.
15pub fn from_bytes<'i>(mut input: &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> Result<(), Error> {
16    let start = input.checkpoint();
17
18    let bom = unicode_bom::Bom::from(input);
19    input.next_slice(bom.len());
20
21    repeat(
22        0..,
23        alt((
24            comment.map(Event::Comment),
25            take_spaces1.map(|whitespace| Event::Whitespace(Cow::Borrowed(whitespace))),
26            |i: &mut &'i [u8]| {
27                let newline = take_newlines1.parse_next(i)?;
28                let o = Event::Newline(Cow::Borrowed(newline));
29                Ok(o)
30            },
31        )),
32    )
33    .fold(|| (), |_acc, event| dispatch(event))
34    .parse_next(&mut input)
35    // I don't think this can panic. many0 errors if the child parser returns
36    // a success where the input was not consumed, but alt will only return Ok
37    // if one of its children succeed. However, all of it's children are
38    // guaranteed to consume something if they succeed, so the Ok(i) == i case
39    // can never occur.
40    .expect("many0(alt(...)) panicked. Likely a bug in one of the children parsers.");
41
42    if input.is_empty() {
43        return Ok(());
44    }
45
46    let mut node = ParseNode::SectionHeader;
47
48    let res = repeat(1.., |i: &mut &'i [u8]| section(i, &mut node, dispatch))
49        .map(|()| ())
50        .parse_next(&mut input);
51    res.map_err(|_| {
52        let newlines = newlines_from(input, start);
53        Error {
54            line_number: newlines,
55            last_attempted_parser: node,
56            parsed_until: input.as_bstr().into(),
57        }
58    })?;
59
60    // This needs to happen after we collect sections, otherwise the line number
61    // will be off.
62    if !input.is_empty() {
63        let newlines = newlines_from(input, start);
64        return Err(Error {
65            line_number: newlines,
66            last_attempted_parser: node,
67            parsed_until: input.as_bstr().into(),
68        });
69    }
70
71    Ok(())
72}
73
74fn newlines_from(input: &[u8], start: winnow::stream::Checkpoint<&[u8], &[u8]>) -> usize {
75    let offset = input.offset_from(&start);
76    let mut start_input = input;
77    start_input.reset(&start);
78    start_input.next_slice(offset).iter().filter(|c| **c == b'\n').count()
79}
80
81fn comment<'i>(i: &mut &'i [u8]) -> PResult<Comment<'i>, NomError<&'i [u8]>> {
82    (
83        one_of([';', '#']),
84        take_till(0.., |c| c == b'\n').map(|text: &[u8]| Cow::Borrowed(text.as_bstr())),
85    )
86        .map(|(tag, text)| Comment { tag, text })
87        .parse_next(i)
88}
89
90#[cfg(test)]
91mod tests;
92
93fn section<'i>(
94    i: &mut &'i [u8],
95    node: &mut ParseNode,
96    dispatch: &mut dyn FnMut(Event<'i>),
97) -> PResult<(), NomError<&'i [u8]>> {
98    let start = i.checkpoint();
99    let header = section_header(i).map_err(|e| {
100        i.reset(&start);
101        e
102    })?;
103    dispatch(Event::SectionHeader(header));
104
105    // This would usually be a many0(alt(...)), the manual loop allows us to
106    // optimize vec insertions
107    loop {
108        let start = i.checkpoint();
109
110        if let Some(v) = opt(take_spaces1).parse_next(i)? {
111            dispatch(Event::Whitespace(Cow::Borrowed(v.as_bstr())));
112        }
113
114        if let Some(v) = opt(take_newlines1).parse_next(i)? {
115            dispatch(Event::Newline(Cow::Borrowed(v.as_bstr())));
116        }
117
118        key_value_pair(i, node, dispatch)?;
119
120        if let Some(comment) = opt(comment).parse_next(i)? {
121            dispatch(Event::Comment(comment));
122        }
123
124        if i.offset_from(&start) == 0 {
125            break;
126        }
127    }
128
129    Ok(())
130}
131
132fn section_header<'i>(i: &mut &'i [u8]) -> PResult<section::Header<'i>, NomError<&'i [u8]>> {
133    // No spaces must be between section name and section start
134    let name = preceded('[', take_while(1.., is_section_char).map(bstr::ByteSlice::as_bstr)).parse_next(i)?;
135
136    if opt(one_of::<_, _, NomError<&[u8]>>(']')).parse_next(i)?.is_some() {
137        // Either section does not have a subsection or using deprecated
138        // subsection syntax at this point.
139        let header = match memchr::memrchr(b'.', name.as_bytes()) {
140            Some(index) => section::Header {
141                name: section::Name(Cow::Borrowed(name[..index].as_bstr())),
142                separator: name.get(index..=index).map(|s| Cow::Borrowed(s.as_bstr())),
143                subsection_name: name.get(index + 1..).map(|s| Cow::Borrowed(s.as_bstr())),
144            },
145            None => section::Header {
146                name: section::Name(Cow::Borrowed(name.as_bstr())),
147                separator: None,
148                subsection_name: None,
149            },
150        };
151
152        if header.name.is_empty() {
153            return Err(winnow::error::ErrMode::from_error_kind(i, ErrorKind::Fail));
154        }
155        return Ok(header);
156    }
157
158    // Section header must be using modern subsection syntax at this point.
159    (take_spaces1, delimited('"', opt(sub_section), "\"]"))
160        .map(|(whitespace, subsection_name)| section::Header {
161            name: section::Name(Cow::Borrowed(name)),
162            separator: Some(Cow::Borrowed(whitespace)),
163            subsection_name,
164        })
165        .parse_next(i)
166}
167
168fn is_section_char(c: u8) -> bool {
169    c.is_ascii_alphanumeric() || c == b'-' || c == b'.'
170}
171
172fn sub_section<'i>(i: &mut &'i [u8]) -> PResult<Cow<'i, BStr>, NomError<&'i [u8]>> {
173    let mut output = Cow::Borrowed(Default::default());
174    if let Some(sub) = opt(subsection_subset).parse_next(i)? {
175        output = Cow::Borrowed(sub.as_bstr());
176    }
177    while let Some(sub) = opt(subsection_subset).parse_next(i)? {
178        output.to_mut().extend(sub);
179    }
180
181    Ok(output)
182}
183
184fn subsection_subset<'i>(i: &mut &'i [u8]) -> PResult<&'i [u8], NomError<&'i [u8]>> {
185    alt((subsection_unescaped, subsection_escaped_char)).parse_next(i)
186}
187
188fn subsection_unescaped<'i>(i: &mut &'i [u8]) -> PResult<&'i [u8], NomError<&'i [u8]>> {
189    take_while(1.., is_subsection_unescaped_char).parse_next(i)
190}
191
192fn subsection_escaped_char<'i>(i: &mut &'i [u8]) -> PResult<&'i [u8], NomError<&'i [u8]>> {
193    preceded('\\', one_of(is_subsection_escapable_char).take()).parse_next(i)
194}
195
196fn is_subsection_escapable_char(c: u8) -> bool {
197    c != b'\n'
198}
199
200fn is_subsection_unescaped_char(c: u8) -> bool {
201    c != b'"' && c != b'\\' && c != b'\n' && c != 0
202}
203
204fn key_value_pair<'i>(
205    i: &mut &'i [u8],
206    node: &mut ParseNode,
207    dispatch: &mut dyn FnMut(Event<'i>),
208) -> PResult<(), NomError<&'i [u8]>> {
209    *node = ParseNode::Name;
210    if let Some(name) = opt(config_name).parse_next(i)? {
211        dispatch(Event::SectionValueName(section::ValueName(Cow::Borrowed(name))));
212
213        if let Some(whitespace) = opt(take_spaces1).parse_next(i)? {
214            dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
215        }
216
217        *node = ParseNode::Value;
218        config_value(i, dispatch)
219    } else {
220        Ok(())
221    }
222}
223
224/// Parses the config name of a config pair. Assumes the input has already been
225/// trimmed of any leading whitespace.
226fn config_name<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> {
227    (
228        one_of(|c: u8| c.is_ascii_alphabetic()),
229        take_while(0.., |c: u8| c.is_ascii_alphanumeric() || c == b'-'),
230    )
231        .take()
232        .map(bstr::ByteSlice::as_bstr)
233        .parse_next(i)
234}
235
236fn config_value<'i>(i: &mut &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> PResult<(), NomError<&'i [u8]>> {
237    if opt('=').parse_next(i)?.is_some() {
238        dispatch(Event::KeyValueSeparator);
239        if let Some(whitespace) = opt(take_spaces1).parse_next(i)? {
240            dispatch(Event::Whitespace(Cow::Borrowed(whitespace)));
241        }
242        value_impl(i, dispatch)
243    } else {
244        // This is a special way of denoting 'empty' values which a lot of code depends on.
245        // Hence, rather to fix this everywhere else, leave it here and fix it where it matters, namely
246        // when it's about differentiating between a missing key-value separator, and one followed by emptiness.
247        dispatch(Event::Value(Cow::Borrowed("".into())));
248        Ok(())
249    }
250}
251
252/// Handles parsing of known-to-be values. This function handles both single
253/// line values as well as values that are continuations.
254fn value_impl<'i>(i: &mut &'i [u8], dispatch: &mut dyn FnMut(Event<'i>)) -> PResult<(), NomError<&'i [u8]>> {
255    let start_checkpoint = i.checkpoint();
256    let mut value_start_checkpoint = i.checkpoint();
257    let mut value_end = None;
258
259    // This is required to ignore comment markers if they're in a quote.
260    let mut is_in_quotes = false;
261    // Used to determine if we return a Value or Value{Not,}Done
262    let mut partial_value_found = false;
263
264    loop {
265        let _ = take_while(0.., |c| !matches!(c, b'\n' | b'\\' | b'"' | b';' | b'#')).parse_next(i)?;
266        if let Some(c) = i.next_token() {
267            match c {
268                b'\n' => {
269                    value_end = Some(i.offset_from(&value_start_checkpoint) - 1);
270                    break;
271                }
272                b';' | b'#' if !is_in_quotes => {
273                    value_end = Some(i.offset_from(&value_start_checkpoint) - 1);
274                    break;
275                }
276                b'\\' => {
277                    let escaped_index = i.offset_from(&value_start_checkpoint);
278                    let escape_index = escaped_index - 1;
279                    let Some(mut c) = i.next_token() else {
280                        i.reset(&start_checkpoint);
281                        return Err(winnow::error::ErrMode::from_error_kind(i, ErrorKind::Token));
282                    };
283                    let mut consumed = 1;
284                    if c == b'\r' {
285                        c = i.next_token().ok_or_else(|| {
286                            i.reset(&start_checkpoint);
287                            winnow::error::ErrMode::from_error_kind(i, ErrorKind::Token)
288                        })?;
289                        if c != b'\n' {
290                            i.reset(&start_checkpoint);
291                            return Err(winnow::error::ErrMode::from_error_kind(i, ErrorKind::Slice));
292                        }
293                        consumed += 1;
294                    }
295
296                    match c {
297                        b'\n' => {
298                            partial_value_found = true;
299
300                            i.reset(&value_start_checkpoint);
301
302                            let value = i.next_slice(escape_index).as_bstr();
303                            dispatch(Event::ValueNotDone(Cow::Borrowed(value)));
304
305                            i.next_token();
306
307                            let nl = i.next_slice(consumed).as_bstr();
308                            dispatch(Event::Newline(Cow::Borrowed(nl)));
309
310                            value_start_checkpoint = i.checkpoint();
311                            value_end = None;
312                        }
313                        b'n' | b't' | b'\\' | b'b' | b'"' => {}
314                        _ => {
315                            i.reset(&start_checkpoint);
316                            return Err(winnow::error::ErrMode::from_error_kind(i, ErrorKind::Token));
317                        }
318                    }
319                }
320                b'"' => is_in_quotes = !is_in_quotes,
321                _ => {}
322            }
323        } else {
324            break;
325        }
326    }
327    if is_in_quotes {
328        i.reset(&start_checkpoint);
329        return Err(winnow::error::ErrMode::from_error_kind(i, ErrorKind::Slice));
330    }
331
332    let value_end = match value_end {
333        None => {
334            let last_value_index = i.offset_from(&value_start_checkpoint);
335            if last_value_index == 0 {
336                dispatch(Event::Value(Cow::Borrowed("".into())));
337                return Ok(());
338            } else {
339                last_value_index
340            }
341        }
342        Some(idx) => idx,
343    };
344
345    i.reset(&value_start_checkpoint);
346    let value_end_no_trailing_whitespace = i[..value_end]
347        .iter()
348        .enumerate()
349        .rev()
350        .find_map(|(idx, b)| (!b.is_ascii_whitespace()).then_some(idx + 1))
351        .unwrap_or(0);
352    let remainder_value = i.next_slice(value_end_no_trailing_whitespace);
353
354    if partial_value_found {
355        dispatch(Event::ValueDone(Cow::Borrowed(remainder_value.as_bstr())));
356    } else {
357        dispatch(Event::Value(Cow::Borrowed(remainder_value.as_bstr())));
358    }
359
360    Ok(())
361}
362
363fn take_spaces1<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> {
364    take_while(1.., winnow::stream::AsChar::is_space)
365        .map(bstr::ByteSlice::as_bstr)
366        .parse_next(i)
367}
368
369fn take_newlines1<'i>(i: &mut &'i [u8]) -> PResult<&'i BStr, NomError<&'i [u8]>> {
370    repeat(1..1024, alt(("\r\n", "\n")))
371        .map(|()| ())
372        .take()
373        .map(bstr::ByteSlice::as_bstr)
374        .parse_next(i)
375}