quick_xml/parser/
element.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
//! Contains a parser for an XML element.

use crate::errors::SyntaxError;
use crate::parser::Parser;

/// A parser that search a `>` symbol in the slice outside of quoted regions.
///
/// The parser considers two quoted regions: a double-quoted (`"..."`) and
/// a single-quoted (`'...'`) region. Matches found inside those regions are not
/// considered as results. Each region starts and ends by its quote symbol,
/// which cannot be escaped (but can be encoded as XML character entity or named
/// entity. Anyway, that encoding does not contain literal quotes).
///
/// To use a parser create an instance of parser and [`feed`] data into it.
/// After successful search the parser will return [`Some`] with position of
/// found symbol. If search is unsuccessful, a [`None`] will be returned. You
/// typically would expect positive result of search, so that you should feed
/// new data until you get it.
///
/// NOTE: after successful match the parser does not returned to the initial
/// state and should not be used anymore. Create a new parser if you want to perform
/// new search.
///
/// # Example
///
/// ```
/// # use pretty_assertions::assert_eq;
/// use quick_xml::parser::{ElementParser, Parser};
///
/// let mut parser = ElementParser::default();
///
/// // Parse `<my-element  with = 'some > inside'>and the text follow...`
/// // splitted into three chunks
/// assert_eq!(parser.feed(b"<my-element"), None);
/// // ...get new chunk of data
/// assert_eq!(parser.feed(b" with = 'some >"), None);
/// // ...get another chunk of data
/// assert_eq!(parser.feed(b" inside'>and the text follow..."), Some(8));
/// //                       ^       ^
/// //                       0       8
/// ```
///
/// [`feed`]: Self::feed()
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum ElementParser {
    /// The initial state (inside element, but outside of attribute value).
    Outside,
    /// Inside a single-quoted region (`'...'`).
    SingleQ,
    /// Inside a double-quoted region (`"..."`).
    DoubleQ,
}

impl Parser for ElementParser {
    /// Returns number of consumed bytes or `None` if `>` was not found in `bytes`.
    #[inline]
    fn feed(&mut self, bytes: &[u8]) -> Option<usize> {
        for i in memchr::memchr3_iter(b'>', b'\'', b'"', bytes) {
            *self = match (*self, bytes[i]) {
                // only allowed to match `>` while we are in state `Outside`
                (Self::Outside, b'>') => return Some(i),
                (Self::Outside, b'\'') => Self::SingleQ,
                (Self::Outside, b'\"') => Self::DoubleQ,

                // the only end_byte that gets us out if the same character
                (Self::SingleQ, b'\'') | (Self::DoubleQ, b'"') => Self::Outside,

                // all other bytes: no state change
                _ => continue,
            };
        }
        None
    }

    #[inline]
    fn eof_error() -> SyntaxError {
        SyntaxError::UnclosedTag
    }
}

impl Default for ElementParser {
    #[inline]
    fn default() -> Self {
        Self::Outside
    }
}

#[test]
fn parse() {
    use pretty_assertions::assert_eq;
    use ElementParser::*;

    /// Returns `Ok(pos)` with the position in the buffer where element is ended.
    ///
    /// Returns `Err(internal_state)` if parsing does not done yet.
    fn parse_element(bytes: &[u8], mut parser: ElementParser) -> Result<usize, ElementParser> {
        match parser.feed(bytes) {
            Some(i) => Ok(i),
            None => Err(parser),
        }
    }

    assert_eq!(parse_element(b"", Outside), Err(Outside));
    assert_eq!(parse_element(b"", SingleQ), Err(SingleQ));
    assert_eq!(parse_element(b"", DoubleQ), Err(DoubleQ));

    assert_eq!(parse_element(b"'", Outside), Err(SingleQ));
    assert_eq!(parse_element(b"'", SingleQ), Err(Outside));
    assert_eq!(parse_element(b"'", DoubleQ), Err(DoubleQ));

    assert_eq!(parse_element(b"\"", Outside), Err(DoubleQ));
    assert_eq!(parse_element(b"\"", SingleQ), Err(SingleQ));
    assert_eq!(parse_element(b"\"", DoubleQ), Err(Outside));

    assert_eq!(parse_element(b">", Outside), Ok(0));
    assert_eq!(parse_element(b">", SingleQ), Err(SingleQ));
    assert_eq!(parse_element(b">", DoubleQ), Err(DoubleQ));

    assert_eq!(parse_element(b"''>", Outside), Ok(2));
    assert_eq!(parse_element(b"''>", SingleQ), Err(SingleQ));
    assert_eq!(parse_element(b"''>", DoubleQ), Err(DoubleQ));
}