quick_xml/reader/
mod.rs

1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, SyntaxError};
10use crate::events::Event;
11use crate::parser::{ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27    /// Whether unmatched closing tag names should be allowed. Unless enabled,
28    /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
29    /// is returned from read methods.
30    ///
31    /// When set to `true`, it won't check if a closing tag has a corresponding
32    /// opening tag at all. For example, `<a></a></b>` will be permitted.
33    ///
34    /// Note that the emitted [`End`] event will not be modified if this is enabled,
35    /// ie. it will contain the data of the unmatched end tag.
36    ///
37    /// Note, that setting this to `true` will lead to additional allocates that
38    /// needed to store tag name for an [`End`] event.
39    ///
40    /// Default: `false`
41    ///
42    /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
43    /// [`End`]: crate::events::Event::End
44    pub allow_unmatched_ends: bool,
45
46    /// Whether comments should be validated. If enabled, in case of invalid comment
47    /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
48    ///
49    /// When set to `true`, every [`Comment`] event will be checked for not
50    /// containing `--`, which [is not allowed] in XML comments. Most of the time
51    /// we don't want comments at all so we don't really care about comment
52    /// correctness, thus the default value is `false` to improve performance.
53    ///
54    /// Default: `false`
55    ///
56    /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
57    /// [`Comment`]: crate::events::Event::Comment
58    /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
59    pub check_comments: bool,
60
61    /// Whether mismatched closing tag names should be detected. If enabled, in
62    /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
63    /// read methods.
64    ///
65    /// Note, that start and end tags [should match literally][spec], they cannot
66    /// have different prefixes even if both prefixes resolve to the same namespace.
67    /// The XML
68    ///
69    /// ```xml
70    /// <outer xmlns="namespace" xmlns:p="namespace">
71    /// </p:outer>
72    /// ```
73    ///
74    /// is not valid, even though semantically the start tag is the same as the
75    /// end tag. The reason is that namespaces are an extension of the original
76    /// XML specification (without namespaces) and it should be backward-compatible.
77    ///
78    /// When set to `false`, it won't check if a closing tag matches the corresponding
79    /// opening tag. For example, `<mytag></different_tag>` will be permitted.
80    ///
81    /// If the XML is known to be sane (already processed, etc.) this saves extra time.
82    ///
83    /// Note that the emitted [`End`] event will not be modified if this is disabled,
84    /// ie. it will contain the data of the mismatched end tag.
85    ///
86    /// Note, that setting this to `true` will lead to additional allocates that
87    /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
88    /// is also set, only one additional allocation will be performed that support
89    /// both these options.
90    ///
91    /// Default: `true`
92    ///
93    /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
94    /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
95    /// [`End`]: crate::events::Event::End
96    /// [`expand_empty_elements`]: Self::expand_empty_elements
97    pub check_end_names: bool,
98
99    /// Whether empty elements should be split into an `Open` and a `Close` event.
100    ///
101    /// When set to `true`, all [`Empty`] events produced by a self-closing tag
102    /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
103    /// event. When set to `false` (the default), those tags are represented by
104    /// an [`Empty`] event instead.
105    ///
106    /// Note, that setting this to `true` will lead to additional allocates that
107    /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
108    /// is also set, only one additional allocation will be performed that support
109    /// both these options.
110    ///
111    /// Default: `false`
112    ///
113    /// [`Empty`]: crate::events::Event::Empty
114    /// [`Start`]: crate::events::Event::Start
115    /// [`End`]: crate::events::Event::End
116    /// [`check_end_names`]: Self::check_end_names
117    pub expand_empty_elements: bool,
118
119    /// Whether trailing whitespace after the markup name are trimmed in closing
120    /// tags `</a >`.
121    ///
122    /// If `true` the emitted [`End`] event is stripped of trailing whitespace
123    /// after the markup name.
124    ///
125    /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
126    /// of markup names is going to fail erroneously if a closing tag contains
127    /// trailing whitespace.
128    ///
129    /// Default: `true`
130    ///
131    /// [`End`]: crate::events::Event::End
132    /// [`check_end_names`]: Self::check_end_names
133    pub trim_markup_names_in_closing_tags: bool,
134
135    /// Whether whitespace before character data should be removed.
136    ///
137    /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
138    /// If after that the event is empty it will not be pushed.
139    ///
140    /// Default: `false`
141    ///
142    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
143    ///
144    /// WARNING: With this option every text events will be trimmed which is
145    /// incorrect behavior when text events delimited by comments, processing
146    /// instructions or CDATA sections. To correctly trim data manually apply
147    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
148    /// only to necessary events.
149    /// </div>
150    ///
151    /// [`Text`]: crate::events::Event::Text
152    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
153    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
154    pub trim_text_start: bool,
155
156    /// Whether whitespace after character data should be removed.
157    ///
158    /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
159    /// If after that the event is empty it will not be pushed.
160    ///
161    /// Default: `false`
162    ///
163    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
164    ///
165    /// WARNING: With this option every text events will be trimmed which is
166    /// incorrect behavior when text events delimited by comments, processing
167    /// instructions or CDATA sections. To correctly trim data manually apply
168    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
169    /// only to necessary events.
170    /// </div>
171    ///
172    /// [`Text`]: crate::events::Event::Text
173    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
174    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
175    pub trim_text_end: bool,
176}
177
178impl Config {
179    /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
180    ///
181    /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
182    ///
183    /// WARNING: With this option every text events will be trimmed which is
184    /// incorrect behavior when text events delimited by comments, processing
185    /// instructions or CDATA sections. To correctly trim data manually apply
186    /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
187    /// only to necessary events.
188    /// </div>
189    ///
190    /// [`trim_text_start`]: Self::trim_text_start
191    /// [`trim_text_end`]: Self::trim_text_end
192    /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
193    /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
194    #[inline]
195    pub fn trim_text(&mut self, trim: bool) {
196        self.trim_text_start = trim;
197        self.trim_text_end = trim;
198    }
199
200    /// Turn on or off all checks for well-formedness. Currently it is that settings:
201    /// - [`check_comments`](Self::check_comments)
202    /// - [`check_end_names`](Self::check_end_names)
203    #[inline]
204    pub fn enable_all_checks(&mut self, enable: bool) {
205        self.check_comments = enable;
206        self.check_end_names = enable;
207    }
208}
209
210impl Default for Config {
211    fn default() -> Self {
212        Self {
213            allow_unmatched_ends: false,
214            check_comments: false,
215            check_end_names: true,
216            expand_empty_elements: false,
217            trim_markup_names_in_closing_tags: true,
218            trim_text_start: false,
219            trim_text_end: false,
220        }
221    }
222}
223
224////////////////////////////////////////////////////////////////////////////////////////////////////
225
226macro_rules! read_event_impl {
227    (
228        $self:ident, $buf:ident,
229        $reader:expr,
230        $read_until_close:ident
231        $(, $await:ident)?
232    ) => {{
233        let event = loop {
234            break match $self.state.state {
235                ParseState::Init => { // Go to InsideMarkup state
236                    // If encoding set explicitly, we not need to detect it. For example,
237                    // explicit UTF-8 set automatically if Reader was created using `from_str`.
238                    // But we still need to remove BOM for consistency with no encoding
239                    // feature enabled path
240                    #[cfg(feature = "encoding")]
241                    if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
242                        if $self.state.encoding.can_be_refined() {
243                            $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
244                        }
245                    }
246
247                    // Removes UTF-8 BOM if it is present
248                    #[cfg(not(feature = "encoding"))]
249                    $reader.remove_utf8_bom() $(.$await)? ?;
250
251                    $self.state.state = ParseState::InsideText;
252                    continue;
253                },
254                ParseState::InsideText => { // Go to InsideMarkup or Done state
255                    if $self.state.config.trim_text_start {
256                        $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
257                    }
258
259                    match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
260                        ReadTextResult::Markup(buf) => {
261                            $self.state.state = ParseState::InsideMarkup;
262                            // Pass `buf` to the next next iteration of parsing loop
263                            $buf = buf;
264                            continue;
265                        }
266                        ReadTextResult::UpToMarkup(bytes) => {
267                            $self.state.state = ParseState::InsideMarkup;
268                            // FIXME: Can produce an empty event if:
269                            // - event contains only spaces
270                            // - trim_text_start = false
271                            // - trim_text_end = true
272                            Ok(Event::Text($self.state.emit_text(bytes)))
273                        }
274                        ReadTextResult::UpToEof(bytes) => {
275                            $self.state.state = ParseState::Done;
276                            // Trim bytes from end if required
277                            let event = $self.state.emit_text(bytes);
278                            if event.is_empty() {
279                                Ok(Event::Eof)
280                            } else {
281                                Ok(Event::Text(event))
282                            }
283                        }
284                        ReadTextResult::Err(e) => Err(Error::Io(e.into())),
285                    }
286                },
287                // Go to InsideText state in next two arms
288                ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
289                ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
290                ParseState::Done => Ok(Event::Eof),
291            };
292        };
293        match event {
294            // #513: In case of ill-formed errors we already consume the wrong data
295            // and change the state. We can continue parsing if we wish
296            Err(Error::IllFormed(_)) => {}
297            Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
298            _ => {}
299        }
300        event
301    }};
302}
303
304/// Read bytes up to the `>` and skip it. This method is expected to be called
305/// after seeing the `<` symbol and skipping it. Inspects the next (current)
306/// symbol and returns an appropriate [`Event`]:
307///
308/// |Symbol |Event
309/// |-------|-------------------------------------
310/// |`!`    |[`Comment`], [`CData`] or [`DocType`]
311/// |`/`    |[`End`]
312/// |`?`    |[`PI`]
313/// |_other_|[`Start`] or [`Empty`]
314///
315/// Moves parser to the `InsideText` state.
316///
317/// [`Comment`]: Event::Comment
318/// [`CData`]: Event::CData
319/// [`DocType`]: Event::DocType
320/// [`End`]: Event::End
321/// [`PI`]: Event::PI
322/// [`Start`]: Event::Start
323/// [`Empty`]: Event::Empty
324macro_rules! read_until_close {
325    (
326        $self:ident, $buf:ident,
327        $reader:expr
328        $(, $await:ident)?
329    ) => {{
330        $self.state.state = ParseState::InsideText;
331
332        let start = $self.state.offset;
333        match $reader.peek_one() $(.$await)? {
334            // `<!` - comment, CDATA or DOCTYPE declaration
335            Ok(Some(b'!')) => match $reader
336                .read_bang_element($buf, &mut $self.state.offset)
337                $(.$await)?
338            {
339                Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
340                Err(e) => {
341                    // We want to report error at `<`, but offset was increased,
342                    // so return it back (-1 for `<`)
343                    $self.state.last_error_offset = start - 1;
344                    Err(e)
345                }
346            },
347            // `</` - closing tag
348            // #776: We parse using ElementParser which allows us to have attributes
349            // in close tags. While such tags are not allowed by the specification,
350            // we anyway allow to parse them because:
351            // - we do not check constraints during parsing. This is performed by the
352            //   optional validate step which user should call manually
353            // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
354            //   `</tag attr=">` and text `" >` which probably no one existing parser
355            //   does. This is malformed XML, however it is tolerated by some parsers
356            //   (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
357            Ok(Some(b'/')) => match $reader
358                .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
359                $(.$await)?
360            {
361                Ok(bytes) => $self.state.emit_end(bytes),
362                Err(e) => {
363                    // We want to report error at `<`, but offset was increased,
364                    // so return it back (-1 for `<`)
365                    $self.state.last_error_offset = start - 1;
366                    Err(e)
367                }
368            },
369            // `<?` - processing instruction
370            Ok(Some(b'?')) => match $reader
371                .read_with(PiParser(false), $buf, &mut $self.state.offset)
372                $(.$await)?
373            {
374                Ok(bytes) => $self.state.emit_question_mark(bytes),
375                Err(e) => {
376                    // We want to report error at `<`, but offset was increased,
377                    // so return it back (-1 for `<`)
378                    $self.state.last_error_offset = start - 1;
379                    Err(e)
380                }
381            },
382            // `<...` - opening or self-closed tag
383            Ok(Some(_)) => match $reader
384                .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
385                $(.$await)?
386            {
387                Ok(bytes) => Ok($self.state.emit_start(bytes)),
388                Err(e) => {
389                    // We want to report error at `<`, but offset was increased,
390                    // so return it back (-1 for `<`)
391                    $self.state.last_error_offset = start - 1;
392                    Err(e)
393                }
394            },
395            // `<` - syntax error, tag not closed
396            Ok(None) => {
397                // We want to report error at `<`, but offset was increased,
398                // so return it back (-1 for `<`)
399                $self.state.last_error_offset = start - 1;
400                Err(Error::Syntax(SyntaxError::UnclosedTag))
401            }
402            Err(e) => Err(Error::Io(e.into())),
403        }
404    }};
405}
406
407/// Generalization of `read_to_end` method for buffered and borrowed readers
408macro_rules! read_to_end {
409    (
410        // $self: &mut Reader
411        $self:expr, $end:expr, $buf:expr,
412        $read_event:ident,
413        // Code block that performs clearing of internal buffer after read of each event
414        $clear:block
415        $(, $await:ident)?
416    ) => {{
417        // Because we take position after the event before the End event,
418        // it is important that this position indicates beginning of the End event.
419        // If between last event and the End event would be only spaces, then we
420        // take position before the spaces, but spaces would be skipped without
421        // generating event if `trim_text_start` is set to `true`. To prevent that
422        // we temporary disable start text trimming.
423        //
424        // We also cannot take position after getting End event, because if
425        // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
426        // we do not known the real size of the End event that it is occupies in
427        // the source and cannot correct the position after the End event.
428        // So, we in any case should tweak parser configuration.
429        let config = $self.config_mut();
430        let trim = config.trim_text_start;
431        config.trim_text_start = false;
432
433        let start = $self.buffer_position();
434        let mut depth = 0;
435        loop {
436            $clear
437            let end = $self.buffer_position();
438            match $self.$read_event($buf) $(.$await)? {
439                Err(e) => {
440                    $self.config_mut().trim_text_start = trim;
441                    return Err(e);
442                }
443
444                Ok(Event::Start(e)) if e.name() == $end => depth += 1,
445                Ok(Event::End(e)) if e.name() == $end => {
446                    if depth == 0 {
447                        $self.config_mut().trim_text_start = trim;
448                        break start..end;
449                    }
450                    depth -= 1;
451                }
452                Ok(Event::Eof) => {
453                    $self.config_mut().trim_text_start = trim;
454                    return Err(Error::missed_end($end, $self.decoder()));
455                }
456                _ => (),
457            }
458        }
459    }};
460}
461
462#[cfg(feature = "async-tokio")]
463mod async_tokio;
464mod buffered_reader;
465mod ns_reader;
466mod slice_reader;
467mod state;
468
469pub use ns_reader::NsReader;
470
471/// Range of input in bytes, that corresponds to some piece of XML
472pub type Span = Range<u64>;
473
474////////////////////////////////////////////////////////////////////////////////////////////////////
475
476/// Possible reader states. The state transition diagram (`true` and `false` shows
477/// value of [`Config::expand_empty_elements`] option):
478///
479/// ```mermaid
480/// flowchart LR
481///   subgraph _
482///     direction LR
483///
484///     Init         -- "(no event)"\n                                       --> InsideMarkup
485///     InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
486///     InsideText   -- "#lt;false#gt;\n(no event)"\nText                    --> InsideMarkup
487///   end
488///   InsideText     -- "#lt;true#gt;"\nStart --> InsideEmpty
489///   InsideEmpty    -- End                   --> InsideText
490///   _ -. Eof .-> Done
491/// ```
492#[derive(Clone, Debug)]
493enum ParseState {
494    /// Initial state in which reader stay after creation. Transition from that
495    /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
496    /// state is always `InsideMarkup`. The reader will never return to this state. The
497    /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
498    /// first symbol not `<`, otherwise no event are emitted.
499    Init,
500    /// State after seeing the `<` symbol. Depending on the next symbol all other
501    /// events could be generated.
502    ///
503    /// After generating one event the reader moves to the `InsideText` state.
504    InsideMarkup,
505    /// State in which reader searches the `<` symbol of a markup. All bytes before
506    /// that symbol will be returned in the [`Event::Text`] event. After that
507    /// the reader moves to the `InsideMarkup` state.
508    InsideText,
509    /// This state is used only if option [`expand_empty_elements`] is set to `true`.
510    /// Reader enters to this state when it is in a `InsideText` state and emits an
511    /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
512    /// after which reader returned to the `InsideText` state.
513    ///
514    /// [`expand_empty_elements`]: Config::expand_empty_elements
515    InsideEmpty,
516    /// Reader enters this state when `Eof` event generated or an error occurred.
517    /// This is the last state, the reader stay in it forever.
518    Done,
519}
520
521/// A reference to an encoding together with information about how it was retrieved.
522///
523/// The state transition diagram:
524///
525/// ```mermaid
526/// flowchart LR
527///   Implicit    -- from_str       --> Explicit
528///   Implicit    -- BOM            --> BomDetected
529///   Implicit    -- "encoding=..." --> XmlDetected
530///   BomDetected -- "encoding=..." --> XmlDetected
531/// ```
532#[cfg(feature = "encoding")]
533#[derive(Clone, Copy, Debug)]
534enum EncodingRef {
535    /// Encoding was implicitly assumed to have a specified value. It can be refined
536    /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
537    Implicit(&'static Encoding),
538    /// Encoding was explicitly set to the desired value. It cannot be changed
539    /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
540    Explicit(&'static Encoding),
541    /// Encoding was detected from a byte order mark (BOM) or by the first bytes
542    /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
543    BomDetected(&'static Encoding),
544    /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
545    /// It can no longer change
546    XmlDetected(&'static Encoding),
547}
548#[cfg(feature = "encoding")]
549impl EncodingRef {
550    #[inline]
551    const fn encoding(&self) -> &'static Encoding {
552        match self {
553            Self::Implicit(e) => e,
554            Self::Explicit(e) => e,
555            Self::BomDetected(e) => e,
556            Self::XmlDetected(e) => e,
557        }
558    }
559    #[inline]
560    const fn can_be_refined(&self) -> bool {
561        match self {
562            Self::Implicit(_) | Self::BomDetected(_) => true,
563            Self::Explicit(_) | Self::XmlDetected(_) => false,
564        }
565    }
566}
567
568////////////////////////////////////////////////////////////////////////////////////////////////////
569
570/// A direct stream to the underlying [`Reader`]s reader which updates
571/// [`Reader::buffer_position()`] when read from it.
572#[derive(Debug)]
573#[must_use = "streams do nothing unless read or polled"]
574pub struct BinaryStream<'r, R> {
575    inner: &'r mut R,
576    offset: &'r mut u64,
577}
578
579impl<'r, R> BinaryStream<'r, R> {
580    /// Returns current position in bytes in the original source.
581    #[inline]
582    pub const fn offset(&self) -> u64 {
583        *self.offset
584    }
585
586    /// Gets a reference to the underlying reader.
587    #[inline]
588    pub const fn get_ref(&self) -> &R {
589        self.inner
590    }
591
592    /// Gets a mutable reference to the underlying reader.
593    ///
594    /// Avoid read from this reader because this will not update reader's position
595    /// and will lead to incorrect positions of errors. Read from this stream instead.
596    #[inline]
597    pub fn get_mut(&mut self) -> &mut R {
598        self.inner
599    }
600}
601
602impl<'r, R> io::Read for BinaryStream<'r, R>
603where
604    R: io::Read,
605{
606    #[inline]
607    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
608        let amt = self.inner.read(buf)?;
609        *self.offset += amt as u64;
610        Ok(amt)
611    }
612}
613
614impl<'r, R> io::BufRead for BinaryStream<'r, R>
615where
616    R: io::BufRead,
617{
618    #[inline]
619    fn fill_buf(&mut self) -> io::Result<&[u8]> {
620        self.inner.fill_buf()
621    }
622
623    #[inline]
624    fn consume(&mut self, amt: usize) {
625        self.inner.consume(amt);
626        *self.offset += amt as u64;
627    }
628}
629
630////////////////////////////////////////////////////////////////////////////////////////////////////
631
632/// A low level encoding-agnostic XML event reader.
633///
634/// Consumes bytes and streams XML [`Event`]s.
635///
636/// This reader does not manage namespace declarations and not able to resolve
637/// prefixes. If you want these features, use the [`NsReader`].
638///
639/// # Examples
640///
641/// ```
642/// use quick_xml::events::Event;
643/// use quick_xml::reader::Reader;
644///
645/// let xml = r#"<tag1 att1 = "test">
646///                 <tag2><!--Test comment-->Test</tag2>
647///                 <tag2>Test 2</tag2>
648///              </tag1>"#;
649/// let mut reader = Reader::from_str(xml);
650/// reader.config_mut().trim_text(true);
651///
652/// let mut count = 0;
653/// let mut txt = Vec::new();
654/// let mut buf = Vec::new();
655///
656/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
657/// loop {
658///     // NOTE: this is the generic case when we don't know about the input BufRead.
659///     // when the input is a &str or a &[u8], we don't actually need to use another
660///     // buffer, we could directly call `reader.read_event()`
661///     match reader.read_event_into(&mut buf) {
662///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
663///         // exits the loop when reaching end of file
664///         Ok(Event::Eof) => break,
665///
666///         Ok(Event::Start(e)) => {
667///             match e.name().as_ref() {
668///                 b"tag1" => println!("attributes values: {:?}",
669///                                     e.attributes().map(|a| a.unwrap().value)
670///                                     .collect::<Vec<_>>()),
671///                 b"tag2" => count += 1,
672///                 _ => (),
673///             }
674///         }
675///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
676///
677///         // There are several other `Event`s we do not consider here
678///         _ => (),
679///     }
680///     // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
681///     buf.clear();
682/// }
683/// ```
684///
685/// [`NsReader`]: crate::reader::NsReader
686#[derive(Clone)]
687pub struct Reader<R> {
688    /// Source of data for parse
689    reader: R,
690    /// Configuration and current parse state
691    state: ReaderState,
692}
693
694/// Builder methods
695impl<R> Reader<R> {
696    /// Creates a `Reader` that reads from a given reader.
697    pub fn from_reader(reader: R) -> Self {
698        Self {
699            reader,
700            state: ReaderState::default(),
701        }
702    }
703
704    /// Returns reference to the parser configuration
705    pub const fn config(&self) -> &Config {
706        &self.state.config
707    }
708
709    /// Returns mutable reference to the parser configuration
710    pub fn config_mut(&mut self) -> &mut Config {
711        &mut self.state.config
712    }
713}
714
715/// Getters
716impl<R> Reader<R> {
717    /// Consumes `Reader` returning the underlying reader
718    ///
719    /// Can be used to compute line and column of a parsing error position
720    ///
721    /// # Examples
722    ///
723    /// ```
724    /// # use pretty_assertions::assert_eq;
725    /// use std::{str, io::Cursor};
726    /// use quick_xml::events::Event;
727    /// use quick_xml::reader::Reader;
728    ///
729    /// let xml = r#"<tag1 att1 = "test">
730    ///                 <tag2><!--Test comment-->Test</tag2>
731    ///                 <tag3>Test 2</tag3>
732    ///              </tag1>"#;
733    /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
734    /// let mut buf = Vec::new();
735    ///
736    /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
737    ///     // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
738    ///     let end_pos = reader.buffer_position() as usize;
739    ///     let mut cursor = reader.into_inner();
740    ///     let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
741    ///         .expect("can't make a string");
742    ///     let mut line = 1;
743    ///     let mut column = 0;
744    ///     for c in s.chars() {
745    ///         if c == '\n' {
746    ///             line += 1;
747    ///             column = 0;
748    ///         } else {
749    ///             column += 1;
750    ///         }
751    ///     }
752    ///     (line, column)
753    /// }
754    ///
755    /// loop {
756    ///     match reader.read_event_into(&mut buf) {
757    ///         Ok(Event::Start(ref e)) => match e.name().as_ref() {
758    ///             b"tag1" | b"tag2" => (),
759    ///             tag => {
760    ///                 assert_eq!(b"tag3", tag);
761    ///                 assert_eq!((3, 22), into_line_and_column(reader));
762    ///                 break;
763    ///             }
764    ///         },
765    ///         Ok(Event::Eof) => unreachable!(),
766    ///         _ => (),
767    ///     }
768    ///     buf.clear();
769    /// }
770    /// ```
771    pub fn into_inner(self) -> R {
772        self.reader
773    }
774
775    /// Gets a reference to the underlying reader.
776    pub const fn get_ref(&self) -> &R {
777        &self.reader
778    }
779
780    /// Gets a mutable reference to the underlying reader.
781    ///
782    /// Avoid read from this reader because this will not update reader's position
783    /// and will lead to incorrect positions of errors. If you want to read, use
784    /// [`stream()`] instead.
785    ///
786    /// [`stream()`]: Self::stream
787    pub fn get_mut(&mut self) -> &mut R {
788        &mut self.reader
789    }
790
791    /// Gets the current byte position in the input data.
792    pub const fn buffer_position(&self) -> u64 {
793        // when internal state is InsideMarkup, we have actually read until '<',
794        // which we don't want to show
795        if let ParseState::InsideMarkup = self.state.state {
796            self.state.offset - 1
797        } else {
798            self.state.offset
799        }
800    }
801
802    /// Gets the last error byte position in the input data. If there is no errors
803    /// yet, returns `0`.
804    ///
805    /// Unlike `buffer_position` it will point to the place where it is rational
806    /// to report error to the end user. For example, all [`SyntaxError`]s are
807    /// reported when the parser sees EOF inside of some kind of markup. The
808    /// `buffer_position()` will point to the last byte of input which is not
809    /// very useful. `error_position()` will point to the start of corresponding
810    /// markup element (i. e. to the `<` character).
811    ///
812    /// This position is always `<= buffer_position()`.
813    pub const fn error_position(&self) -> u64 {
814        self.state.last_error_offset
815    }
816
817    /// Get the decoder, used to decode bytes, read by this reader, to the strings.
818    ///
819    /// If [`encoding`] feature is enabled, the used encoding may change after
820    /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
821    ///
822    /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
823    /// defaults to UTF-8.
824    ///
825    /// [`encoding`]: ../index.html#encoding
826    #[inline]
827    pub const fn decoder(&self) -> Decoder {
828        self.state.decoder()
829    }
830
831    /// Get the direct access to the underlying reader, but tracks the amount of
832    /// read data and update [`Reader::buffer_position()`] accordingly.
833    ///
834    /// Note, that this method gives you access to the internal reader and read
835    /// data will not be returned in any subsequent events read by `read_event`
836    /// family of methods.
837    ///
838    /// # Example
839    ///
840    /// This example demonstrates how to read stream raw bytes from an XML document.
841    /// This could be used to implement streaming read of text, or to read raw binary
842    /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
843    /// valid XML, but XML-derived file formats exist where such documents are valid).
844    ///
845    /// ```
846    /// # use pretty_assertions::assert_eq;
847    /// use std::io::{BufRead, Read};
848    /// use quick_xml::events::{BytesEnd, BytesStart, Event};
849    /// use quick_xml::reader::Reader;
850    ///
851    /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
852    /// //                                 ^    ^               ^     ^
853    /// //                                 0    5              21    27
854    ///
855    /// assert_eq!(
856    ///     (reader.read_event().unwrap(), reader.buffer_position()),
857    ///     // 5 - end of the `<tag>`
858    ///     (Event::Start(BytesStart::new("tag")), 5)
859    /// );
860    ///
861    /// // Reading directly from underlying reader will not update position
862    /// // let mut inner = reader.get_mut();
863    ///
864    /// // Reading from the stream() advances position
865    /// let mut inner = reader.stream();
866    ///
867    /// // Read binary data. We must know its size
868    /// let mut binary = [0u8; 16];
869    /// inner.read_exact(&mut binary).unwrap();
870    /// assert_eq!(&binary, b"binary << data&>");
871    /// // 21 - end of the `binary << data&>`
872    /// assert_eq!(inner.offset(), 21);
873    /// assert_eq!(reader.buffer_position(), 21);
874    ///
875    /// assert_eq!(
876    ///     (reader.read_event().unwrap(), reader.buffer_position()),
877    ///     // 27 - end of the `</tag>`
878    ///     (Event::End(BytesEnd::new("tag")), 27)
879    /// );
880    ///
881    /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
882    /// ```
883    #[inline]
884    pub fn stream(&mut self) -> BinaryStream<R> {
885        BinaryStream {
886            inner: &mut self.reader,
887            offset: &mut self.state.offset,
888        }
889    }
890}
891
892/// Private sync reading methods
893impl<R> Reader<R> {
894    /// Read text into the given buffer, and return an event that borrows from
895    /// either that buffer or from the input itself, based on the type of the
896    /// reader.
897    fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
898    where
899        R: XmlSource<'i, B>,
900    {
901        read_event_impl!(self, buf, self.reader, read_until_close)
902    }
903
904    /// Private function to read until `>` is found. This function expects that
905    /// it was called just after encounter a `<` symbol.
906    fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
907    where
908        R: XmlSource<'i, B>,
909    {
910        read_until_close!(self, buf, self.reader)
911    }
912}
913
914////////////////////////////////////////////////////////////////////////////////////////////////////
915
916/// Result of an attempt to read XML textual data from the reader.
917enum ReadTextResult<'r, B> {
918    /// Start of markup (`<` character) was found in the first byte.
919    /// Contains buffer that should be returned back to the next iteration cycle
920    /// to satisfy borrow checker requirements.
921    Markup(B),
922    /// Contains text block up to start of markup (`<` character).
923    UpToMarkup(&'r [u8]),
924    /// Contains text block up to EOF, start of markup (`<` character) was not found.
925    UpToEof(&'r [u8]),
926    /// IO error occurred.
927    Err(io::Error),
928}
929
930/// Represents an input for a reader that can return borrowed data.
931///
932/// There are two implementors of this trait: generic one that read data from
933/// `Self`, copies some part of it into a provided buffer of type `B` and then
934/// returns data that borrow from that buffer.
935///
936/// The other implementor is for `&[u8]` and instead of copying data returns
937/// borrowed data from `Self` instead. This implementation allows zero-copy
938/// deserialization.
939///
940/// # Parameters
941/// - `'r`: lifetime of a buffer from which events will borrow
942/// - `B`: a type of a buffer that can be used to store data read from `Self` and
943///   from which events can borrow
944trait XmlSource<'r, B> {
945    /// Removes UTF-8 BOM if it is present
946    #[cfg(not(feature = "encoding"))]
947    fn remove_utf8_bom(&mut self) -> io::Result<()>;
948
949    /// Determines encoding from the start of input and removes BOM if it is present
950    #[cfg(feature = "encoding")]
951    fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
952
953    /// Read input until start of markup (the `<`) is found or end of input is reached.
954    ///
955    /// # Parameters
956    /// - `buf`: Buffer that could be filled from an input (`Self`) and
957    ///   from which [events] could borrow their data
958    /// - `position`: Will be increased by amount of bytes consumed
959    ///
960    /// [events]: crate::events::Event
961    fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
962
963    /// Read input until processing instruction is finished.
964    ///
965    /// This method expect that start sequence of a parser already was read.
966    ///
967    /// Returns a slice of data read up to the end of the thing being parsed.
968    /// The end of thing and the returned content is determined by the used parser.
969    ///
970    /// If input (`Self`) is exhausted and no bytes was read, or if the specified
971    /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
972    ///
973    /// # Parameters
974    /// - `buf`: Buffer that could be filled from an input (`Self`) and
975    ///   from which [events] could borrow their data
976    /// - `position`: Will be increased by amount of bytes consumed
977    ///
978    /// A `P` type parameter is used to preserve state between calls to the underlying
979    /// reader which provides bytes fed into the parser.
980    ///
981    /// [events]: crate::events::Event
982    fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
983    where
984        P: Parser;
985
986    /// Read input until comment or CDATA is finished.
987    ///
988    /// This method expect that `<` already was read.
989    ///
990    /// Returns a slice of data read up to end of comment or CDATA (`>`),
991    /// which does not include into result.
992    ///
993    /// If input (`Self`) is exhausted and nothing was read, returns `None`.
994    ///
995    /// # Parameters
996    /// - `buf`: Buffer that could be filled from an input (`Self`) and
997    ///   from which [events] could borrow their data
998    /// - `position`: Will be increased by amount of bytes consumed
999    ///
1000    /// [events]: crate::events::Event
1001    fn read_bang_element(
1002        &mut self,
1003        buf: B,
1004        position: &mut u64,
1005    ) -> Result<(BangType, &'r [u8]), Error>;
1006
1007    /// Consume and discard all the whitespace until the next non-whitespace
1008    /// character or EOF.
1009    ///
1010    /// # Parameters
1011    /// - `position`: Will be increased by amount of bytes consumed
1012    fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1013
1014    /// Return one character without consuming it, so that future `read_*` calls
1015    /// will still include it. On EOF, return `None`.
1016    fn peek_one(&mut self) -> io::Result<Option<u8>>;
1017}
1018
1019/// Possible elements started with `<!`
1020#[derive(Debug, PartialEq)]
1021enum BangType {
1022    /// <![CDATA[...]]>
1023    CData,
1024    /// <!--...-->
1025    Comment,
1026    /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1027    DocType(i32),
1028}
1029impl BangType {
1030    #[inline(always)]
1031    const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1032        Ok(match byte {
1033            Some(b'[') => Self::CData,
1034            Some(b'-') => Self::Comment,
1035            Some(b'D') | Some(b'd') => Self::DocType(0),
1036            _ => return Err(SyntaxError::InvalidBangMarkup),
1037        })
1038    }
1039
1040    /// If element is finished, returns its content up to `>` symbol and
1041    /// an index of this symbol, otherwise returns `None`
1042    ///
1043    /// # Parameters
1044    /// - `buf`: buffer with data consumed on previous iterations
1045    /// - `chunk`: data read on current iteration and not yet consumed from reader
1046    #[inline(always)]
1047    fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1048        match self {
1049            Self::Comment => {
1050                for i in memchr::memchr_iter(b'>', chunk) {
1051                    // Need to read at least 6 symbols (`!---->`) for properly finished comment
1052                    // <!----> - XML comment
1053                    //  012345 - i
1054                    if buf.len() + i > 4 {
1055                        if chunk[..i].ends_with(b"--") {
1056                            // We cannot strip last `--` from the buffer because we need it in case of
1057                            // check_comments enabled option. XML standard requires that comment
1058                            // will not end with `--->` sequence because this is a special case of
1059                            // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1060                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1061                        }
1062                        // End sequence `-|->` was splitted at |
1063                        //        buf --/   \-- chunk
1064                        if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1065                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1066                        }
1067                        // End sequence `--|>` was splitted at |
1068                        //         buf --/   \-- chunk
1069                        if i == 0 && buf.ends_with(b"--") {
1070                            return Some((&[], i + 1)); // +1 for `>`
1071                        }
1072                    }
1073                }
1074            }
1075            Self::CData => {
1076                for i in memchr::memchr_iter(b'>', chunk) {
1077                    if chunk[..i].ends_with(b"]]") {
1078                        return Some((&chunk[..i], i + 1)); // +1 for `>`
1079                    }
1080                    // End sequence `]|]>` was splitted at |
1081                    //        buf --/   \-- chunk
1082                    if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1083                        return Some((&chunk[..i], i + 1)); // +1 for `>`
1084                    }
1085                    // End sequence `]]|>` was splitted at |
1086                    //         buf --/   \-- chunk
1087                    if i == 0 && buf.ends_with(b"]]") {
1088                        return Some((&[], i + 1)); // +1 for `>`
1089                    }
1090                }
1091            }
1092            Self::DocType(ref mut balance) => {
1093                for i in memchr::memchr2_iter(b'<', b'>', chunk) {
1094                    if chunk[i] == b'<' {
1095                        *balance += 1;
1096                    } else {
1097                        if *balance == 0 {
1098                            return Some((&chunk[..i], i + 1)); // +1 for `>`
1099                        }
1100                        *balance -= 1;
1101                    }
1102                }
1103            }
1104        }
1105        None
1106    }
1107    #[inline]
1108    const fn to_err(&self) -> SyntaxError {
1109        match self {
1110            Self::CData => SyntaxError::UnclosedCData,
1111            Self::Comment => SyntaxError::UnclosedComment,
1112            Self::DocType(_) => SyntaxError::UnclosedDoctype,
1113        }
1114    }
1115}
1116
1117////////////////////////////////////////////////////////////////////////////////////////////////////
1118
1119#[cfg(test)]
1120mod test {
1121    /// Checks the internal implementation of the various reader methods
1122    macro_rules! check {
1123        (
1124            #[$test:meta]
1125            $read_event:ident,
1126            $read_until_close:ident,
1127            // constructor of the XML source on which internal functions will be called
1128            $source:path,
1129            // constructor of the buffer to which read data will stored
1130            $buf:expr
1131            $(, $async:ident, $await:ident)?
1132        ) => {
1133            mod read_bang_element {
1134                use super::*;
1135                use crate::errors::{Error, SyntaxError};
1136                use crate::reader::BangType;
1137                use crate::utils::Bytes;
1138
1139                /// Checks that reading CDATA content works correctly
1140                mod cdata {
1141                    use super::*;
1142                    use pretty_assertions::assert_eq;
1143
1144                    /// Checks that if input begins like CDATA element, but CDATA start sequence
1145                    /// is not finished, parsing ends with an error
1146                    #[$test]
1147                    #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1148                    $($async)? fn not_properly_start() {
1149                        let buf = $buf;
1150                        let mut position = 1;
1151                        let mut input = b"![]]>other content".as_ref();
1152                        //                ^= 1
1153
1154                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1155                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1156                            x => panic!(
1157                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1158                                x
1159                            ),
1160                        }
1161                        assert_eq!(position, 1);
1162                    }
1163
1164                    /// Checks that if CDATA startup sequence was matched, but an end sequence
1165                    /// is not found, parsing ends with an error
1166                    #[$test]
1167                    $($async)? fn not_closed() {
1168                        let buf = $buf;
1169                        let mut position = 1;
1170                        let mut input = b"![CDATA[other content".as_ref();
1171                        //                ^= 1                 ^= 22
1172
1173                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1174                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1175                            x => panic!(
1176                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1177                                x
1178                            ),
1179                        }
1180                        assert_eq!(position, 22);
1181                    }
1182
1183                    /// Checks that CDATA element without content inside parsed successfully
1184                    #[$test]
1185                    $($async)? fn empty() {
1186                        let buf = $buf;
1187                        let mut position = 1;
1188                        let mut input = b"![CDATA[]]>other content".as_ref();
1189                        //                ^= 1       ^= 12
1190
1191                        let (ty, bytes) = $source(&mut input)
1192                            .read_bang_element(buf, &mut position)
1193                            $(.$await)?
1194                            .unwrap();
1195                        assert_eq!(
1196                            (ty, Bytes(bytes)),
1197                            (BangType::CData, Bytes(b"![CDATA[]]"))
1198                        );
1199                        assert_eq!(position, 12);
1200                    }
1201
1202                    /// Checks that CDATA element with content parsed successfully.
1203                    /// Additionally checks that sequences inside CDATA that may look like
1204                    /// a CDATA end sequence do not interrupt CDATA parsing
1205                    #[$test]
1206                    $($async)? fn with_content() {
1207                        let buf = $buf;
1208                        let mut position = 1;
1209                        let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1210                        //                ^= 1                        ^= 29
1211
1212                        let (ty, bytes) = $source(&mut input)
1213                            .read_bang_element(buf, &mut position)
1214                            $(.$await)?
1215                            .unwrap();
1216                        assert_eq!(
1217                            (ty, Bytes(bytes)),
1218                            (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1219                        );
1220                        assert_eq!(position, 29);
1221                    }
1222                }
1223
1224                /// Checks that reading XML comments works correctly. According to the [specification],
1225                /// comment data can contain any sequence except `--`:
1226                ///
1227                /// ```peg
1228                /// comment = '<--' (!'--' char)* '-->';
1229                /// char = [#x1-#x2C]
1230                ///      / [#x2E-#xD7FF]
1231                ///      / [#xE000-#xFFFD]
1232                ///      / [#x10000-#x10FFFF]
1233                /// ```
1234                ///
1235                /// The presence of this limitation, however, is simply a poorly designed specification
1236                /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1237                /// presence of these sequences by default. This tests allow such content.
1238                ///
1239                /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1240                mod comment {
1241                    use super::*;
1242                    use pretty_assertions::assert_eq;
1243
1244                    #[$test]
1245                    #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1246                    $($async)? fn not_properly_start() {
1247                        let buf = $buf;
1248                        let mut position = 1;
1249                        let mut input = b"!- -->other content".as_ref();
1250                        //                ^= 1
1251
1252                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1253                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1254                            x => panic!(
1255                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1256                                x
1257                            ),
1258                        }
1259                        assert_eq!(position, 1);
1260                    }
1261
1262                    #[$test]
1263                    $($async)? fn not_properly_end() {
1264                        let buf = $buf;
1265                        let mut position = 1;
1266                        let mut input = b"!->other content".as_ref();
1267                        //                ^= 1            ^= 17
1268
1269                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1270                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1271                            x => panic!(
1272                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1273                                x
1274                            ),
1275                        }
1276                        assert_eq!(position, 17);
1277                    }
1278
1279                    #[$test]
1280                    $($async)? fn not_closed1() {
1281                        let buf = $buf;
1282                        let mut position = 1;
1283                        let mut input = b"!--other content".as_ref();
1284                        //                ^= 1            ^= 17
1285
1286                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1287                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1288                            x => panic!(
1289                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1290                                x
1291                            ),
1292                        }
1293                        assert_eq!(position, 17);
1294                    }
1295
1296                    #[$test]
1297                    $($async)? fn not_closed2() {
1298                        let buf = $buf;
1299                        let mut position = 1;
1300                        let mut input = b"!-->other content".as_ref();
1301                        //                ^= 1             ^= 18
1302
1303                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1304                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1305                            x => panic!(
1306                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1307                                x
1308                            ),
1309                        }
1310                        assert_eq!(position, 18);
1311                    }
1312
1313                    #[$test]
1314                    $($async)? fn not_closed3() {
1315                        let buf = $buf;
1316                        let mut position = 1;
1317                        let mut input = b"!--->other content".as_ref();
1318                        //                ^= 1              ^= 19
1319
1320                        match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1321                            Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1322                            x => panic!(
1323                                "Expected `Err(Syntax(_))`, but got `{:?}`",
1324                                x
1325                            ),
1326                        }
1327                        assert_eq!(position, 19);
1328                    }
1329
1330                    #[$test]
1331                    $($async)? fn empty() {
1332                        let buf = $buf;
1333                        let mut position = 1;
1334                        let mut input = b"!---->other content".as_ref();
1335                        //                ^= 1  ^= 7
1336
1337                        let (ty, bytes) = $source(&mut input)
1338                            .read_bang_element(buf, &mut position)
1339                            $(.$await)?
1340                            .unwrap();
1341                        assert_eq!(
1342                            (ty, Bytes(bytes)),
1343                            (BangType::Comment, Bytes(b"!----"))
1344                        );
1345                        assert_eq!(position, 7);
1346                    }
1347
1348                    #[$test]
1349                    $($async)? fn with_content() {
1350                        let buf = $buf;
1351                        let mut position = 1;
1352                        let mut input = b"!--->comment<--->other content".as_ref();
1353                        //                ^= 1             ^= 18
1354
1355                        let (ty, bytes) = $source(&mut input)
1356                            .read_bang_element(buf, &mut position)
1357                            $(.$await)?
1358                            .unwrap();
1359                        assert_eq!(
1360                            (ty, Bytes(bytes)),
1361                            (BangType::Comment, Bytes(b"!--->comment<---"))
1362                        );
1363                        assert_eq!(position, 18);
1364                    }
1365                }
1366
1367                /// Checks that reading DOCTYPE definition works correctly
1368                mod doctype {
1369                    use super::*;
1370
1371                    mod uppercase {
1372                        use super::*;
1373                        use pretty_assertions::assert_eq;
1374
1375                        #[$test]
1376                        $($async)? fn not_properly_start() {
1377                            let buf = $buf;
1378                            let mut position = 1;
1379                            let mut input = b"!D other content".as_ref();
1380                            //                ^= 1            ^= 17
1381
1382                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1383                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1384                                x => panic!(
1385                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1386                                    x
1387                                ),
1388                            }
1389                            assert_eq!(position, 17);
1390                        }
1391
1392                        #[$test]
1393                        $($async)? fn without_space() {
1394                            let buf = $buf;
1395                            let mut position = 1;
1396                            let mut input = b"!DOCTYPEother content".as_ref();
1397                            //                ^= 1                 ^= 22
1398
1399                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1400                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1401                                x => panic!(
1402                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1403                                    x
1404                                ),
1405                            }
1406                            assert_eq!(position, 22);
1407                        }
1408
1409                        #[$test]
1410                        $($async)? fn empty() {
1411                            let buf = $buf;
1412                            let mut position = 1;
1413                            let mut input = b"!DOCTYPE>other content".as_ref();
1414                            //                ^= 1     ^= 10
1415
1416                            let (ty, bytes) = $source(&mut input)
1417                                .read_bang_element(buf, &mut position)
1418                                $(.$await)?
1419                                .unwrap();
1420                            assert_eq!(
1421                                (ty, Bytes(bytes)),
1422                                (BangType::DocType(0), Bytes(b"!DOCTYPE"))
1423                            );
1424                            assert_eq!(position, 10);
1425                        }
1426
1427                        #[$test]
1428                        $($async)? fn not_closed() {
1429                            let buf = $buf;
1430                            let mut position = 1;
1431                            let mut input = b"!DOCTYPE other content".as_ref();
1432                            //                ^= 1                  ^23
1433
1434                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1435                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1436                                x => panic!(
1437                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1438                                    x
1439                                ),
1440                            }
1441                            assert_eq!(position, 23);
1442                        }
1443                    }
1444
1445                    mod lowercase {
1446                        use super::*;
1447                        use pretty_assertions::assert_eq;
1448
1449                        #[$test]
1450                        $($async)? fn not_properly_start() {
1451                            let buf = $buf;
1452                            let mut position = 1;
1453                            let mut input = b"!d other content".as_ref();
1454                            //                ^= 1            ^= 17
1455
1456                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1457                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1458                                x => panic!(
1459                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1460                                    x
1461                                ),
1462                            }
1463                            assert_eq!(position, 17);
1464                        }
1465
1466                        #[$test]
1467                        $($async)? fn without_space() {
1468                            let buf = $buf;
1469                            let mut position = 1;
1470                            let mut input = b"!doctypeother content".as_ref();
1471                            //                ^= 1                 ^= 22
1472
1473                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1474                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1475                                x => panic!(
1476                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1477                                    x
1478                                ),
1479                            }
1480                            assert_eq!(position, 22);
1481                        }
1482
1483                        #[$test]
1484                        $($async)? fn empty() {
1485                            let buf = $buf;
1486                            let mut position = 1;
1487                            let mut input = b"!doctype>other content".as_ref();
1488                            //                ^= 1     ^= 10
1489
1490                            let (ty, bytes) = $source(&mut input)
1491                                .read_bang_element(buf, &mut position)
1492                                $(.$await)?
1493                                .unwrap();
1494                            assert_eq!(
1495                                (ty, Bytes(bytes)),
1496                                (BangType::DocType(0), Bytes(b"!doctype"))
1497                            );
1498                            assert_eq!(position, 10);
1499                        }
1500
1501                        #[$test]
1502                        $($async)? fn not_closed() {
1503                            let buf = $buf;
1504                            let mut position = 1;
1505                            let mut input = b"!doctype other content".as_ref();
1506                            //                ^= 1                  ^= 23
1507
1508                            match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1509                                Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1510                                x => panic!(
1511                                    "Expected `Err(Syntax(_))`, but got `{:?}`",
1512                                    x
1513                                ),
1514                            }
1515                            assert_eq!(position, 23);
1516                        }
1517                    }
1518                }
1519            }
1520
1521            mod read_element {
1522                use super::*;
1523                use crate::errors::{Error, SyntaxError};
1524                use crate::parser::ElementParser;
1525                use crate::utils::Bytes;
1526                use pretty_assertions::assert_eq;
1527
1528                /// Checks that nothing was read from empty buffer
1529                #[$test]
1530                $($async)? fn empty() {
1531                    let buf = $buf;
1532                    let mut position = 1;
1533                    let mut input = b"".as_ref();
1534                    //                ^= 1
1535
1536                    match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1537                        Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1538                        x => panic!(
1539                            "Expected `Err(Syntax(_))`, but got `{:?}`",
1540                            x
1541                        ),
1542                    }
1543                    assert_eq!(position, 1);
1544                }
1545
1546                mod open {
1547                    use super::*;
1548                    use pretty_assertions::assert_eq;
1549
1550                    #[$test]
1551                    $($async)? fn empty_tag() {
1552                        let buf = $buf;
1553                        let mut position = 1;
1554                        let mut input = b">".as_ref();
1555                        //                 ^= 2
1556
1557                        assert_eq!(
1558                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1559                            Bytes(b"")
1560                        );
1561                        assert_eq!(position, 2);
1562                    }
1563
1564                    #[$test]
1565                    $($async)? fn normal() {
1566                        let buf = $buf;
1567                        let mut position = 1;
1568                        let mut input = b"tag>".as_ref();
1569                        //                    ^= 5
1570
1571                        assert_eq!(
1572                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1573                            Bytes(b"tag")
1574                        );
1575                        assert_eq!(position, 5);
1576                    }
1577
1578                    #[$test]
1579                    $($async)? fn empty_ns_empty_tag() {
1580                        let buf = $buf;
1581                        let mut position = 1;
1582                        let mut input = b":>".as_ref();
1583                        //                  ^= 3
1584
1585                        assert_eq!(
1586                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1587                            Bytes(b":")
1588                        );
1589                        assert_eq!(position, 3);
1590                    }
1591
1592                    #[$test]
1593                    $($async)? fn empty_ns() {
1594                        let buf = $buf;
1595                        let mut position = 1;
1596                        let mut input = b":tag>".as_ref();
1597                        //                     ^= 6
1598
1599                        assert_eq!(
1600                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1601                            Bytes(b":tag")
1602                        );
1603                        assert_eq!(position, 6);
1604                    }
1605
1606                    #[$test]
1607                    $($async)? fn with_attributes() {
1608                        let buf = $buf;
1609                        let mut position = 1;
1610                        let mut input = br#"tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1611                        //                                                        ^= 39
1612
1613                        assert_eq!(
1614                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1615                            Bytes(br#"tag  attr-1=">"  attr2  =  '>'  3attr"#)
1616                        );
1617                        assert_eq!(position, 39);
1618                    }
1619                }
1620
1621                mod self_closed {
1622                    use super::*;
1623                    use pretty_assertions::assert_eq;
1624
1625                    #[$test]
1626                    $($async)? fn empty_tag() {
1627                        let buf = $buf;
1628                        let mut position = 1;
1629                        let mut input = b"/>".as_ref();
1630                        //                  ^= 3
1631
1632                        assert_eq!(
1633                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1634                            Bytes(b"/")
1635                        );
1636                        assert_eq!(position, 3);
1637                    }
1638
1639                    #[$test]
1640                    $($async)? fn normal() {
1641                        let buf = $buf;
1642                        let mut position = 1;
1643                        let mut input = b"tag/>".as_ref();
1644                        //                     ^= 6
1645
1646                        assert_eq!(
1647                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1648                            Bytes(b"tag/")
1649                        );
1650                        assert_eq!(position, 6);
1651                    }
1652
1653                    #[$test]
1654                    $($async)? fn empty_ns_empty_tag() {
1655                        let buf = $buf;
1656                        let mut position = 1;
1657                        let mut input = b":/>".as_ref();
1658                        //                   ^= 4
1659
1660                        assert_eq!(
1661                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1662                            Bytes(b":/")
1663                        );
1664                        assert_eq!(position, 4);
1665                    }
1666
1667                    #[$test]
1668                    $($async)? fn empty_ns() {
1669                        let buf = $buf;
1670                        let mut position = 1;
1671                        let mut input = b":tag/>".as_ref();
1672                        //                      ^= 7
1673
1674                        assert_eq!(
1675                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1676                            Bytes(b":tag/")
1677                        );
1678                        assert_eq!(position, 7);
1679                    }
1680
1681                    #[$test]
1682                    $($async)? fn with_attributes() {
1683                        let buf = $buf;
1684                        let mut position = 1;
1685                        let mut input = br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/>"#.as_ref();
1686                        //                                                           ^= 42
1687
1688                        assert_eq!(
1689                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1690                            Bytes(br#"tag  attr-1="/>"  attr2  =  '/>'  3attr/"#)
1691                        );
1692                        assert_eq!(position, 42);
1693                    }
1694                }
1695
1696                mod close {
1697                    use super::*;
1698                    use pretty_assertions::assert_eq;
1699
1700                    #[$test]
1701                    $($async)? fn empty_tag() {
1702                        let buf = $buf;
1703                        let mut position = 1;
1704                        let mut input = b"/ >".as_ref();
1705                        //                   ^= 4
1706
1707                        assert_eq!(
1708                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1709                            Bytes(b"/ ")
1710                        );
1711                        assert_eq!(position, 4);
1712                    }
1713
1714                    #[$test]
1715                    $($async)? fn normal() {
1716                        let buf = $buf;
1717                        let mut position = 1;
1718                        let mut input = b"/tag>".as_ref();
1719                        //                     ^= 6
1720
1721                        assert_eq!(
1722                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1723                            Bytes(b"/tag")
1724                        );
1725                        assert_eq!(position, 6);
1726                    }
1727
1728                    #[$test]
1729                    $($async)? fn empty_ns_empty_tag() {
1730                        let buf = $buf;
1731                        let mut position = 1;
1732                        let mut input = b"/:>".as_ref();
1733                        //                   ^= 4
1734
1735                        assert_eq!(
1736                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1737                            Bytes(b"/:")
1738                        );
1739                        assert_eq!(position, 4);
1740                    }
1741
1742                    #[$test]
1743                    $($async)? fn empty_ns() {
1744                        let buf = $buf;
1745                        let mut position = 1;
1746                        let mut input = b"/:tag>".as_ref();
1747                        //                      ^= 7
1748
1749                        assert_eq!(
1750                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1751                            Bytes(b"/:tag")
1752                        );
1753                        assert_eq!(position, 7);
1754                    }
1755
1756                    #[$test]
1757                    $($async)? fn with_attributes() {
1758                        let buf = $buf;
1759                        let mut position = 1;
1760                        let mut input = br#"/tag  attr-1=">"  attr2  =  '>'  3attr>"#.as_ref();
1761                        //                                                         ^= 40
1762
1763                        assert_eq!(
1764                            Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1765                            Bytes(br#"/tag  attr-1=">"  attr2  =  '>'  3attr"#)
1766                        );
1767                        assert_eq!(position, 40);
1768                    }
1769                }
1770            }
1771
1772            /// Ensures, that no empty `Text` events are generated
1773            mod $read_event {
1774                use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
1775                use crate::reader::Reader;
1776                use pretty_assertions::assert_eq;
1777
1778                /// When `encoding` feature is enabled, encoding should be detected
1779                /// from BOM (UTF-8) and BOM should be stripped.
1780                ///
1781                /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1782                /// character should be stripped for consistency
1783                #[$test]
1784                $($async)? fn bom_from_reader() {
1785                    let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1786
1787                    assert_eq!(
1788                        reader.$read_event($buf) $(.$await)? .unwrap(),
1789                        Event::Text(BytesText::from_escaped("\u{feff}"))
1790                    );
1791
1792                    assert_eq!(
1793                        reader.$read_event($buf) $(.$await)? .unwrap(),
1794                        Event::Eof
1795                    );
1796                }
1797
1798                /// When parsing from &str, encoding is fixed (UTF-8), so
1799                /// - when `encoding` feature is disabled, the behavior the
1800                ///   same as in `bom_from_reader` text
1801                /// - when `encoding` feature is enabled, the behavior should
1802                ///   stay consistent, so the first BOM character is stripped
1803                #[$test]
1804                $($async)? fn bom_from_str() {
1805                    let mut reader = Reader::from_str("\u{feff}\u{feff}");
1806
1807                    assert_eq!(
1808                        reader.$read_event($buf) $(.$await)? .unwrap(),
1809                        Event::Text(BytesText::from_escaped("\u{feff}"))
1810                    );
1811
1812                    assert_eq!(
1813                        reader.$read_event($buf) $(.$await)? .unwrap(),
1814                        Event::Eof
1815                    );
1816                }
1817
1818                #[$test]
1819                $($async)? fn declaration() {
1820                    let mut reader = Reader::from_str("<?xml ?>");
1821
1822                    assert_eq!(
1823                        reader.$read_event($buf) $(.$await)? .unwrap(),
1824                        Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1825                    );
1826                }
1827
1828                #[$test]
1829                $($async)? fn doctype() {
1830                    let mut reader = Reader::from_str("<!DOCTYPE x>");
1831
1832                    assert_eq!(
1833                        reader.$read_event($buf) $(.$await)? .unwrap(),
1834                        Event::DocType(BytesText::from_escaped("x"))
1835                    );
1836                }
1837
1838                #[$test]
1839                $($async)? fn processing_instruction() {
1840                    let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
1841
1842                    assert_eq!(
1843                        reader.$read_event($buf) $(.$await)? .unwrap(),
1844                        Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
1845                    );
1846                }
1847
1848                /// Lone closing tags are not allowed, so testing it together with start tag
1849                #[$test]
1850                $($async)? fn start_and_end() {
1851                    let mut reader = Reader::from_str("<tag></tag>");
1852
1853                    assert_eq!(
1854                        reader.$read_event($buf) $(.$await)? .unwrap(),
1855                        Event::Start(BytesStart::new("tag"))
1856                    );
1857
1858                    assert_eq!(
1859                        reader.$read_event($buf) $(.$await)? .unwrap(),
1860                        Event::End(BytesEnd::new("tag"))
1861                    );
1862                }
1863
1864                #[$test]
1865                $($async)? fn empty() {
1866                    let mut reader = Reader::from_str("<tag/>");
1867
1868                    assert_eq!(
1869                        reader.$read_event($buf) $(.$await)? .unwrap(),
1870                        Event::Empty(BytesStart::new("tag"))
1871                    );
1872                }
1873
1874                #[$test]
1875                $($async)? fn text() {
1876                    let mut reader = Reader::from_str("text");
1877
1878                    assert_eq!(
1879                        reader.$read_event($buf) $(.$await)? .unwrap(),
1880                        Event::Text(BytesText::from_escaped("text"))
1881                    );
1882                }
1883
1884                #[$test]
1885                $($async)? fn cdata() {
1886                    let mut reader = Reader::from_str("<![CDATA[]]>");
1887
1888                    assert_eq!(
1889                        reader.$read_event($buf) $(.$await)? .unwrap(),
1890                        Event::CData(BytesCData::new(""))
1891                    );
1892                }
1893
1894                #[$test]
1895                $($async)? fn comment() {
1896                    let mut reader = Reader::from_str("<!---->");
1897
1898                    assert_eq!(
1899                        reader.$read_event($buf) $(.$await)? .unwrap(),
1900                        Event::Comment(BytesText::from_escaped(""))
1901                    );
1902                }
1903
1904                #[$test]
1905                $($async)? fn eof() {
1906                    let mut reader = Reader::from_str("");
1907
1908                    assert_eq!(
1909                        reader.$read_event($buf) $(.$await)? .unwrap(),
1910                        Event::Eof
1911                    );
1912                }
1913            }
1914        };
1915    }
1916
1917    // Export macros for the child modules:
1918    // - buffered_reader
1919    // - slice_reader
1920    pub(super) use check;
1921}