quick_xml/reader/mod.rs
1//! Contains high-level interface for a pull-based XML parser.
2
3#[cfg(feature = "encoding")]
4use encoding_rs::Encoding;
5use std::io;
6use std::ops::Range;
7
8use crate::encoding::Decoder;
9use crate::errors::{Error, SyntaxError};
10use crate::events::Event;
11use crate::parser::{ElementParser, Parser, PiParser};
12use crate::reader::state::ReaderState;
13
14/// A struct that holds a parser configuration.
15///
16/// Current parser configuration can be retrieved by calling [`Reader::config()`]
17/// and changed by changing properties of the object returned by a call to
18/// [`Reader::config_mut()`].
19///
20/// [`Reader::config()`]: crate::reader::Reader::config
21/// [`Reader::config_mut()`]: crate::reader::Reader::config_mut
22#[derive(Debug, Clone, PartialEq, Eq)]
23#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
24#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
25#[non_exhaustive]
26pub struct Config {
27 /// Whether unmatched closing tag names should be allowed. Unless enabled,
28 /// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
29 /// is returned from read methods.
30 ///
31 /// When set to `true`, it won't check if a closing tag has a corresponding
32 /// opening tag at all. For example, `<a></a></b>` will be permitted.
33 ///
34 /// Note that the emitted [`End`] event will not be modified if this is enabled,
35 /// ie. it will contain the data of the unmatched end tag.
36 ///
37 /// Note, that setting this to `true` will lead to additional allocates that
38 /// needed to store tag name for an [`End`] event.
39 ///
40 /// Default: `false`
41 ///
42 /// [`Error::IllFormed(UnmatchedEndTag)`]: crate::errors::IllFormedError::UnmatchedEndTag
43 /// [`End`]: crate::events::Event::End
44 pub allow_unmatched_ends: bool,
45
46 /// Whether comments should be validated. If enabled, in case of invalid comment
47 /// [`Error::IllFormed(DoubleHyphenInComment)`] is returned from read methods.
48 ///
49 /// When set to `true`, every [`Comment`] event will be checked for not
50 /// containing `--`, which [is not allowed] in XML comments. Most of the time
51 /// we don't want comments at all so we don't really care about comment
52 /// correctness, thus the default value is `false` to improve performance.
53 ///
54 /// Default: `false`
55 ///
56 /// [`Error::IllFormed(DoubleHyphenInComment)`]: crate::errors::IllFormedError::DoubleHyphenInComment
57 /// [`Comment`]: crate::events::Event::Comment
58 /// [is not allowed]: https://www.w3.org/TR/xml11/#sec-comments
59 pub check_comments: bool,
60
61 /// Whether mismatched closing tag names should be detected. If enabled, in
62 /// case of mismatch the [`Error::IllFormed(MismatchedEndTag)`] is returned from
63 /// read methods.
64 ///
65 /// Note, that start and end tags [should match literally][spec], they cannot
66 /// have different prefixes even if both prefixes resolve to the same namespace.
67 /// The XML
68 ///
69 /// ```xml
70 /// <outer xmlns="namespace" xmlns:p="namespace">
71 /// </p:outer>
72 /// ```
73 ///
74 /// is not valid, even though semantically the start tag is the same as the
75 /// end tag. The reason is that namespaces are an extension of the original
76 /// XML specification (without namespaces) and it should be backward-compatible.
77 ///
78 /// When set to `false`, it won't check if a closing tag matches the corresponding
79 /// opening tag. For example, `<mytag></different_tag>` will be permitted.
80 ///
81 /// If the XML is known to be sane (already processed, etc.) this saves extra time.
82 ///
83 /// Note that the emitted [`End`] event will not be modified if this is disabled,
84 /// ie. it will contain the data of the mismatched end tag.
85 ///
86 /// Note, that setting this to `true` will lead to additional allocates that
87 /// needed to store tag name for an [`End`] event. However if [`expand_empty_elements`]
88 /// is also set, only one additional allocation will be performed that support
89 /// both these options.
90 ///
91 /// Default: `true`
92 ///
93 /// [`Error::IllFormed(MismatchedEndTag)`]: crate::errors::IllFormedError::MismatchedEndTag
94 /// [spec]: https://www.w3.org/TR/xml11/#dt-etag
95 /// [`End`]: crate::events::Event::End
96 /// [`expand_empty_elements`]: Self::expand_empty_elements
97 pub check_end_names: bool,
98
99 /// Whether empty elements should be split into an `Open` and a `Close` event.
100 ///
101 /// When set to `true`, all [`Empty`] events produced by a self-closing tag
102 /// like `<tag/>` are expanded into a [`Start`] event followed by an [`End`]
103 /// event. When set to `false` (the default), those tags are represented by
104 /// an [`Empty`] event instead.
105 ///
106 /// Note, that setting this to `true` will lead to additional allocates that
107 /// needed to store tag name for an [`End`] event. However if [`check_end_names`]
108 /// is also set, only one additional allocation will be performed that support
109 /// both these options.
110 ///
111 /// Default: `false`
112 ///
113 /// [`Empty`]: crate::events::Event::Empty
114 /// [`Start`]: crate::events::Event::Start
115 /// [`End`]: crate::events::Event::End
116 /// [`check_end_names`]: Self::check_end_names
117 pub expand_empty_elements: bool,
118
119 /// Whether trailing whitespace after the markup name are trimmed in closing
120 /// tags `</a >`.
121 ///
122 /// If `true` the emitted [`End`] event is stripped of trailing whitespace
123 /// after the markup name.
124 ///
125 /// Note that if set to `false` and [`check_end_names`] is `true` the comparison
126 /// of markup names is going to fail erroneously if a closing tag contains
127 /// trailing whitespace.
128 ///
129 /// Default: `true`
130 ///
131 /// [`End`]: crate::events::Event::End
132 /// [`check_end_names`]: Self::check_end_names
133 pub trim_markup_names_in_closing_tags: bool,
134
135 /// Whether whitespace before character data should be removed.
136 ///
137 /// When set to `true`, leading whitespace is trimmed in [`Text`] events.
138 /// If after that the event is empty it will not be pushed.
139 ///
140 /// Default: `false`
141 ///
142 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
143 ///
144 /// WARNING: With this option every text events will be trimmed which is
145 /// incorrect behavior when text events delimited by comments, processing
146 /// instructions or CDATA sections. To correctly trim data manually apply
147 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
148 /// only to necessary events.
149 /// </div>
150 ///
151 /// [`Text`]: crate::events::Event::Text
152 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
153 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
154 pub trim_text_start: bool,
155
156 /// Whether whitespace after character data should be removed.
157 ///
158 /// When set to `true`, trailing whitespace is trimmed in [`Text`] events.
159 /// If after that the event is empty it will not be pushed.
160 ///
161 /// Default: `false`
162 ///
163 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
164 ///
165 /// WARNING: With this option every text events will be trimmed which is
166 /// incorrect behavior when text events delimited by comments, processing
167 /// instructions or CDATA sections. To correctly trim data manually apply
168 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
169 /// only to necessary events.
170 /// </div>
171 ///
172 /// [`Text`]: crate::events::Event::Text
173 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
174 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
175 pub trim_text_end: bool,
176}
177
178impl Config {
179 /// Set both [`trim_text_start`] and [`trim_text_end`] to the same value.
180 ///
181 /// <div style="background:rgba(80, 240, 100, 0.20);padding:0.75em;">
182 ///
183 /// WARNING: With this option every text events will be trimmed which is
184 /// incorrect behavior when text events delimited by comments, processing
185 /// instructions or CDATA sections. To correctly trim data manually apply
186 /// [`BytesText::inplace_trim_start`] and [`BytesText::inplace_trim_end`]
187 /// only to necessary events.
188 /// </div>
189 ///
190 /// [`trim_text_start`]: Self::trim_text_start
191 /// [`trim_text_end`]: Self::trim_text_end
192 /// [`BytesText::inplace_trim_start`]: crate::events::BytesText::inplace_trim_start
193 /// [`BytesText::inplace_trim_end`]: crate::events::BytesText::inplace_trim_end
194 #[inline]
195 pub fn trim_text(&mut self, trim: bool) {
196 self.trim_text_start = trim;
197 self.trim_text_end = trim;
198 }
199
200 /// Turn on or off all checks for well-formedness. Currently it is that settings:
201 /// - [`check_comments`](Self::check_comments)
202 /// - [`check_end_names`](Self::check_end_names)
203 #[inline]
204 pub fn enable_all_checks(&mut self, enable: bool) {
205 self.check_comments = enable;
206 self.check_end_names = enable;
207 }
208}
209
210impl Default for Config {
211 fn default() -> Self {
212 Self {
213 allow_unmatched_ends: false,
214 check_comments: false,
215 check_end_names: true,
216 expand_empty_elements: false,
217 trim_markup_names_in_closing_tags: true,
218 trim_text_start: false,
219 trim_text_end: false,
220 }
221 }
222}
223
224////////////////////////////////////////////////////////////////////////////////////////////////////
225
226macro_rules! read_event_impl {
227 (
228 $self:ident, $buf:ident,
229 $reader:expr,
230 $read_until_close:ident
231 $(, $await:ident)?
232 ) => {{
233 let event = loop {
234 break match $self.state.state {
235 ParseState::Init => { // Go to InsideMarkup state
236 // If encoding set explicitly, we not need to detect it. For example,
237 // explicit UTF-8 set automatically if Reader was created using `from_str`.
238 // But we still need to remove BOM for consistency with no encoding
239 // feature enabled path
240 #[cfg(feature = "encoding")]
241 if let Some(encoding) = $reader.detect_encoding() $(.$await)? ? {
242 if $self.state.encoding.can_be_refined() {
243 $self.state.encoding = crate::reader::EncodingRef::BomDetected(encoding);
244 }
245 }
246
247 // Removes UTF-8 BOM if it is present
248 #[cfg(not(feature = "encoding"))]
249 $reader.remove_utf8_bom() $(.$await)? ?;
250
251 $self.state.state = ParseState::InsideText;
252 continue;
253 },
254 ParseState::InsideText => { // Go to InsideMarkup or Done state
255 if $self.state.config.trim_text_start {
256 $reader.skip_whitespace(&mut $self.state.offset) $(.$await)? ?;
257 }
258
259 match $reader.read_text($buf, &mut $self.state.offset) $(.$await)? {
260 ReadTextResult::Markup(buf) => {
261 $self.state.state = ParseState::InsideMarkup;
262 // Pass `buf` to the next next iteration of parsing loop
263 $buf = buf;
264 continue;
265 }
266 ReadTextResult::UpToMarkup(bytes) => {
267 $self.state.state = ParseState::InsideMarkup;
268 // FIXME: Can produce an empty event if:
269 // - event contains only spaces
270 // - trim_text_start = false
271 // - trim_text_end = true
272 Ok(Event::Text($self.state.emit_text(bytes)))
273 }
274 ReadTextResult::UpToEof(bytes) => {
275 $self.state.state = ParseState::Done;
276 // Trim bytes from end if required
277 let event = $self.state.emit_text(bytes);
278 if event.is_empty() {
279 Ok(Event::Eof)
280 } else {
281 Ok(Event::Text(event))
282 }
283 }
284 ReadTextResult::Err(e) => Err(Error::Io(e.into())),
285 }
286 },
287 // Go to InsideText state in next two arms
288 ParseState::InsideMarkup => $self.$read_until_close($buf) $(.$await)?,
289 ParseState::InsideEmpty => Ok(Event::End($self.state.close_expanded_empty())),
290 ParseState::Done => Ok(Event::Eof),
291 };
292 };
293 match event {
294 // #513: In case of ill-formed errors we already consume the wrong data
295 // and change the state. We can continue parsing if we wish
296 Err(Error::IllFormed(_)) => {}
297 Err(_) | Ok(Event::Eof) => $self.state.state = ParseState::Done,
298 _ => {}
299 }
300 event
301 }};
302}
303
304/// Read bytes up to the `>` and skip it. This method is expected to be called
305/// after seeing the `<` symbol and skipping it. Inspects the next (current)
306/// symbol and returns an appropriate [`Event`]:
307///
308/// |Symbol |Event
309/// |-------|-------------------------------------
310/// |`!` |[`Comment`], [`CData`] or [`DocType`]
311/// |`/` |[`End`]
312/// |`?` |[`PI`]
313/// |_other_|[`Start`] or [`Empty`]
314///
315/// Moves parser to the `InsideText` state.
316///
317/// [`Comment`]: Event::Comment
318/// [`CData`]: Event::CData
319/// [`DocType`]: Event::DocType
320/// [`End`]: Event::End
321/// [`PI`]: Event::PI
322/// [`Start`]: Event::Start
323/// [`Empty`]: Event::Empty
324macro_rules! read_until_close {
325 (
326 $self:ident, $buf:ident,
327 $reader:expr
328 $(, $await:ident)?
329 ) => {{
330 $self.state.state = ParseState::InsideText;
331
332 let start = $self.state.offset;
333 match $reader.peek_one() $(.$await)? {
334 // `<!` - comment, CDATA or DOCTYPE declaration
335 Ok(Some(b'!')) => match $reader
336 .read_bang_element($buf, &mut $self.state.offset)
337 $(.$await)?
338 {
339 Ok((bang_type, bytes)) => $self.state.emit_bang(bang_type, bytes),
340 Err(e) => {
341 // We want to report error at `<`, but offset was increased,
342 // so return it back (-1 for `<`)
343 $self.state.last_error_offset = start - 1;
344 Err(e)
345 }
346 },
347 // `</` - closing tag
348 // #776: We parse using ElementParser which allows us to have attributes
349 // in close tags. While such tags are not allowed by the specification,
350 // we anyway allow to parse them because:
351 // - we do not check constraints during parsing. This is performed by the
352 // optional validate step which user should call manually
353 // - if we just look for `>` we will parse `</tag attr=">" >` as end tag
354 // `</tag attr=">` and text `" >` which probably no one existing parser
355 // does. This is malformed XML, however it is tolerated by some parsers
356 // (e.g. the one used by Adobe Flash) and such documents do exist in the wild.
357 Ok(Some(b'/')) => match $reader
358 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
359 $(.$await)?
360 {
361 Ok(bytes) => $self.state.emit_end(bytes),
362 Err(e) => {
363 // We want to report error at `<`, but offset was increased,
364 // so return it back (-1 for `<`)
365 $self.state.last_error_offset = start - 1;
366 Err(e)
367 }
368 },
369 // `<?` - processing instruction
370 Ok(Some(b'?')) => match $reader
371 .read_with(PiParser(false), $buf, &mut $self.state.offset)
372 $(.$await)?
373 {
374 Ok(bytes) => $self.state.emit_question_mark(bytes),
375 Err(e) => {
376 // We want to report error at `<`, but offset was increased,
377 // so return it back (-1 for `<`)
378 $self.state.last_error_offset = start - 1;
379 Err(e)
380 }
381 },
382 // `<...` - opening or self-closed tag
383 Ok(Some(_)) => match $reader
384 .read_with(ElementParser::Outside, $buf, &mut $self.state.offset)
385 $(.$await)?
386 {
387 Ok(bytes) => Ok($self.state.emit_start(bytes)),
388 Err(e) => {
389 // We want to report error at `<`, but offset was increased,
390 // so return it back (-1 for `<`)
391 $self.state.last_error_offset = start - 1;
392 Err(e)
393 }
394 },
395 // `<` - syntax error, tag not closed
396 Ok(None) => {
397 // We want to report error at `<`, but offset was increased,
398 // so return it back (-1 for `<`)
399 $self.state.last_error_offset = start - 1;
400 Err(Error::Syntax(SyntaxError::UnclosedTag))
401 }
402 Err(e) => Err(Error::Io(e.into())),
403 }
404 }};
405}
406
407/// Generalization of `read_to_end` method for buffered and borrowed readers
408macro_rules! read_to_end {
409 (
410 // $self: &mut Reader
411 $self:expr, $end:expr, $buf:expr,
412 $read_event:ident,
413 // Code block that performs clearing of internal buffer after read of each event
414 $clear:block
415 $(, $await:ident)?
416 ) => {{
417 // Because we take position after the event before the End event,
418 // it is important that this position indicates beginning of the End event.
419 // If between last event and the End event would be only spaces, then we
420 // take position before the spaces, but spaces would be skipped without
421 // generating event if `trim_text_start` is set to `true`. To prevent that
422 // we temporary disable start text trimming.
423 //
424 // We also cannot take position after getting End event, because if
425 // `trim_markup_names_in_closing_tags` is set to `true` (which is the default),
426 // we do not known the real size of the End event that it is occupies in
427 // the source and cannot correct the position after the End event.
428 // So, we in any case should tweak parser configuration.
429 let config = $self.config_mut();
430 let trim = config.trim_text_start;
431 config.trim_text_start = false;
432
433 let start = $self.buffer_position();
434 let mut depth = 0;
435 loop {
436 $clear
437 let end = $self.buffer_position();
438 match $self.$read_event($buf) $(.$await)? {
439 Err(e) => {
440 $self.config_mut().trim_text_start = trim;
441 return Err(e);
442 }
443
444 Ok(Event::Start(e)) if e.name() == $end => depth += 1,
445 Ok(Event::End(e)) if e.name() == $end => {
446 if depth == 0 {
447 $self.config_mut().trim_text_start = trim;
448 break start..end;
449 }
450 depth -= 1;
451 }
452 Ok(Event::Eof) => {
453 $self.config_mut().trim_text_start = trim;
454 return Err(Error::missed_end($end, $self.decoder()));
455 }
456 _ => (),
457 }
458 }
459 }};
460}
461
462#[cfg(feature = "async-tokio")]
463mod async_tokio;
464mod buffered_reader;
465mod ns_reader;
466mod slice_reader;
467mod state;
468
469pub use ns_reader::NsReader;
470
471/// Range of input in bytes, that corresponds to some piece of XML
472pub type Span = Range<u64>;
473
474////////////////////////////////////////////////////////////////////////////////////////////////////
475
476/// Possible reader states. The state transition diagram (`true` and `false` shows
477/// value of [`Config::expand_empty_elements`] option):
478///
479/// ```mermaid
480/// flowchart LR
481/// subgraph _
482/// direction LR
483///
484/// Init -- "(no event)"\n --> InsideMarkup
485/// InsideMarkup -- Decl, DocType, PI\nComment, CData\nStart, Empty, End --> InsideText
486/// InsideText -- "#lt;false#gt;\n(no event)"\nText --> InsideMarkup
487/// end
488/// InsideText -- "#lt;true#gt;"\nStart --> InsideEmpty
489/// InsideEmpty -- End --> InsideText
490/// _ -. Eof .-> Done
491/// ```
492#[derive(Clone, Debug)]
493enum ParseState {
494 /// Initial state in which reader stay after creation. Transition from that
495 /// state could produce a `Text`, `Decl`, `Comment` or `Start` event. The next
496 /// state is always `InsideMarkup`. The reader will never return to this state. The
497 /// event emitted during transition to `InsideMarkup` is a `StartEvent` if the
498 /// first symbol not `<`, otherwise no event are emitted.
499 Init,
500 /// State after seeing the `<` symbol. Depending on the next symbol all other
501 /// events could be generated.
502 ///
503 /// After generating one event the reader moves to the `InsideText` state.
504 InsideMarkup,
505 /// State in which reader searches the `<` symbol of a markup. All bytes before
506 /// that symbol will be returned in the [`Event::Text`] event. After that
507 /// the reader moves to the `InsideMarkup` state.
508 InsideText,
509 /// This state is used only if option [`expand_empty_elements`] is set to `true`.
510 /// Reader enters to this state when it is in a `InsideText` state and emits an
511 /// [`Event::Start`] event. The next event emitted will be an [`Event::End`],
512 /// after which reader returned to the `InsideText` state.
513 ///
514 /// [`expand_empty_elements`]: Config::expand_empty_elements
515 InsideEmpty,
516 /// Reader enters this state when `Eof` event generated or an error occurred.
517 /// This is the last state, the reader stay in it forever.
518 Done,
519}
520
521/// A reference to an encoding together with information about how it was retrieved.
522///
523/// The state transition diagram:
524///
525/// ```mermaid
526/// flowchart LR
527/// Implicit -- from_str --> Explicit
528/// Implicit -- BOM --> BomDetected
529/// Implicit -- "encoding=..." --> XmlDetected
530/// BomDetected -- "encoding=..." --> XmlDetected
531/// ```
532#[cfg(feature = "encoding")]
533#[derive(Clone, Copy, Debug)]
534enum EncodingRef {
535 /// Encoding was implicitly assumed to have a specified value. It can be refined
536 /// using BOM or by the XML declaration event (`<?xml encoding=... ?>`)
537 Implicit(&'static Encoding),
538 /// Encoding was explicitly set to the desired value. It cannot be changed
539 /// nor by BOM, nor by parsing XML declaration (`<?xml encoding=... ?>`)
540 Explicit(&'static Encoding),
541 /// Encoding was detected from a byte order mark (BOM) or by the first bytes
542 /// of the content. It can be refined by the XML declaration event (`<?xml encoding=... ?>`)
543 BomDetected(&'static Encoding),
544 /// Encoding was detected using XML declaration event (`<?xml encoding=... ?>`).
545 /// It can no longer change
546 XmlDetected(&'static Encoding),
547}
548#[cfg(feature = "encoding")]
549impl EncodingRef {
550 #[inline]
551 const fn encoding(&self) -> &'static Encoding {
552 match self {
553 Self::Implicit(e) => e,
554 Self::Explicit(e) => e,
555 Self::BomDetected(e) => e,
556 Self::XmlDetected(e) => e,
557 }
558 }
559 #[inline]
560 const fn can_be_refined(&self) -> bool {
561 match self {
562 Self::Implicit(_) | Self::BomDetected(_) => true,
563 Self::Explicit(_) | Self::XmlDetected(_) => false,
564 }
565 }
566}
567
568////////////////////////////////////////////////////////////////////////////////////////////////////
569
570/// A direct stream to the underlying [`Reader`]s reader which updates
571/// [`Reader::buffer_position()`] when read from it.
572#[derive(Debug)]
573#[must_use = "streams do nothing unless read or polled"]
574pub struct BinaryStream<'r, R> {
575 inner: &'r mut R,
576 offset: &'r mut u64,
577}
578
579impl<'r, R> BinaryStream<'r, R> {
580 /// Returns current position in bytes in the original source.
581 #[inline]
582 pub const fn offset(&self) -> u64 {
583 *self.offset
584 }
585
586 /// Gets a reference to the underlying reader.
587 #[inline]
588 pub const fn get_ref(&self) -> &R {
589 self.inner
590 }
591
592 /// Gets a mutable reference to the underlying reader.
593 ///
594 /// Avoid read from this reader because this will not update reader's position
595 /// and will lead to incorrect positions of errors. Read from this stream instead.
596 #[inline]
597 pub fn get_mut(&mut self) -> &mut R {
598 self.inner
599 }
600}
601
602impl<'r, R> io::Read for BinaryStream<'r, R>
603where
604 R: io::Read,
605{
606 #[inline]
607 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
608 let amt = self.inner.read(buf)?;
609 *self.offset += amt as u64;
610 Ok(amt)
611 }
612}
613
614impl<'r, R> io::BufRead for BinaryStream<'r, R>
615where
616 R: io::BufRead,
617{
618 #[inline]
619 fn fill_buf(&mut self) -> io::Result<&[u8]> {
620 self.inner.fill_buf()
621 }
622
623 #[inline]
624 fn consume(&mut self, amt: usize) {
625 self.inner.consume(amt);
626 *self.offset += amt as u64;
627 }
628}
629
630////////////////////////////////////////////////////////////////////////////////////////////////////
631
632/// A low level encoding-agnostic XML event reader.
633///
634/// Consumes bytes and streams XML [`Event`]s.
635///
636/// This reader does not manage namespace declarations and not able to resolve
637/// prefixes. If you want these features, use the [`NsReader`].
638///
639/// # Examples
640///
641/// ```
642/// use quick_xml::events::Event;
643/// use quick_xml::reader::Reader;
644///
645/// let xml = r#"<tag1 att1 = "test">
646/// <tag2><!--Test comment-->Test</tag2>
647/// <tag2>Test 2</tag2>
648/// </tag1>"#;
649/// let mut reader = Reader::from_str(xml);
650/// reader.config_mut().trim_text(true);
651///
652/// let mut count = 0;
653/// let mut txt = Vec::new();
654/// let mut buf = Vec::new();
655///
656/// // The `Reader` does not implement `Iterator` because it outputs borrowed data (`Cow`s)
657/// loop {
658/// // NOTE: this is the generic case when we don't know about the input BufRead.
659/// // when the input is a &str or a &[u8], we don't actually need to use another
660/// // buffer, we could directly call `reader.read_event()`
661/// match reader.read_event_into(&mut buf) {
662/// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
663/// // exits the loop when reaching end of file
664/// Ok(Event::Eof) => break,
665///
666/// Ok(Event::Start(e)) => {
667/// match e.name().as_ref() {
668/// b"tag1" => println!("attributes values: {:?}",
669/// e.attributes().map(|a| a.unwrap().value)
670/// .collect::<Vec<_>>()),
671/// b"tag2" => count += 1,
672/// _ => (),
673/// }
674/// }
675/// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
676///
677/// // There are several other `Event`s we do not consider here
678/// _ => (),
679/// }
680/// // if we don't keep a borrow elsewhere, we can clear the buffer to keep memory usage low
681/// buf.clear();
682/// }
683/// ```
684///
685/// [`NsReader`]: crate::reader::NsReader
686#[derive(Clone)]
687pub struct Reader<R> {
688 /// Source of data for parse
689 reader: R,
690 /// Configuration and current parse state
691 state: ReaderState,
692}
693
694/// Builder methods
695impl<R> Reader<R> {
696 /// Creates a `Reader` that reads from a given reader.
697 pub fn from_reader(reader: R) -> Self {
698 Self {
699 reader,
700 state: ReaderState::default(),
701 }
702 }
703
704 /// Returns reference to the parser configuration
705 pub const fn config(&self) -> &Config {
706 &self.state.config
707 }
708
709 /// Returns mutable reference to the parser configuration
710 pub fn config_mut(&mut self) -> &mut Config {
711 &mut self.state.config
712 }
713}
714
715/// Getters
716impl<R> Reader<R> {
717 /// Consumes `Reader` returning the underlying reader
718 ///
719 /// Can be used to compute line and column of a parsing error position
720 ///
721 /// # Examples
722 ///
723 /// ```
724 /// # use pretty_assertions::assert_eq;
725 /// use std::{str, io::Cursor};
726 /// use quick_xml::events::Event;
727 /// use quick_xml::reader::Reader;
728 ///
729 /// let xml = r#"<tag1 att1 = "test">
730 /// <tag2><!--Test comment-->Test</tag2>
731 /// <tag3>Test 2</tag3>
732 /// </tag1>"#;
733 /// let mut reader = Reader::from_reader(Cursor::new(xml.as_bytes()));
734 /// let mut buf = Vec::new();
735 ///
736 /// fn into_line_and_column(reader: Reader<Cursor<&[u8]>>) -> (usize, usize) {
737 /// // We known that size cannot exceed usize::MAX because we created parser from single &[u8]
738 /// let end_pos = reader.buffer_position() as usize;
739 /// let mut cursor = reader.into_inner();
740 /// let s = String::from_utf8(cursor.into_inner()[0..end_pos].to_owned())
741 /// .expect("can't make a string");
742 /// let mut line = 1;
743 /// let mut column = 0;
744 /// for c in s.chars() {
745 /// if c == '\n' {
746 /// line += 1;
747 /// column = 0;
748 /// } else {
749 /// column += 1;
750 /// }
751 /// }
752 /// (line, column)
753 /// }
754 ///
755 /// loop {
756 /// match reader.read_event_into(&mut buf) {
757 /// Ok(Event::Start(ref e)) => match e.name().as_ref() {
758 /// b"tag1" | b"tag2" => (),
759 /// tag => {
760 /// assert_eq!(b"tag3", tag);
761 /// assert_eq!((3, 22), into_line_and_column(reader));
762 /// break;
763 /// }
764 /// },
765 /// Ok(Event::Eof) => unreachable!(),
766 /// _ => (),
767 /// }
768 /// buf.clear();
769 /// }
770 /// ```
771 pub fn into_inner(self) -> R {
772 self.reader
773 }
774
775 /// Gets a reference to the underlying reader.
776 pub const fn get_ref(&self) -> &R {
777 &self.reader
778 }
779
780 /// Gets a mutable reference to the underlying reader.
781 ///
782 /// Avoid read from this reader because this will not update reader's position
783 /// and will lead to incorrect positions of errors. If you want to read, use
784 /// [`stream()`] instead.
785 ///
786 /// [`stream()`]: Self::stream
787 pub fn get_mut(&mut self) -> &mut R {
788 &mut self.reader
789 }
790
791 /// Gets the current byte position in the input data.
792 pub const fn buffer_position(&self) -> u64 {
793 // when internal state is InsideMarkup, we have actually read until '<',
794 // which we don't want to show
795 if let ParseState::InsideMarkup = self.state.state {
796 self.state.offset - 1
797 } else {
798 self.state.offset
799 }
800 }
801
802 /// Gets the last error byte position in the input data. If there is no errors
803 /// yet, returns `0`.
804 ///
805 /// Unlike `buffer_position` it will point to the place where it is rational
806 /// to report error to the end user. For example, all [`SyntaxError`]s are
807 /// reported when the parser sees EOF inside of some kind of markup. The
808 /// `buffer_position()` will point to the last byte of input which is not
809 /// very useful. `error_position()` will point to the start of corresponding
810 /// markup element (i. e. to the `<` character).
811 ///
812 /// This position is always `<= buffer_position()`.
813 pub const fn error_position(&self) -> u64 {
814 self.state.last_error_offset
815 }
816
817 /// Get the decoder, used to decode bytes, read by this reader, to the strings.
818 ///
819 /// If [`encoding`] feature is enabled, the used encoding may change after
820 /// parsing the XML declaration, otherwise encoding is fixed to UTF-8.
821 ///
822 /// If [`encoding`] feature is enabled and no encoding is specified in declaration,
823 /// defaults to UTF-8.
824 ///
825 /// [`encoding`]: ../index.html#encoding
826 #[inline]
827 pub const fn decoder(&self) -> Decoder {
828 self.state.decoder()
829 }
830
831 /// Get the direct access to the underlying reader, but tracks the amount of
832 /// read data and update [`Reader::buffer_position()`] accordingly.
833 ///
834 /// Note, that this method gives you access to the internal reader and read
835 /// data will not be returned in any subsequent events read by `read_event`
836 /// family of methods.
837 ///
838 /// # Example
839 ///
840 /// This example demonstrates how to read stream raw bytes from an XML document.
841 /// This could be used to implement streaming read of text, or to read raw binary
842 /// bytes embedded in an XML document. (Documents with embedded raw bytes are not
843 /// valid XML, but XML-derived file formats exist where such documents are valid).
844 ///
845 /// ```
846 /// # use pretty_assertions::assert_eq;
847 /// use std::io::{BufRead, Read};
848 /// use quick_xml::events::{BytesEnd, BytesStart, Event};
849 /// use quick_xml::reader::Reader;
850 ///
851 /// let mut reader = Reader::from_str("<tag>binary << data&></tag>");
852 /// // ^ ^ ^ ^
853 /// // 0 5 21 27
854 ///
855 /// assert_eq!(
856 /// (reader.read_event().unwrap(), reader.buffer_position()),
857 /// // 5 - end of the `<tag>`
858 /// (Event::Start(BytesStart::new("tag")), 5)
859 /// );
860 ///
861 /// // Reading directly from underlying reader will not update position
862 /// // let mut inner = reader.get_mut();
863 ///
864 /// // Reading from the stream() advances position
865 /// let mut inner = reader.stream();
866 ///
867 /// // Read binary data. We must know its size
868 /// let mut binary = [0u8; 16];
869 /// inner.read_exact(&mut binary).unwrap();
870 /// assert_eq!(&binary, b"binary << data&>");
871 /// // 21 - end of the `binary << data&>`
872 /// assert_eq!(inner.offset(), 21);
873 /// assert_eq!(reader.buffer_position(), 21);
874 ///
875 /// assert_eq!(
876 /// (reader.read_event().unwrap(), reader.buffer_position()),
877 /// // 27 - end of the `</tag>`
878 /// (Event::End(BytesEnd::new("tag")), 27)
879 /// );
880 ///
881 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
882 /// ```
883 #[inline]
884 pub fn stream(&mut self) -> BinaryStream<R> {
885 BinaryStream {
886 inner: &mut self.reader,
887 offset: &mut self.state.offset,
888 }
889 }
890}
891
892/// Private sync reading methods
893impl<R> Reader<R> {
894 /// Read text into the given buffer, and return an event that borrows from
895 /// either that buffer or from the input itself, based on the type of the
896 /// reader.
897 fn read_event_impl<'i, B>(&mut self, mut buf: B) -> Result<Event<'i>, Error>
898 where
899 R: XmlSource<'i, B>,
900 {
901 read_event_impl!(self, buf, self.reader, read_until_close)
902 }
903
904 /// Private function to read until `>` is found. This function expects that
905 /// it was called just after encounter a `<` symbol.
906 fn read_until_close<'i, B>(&mut self, buf: B) -> Result<Event<'i>, Error>
907 where
908 R: XmlSource<'i, B>,
909 {
910 read_until_close!(self, buf, self.reader)
911 }
912}
913
914////////////////////////////////////////////////////////////////////////////////////////////////////
915
916/// Result of an attempt to read XML textual data from the reader.
917enum ReadTextResult<'r, B> {
918 /// Start of markup (`<` character) was found in the first byte.
919 /// Contains buffer that should be returned back to the next iteration cycle
920 /// to satisfy borrow checker requirements.
921 Markup(B),
922 /// Contains text block up to start of markup (`<` character).
923 UpToMarkup(&'r [u8]),
924 /// Contains text block up to EOF, start of markup (`<` character) was not found.
925 UpToEof(&'r [u8]),
926 /// IO error occurred.
927 Err(io::Error),
928}
929
930/// Represents an input for a reader that can return borrowed data.
931///
932/// There are two implementors of this trait: generic one that read data from
933/// `Self`, copies some part of it into a provided buffer of type `B` and then
934/// returns data that borrow from that buffer.
935///
936/// The other implementor is for `&[u8]` and instead of copying data returns
937/// borrowed data from `Self` instead. This implementation allows zero-copy
938/// deserialization.
939///
940/// # Parameters
941/// - `'r`: lifetime of a buffer from which events will borrow
942/// - `B`: a type of a buffer that can be used to store data read from `Self` and
943/// from which events can borrow
944trait XmlSource<'r, B> {
945 /// Removes UTF-8 BOM if it is present
946 #[cfg(not(feature = "encoding"))]
947 fn remove_utf8_bom(&mut self) -> io::Result<()>;
948
949 /// Determines encoding from the start of input and removes BOM if it is present
950 #[cfg(feature = "encoding")]
951 fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>>;
952
953 /// Read input until start of markup (the `<`) is found or end of input is reached.
954 ///
955 /// # Parameters
956 /// - `buf`: Buffer that could be filled from an input (`Self`) and
957 /// from which [events] could borrow their data
958 /// - `position`: Will be increased by amount of bytes consumed
959 ///
960 /// [events]: crate::events::Event
961 fn read_text(&mut self, buf: B, position: &mut u64) -> ReadTextResult<'r, B>;
962
963 /// Read input until processing instruction is finished.
964 ///
965 /// This method expect that start sequence of a parser already was read.
966 ///
967 /// Returns a slice of data read up to the end of the thing being parsed.
968 /// The end of thing and the returned content is determined by the used parser.
969 ///
970 /// If input (`Self`) is exhausted and no bytes was read, or if the specified
971 /// parser could not find the ending sequence of the thing, returns `SyntaxError`.
972 ///
973 /// # Parameters
974 /// - `buf`: Buffer that could be filled from an input (`Self`) and
975 /// from which [events] could borrow their data
976 /// - `position`: Will be increased by amount of bytes consumed
977 ///
978 /// A `P` type parameter is used to preserve state between calls to the underlying
979 /// reader which provides bytes fed into the parser.
980 ///
981 /// [events]: crate::events::Event
982 fn read_with<P>(&mut self, parser: P, buf: B, position: &mut u64) -> Result<&'r [u8], Error>
983 where
984 P: Parser;
985
986 /// Read input until comment or CDATA is finished.
987 ///
988 /// This method expect that `<` already was read.
989 ///
990 /// Returns a slice of data read up to end of comment or CDATA (`>`),
991 /// which does not include into result.
992 ///
993 /// If input (`Self`) is exhausted and nothing was read, returns `None`.
994 ///
995 /// # Parameters
996 /// - `buf`: Buffer that could be filled from an input (`Self`) and
997 /// from which [events] could borrow their data
998 /// - `position`: Will be increased by amount of bytes consumed
999 ///
1000 /// [events]: crate::events::Event
1001 fn read_bang_element(
1002 &mut self,
1003 buf: B,
1004 position: &mut u64,
1005 ) -> Result<(BangType, &'r [u8]), Error>;
1006
1007 /// Consume and discard all the whitespace until the next non-whitespace
1008 /// character or EOF.
1009 ///
1010 /// # Parameters
1011 /// - `position`: Will be increased by amount of bytes consumed
1012 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()>;
1013
1014 /// Return one character without consuming it, so that future `read_*` calls
1015 /// will still include it. On EOF, return `None`.
1016 fn peek_one(&mut self) -> io::Result<Option<u8>>;
1017}
1018
1019/// Possible elements started with `<!`
1020#[derive(Debug, PartialEq)]
1021enum BangType {
1022 /// <![CDATA[...]]>
1023 CData,
1024 /// <!--...-->
1025 Comment,
1026 /// <!DOCTYPE...>. Contains balance of '<' (+1) and '>' (-1)
1027 DocType(i32),
1028}
1029impl BangType {
1030 #[inline(always)]
1031 const fn new(byte: Option<u8>) -> Result<Self, SyntaxError> {
1032 Ok(match byte {
1033 Some(b'[') => Self::CData,
1034 Some(b'-') => Self::Comment,
1035 Some(b'D') | Some(b'd') => Self::DocType(0),
1036 _ => return Err(SyntaxError::InvalidBangMarkup),
1037 })
1038 }
1039
1040 /// If element is finished, returns its content up to `>` symbol and
1041 /// an index of this symbol, otherwise returns `None`
1042 ///
1043 /// # Parameters
1044 /// - `buf`: buffer with data consumed on previous iterations
1045 /// - `chunk`: data read on current iteration and not yet consumed from reader
1046 #[inline(always)]
1047 fn parse<'b>(&mut self, buf: &[u8], chunk: &'b [u8]) -> Option<(&'b [u8], usize)> {
1048 match self {
1049 Self::Comment => {
1050 for i in memchr::memchr_iter(b'>', chunk) {
1051 // Need to read at least 6 symbols (`!---->`) for properly finished comment
1052 // <!----> - XML comment
1053 // 012345 - i
1054 if buf.len() + i > 4 {
1055 if chunk[..i].ends_with(b"--") {
1056 // We cannot strip last `--` from the buffer because we need it in case of
1057 // check_comments enabled option. XML standard requires that comment
1058 // will not end with `--->` sequence because this is a special case of
1059 // `--` in the comment (https://www.w3.org/TR/xml11/#sec-comments)
1060 return Some((&chunk[..i], i + 1)); // +1 for `>`
1061 }
1062 // End sequence `-|->` was splitted at |
1063 // buf --/ \-- chunk
1064 if i == 1 && buf.ends_with(b"-") && chunk[0] == b'-' {
1065 return Some((&chunk[..i], i + 1)); // +1 for `>`
1066 }
1067 // End sequence `--|>` was splitted at |
1068 // buf --/ \-- chunk
1069 if i == 0 && buf.ends_with(b"--") {
1070 return Some((&[], i + 1)); // +1 for `>`
1071 }
1072 }
1073 }
1074 }
1075 Self::CData => {
1076 for i in memchr::memchr_iter(b'>', chunk) {
1077 if chunk[..i].ends_with(b"]]") {
1078 return Some((&chunk[..i], i + 1)); // +1 for `>`
1079 }
1080 // End sequence `]|]>` was splitted at |
1081 // buf --/ \-- chunk
1082 if i == 1 && buf.ends_with(b"]") && chunk[0] == b']' {
1083 return Some((&chunk[..i], i + 1)); // +1 for `>`
1084 }
1085 // End sequence `]]|>` was splitted at |
1086 // buf --/ \-- chunk
1087 if i == 0 && buf.ends_with(b"]]") {
1088 return Some((&[], i + 1)); // +1 for `>`
1089 }
1090 }
1091 }
1092 Self::DocType(ref mut balance) => {
1093 for i in memchr::memchr2_iter(b'<', b'>', chunk) {
1094 if chunk[i] == b'<' {
1095 *balance += 1;
1096 } else {
1097 if *balance == 0 {
1098 return Some((&chunk[..i], i + 1)); // +1 for `>`
1099 }
1100 *balance -= 1;
1101 }
1102 }
1103 }
1104 }
1105 None
1106 }
1107 #[inline]
1108 const fn to_err(&self) -> SyntaxError {
1109 match self {
1110 Self::CData => SyntaxError::UnclosedCData,
1111 Self::Comment => SyntaxError::UnclosedComment,
1112 Self::DocType(_) => SyntaxError::UnclosedDoctype,
1113 }
1114 }
1115}
1116
1117////////////////////////////////////////////////////////////////////////////////////////////////////
1118
1119#[cfg(test)]
1120mod test {
1121 /// Checks the internal implementation of the various reader methods
1122 macro_rules! check {
1123 (
1124 #[$test:meta]
1125 $read_event:ident,
1126 $read_until_close:ident,
1127 // constructor of the XML source on which internal functions will be called
1128 $source:path,
1129 // constructor of the buffer to which read data will stored
1130 $buf:expr
1131 $(, $async:ident, $await:ident)?
1132 ) => {
1133 mod read_bang_element {
1134 use super::*;
1135 use crate::errors::{Error, SyntaxError};
1136 use crate::reader::BangType;
1137 use crate::utils::Bytes;
1138
1139 /// Checks that reading CDATA content works correctly
1140 mod cdata {
1141 use super::*;
1142 use pretty_assertions::assert_eq;
1143
1144 /// Checks that if input begins like CDATA element, but CDATA start sequence
1145 /// is not finished, parsing ends with an error
1146 #[$test]
1147 #[ignore = "start CDATA sequence fully checked outside of `read_bang_element`"]
1148 $($async)? fn not_properly_start() {
1149 let buf = $buf;
1150 let mut position = 1;
1151 let mut input = b"![]]>other content".as_ref();
1152 // ^= 1
1153
1154 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1155 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1156 x => panic!(
1157 "Expected `Err(Syntax(_))`, but got `{:?}`",
1158 x
1159 ),
1160 }
1161 assert_eq!(position, 1);
1162 }
1163
1164 /// Checks that if CDATA startup sequence was matched, but an end sequence
1165 /// is not found, parsing ends with an error
1166 #[$test]
1167 $($async)? fn not_closed() {
1168 let buf = $buf;
1169 let mut position = 1;
1170 let mut input = b"![CDATA[other content".as_ref();
1171 // ^= 1 ^= 22
1172
1173 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1174 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedCData),
1175 x => panic!(
1176 "Expected `Err(Syntax(_))`, but got `{:?}`",
1177 x
1178 ),
1179 }
1180 assert_eq!(position, 22);
1181 }
1182
1183 /// Checks that CDATA element without content inside parsed successfully
1184 #[$test]
1185 $($async)? fn empty() {
1186 let buf = $buf;
1187 let mut position = 1;
1188 let mut input = b"![CDATA[]]>other content".as_ref();
1189 // ^= 1 ^= 12
1190
1191 let (ty, bytes) = $source(&mut input)
1192 .read_bang_element(buf, &mut position)
1193 $(.$await)?
1194 .unwrap();
1195 assert_eq!(
1196 (ty, Bytes(bytes)),
1197 (BangType::CData, Bytes(b"![CDATA[]]"))
1198 );
1199 assert_eq!(position, 12);
1200 }
1201
1202 /// Checks that CDATA element with content parsed successfully.
1203 /// Additionally checks that sequences inside CDATA that may look like
1204 /// a CDATA end sequence do not interrupt CDATA parsing
1205 #[$test]
1206 $($async)? fn with_content() {
1207 let buf = $buf;
1208 let mut position = 1;
1209 let mut input = b"![CDATA[cdata]] ]>content]]>other content]]>".as_ref();
1210 // ^= 1 ^= 29
1211
1212 let (ty, bytes) = $source(&mut input)
1213 .read_bang_element(buf, &mut position)
1214 $(.$await)?
1215 .unwrap();
1216 assert_eq!(
1217 (ty, Bytes(bytes)),
1218 (BangType::CData, Bytes(b"![CDATA[cdata]] ]>content]]"))
1219 );
1220 assert_eq!(position, 29);
1221 }
1222 }
1223
1224 /// Checks that reading XML comments works correctly. According to the [specification],
1225 /// comment data can contain any sequence except `--`:
1226 ///
1227 /// ```peg
1228 /// comment = '<--' (!'--' char)* '-->';
1229 /// char = [#x1-#x2C]
1230 /// / [#x2E-#xD7FF]
1231 /// / [#xE000-#xFFFD]
1232 /// / [#x10000-#x10FFFF]
1233 /// ```
1234 ///
1235 /// The presence of this limitation, however, is simply a poorly designed specification
1236 /// (maybe for purpose of building of LL(1) XML parser) and quick-xml does not check for
1237 /// presence of these sequences by default. This tests allow such content.
1238 ///
1239 /// [specification]: https://www.w3.org/TR/xml11/#dt-comment
1240 mod comment {
1241 use super::*;
1242 use pretty_assertions::assert_eq;
1243
1244 #[$test]
1245 #[ignore = "start comment sequence fully checked outside of `read_bang_element`"]
1246 $($async)? fn not_properly_start() {
1247 let buf = $buf;
1248 let mut position = 1;
1249 let mut input = b"!- -->other content".as_ref();
1250 // ^= 1
1251
1252 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1253 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1254 x => panic!(
1255 "Expected `Err(Syntax(_))`, but got `{:?}`",
1256 x
1257 ),
1258 }
1259 assert_eq!(position, 1);
1260 }
1261
1262 #[$test]
1263 $($async)? fn not_properly_end() {
1264 let buf = $buf;
1265 let mut position = 1;
1266 let mut input = b"!->other content".as_ref();
1267 // ^= 1 ^= 17
1268
1269 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1270 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1271 x => panic!(
1272 "Expected `Err(Syntax(_))`, but got `{:?}`",
1273 x
1274 ),
1275 }
1276 assert_eq!(position, 17);
1277 }
1278
1279 #[$test]
1280 $($async)? fn not_closed1() {
1281 let buf = $buf;
1282 let mut position = 1;
1283 let mut input = b"!--other content".as_ref();
1284 // ^= 1 ^= 17
1285
1286 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1287 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1288 x => panic!(
1289 "Expected `Err(Syntax(_))`, but got `{:?}`",
1290 x
1291 ),
1292 }
1293 assert_eq!(position, 17);
1294 }
1295
1296 #[$test]
1297 $($async)? fn not_closed2() {
1298 let buf = $buf;
1299 let mut position = 1;
1300 let mut input = b"!-->other content".as_ref();
1301 // ^= 1 ^= 18
1302
1303 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1304 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1305 x => panic!(
1306 "Expected `Err(Syntax(_))`, but got `{:?}`",
1307 x
1308 ),
1309 }
1310 assert_eq!(position, 18);
1311 }
1312
1313 #[$test]
1314 $($async)? fn not_closed3() {
1315 let buf = $buf;
1316 let mut position = 1;
1317 let mut input = b"!--->other content".as_ref();
1318 // ^= 1 ^= 19
1319
1320 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1321 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedComment),
1322 x => panic!(
1323 "Expected `Err(Syntax(_))`, but got `{:?}`",
1324 x
1325 ),
1326 }
1327 assert_eq!(position, 19);
1328 }
1329
1330 #[$test]
1331 $($async)? fn empty() {
1332 let buf = $buf;
1333 let mut position = 1;
1334 let mut input = b"!---->other content".as_ref();
1335 // ^= 1 ^= 7
1336
1337 let (ty, bytes) = $source(&mut input)
1338 .read_bang_element(buf, &mut position)
1339 $(.$await)?
1340 .unwrap();
1341 assert_eq!(
1342 (ty, Bytes(bytes)),
1343 (BangType::Comment, Bytes(b"!----"))
1344 );
1345 assert_eq!(position, 7);
1346 }
1347
1348 #[$test]
1349 $($async)? fn with_content() {
1350 let buf = $buf;
1351 let mut position = 1;
1352 let mut input = b"!--->comment<--->other content".as_ref();
1353 // ^= 1 ^= 18
1354
1355 let (ty, bytes) = $source(&mut input)
1356 .read_bang_element(buf, &mut position)
1357 $(.$await)?
1358 .unwrap();
1359 assert_eq!(
1360 (ty, Bytes(bytes)),
1361 (BangType::Comment, Bytes(b"!--->comment<---"))
1362 );
1363 assert_eq!(position, 18);
1364 }
1365 }
1366
1367 /// Checks that reading DOCTYPE definition works correctly
1368 mod doctype {
1369 use super::*;
1370
1371 mod uppercase {
1372 use super::*;
1373 use pretty_assertions::assert_eq;
1374
1375 #[$test]
1376 $($async)? fn not_properly_start() {
1377 let buf = $buf;
1378 let mut position = 1;
1379 let mut input = b"!D other content".as_ref();
1380 // ^= 1 ^= 17
1381
1382 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1383 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1384 x => panic!(
1385 "Expected `Err(Syntax(_))`, but got `{:?}`",
1386 x
1387 ),
1388 }
1389 assert_eq!(position, 17);
1390 }
1391
1392 #[$test]
1393 $($async)? fn without_space() {
1394 let buf = $buf;
1395 let mut position = 1;
1396 let mut input = b"!DOCTYPEother content".as_ref();
1397 // ^= 1 ^= 22
1398
1399 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1400 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1401 x => panic!(
1402 "Expected `Err(Syntax(_))`, but got `{:?}`",
1403 x
1404 ),
1405 }
1406 assert_eq!(position, 22);
1407 }
1408
1409 #[$test]
1410 $($async)? fn empty() {
1411 let buf = $buf;
1412 let mut position = 1;
1413 let mut input = b"!DOCTYPE>other content".as_ref();
1414 // ^= 1 ^= 10
1415
1416 let (ty, bytes) = $source(&mut input)
1417 .read_bang_element(buf, &mut position)
1418 $(.$await)?
1419 .unwrap();
1420 assert_eq!(
1421 (ty, Bytes(bytes)),
1422 (BangType::DocType(0), Bytes(b"!DOCTYPE"))
1423 );
1424 assert_eq!(position, 10);
1425 }
1426
1427 #[$test]
1428 $($async)? fn not_closed() {
1429 let buf = $buf;
1430 let mut position = 1;
1431 let mut input = b"!DOCTYPE other content".as_ref();
1432 // ^= 1 ^23
1433
1434 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1435 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1436 x => panic!(
1437 "Expected `Err(Syntax(_))`, but got `{:?}`",
1438 x
1439 ),
1440 }
1441 assert_eq!(position, 23);
1442 }
1443 }
1444
1445 mod lowercase {
1446 use super::*;
1447 use pretty_assertions::assert_eq;
1448
1449 #[$test]
1450 $($async)? fn not_properly_start() {
1451 let buf = $buf;
1452 let mut position = 1;
1453 let mut input = b"!d other content".as_ref();
1454 // ^= 1 ^= 17
1455
1456 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1457 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1458 x => panic!(
1459 "Expected `Err(Syntax(_))`, but got `{:?}`",
1460 x
1461 ),
1462 }
1463 assert_eq!(position, 17);
1464 }
1465
1466 #[$test]
1467 $($async)? fn without_space() {
1468 let buf = $buf;
1469 let mut position = 1;
1470 let mut input = b"!doctypeother content".as_ref();
1471 // ^= 1 ^= 22
1472
1473 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1474 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1475 x => panic!(
1476 "Expected `Err(Syntax(_))`, but got `{:?}`",
1477 x
1478 ),
1479 }
1480 assert_eq!(position, 22);
1481 }
1482
1483 #[$test]
1484 $($async)? fn empty() {
1485 let buf = $buf;
1486 let mut position = 1;
1487 let mut input = b"!doctype>other content".as_ref();
1488 // ^= 1 ^= 10
1489
1490 let (ty, bytes) = $source(&mut input)
1491 .read_bang_element(buf, &mut position)
1492 $(.$await)?
1493 .unwrap();
1494 assert_eq!(
1495 (ty, Bytes(bytes)),
1496 (BangType::DocType(0), Bytes(b"!doctype"))
1497 );
1498 assert_eq!(position, 10);
1499 }
1500
1501 #[$test]
1502 $($async)? fn not_closed() {
1503 let buf = $buf;
1504 let mut position = 1;
1505 let mut input = b"!doctype other content".as_ref();
1506 // ^= 1 ^= 23
1507
1508 match $source(&mut input).read_bang_element(buf, &mut position) $(.$await)? {
1509 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedDoctype),
1510 x => panic!(
1511 "Expected `Err(Syntax(_))`, but got `{:?}`",
1512 x
1513 ),
1514 }
1515 assert_eq!(position, 23);
1516 }
1517 }
1518 }
1519 }
1520
1521 mod read_element {
1522 use super::*;
1523 use crate::errors::{Error, SyntaxError};
1524 use crate::parser::ElementParser;
1525 use crate::utils::Bytes;
1526 use pretty_assertions::assert_eq;
1527
1528 /// Checks that nothing was read from empty buffer
1529 #[$test]
1530 $($async)? fn empty() {
1531 let buf = $buf;
1532 let mut position = 1;
1533 let mut input = b"".as_ref();
1534 // ^= 1
1535
1536 match $source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? {
1537 Err(Error::Syntax(cause)) => assert_eq!(cause, SyntaxError::UnclosedTag),
1538 x => panic!(
1539 "Expected `Err(Syntax(_))`, but got `{:?}`",
1540 x
1541 ),
1542 }
1543 assert_eq!(position, 1);
1544 }
1545
1546 mod open {
1547 use super::*;
1548 use pretty_assertions::assert_eq;
1549
1550 #[$test]
1551 $($async)? fn empty_tag() {
1552 let buf = $buf;
1553 let mut position = 1;
1554 let mut input = b">".as_ref();
1555 // ^= 2
1556
1557 assert_eq!(
1558 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1559 Bytes(b"")
1560 );
1561 assert_eq!(position, 2);
1562 }
1563
1564 #[$test]
1565 $($async)? fn normal() {
1566 let buf = $buf;
1567 let mut position = 1;
1568 let mut input = b"tag>".as_ref();
1569 // ^= 5
1570
1571 assert_eq!(
1572 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1573 Bytes(b"tag")
1574 );
1575 assert_eq!(position, 5);
1576 }
1577
1578 #[$test]
1579 $($async)? fn empty_ns_empty_tag() {
1580 let buf = $buf;
1581 let mut position = 1;
1582 let mut input = b":>".as_ref();
1583 // ^= 3
1584
1585 assert_eq!(
1586 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1587 Bytes(b":")
1588 );
1589 assert_eq!(position, 3);
1590 }
1591
1592 #[$test]
1593 $($async)? fn empty_ns() {
1594 let buf = $buf;
1595 let mut position = 1;
1596 let mut input = b":tag>".as_ref();
1597 // ^= 6
1598
1599 assert_eq!(
1600 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1601 Bytes(b":tag")
1602 );
1603 assert_eq!(position, 6);
1604 }
1605
1606 #[$test]
1607 $($async)? fn with_attributes() {
1608 let buf = $buf;
1609 let mut position = 1;
1610 let mut input = br#"tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1611 // ^= 39
1612
1613 assert_eq!(
1614 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1615 Bytes(br#"tag attr-1=">" attr2 = '>' 3attr"#)
1616 );
1617 assert_eq!(position, 39);
1618 }
1619 }
1620
1621 mod self_closed {
1622 use super::*;
1623 use pretty_assertions::assert_eq;
1624
1625 #[$test]
1626 $($async)? fn empty_tag() {
1627 let buf = $buf;
1628 let mut position = 1;
1629 let mut input = b"/>".as_ref();
1630 // ^= 3
1631
1632 assert_eq!(
1633 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1634 Bytes(b"/")
1635 );
1636 assert_eq!(position, 3);
1637 }
1638
1639 #[$test]
1640 $($async)? fn normal() {
1641 let buf = $buf;
1642 let mut position = 1;
1643 let mut input = b"tag/>".as_ref();
1644 // ^= 6
1645
1646 assert_eq!(
1647 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1648 Bytes(b"tag/")
1649 );
1650 assert_eq!(position, 6);
1651 }
1652
1653 #[$test]
1654 $($async)? fn empty_ns_empty_tag() {
1655 let buf = $buf;
1656 let mut position = 1;
1657 let mut input = b":/>".as_ref();
1658 // ^= 4
1659
1660 assert_eq!(
1661 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1662 Bytes(b":/")
1663 );
1664 assert_eq!(position, 4);
1665 }
1666
1667 #[$test]
1668 $($async)? fn empty_ns() {
1669 let buf = $buf;
1670 let mut position = 1;
1671 let mut input = b":tag/>".as_ref();
1672 // ^= 7
1673
1674 assert_eq!(
1675 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1676 Bytes(b":tag/")
1677 );
1678 assert_eq!(position, 7);
1679 }
1680
1681 #[$test]
1682 $($async)? fn with_attributes() {
1683 let buf = $buf;
1684 let mut position = 1;
1685 let mut input = br#"tag attr-1="/>" attr2 = '/>' 3attr/>"#.as_ref();
1686 // ^= 42
1687
1688 assert_eq!(
1689 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1690 Bytes(br#"tag attr-1="/>" attr2 = '/>' 3attr/"#)
1691 );
1692 assert_eq!(position, 42);
1693 }
1694 }
1695
1696 mod close {
1697 use super::*;
1698 use pretty_assertions::assert_eq;
1699
1700 #[$test]
1701 $($async)? fn empty_tag() {
1702 let buf = $buf;
1703 let mut position = 1;
1704 let mut input = b"/ >".as_ref();
1705 // ^= 4
1706
1707 assert_eq!(
1708 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1709 Bytes(b"/ ")
1710 );
1711 assert_eq!(position, 4);
1712 }
1713
1714 #[$test]
1715 $($async)? fn normal() {
1716 let buf = $buf;
1717 let mut position = 1;
1718 let mut input = b"/tag>".as_ref();
1719 // ^= 6
1720
1721 assert_eq!(
1722 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1723 Bytes(b"/tag")
1724 );
1725 assert_eq!(position, 6);
1726 }
1727
1728 #[$test]
1729 $($async)? fn empty_ns_empty_tag() {
1730 let buf = $buf;
1731 let mut position = 1;
1732 let mut input = b"/:>".as_ref();
1733 // ^= 4
1734
1735 assert_eq!(
1736 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1737 Bytes(b"/:")
1738 );
1739 assert_eq!(position, 4);
1740 }
1741
1742 #[$test]
1743 $($async)? fn empty_ns() {
1744 let buf = $buf;
1745 let mut position = 1;
1746 let mut input = b"/:tag>".as_ref();
1747 // ^= 7
1748
1749 assert_eq!(
1750 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1751 Bytes(b"/:tag")
1752 );
1753 assert_eq!(position, 7);
1754 }
1755
1756 #[$test]
1757 $($async)? fn with_attributes() {
1758 let buf = $buf;
1759 let mut position = 1;
1760 let mut input = br#"/tag attr-1=">" attr2 = '>' 3attr>"#.as_ref();
1761 // ^= 40
1762
1763 assert_eq!(
1764 Bytes($source(&mut input).read_with(ElementParser::default(), buf, &mut position) $(.$await)? .unwrap()),
1765 Bytes(br#"/tag attr-1=">" attr2 = '>' 3attr"#)
1766 );
1767 assert_eq!(position, 40);
1768 }
1769 }
1770 }
1771
1772 /// Ensures, that no empty `Text` events are generated
1773 mod $read_event {
1774 use crate::events::{BytesCData, BytesDecl, BytesEnd, BytesPI, BytesStart, BytesText, Event};
1775 use crate::reader::Reader;
1776 use pretty_assertions::assert_eq;
1777
1778 /// When `encoding` feature is enabled, encoding should be detected
1779 /// from BOM (UTF-8) and BOM should be stripped.
1780 ///
1781 /// When `encoding` feature is disabled, UTF-8 is assumed and BOM
1782 /// character should be stripped for consistency
1783 #[$test]
1784 $($async)? fn bom_from_reader() {
1785 let mut reader = Reader::from_reader("\u{feff}\u{feff}".as_bytes());
1786
1787 assert_eq!(
1788 reader.$read_event($buf) $(.$await)? .unwrap(),
1789 Event::Text(BytesText::from_escaped("\u{feff}"))
1790 );
1791
1792 assert_eq!(
1793 reader.$read_event($buf) $(.$await)? .unwrap(),
1794 Event::Eof
1795 );
1796 }
1797
1798 /// When parsing from &str, encoding is fixed (UTF-8), so
1799 /// - when `encoding` feature is disabled, the behavior the
1800 /// same as in `bom_from_reader` text
1801 /// - when `encoding` feature is enabled, the behavior should
1802 /// stay consistent, so the first BOM character is stripped
1803 #[$test]
1804 $($async)? fn bom_from_str() {
1805 let mut reader = Reader::from_str("\u{feff}\u{feff}");
1806
1807 assert_eq!(
1808 reader.$read_event($buf) $(.$await)? .unwrap(),
1809 Event::Text(BytesText::from_escaped("\u{feff}"))
1810 );
1811
1812 assert_eq!(
1813 reader.$read_event($buf) $(.$await)? .unwrap(),
1814 Event::Eof
1815 );
1816 }
1817
1818 #[$test]
1819 $($async)? fn declaration() {
1820 let mut reader = Reader::from_str("<?xml ?>");
1821
1822 assert_eq!(
1823 reader.$read_event($buf) $(.$await)? .unwrap(),
1824 Event::Decl(BytesDecl::from_start(BytesStart::from_content("xml ", 3)))
1825 );
1826 }
1827
1828 #[$test]
1829 $($async)? fn doctype() {
1830 let mut reader = Reader::from_str("<!DOCTYPE x>");
1831
1832 assert_eq!(
1833 reader.$read_event($buf) $(.$await)? .unwrap(),
1834 Event::DocType(BytesText::from_escaped("x"))
1835 );
1836 }
1837
1838 #[$test]
1839 $($async)? fn processing_instruction() {
1840 let mut reader = Reader::from_str("<?xml-stylesheet '? >\" ?>");
1841
1842 assert_eq!(
1843 reader.$read_event($buf) $(.$await)? .unwrap(),
1844 Event::PI(BytesPI::new("xml-stylesheet '? >\" "))
1845 );
1846 }
1847
1848 /// Lone closing tags are not allowed, so testing it together with start tag
1849 #[$test]
1850 $($async)? fn start_and_end() {
1851 let mut reader = Reader::from_str("<tag></tag>");
1852
1853 assert_eq!(
1854 reader.$read_event($buf) $(.$await)? .unwrap(),
1855 Event::Start(BytesStart::new("tag"))
1856 );
1857
1858 assert_eq!(
1859 reader.$read_event($buf) $(.$await)? .unwrap(),
1860 Event::End(BytesEnd::new("tag"))
1861 );
1862 }
1863
1864 #[$test]
1865 $($async)? fn empty() {
1866 let mut reader = Reader::from_str("<tag/>");
1867
1868 assert_eq!(
1869 reader.$read_event($buf) $(.$await)? .unwrap(),
1870 Event::Empty(BytesStart::new("tag"))
1871 );
1872 }
1873
1874 #[$test]
1875 $($async)? fn text() {
1876 let mut reader = Reader::from_str("text");
1877
1878 assert_eq!(
1879 reader.$read_event($buf) $(.$await)? .unwrap(),
1880 Event::Text(BytesText::from_escaped("text"))
1881 );
1882 }
1883
1884 #[$test]
1885 $($async)? fn cdata() {
1886 let mut reader = Reader::from_str("<![CDATA[]]>");
1887
1888 assert_eq!(
1889 reader.$read_event($buf) $(.$await)? .unwrap(),
1890 Event::CData(BytesCData::new(""))
1891 );
1892 }
1893
1894 #[$test]
1895 $($async)? fn comment() {
1896 let mut reader = Reader::from_str("<!---->");
1897
1898 assert_eq!(
1899 reader.$read_event($buf) $(.$await)? .unwrap(),
1900 Event::Comment(BytesText::from_escaped(""))
1901 );
1902 }
1903
1904 #[$test]
1905 $($async)? fn eof() {
1906 let mut reader = Reader::from_str("");
1907
1908 assert_eq!(
1909 reader.$read_event($buf) $(.$await)? .unwrap(),
1910 Event::Eof
1911 );
1912 }
1913 }
1914 };
1915 }
1916
1917 // Export macros for the child modules:
1918 // - buffered_reader
1919 // - slice_reader
1920 pub(super) use check;
1921}