quick_xml/reader/
buffered_reader.rs

1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16    ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17        #[cfg(not(feature = "encoding"))]
18        #[inline]
19        $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20            use crate::encoding::UTF8_BOM;
21
22            loop {
23                break match self $(.$reader)? .fill_buf() $(.$await)? {
24                    Ok(n) => {
25                        if n.starts_with(UTF8_BOM) {
26                            self $(.$reader)? .consume(UTF8_BOM.len());
27                        }
28                        Ok(())
29                    },
30                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31                    Err(e) => Err(e),
32                };
33            }
34        }
35
36        #[cfg(feature = "encoding")]
37        #[inline]
38        $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39            loop {
40                break match self $(.$reader)? .fill_buf() $(.$await)? {
41                    Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42                        self $(.$reader)? .consume(bom_len);
43                        Ok(Some(enc))
44                    } else {
45                        Ok(None)
46                    },
47                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48                    Err(e) => Err(e),
49                };
50            }
51        }
52
53        #[inline]
54        $($async)? fn read_text $(<$lf>)? (
55            &mut self,
56            buf: &'b mut Vec<u8>,
57            position: &mut u64,
58        ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59            let mut read = 0;
60            let start = buf.len();
61            loop {
62                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63                    Ok(n) if n.is_empty() => break,
64                    Ok(n) => n,
65                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66                    Err(e) => {
67                        *position += read;
68                        return ReadTextResult::Err(e);
69                    }
70                };
71
72                match memchr::memchr(b'<', available) {
73                    // Special handling is needed only on the first iteration.
74                    // On next iterations we already read something and should emit Text event
75                    Some(0) if read == 0 => {
76                        self $(.$reader)? .consume(1);
77                        *position += 1;
78                        return ReadTextResult::Markup(buf);
79                    }
80                    Some(i) => {
81                        buf.extend_from_slice(&available[..i]);
82
83                        let used = i + 1;
84                        self $(.$reader)? .consume(used);
85                        read += used as u64;
86
87                        *position += read;
88                        return ReadTextResult::UpToMarkup(&buf[start..]);
89                    }
90                    None => {
91                        buf.extend_from_slice(available);
92
93                        let used = available.len();
94                        self $(.$reader)? .consume(used);
95                        read += used as u64;
96                    }
97                }
98            }
99
100            *position += read;
101            ReadTextResult::UpToEof(&buf[start..])
102        }
103
104        #[inline]
105        $($async)? fn read_with<$($lf,)? P: Parser>(
106            &mut self,
107            mut parser: P,
108            buf: &'b mut Vec<u8>,
109            position: &mut u64,
110        ) -> Result<&'b [u8]> {
111            let mut read = 0;
112            let start = buf.len();
113            loop {
114                let available = match self $(.$reader)? .fill_buf() $(.$await)? {
115                    Ok(n) if n.is_empty() => break,
116                    Ok(n) => n,
117                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
118                    Err(e) => {
119                        *position += read;
120                        return Err(Error::Io(e.into()));
121                    }
122                };
123
124                if let Some(i) = parser.feed(available) {
125                    buf.extend_from_slice(&available[..i]);
126
127                    // +1 for `>` which we do not include
128                    self $(.$reader)? .consume(i + 1);
129                    read += i as u64 + 1;
130
131                    *position += read;
132                    return Ok(&buf[start..]);
133                }
134
135                // The `>` symbol not yet found, continue reading
136                buf.extend_from_slice(available);
137
138                let used = available.len();
139                self $(.$reader)? .consume(used);
140                read += used as u64;
141            }
142
143            *position += read;
144            Err(Error::Syntax(P::eof_error()))
145        }
146
147        #[inline]
148        $($async)? fn read_bang_element $(<$lf>)? (
149            &mut self,
150            buf: &'b mut Vec<u8>,
151            position: &mut u64,
152        ) -> Result<(BangType, &'b [u8])> {
153            // Peeked one bang ('!') before being called, so it's guaranteed to
154            // start with it.
155            let start = buf.len();
156            let mut read = 1;
157            buf.push(b'!');
158            self $(.$reader)? .consume(1);
159
160            let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
161
162            loop {
163                match self $(.$reader)? .fill_buf() $(.$await)? {
164                    // Note: Do not update position, so the error points to
165                    // somewhere sane rather than at the EOF
166                    Ok(n) if n.is_empty() => break,
167                    Ok(available) => {
168                        // We only parse from start because we don't want to consider
169                        // whatever is in the buffer before the bang element
170                        if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
171                            buf.extend_from_slice(consumed);
172
173                            self $(.$reader)? .consume(used);
174                            read += used as u64;
175
176                            *position += read;
177                            return Ok((bang_type, &buf[start..]));
178                        } else {
179                            buf.extend_from_slice(available);
180
181                            let used = available.len();
182                            self $(.$reader)? .consume(used);
183                            read += used as u64;
184                        }
185                    }
186                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
187                    Err(e) => {
188                        *position += read;
189                        return Err(Error::Io(e.into()));
190                    }
191                }
192            }
193
194            *position += read;
195            Err(bang_type.to_err().into())
196        }
197
198        #[inline]
199        $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
200            loop {
201                break match self $(.$reader)? .fill_buf() $(.$await)? {
202                    Ok(n) => {
203                        let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
204                        if count > 0 {
205                            self $(.$reader)? .consume(count);
206                            *position += count as u64;
207                            continue;
208                        } else {
209                            Ok(())
210                        }
211                    }
212                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
213                    Err(e) => Err(e),
214                };
215            }
216        }
217
218        #[inline]
219        $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
220            loop {
221                break match self $(.$reader)? .fill_buf() $(.$await)? {
222                    Ok(n) => Ok(n.first().cloned()),
223                    Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
224                    Err(e) => Err(e),
225                };
226            }
227        }
228    };
229}
230
231// Make it public for use in async implementations.
232// New rustc reports
233// > warning: the item `impl_buffered_source` is imported redundantly
234// so make it public only when async feature is enabled
235#[cfg(feature = "async-tokio")]
236pub(super) use impl_buffered_source;
237
238/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
239/// `Vec<u8>` as buffer that will be borrowed by events.
240impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
241    impl_buffered_source!();
242}
243
244////////////////////////////////////////////////////////////////////////////////////////////////////
245
246/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
247impl<R: BufRead> Reader<R> {
248    /// Reads the next `Event`.
249    ///
250    /// This is the main entry point for reading XML `Event`s.
251    ///
252    /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
253    /// internally).
254    ///
255    /// Having the possibility to control the internal buffers gives you some additional benefits
256    /// such as:
257    ///
258    /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
259    ///   you can call `buf.clear()` once you are done with processing the event (typically at the
260    ///   end of your loop).
261    /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
262    ///
263    /// # Examples
264    ///
265    /// ```
266    /// # use pretty_assertions::assert_eq;
267    /// use quick_xml::events::Event;
268    /// use quick_xml::reader::Reader;
269    ///
270    /// let xml = r#"<tag1 att1 = "test">
271    ///                 <tag2><!--Test comment-->Test</tag2>
272    ///                 <tag2>Test 2</tag2>
273    ///              </tag1>"#;
274    /// let mut reader = Reader::from_str(xml);
275    /// reader.config_mut().trim_text(true);
276    /// let mut count = 0;
277    /// let mut buf = Vec::new();
278    /// let mut txt = Vec::new();
279    /// loop {
280    ///     match reader.read_event_into(&mut buf) {
281    ///         Ok(Event::Start(_)) => count += 1,
282    ///         Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
283    ///         Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
284    ///         Ok(Event::Eof) => break,
285    ///         _ => (),
286    ///     }
287    ///     buf.clear();
288    /// }
289    /// assert_eq!(count, 3);
290    /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
291    /// ```
292    #[inline]
293    pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
294        self.read_event_impl(buf)
295    }
296
297    /// Reads until end element is found using provided buffer as intermediate
298    /// storage for events content. This function is supposed to be called after
299    /// you already read a [`Start`] event.
300    ///
301    /// Returns a span that cover content between `>` of an opening tag and `<` of
302    /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
303    /// this method was called after reading expanded [`Start`] event.
304    ///
305    /// Manages nested cases where parent and child elements have the _literally_
306    /// same name.
307    ///
308    /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
309    /// will be returned. In particularly, that error will be returned if you call
310    /// this method without consuming the corresponding [`Start`] event first.
311    ///
312    /// If your reader created from a string slice or byte array slice, it is
313    /// better to use [`read_to_end()`] method, because it will not copy bytes
314    /// into intermediate buffer.
315    ///
316    /// The provided `buf` buffer will be filled only by one event content at time.
317    /// Before reading of each event the buffer will be cleared. If you know an
318    /// appropriate size of each event, you can preallocate the buffer to reduce
319    /// number of reallocations.
320    ///
321    /// The `end` parameter should contain name of the end element _in the reader
322    /// encoding_. It is good practice to always get that parameter using
323    /// [`BytesStart::to_end()`] method.
324    ///
325    /// The correctness of the skipped events does not checked, if you disabled
326    /// the [`check_end_names`] option.
327    ///
328    /// # Namespaces
329    ///
330    /// While the `Reader` does not support namespace resolution, namespaces
331    /// does not change the algorithm for comparing names. Although the names
332    /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
333    /// same namespace, are semantically equivalent, `</b:name>` cannot close
334    /// `<a:name>`, because according to [the specification]
335    ///
336    /// > The end of every element that begins with a **start-tag** MUST be marked
337    /// > by an **end-tag** containing a name that echoes the element's type as
338    /// > given in the **start-tag**
339    ///
340    /// # Examples
341    ///
342    /// This example shows, how you can skip XML content after you read the
343    /// start event.
344    ///
345    /// ```
346    /// # use pretty_assertions::assert_eq;
347    /// use quick_xml::events::{BytesStart, Event};
348    /// use quick_xml::reader::Reader;
349    ///
350    /// let mut reader = Reader::from_str(r#"
351    ///     <outer>
352    ///         <inner>
353    ///             <inner></inner>
354    ///             <inner/>
355    ///             <outer></outer>
356    ///             <outer/>
357    ///         </inner>
358    ///     </outer>
359    /// "#);
360    /// reader.config_mut().trim_text(true);
361    /// let mut buf = Vec::new();
362    ///
363    /// let start = BytesStart::new("outer");
364    /// let end   = start.to_end().into_owned();
365    ///
366    /// // First, we read a start event...
367    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
368    ///
369    /// // ...then, we could skip all events to the corresponding end event.
370    /// // This call will correctly handle nested <outer> elements.
371    /// // Note, however, that this method does not handle namespaces.
372    /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
373    ///
374    /// // At the end we should get an Eof event, because we ate the whole XML
375    /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
376    /// ```
377    ///
378    /// [`Start`]: Event::Start
379    /// [`End`]: Event::End
380    /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
381    /// [`read_to_end()`]: Self::read_to_end
382    /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
383    /// [`check_end_names`]: crate::reader::Config::check_end_names
384    /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
385    pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
386        Ok(read_to_end!(self, end, buf, read_event_impl, {
387            buf.clear();
388        }))
389    }
390}
391
392impl Reader<BufReader<File>> {
393    /// Creates an XML reader from a file path.
394    pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
395        let file = File::open(path)?;
396        let reader = BufReader::new(file);
397        Ok(Self::from_reader(reader))
398    }
399}
400
401#[cfg(test)]
402mod test {
403    use crate::reader::test::check;
404    use crate::reader::XmlSource;
405
406    /// Default buffer constructor just pass the byte array from the test
407    fn identity<T>(input: T) -> T {
408        input
409    }
410
411    check!(
412        #[test]
413        read_event_impl,
414        read_until_close,
415        identity,
416        &mut Vec::new()
417    );
418}