quick_xml/reader/buffered_reader.rs
1//! This is an implementation of [`Reader`] for reading from a [`BufRead`] as
2//! underlying byte stream.
3
4use std::fs::File;
5use std::io::{self, BufRead, BufReader};
6use std::path::Path;
7
8use crate::errors::{Error, Result};
9use crate::events::Event;
10use crate::name::QName;
11use crate::parser::Parser;
12use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
13use crate::utils::is_whitespace;
14
15macro_rules! impl_buffered_source {
16 ($($lf:lifetime, $reader:tt, $async:ident, $await:ident)?) => {
17 #[cfg(not(feature = "encoding"))]
18 #[inline]
19 $($async)? fn remove_utf8_bom(&mut self) -> io::Result<()> {
20 use crate::encoding::UTF8_BOM;
21
22 loop {
23 break match self $(.$reader)? .fill_buf() $(.$await)? {
24 Ok(n) => {
25 if n.starts_with(UTF8_BOM) {
26 self $(.$reader)? .consume(UTF8_BOM.len());
27 }
28 Ok(())
29 },
30 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
31 Err(e) => Err(e),
32 };
33 }
34 }
35
36 #[cfg(feature = "encoding")]
37 #[inline]
38 $($async)? fn detect_encoding(&mut self) -> io::Result<Option<&'static encoding_rs::Encoding>> {
39 loop {
40 break match self $(.$reader)? .fill_buf() $(.$await)? {
41 Ok(n) => if let Some((enc, bom_len)) = crate::encoding::detect_encoding(n) {
42 self $(.$reader)? .consume(bom_len);
43 Ok(Some(enc))
44 } else {
45 Ok(None)
46 },
47 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
48 Err(e) => Err(e),
49 };
50 }
51 }
52
53 #[inline]
54 $($async)? fn read_text $(<$lf>)? (
55 &mut self,
56 buf: &'b mut Vec<u8>,
57 position: &mut u64,
58 ) -> ReadTextResult<'b, &'b mut Vec<u8>> {
59 let mut read = 0;
60 let start = buf.len();
61 loop {
62 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
63 Ok(n) if n.is_empty() => break,
64 Ok(n) => n,
65 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
66 Err(e) => {
67 *position += read;
68 return ReadTextResult::Err(e);
69 }
70 };
71
72 match memchr::memchr(b'<', available) {
73 // Special handling is needed only on the first iteration.
74 // On next iterations we already read something and should emit Text event
75 Some(0) if read == 0 => {
76 self $(.$reader)? .consume(1);
77 *position += 1;
78 return ReadTextResult::Markup(buf);
79 }
80 Some(i) => {
81 buf.extend_from_slice(&available[..i]);
82
83 let used = i + 1;
84 self $(.$reader)? .consume(used);
85 read += used as u64;
86
87 *position += read;
88 return ReadTextResult::UpToMarkup(&buf[start..]);
89 }
90 None => {
91 buf.extend_from_slice(available);
92
93 let used = available.len();
94 self $(.$reader)? .consume(used);
95 read += used as u64;
96 }
97 }
98 }
99
100 *position += read;
101 ReadTextResult::UpToEof(&buf[start..])
102 }
103
104 #[inline]
105 $($async)? fn read_with<$($lf,)? P: Parser>(
106 &mut self,
107 mut parser: P,
108 buf: &'b mut Vec<u8>,
109 position: &mut u64,
110 ) -> Result<&'b [u8]> {
111 let mut read = 0;
112 let start = buf.len();
113 loop {
114 let available = match self $(.$reader)? .fill_buf() $(.$await)? {
115 Ok(n) if n.is_empty() => break,
116 Ok(n) => n,
117 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
118 Err(e) => {
119 *position += read;
120 return Err(Error::Io(e.into()));
121 }
122 };
123
124 if let Some(i) = parser.feed(available) {
125 buf.extend_from_slice(&available[..i]);
126
127 // +1 for `>` which we do not include
128 self $(.$reader)? .consume(i + 1);
129 read += i as u64 + 1;
130
131 *position += read;
132 return Ok(&buf[start..]);
133 }
134
135 // The `>` symbol not yet found, continue reading
136 buf.extend_from_slice(available);
137
138 let used = available.len();
139 self $(.$reader)? .consume(used);
140 read += used as u64;
141 }
142
143 *position += read;
144 Err(Error::Syntax(P::eof_error()))
145 }
146
147 #[inline]
148 $($async)? fn read_bang_element $(<$lf>)? (
149 &mut self,
150 buf: &'b mut Vec<u8>,
151 position: &mut u64,
152 ) -> Result<(BangType, &'b [u8])> {
153 // Peeked one bang ('!') before being called, so it's guaranteed to
154 // start with it.
155 let start = buf.len();
156 let mut read = 1;
157 buf.push(b'!');
158 self $(.$reader)? .consume(1);
159
160 let mut bang_type = BangType::new(self.peek_one() $(.$await)? ?)?;
161
162 loop {
163 match self $(.$reader)? .fill_buf() $(.$await)? {
164 // Note: Do not update position, so the error points to
165 // somewhere sane rather than at the EOF
166 Ok(n) if n.is_empty() => break,
167 Ok(available) => {
168 // We only parse from start because we don't want to consider
169 // whatever is in the buffer before the bang element
170 if let Some((consumed, used)) = bang_type.parse(&buf[start..], available) {
171 buf.extend_from_slice(consumed);
172
173 self $(.$reader)? .consume(used);
174 read += used as u64;
175
176 *position += read;
177 return Ok((bang_type, &buf[start..]));
178 } else {
179 buf.extend_from_slice(available);
180
181 let used = available.len();
182 self $(.$reader)? .consume(used);
183 read += used as u64;
184 }
185 }
186 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
187 Err(e) => {
188 *position += read;
189 return Err(Error::Io(e.into()));
190 }
191 }
192 }
193
194 *position += read;
195 Err(bang_type.to_err().into())
196 }
197
198 #[inline]
199 $($async)? fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
200 loop {
201 break match self $(.$reader)? .fill_buf() $(.$await)? {
202 Ok(n) => {
203 let count = n.iter().position(|b| !is_whitespace(*b)).unwrap_or(n.len());
204 if count > 0 {
205 self $(.$reader)? .consume(count);
206 *position += count as u64;
207 continue;
208 } else {
209 Ok(())
210 }
211 }
212 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
213 Err(e) => Err(e),
214 };
215 }
216 }
217
218 #[inline]
219 $($async)? fn peek_one(&mut self) -> io::Result<Option<u8>> {
220 loop {
221 break match self $(.$reader)? .fill_buf() $(.$await)? {
222 Ok(n) => Ok(n.first().cloned()),
223 Err(ref e) if e.kind() == io::ErrorKind::Interrupted => continue,
224 Err(e) => Err(e),
225 };
226 }
227 }
228 };
229}
230
231// Make it public for use in async implementations.
232// New rustc reports
233// > warning: the item `impl_buffered_source` is imported redundantly
234// so make it public only when async feature is enabled
235#[cfg(feature = "async-tokio")]
236pub(super) use impl_buffered_source;
237
238/// Implementation of `XmlSource` for any `BufRead` reader using a user-given
239/// `Vec<u8>` as buffer that will be borrowed by events.
240impl<'b, R: BufRead> XmlSource<'b, &'b mut Vec<u8>> for R {
241 impl_buffered_source!();
242}
243
244////////////////////////////////////////////////////////////////////////////////////////////////////
245
246/// This is an implementation for reading from a [`BufRead`] as underlying byte stream.
247impl<R: BufRead> Reader<R> {
248 /// Reads the next `Event`.
249 ///
250 /// This is the main entry point for reading XML `Event`s.
251 ///
252 /// `Event`s borrow `buf` and can be converted to own their data if needed (uses `Cow`
253 /// internally).
254 ///
255 /// Having the possibility to control the internal buffers gives you some additional benefits
256 /// such as:
257 ///
258 /// - Reduce the number of allocations by reusing the same buffer. For constrained systems,
259 /// you can call `buf.clear()` once you are done with processing the event (typically at the
260 /// end of your loop).
261 /// - Reserve the buffer length if you know the file size (using `Vec::with_capacity`).
262 ///
263 /// # Examples
264 ///
265 /// ```
266 /// # use pretty_assertions::assert_eq;
267 /// use quick_xml::events::Event;
268 /// use quick_xml::reader::Reader;
269 ///
270 /// let xml = r#"<tag1 att1 = "test">
271 /// <tag2><!--Test comment-->Test</tag2>
272 /// <tag2>Test 2</tag2>
273 /// </tag1>"#;
274 /// let mut reader = Reader::from_str(xml);
275 /// reader.config_mut().trim_text(true);
276 /// let mut count = 0;
277 /// let mut buf = Vec::new();
278 /// let mut txt = Vec::new();
279 /// loop {
280 /// match reader.read_event_into(&mut buf) {
281 /// Ok(Event::Start(_)) => count += 1,
282 /// Ok(Event::Text(e)) => txt.push(e.unescape().unwrap().into_owned()),
283 /// Err(e) => panic!("Error at position {}: {:?}", reader.error_position(), e),
284 /// Ok(Event::Eof) => break,
285 /// _ => (),
286 /// }
287 /// buf.clear();
288 /// }
289 /// assert_eq!(count, 3);
290 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
291 /// ```
292 #[inline]
293 pub fn read_event_into<'b>(&mut self, buf: &'b mut Vec<u8>) -> Result<Event<'b>> {
294 self.read_event_impl(buf)
295 }
296
297 /// Reads until end element is found using provided buffer as intermediate
298 /// storage for events content. This function is supposed to be called after
299 /// you already read a [`Start`] event.
300 ///
301 /// Returns a span that cover content between `>` of an opening tag and `<` of
302 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
303 /// this method was called after reading expanded [`Start`] event.
304 ///
305 /// Manages nested cases where parent and child elements have the _literally_
306 /// same name.
307 ///
308 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
309 /// will be returned. In particularly, that error will be returned if you call
310 /// this method without consuming the corresponding [`Start`] event first.
311 ///
312 /// If your reader created from a string slice or byte array slice, it is
313 /// better to use [`read_to_end()`] method, because it will not copy bytes
314 /// into intermediate buffer.
315 ///
316 /// The provided `buf` buffer will be filled only by one event content at time.
317 /// Before reading of each event the buffer will be cleared. If you know an
318 /// appropriate size of each event, you can preallocate the buffer to reduce
319 /// number of reallocations.
320 ///
321 /// The `end` parameter should contain name of the end element _in the reader
322 /// encoding_. It is good practice to always get that parameter using
323 /// [`BytesStart::to_end()`] method.
324 ///
325 /// The correctness of the skipped events does not checked, if you disabled
326 /// the [`check_end_names`] option.
327 ///
328 /// # Namespaces
329 ///
330 /// While the `Reader` does not support namespace resolution, namespaces
331 /// does not change the algorithm for comparing names. Although the names
332 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
333 /// same namespace, are semantically equivalent, `</b:name>` cannot close
334 /// `<a:name>`, because according to [the specification]
335 ///
336 /// > The end of every element that begins with a **start-tag** MUST be marked
337 /// > by an **end-tag** containing a name that echoes the element's type as
338 /// > given in the **start-tag**
339 ///
340 /// # Examples
341 ///
342 /// This example shows, how you can skip XML content after you read the
343 /// start event.
344 ///
345 /// ```
346 /// # use pretty_assertions::assert_eq;
347 /// use quick_xml::events::{BytesStart, Event};
348 /// use quick_xml::reader::Reader;
349 ///
350 /// let mut reader = Reader::from_str(r#"
351 /// <outer>
352 /// <inner>
353 /// <inner></inner>
354 /// <inner/>
355 /// <outer></outer>
356 /// <outer/>
357 /// </inner>
358 /// </outer>
359 /// "#);
360 /// reader.config_mut().trim_text(true);
361 /// let mut buf = Vec::new();
362 ///
363 /// let start = BytesStart::new("outer");
364 /// let end = start.to_end().into_owned();
365 ///
366 /// // First, we read a start event...
367 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Start(start));
368 ///
369 /// // ...then, we could skip all events to the corresponding end event.
370 /// // This call will correctly handle nested <outer> elements.
371 /// // Note, however, that this method does not handle namespaces.
372 /// reader.read_to_end_into(end.name(), &mut buf).unwrap();
373 ///
374 /// // At the end we should get an Eof event, because we ate the whole XML
375 /// assert_eq!(reader.read_event_into(&mut buf).unwrap(), Event::Eof);
376 /// ```
377 ///
378 /// [`Start`]: Event::Start
379 /// [`End`]: Event::End
380 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
381 /// [`read_to_end()`]: Self::read_to_end
382 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
383 /// [`check_end_names`]: crate::reader::Config::check_end_names
384 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
385 pub fn read_to_end_into(&mut self, end: QName, buf: &mut Vec<u8>) -> Result<Span> {
386 Ok(read_to_end!(self, end, buf, read_event_impl, {
387 buf.clear();
388 }))
389 }
390}
391
392impl Reader<BufReader<File>> {
393 /// Creates an XML reader from a file path.
394 pub fn from_file<P: AsRef<Path>>(path: P) -> Result<Self> {
395 let file = File::open(path)?;
396 let reader = BufReader::new(file);
397 Ok(Self::from_reader(reader))
398 }
399}
400
401#[cfg(test)]
402mod test {
403 use crate::reader::test::check;
404 use crate::reader::XmlSource;
405
406 /// Default buffer constructor just pass the byte array from the test
407 fn identity<T>(input: T) -> T {
408 input
409 }
410
411 check!(
412 #[test]
413 read_event_impl,
414 read_until_close,
415 identity,
416 &mut Vec::new()
417 );
418}