quick_xml/reader/slice_reader.rs
1//! This is an implementation of [`Reader`] for reading from a `&[u8]` as
2//! underlying byte stream. This implementation supports not using an
3//! intermediate buffer as the byte slice itself can be used to borrow from.
4
5use std::borrow::Cow;
6use std::io;
7
8#[cfg(feature = "encoding")]
9use crate::reader::EncodingRef;
10#[cfg(feature = "encoding")]
11use encoding_rs::{Encoding, UTF_8};
12
13use crate::errors::{Error, Result};
14use crate::events::Event;
15use crate::name::QName;
16use crate::parser::Parser;
17use crate::reader::{BangType, ReadTextResult, Reader, Span, XmlSource};
18use crate::utils::is_whitespace;
19
20/// This is an implementation for reading from a `&[u8]` as underlying byte stream.
21/// This implementation supports not using an intermediate buffer as the byte slice
22/// itself can be used to borrow from.
23impl<'a> Reader<&'a [u8]> {
24 /// Creates an XML reader from a string slice.
25 #[allow(clippy::should_implement_trait)]
26 pub fn from_str(s: &'a str) -> Self {
27 // Rust strings are guaranteed to be UTF-8, so lock the encoding
28 #[cfg(feature = "encoding")]
29 {
30 let mut reader = Self::from_reader(s.as_bytes());
31 reader.state.encoding = EncodingRef::Explicit(UTF_8);
32 reader
33 }
34
35 #[cfg(not(feature = "encoding"))]
36 Self::from_reader(s.as_bytes())
37 }
38
39 /// Read an event that borrows from the input rather than a buffer.
40 ///
41 /// There is no asynchronous `read_event_async()` version of this function,
42 /// because it is not necessary -- the contents are already in memory and no IO
43 /// is needed, therefore there is no potential for blocking.
44 ///
45 /// # Examples
46 ///
47 /// ```
48 /// # use pretty_assertions::assert_eq;
49 /// use quick_xml::events::Event;
50 /// use quick_xml::reader::Reader;
51 ///
52 /// let mut reader = Reader::from_str(r#"
53 /// <tag1 att1 = "test">
54 /// <tag2><!--Test comment-->Test</tag2>
55 /// <tag2>Test 2</tag2>
56 /// </tag1>
57 /// "#);
58 /// reader.config_mut().trim_text(true);
59 ///
60 /// let mut count = 0;
61 /// let mut txt = Vec::new();
62 /// loop {
63 /// match reader.read_event().unwrap() {
64 /// Event::Start(e) => count += 1,
65 /// Event::Text(e) => txt.push(e.unescape().unwrap().into_owned()),
66 /// Event::Eof => break,
67 /// _ => (),
68 /// }
69 /// }
70 /// assert_eq!(count, 3);
71 /// assert_eq!(txt, vec!["Test".to_string(), "Test 2".to_string()]);
72 /// ```
73 #[inline]
74 pub fn read_event(&mut self) -> Result<Event<'a>> {
75 self.read_event_impl(())
76 }
77
78 /// Reads until end element is found. This function is supposed to be called
79 /// after you already read a [`Start`] event.
80 ///
81 /// Returns a span that cover content between `>` of an opening tag and `<` of
82 /// a closing tag or an empty slice, if [`expand_empty_elements`] is set and
83 /// this method was called after reading expanded [`Start`] event.
84 ///
85 /// Manages nested cases where parent and child elements have the _literally_
86 /// same name.
87 ///
88 /// If a corresponding [`End`] event is not found, an error of type [`Error::IllFormed`]
89 /// will be returned. In particularly, that error will be returned if you call
90 /// this method without consuming the corresponding [`Start`] event first.
91 ///
92 /// The `end` parameter should contain name of the end element _in the reader
93 /// encoding_. It is good practice to always get that parameter using
94 /// [`BytesStart::to_end()`] method.
95 ///
96 /// The correctness of the skipped events does not checked, if you disabled
97 /// the [`check_end_names`] option.
98 ///
99 /// There is no asynchronous `read_to_end_async()` version of this function,
100 /// because it is not necessary -- the contents are already in memory and no IO
101 /// is needed, therefore there is no potential for blocking.
102 ///
103 /// # Namespaces
104 ///
105 /// While the `Reader` does not support namespace resolution, namespaces
106 /// does not change the algorithm for comparing names. Although the names
107 /// `a:name` and `b:name` where both prefixes `a` and `b` resolves to the
108 /// same namespace, are semantically equivalent, `</b:name>` cannot close
109 /// `<a:name>`, because according to [the specification]
110 ///
111 /// > The end of every element that begins with a **start-tag** MUST be marked
112 /// > by an **end-tag** containing a name that echoes the element's type as
113 /// > given in the **start-tag**
114 ///
115 /// # Examples
116 ///
117 /// This example shows, how you can skip XML content after you read the
118 /// start event.
119 ///
120 /// ```
121 /// # use pretty_assertions::assert_eq;
122 /// use quick_xml::events::{BytesStart, Event};
123 /// use quick_xml::reader::Reader;
124 ///
125 /// let mut reader = Reader::from_str(r#"
126 /// <outer>
127 /// <inner>
128 /// <inner></inner>
129 /// <inner/>
130 /// <outer></outer>
131 /// <outer/>
132 /// </inner>
133 /// </outer>
134 /// "#);
135 /// reader.config_mut().trim_text(true);
136 ///
137 /// let start = BytesStart::new("outer");
138 /// let end = start.to_end().into_owned();
139 ///
140 /// // First, we read a start event...
141 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
142 ///
143 /// // ...then, we could skip all events to the corresponding end event.
144 /// // This call will correctly handle nested <outer> elements.
145 /// // Note, however, that this method does not handle namespaces.
146 /// reader.read_to_end(end.name()).unwrap();
147 ///
148 /// // At the end we should get an Eof event, because we ate the whole XML
149 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
150 /// ```
151 ///
152 /// [`Start`]: Event::Start
153 /// [`End`]: Event::End
154 /// [`BytesStart::to_end()`]: crate::events::BytesStart::to_end
155 /// [`expand_empty_elements`]: crate::reader::Config::expand_empty_elements
156 /// [`check_end_names`]: crate::reader::Config::check_end_names
157 /// [the specification]: https://www.w3.org/TR/xml11/#dt-etag
158 pub fn read_to_end(&mut self, end: QName) -> Result<Span> {
159 Ok(read_to_end!(self, end, (), read_event_impl, {}))
160 }
161
162 /// Reads content between start and end tags, including any markup. This
163 /// function is supposed to be called after you already read a [`Start`] event.
164 ///
165 /// Manages nested cases where parent and child elements have the _literally_
166 /// same name.
167 ///
168 /// This method does not unescape read data, instead it returns content
169 /// "as is" of the XML document. This is because it has no idea what text
170 /// it reads, and if, for example, it contains CDATA section, attempt to
171 /// unescape it content will spoil data.
172 ///
173 /// Any text will be decoded using the XML current [`decoder()`].
174 ///
175 /// Actually, this method perform the following code:
176 ///
177 /// ```ignore
178 /// let span = reader.read_to_end(end)?;
179 /// let text = reader.decoder().decode(&reader.inner_slice[span]);
180 /// ```
181 ///
182 /// # Examples
183 ///
184 /// This example shows, how you can read a HTML content from your XML document.
185 ///
186 /// ```
187 /// # use pretty_assertions::assert_eq;
188 /// # use std::borrow::Cow;
189 /// use quick_xml::events::{BytesStart, Event};
190 /// use quick_xml::reader::Reader;
191 ///
192 /// let mut reader = Reader::from_str("
193 /// <html>
194 /// <title>This is a HTML text</title>
195 /// <p>Usual XML rules does not apply inside it
196 /// <p>For example, elements not needed to be "closed"
197 /// </html>
198 /// ");
199 /// reader.config_mut().trim_text(true);
200 ///
201 /// let start = BytesStart::new("html");
202 /// let end = start.to_end().into_owned();
203 ///
204 /// // First, we read a start event...
205 /// assert_eq!(reader.read_event().unwrap(), Event::Start(start));
206 /// // ...and disable checking of end names because we expect HTML further...
207 /// reader.config_mut().check_end_names = false;
208 ///
209 /// // ...then, we could read text content until close tag.
210 /// // This call will correctly handle nested <html> elements.
211 /// let text = reader.read_text(end.name()).unwrap();
212 /// assert_eq!(text, Cow::Borrowed(r#"
213 /// <title>This is a HTML text</title>
214 /// <p>Usual XML rules does not apply inside it
215 /// <p>For example, elements not needed to be "closed"
216 /// "#));
217 /// assert!(matches!(text, Cow::Borrowed(_)));
218 ///
219 /// // Now we can enable checks again
220 /// reader.config_mut().check_end_names = true;
221 ///
222 /// // At the end we should get an Eof event, because we ate the whole XML
223 /// assert_eq!(reader.read_event().unwrap(), Event::Eof);
224 /// ```
225 ///
226 /// [`Start`]: Event::Start
227 /// [`decoder()`]: Self::decoder()
228 pub fn read_text(&mut self, end: QName) -> Result<Cow<'a, str>> {
229 // self.reader will be changed, so store original reference
230 let buffer = self.reader;
231 let span = self.read_to_end(end)?;
232
233 let len = span.end - span.start;
234 // SAFETY: `span` can only contain indexes up to usize::MAX because it
235 // was created from offsets from a single &[u8] slice
236 Ok(self.decoder().decode(&buffer[0..len as usize])?)
237 }
238}
239
240////////////////////////////////////////////////////////////////////////////////////////////////////
241
242/// Implementation of `XmlSource` for `&[u8]` reader using a `Self` as buffer
243/// that will be borrowed by events. This implementation provides a zero-copy deserialization
244impl<'a> XmlSource<'a, ()> for &'a [u8] {
245 #[cfg(not(feature = "encoding"))]
246 #[inline]
247 fn remove_utf8_bom(&mut self) -> io::Result<()> {
248 if self.starts_with(crate::encoding::UTF8_BOM) {
249 *self = &self[crate::encoding::UTF8_BOM.len()..];
250 }
251 Ok(())
252 }
253
254 #[cfg(feature = "encoding")]
255 #[inline]
256 fn detect_encoding(&mut self) -> io::Result<Option<&'static Encoding>> {
257 if let Some((enc, bom_len)) = crate::encoding::detect_encoding(self) {
258 *self = &self[bom_len..];
259 return Ok(Some(enc));
260 }
261 Ok(None)
262 }
263
264 #[inline]
265 fn read_text(&mut self, _buf: (), position: &mut u64) -> ReadTextResult<'a, ()> {
266 match memchr::memchr(b'<', self) {
267 Some(0) => {
268 *position += 1;
269 *self = &self[1..];
270 ReadTextResult::Markup(())
271 }
272 Some(i) => {
273 *position += i as u64 + 1;
274 let bytes = &self[..i];
275 *self = &self[i + 1..];
276 ReadTextResult::UpToMarkup(bytes)
277 }
278 None => {
279 *position += self.len() as u64;
280 let bytes = &self[..];
281 *self = &[];
282 ReadTextResult::UpToEof(bytes)
283 }
284 }
285 }
286
287 #[inline]
288 fn read_with<P>(&mut self, mut parser: P, _buf: (), position: &mut u64) -> Result<&'a [u8]>
289 where
290 P: Parser,
291 {
292 if let Some(i) = parser.feed(self) {
293 // +1 for `>` which we do not include
294 *position += i as u64 + 1;
295 let bytes = &self[..i];
296 *self = &self[i + 1..];
297 return Ok(bytes);
298 }
299
300 *position += self.len() as u64;
301 Err(Error::Syntax(P::eof_error()))
302 }
303
304 #[inline]
305 fn read_bang_element(&mut self, _buf: (), position: &mut u64) -> Result<(BangType, &'a [u8])> {
306 // Peeked one bang ('!') before being called, so it's guaranteed to
307 // start with it.
308 debug_assert_eq!(self[0], b'!');
309
310 let mut bang_type = BangType::new(self[1..].first().copied())?;
311
312 if let Some((bytes, i)) = bang_type.parse(&[], self) {
313 *position += i as u64;
314 *self = &self[i..];
315 return Ok((bang_type, bytes));
316 }
317
318 *position += self.len() as u64;
319 Err(bang_type.to_err().into())
320 }
321
322 #[inline]
323 fn skip_whitespace(&mut self, position: &mut u64) -> io::Result<()> {
324 let whitespaces = self
325 .iter()
326 .position(|b| !is_whitespace(*b))
327 .unwrap_or(self.len());
328 *position += whitespaces as u64;
329 *self = &self[whitespaces..];
330 Ok(())
331 }
332
333 #[inline]
334 fn peek_one(&mut self) -> io::Result<Option<u8>> {
335 Ok(self.first().copied())
336 }
337}
338
339#[cfg(test)]
340mod test {
341 use crate::reader::test::check;
342 use crate::reader::XmlSource;
343
344 /// Default buffer constructor just pass the byte array from the test
345 fn identity<T>(input: T) -> T {
346 input
347 }
348
349 check!(
350 #[test]
351 read_event_impl,
352 read_until_close,
353 identity,
354 ()
355 );
356}