xml/reader/
config.rs

1//! Contains parser configuration structure.
2use std::collections::HashMap;
3use std::io::Read;
4
5use crate::reader::EventReader;
6use crate::util::Encoding;
7
8/// Limits to defend from billion laughs attack
9const DEFAULT_MAX_ENTITY_EXPANSION_LENGTH: usize = 1_000_000;
10const DEFAULT_MAX_ENTITY_EXPANSION_DEPTH: u8 = 10;
11
12/// Parser configuration structure. **There are more config methods than public fileds — see methods below**.
13///
14/// This structure contains various configuration options which affect
15/// behavior of the parser.
16#[derive(Clone, PartialEq, Eq, Debug)]
17pub struct ParserConfig {
18    /// Whether or not should whitespace in textual events be removed. Default is false.
19    ///
20    /// When true, all standalone whitespace will be removed (this means no
21    /// `Whitespace` events will be emitted), and leading and trailing whitespace
22    /// from `Character` events will be deleted. If after trimming `Characters`
23    /// event will be empty, it will also be omitted from output stream. This is
24    /// possible, however, only if `whitespace_to_characters` or
25    /// `cdata_to_characters` options are set.
26    ///
27    /// This option does not affect CDATA events, unless `cdata_to_characters`
28    /// option is also set. In that case CDATA content will also be trimmed.
29    pub trim_whitespace: bool,
30
31    /// Whether or not should whitespace be converted to characters.
32    /// Default is false.
33    ///
34    /// If true, instead of `Whitespace` events `Characters` events with the
35    /// same content will be emitted. If `trim_whitespace` is also true, these
36    /// events will be trimmed to nothing and, consequently, not emitted.
37    pub whitespace_to_characters: bool,
38
39    /// Whether or not should CDATA be converted to characters.
40    /// Default is false.
41    ///
42    /// If true, instead of `CData` events `Characters` events with the same
43    /// content will be emitted. If `trim_whitespace` is also true, these events
44    /// will be trimmed. If corresponding CDATA contained nothing but whitespace,
45    /// this event will be omitted from the stream.
46    pub cdata_to_characters: bool,
47
48    /// Whether or not should comments be omitted. Default is true.
49    ///
50    /// If true, `Comment` events will not be emitted at all.
51    pub ignore_comments: bool,
52
53    /// Whether or not should sequential `Characters` events be merged.
54    /// Default is true.
55    ///
56    /// If true, multiple sequential `Characters` events will be merged into
57    /// a single event, that is, their data will be concatenated.
58    ///
59    /// Multiple sequential `Characters` events are only possible if either
60    /// `cdata_to_characters` or `ignore_comments` are set. Otherwise character
61    /// events will always be separated by other events.
62    pub coalesce_characters: bool,
63
64    /// A map of extra entities recognized by the parser. Default is an empty map.
65    ///
66    /// By default the XML parser recognizes the entities defined in the XML spec. Sometimes,
67    /// however, it is convenient to make the parser recognize additional entities which
68    /// are also not available through the DTD definitions (especially given that at the moment
69    /// DTD parsing is not supported).
70    pub extra_entities: HashMap<String, String>,
71
72    /// Whether or not the parser should ignore the end of stream. Default is false.
73    ///
74    /// By default the parser will either error out when it encounters a premature end of
75    /// stream or complete normally if the end of stream was expected. If you want to continue
76    /// reading from a stream whose input is supplied progressively, you can set this option to true.
77    /// In this case the parser will allow you to invoke the `next()` method even if a supposed end
78    /// of stream has happened.
79    ///
80    /// Note that support for this functionality is incomplete; for example, the parser will fail if
81    /// the premature end of stream happens inside PCDATA. Therefore, use this option at your own risk.
82    pub ignore_end_of_stream: bool,
83
84    /// Whether or not non-unicode entity references get replaced with the replacement character
85    ///
86    /// When true, any decimal or hexadecimal character reference that cannot be converted from a
87    /// u32 to a char using [std::char::from_u32](https://doc.rust-lang.org/std/char/fn.from_u32.html)
88    /// will be converted into the unicode REPLACEMENT CHARACTER (U+FFFD).
89    pub replace_unknown_entity_references: bool,
90
91    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
92    ///
93    /// By default any whitespace that is not enclosed within at least one level of elements will be
94    /// ignored. Setting this value to false will cause root level whitespace events to be emitted.
95    ///
96    /// **There are configuration options – see methods below**
97    pub ignore_root_level_whitespace: bool,
98}
99
100impl ParserConfig {
101    /// Returns a new config with default values.
102    ///
103    /// You can tweak default values using builder-like pattern:
104    ///
105    /// ```rust
106    /// use xml::reader::ParserConfig;
107    ///
108    /// let config = ParserConfig::new()
109    ///     .trim_whitespace(true)
110    ///     .ignore_comments(true)
111    ///     .coalesce_characters(false);
112    /// ```
113    #[must_use]
114    #[inline]
115    pub fn new() -> Self {
116        Self {
117            trim_whitespace: false,
118            whitespace_to_characters: false,
119            cdata_to_characters: false,
120            ignore_comments: true,
121            coalesce_characters: true,
122            extra_entities: HashMap::new(),
123            ignore_end_of_stream: false,
124            replace_unknown_entity_references: false,
125            ignore_root_level_whitespace: true,
126        }
127    }
128
129    /// Creates an XML reader with this configuration.
130    ///
131    /// This is a convenience method for configuring and creating a reader at the same time:
132    ///
133    /// ```rust
134    /// use xml::reader::ParserConfig;
135    ///
136    /// let mut source: &[u8] = b"...";
137    ///
138    /// let reader = ParserConfig::new()
139    ///     .trim_whitespace(true)
140    ///     .ignore_comments(true)
141    ///     .coalesce_characters(false)
142    ///     .create_reader(&mut source);
143    /// ```
144    ///
145    /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
146    /// this configuration object.
147    #[inline]
148    pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
149        EventReader::new_with_config(source, self)
150    }
151
152    /// Adds a new entity mapping and returns an updated config object.
153    ///
154    /// This is a convenience method for adding external entities mappings to the XML parser.
155    /// An example:
156    ///
157    /// ```rust
158    /// use xml::reader::ParserConfig;
159    ///
160    /// let mut source: &[u8] = b"...";
161    ///
162    /// let reader = ParserConfig::new()
163    ///     .add_entity("nbsp", " ")
164    ///     .add_entity("copy", "©")
165    ///     .add_entity("reg", "®")
166    ///     .create_reader(&mut source);
167    /// ```
168    #[must_use]
169    pub fn add_entity<S: Into<String>, T: Into<String>>(mut self, entity: S, value: T) -> Self {
170        self.extra_entities.insert(entity.into(), value.into());
171        self
172    }
173}
174
175impl Default for ParserConfig {
176    #[inline]
177    fn default() -> Self {
178        Self::new()
179    }
180}
181
182gen_setters! { ParserConfig,
183    trim_whitespace: val bool,
184    whitespace_to_characters: val bool,
185    cdata_to_characters: val bool,
186    ignore_comments: val bool,
187    coalesce_characters: val bool,
188    ignore_end_of_stream: val bool,
189    replace_unknown_entity_references: val bool,
190    ignore_root_level_whitespace: val bool
191}
192
193/// Backwards-compatible extension of `ParserConfig`, which will eventually be merged into the original `ParserConfig` struct
194#[derive(Clone, PartialEq, Eq, Debug)]
195#[non_exhaustive]
196pub struct ParserConfig2 {
197    pub(crate) c: ParserConfig,
198
199    /// Use this encoding as the default. Necessary for UTF-16 files without BOM.
200    pub override_encoding: Option<Encoding>,
201
202    /// Allow `<?xml encoding="…">` to contain unsupported encoding names,
203    /// and interpret them as Latin1 instead. This will mangle non-ASCII characters, but usually it won't fail parsing.
204    pub ignore_invalid_encoding_declarations: bool,
205
206    /// Documents with multiple root elements are ill-formed
207    pub allow_multiple_root_elements: bool,
208
209    /// Abort if custom entities create a string longer than this
210    pub max_entity_expansion_length: usize,
211    /// Entities can expand into other entities this many times (be careful about exponential cost!)
212    pub max_entity_expansion_depth: u8,
213
214    /// Maximum length of tag name or attribute name
215    pub max_name_length: usize,
216
217    /// Max number of attributes per element
218    pub max_attributes: usize,
219
220    /// Max number of bytes in each attribute
221    pub max_attribute_length: usize,
222
223    /// Maximum length of strings reprsenting characters, comments, and processing instructions
224    pub max_data_length: usize,
225}
226
227impl Default for ParserConfig2 {
228    fn default() -> Self {
229        Self {
230            c: ParserConfig::default(),
231            override_encoding: None,
232            ignore_invalid_encoding_declarations: false,
233            allow_multiple_root_elements: true,
234            max_entity_expansion_length: DEFAULT_MAX_ENTITY_EXPANSION_LENGTH,
235            max_entity_expansion_depth: DEFAULT_MAX_ENTITY_EXPANSION_DEPTH,
236            max_attributes: 1 << 16,
237            max_attribute_length: 1 << 30,
238            max_data_length: 1 << 30,
239            max_name_length: 1 << 18,
240        }
241    }
242}
243
244impl ParserConfig2 {
245    /// Create extended configuration struct
246    #[inline]
247    #[must_use]
248    pub fn new() -> Self {
249        Self::default()
250    }
251
252    /// Read character encoding from `Content-Type` header.
253    /// Set this when parsing XML documents fetched over HTTP.
254    ///
255    /// `text/*` MIME types do *not* imply latin1. UTF-8 is always the default fallback.
256    #[must_use] pub fn content_type(mut self, mime_type: &str) -> Self {
257        let charset = mime_type.split_once(';')
258            .and_then(|(_, args)| args.split_once("charset"))
259            .and_then(|(_, args)| args.split_once('='));
260        if let Some((_, charset)) = charset {
261            let name = charset.trim().trim_matches('"');
262            if let Ok(enc) = name.parse() {
263                self.override_encoding = Some(enc);
264            }
265        }
266        self
267    }
268
269    /// Creates an XML reader with this configuration.
270    ///
271    /// This is a convenience method for configuring and creating a reader at the same time:
272    ///
273    /// ```rust
274    /// use xml::reader::ParserConfig;
275    ///
276    /// let mut source: &[u8] = b"...";
277    ///
278    /// let reader = ParserConfig::new()
279    ///     .trim_whitespace(true)
280    ///     .ignore_comments(true)
281    ///     .coalesce_characters(false)
282    ///     .create_reader(&mut source);
283    /// ```
284    ///
285    /// This method is exactly equivalent to calling `EventReader::new_with_config()` with
286    /// this configuration object.
287    #[inline]
288    pub fn create_reader<R: Read>(self, source: R) -> EventReader<R> {
289        EventReader::new_with_config(source, self)
290    }
291}
292
293impl From<ParserConfig> for ParserConfig2 {
294    #[inline]
295    fn from(c: ParserConfig) -> Self {
296        Self { c, ..Default::default() }
297    }
298}
299
300gen_setters! { ParserConfig2,
301    /// Set if you got one in the HTTP header
302    override_encoding: val Option<Encoding>,
303    /// Allows invalid documents. There should be only a single root element in XML.
304    allow_multiple_root_elements: val bool,
305    /// Abort if custom entities create a string longer than this
306    max_entity_expansion_length: val usize,
307    /// Entities can expand into other entities this many times (be careful about exponential cost!)
308    max_entity_expansion_depth: val u8,
309    /// Max number of attributes per element
310    max_attributes: val usize,
311    /// Maximum length of tag name or attribute name
312    max_name_length: val usize,
313    /// Max number of bytes in each attribute
314    max_attribute_length: val usize,
315    /// Maximum length of strings reprsenting characters, comments, and processing instructions
316    max_data_length: val usize,
317    /// Allow `<?xml encoding="bogus"?>`
318    ignore_invalid_encoding_declarations: val bool
319}
320
321gen_setters! { ParserConfig,
322    /// Set if you got one in the HTTP header (see `content_type`)
323    override_encoding: c2 Option<Encoding>,
324    /// Allow `<?xml encoding="bogus"?>`
325    ignore_invalid_encoding_declarations: c2 bool,
326    /// Allows invalid documents. There should be only a single root element in XML.
327    allow_multiple_root_elements: c2 bool,
328
329    /// Abort if custom entities create a string longer than this
330    max_entity_expansion_length: c2 usize,
331    /// Entities can expand into other entities this many times (be careful about exponential cost!)
332    max_entity_expansion_depth: c2 u8,
333    /// Max number of attributes per element
334    max_attributes: c2 usize,
335    /// Maximum length of tag name or attribute name
336    max_name_length: c2 usize,
337    /// Max number of bytes in each attribute
338    max_attribute_length: c2 usize,
339    /// Maximum length of strings reprsenting characters, comments, and processing instructions
340    max_data_length: c2 usize,
341
342    /// Set encoding from the MIME type. Important for HTTP compatibility.
343    content_type: c2 &str
344}
345
346gen_setters! { ParserConfig2,
347    trim_whitespace: delegate bool,
348    whitespace_to_characters: delegate bool,
349    cdata_to_characters: delegate bool,
350    ignore_comments: delegate bool,
351    coalesce_characters: delegate bool,
352    ignore_end_of_stream: delegate bool,
353    replace_unknown_entity_references: delegate bool,
354    /// Whether or not whitespace at the root level of the document is ignored. Default is true.
355    ignore_root_level_whitespace: delegate bool
356}
357
358#[test]
359fn mime_parse() {
360    let c = ParserConfig2::new().content_type("text/xml;charset=Us-AScii").max_entity_expansion_length(1000);
361    assert_eq!(c.override_encoding, Some(Encoding::Ascii));
362
363    let c = ParserConfig2::new().max_entity_expansion_depth(3).content_type("text/xml;charset = \"UTF-16\"");
364    assert_eq!(c.override_encoding, Some(Encoding::Utf16));
365}