yaml_rust2/
scanner.rs

1//! Home to the YAML Scanner.
2//!
3//! The scanner is the lowest-level parsing utility. It is the lexer / tokenizer, reading input a
4//! character at a time and emitting tokens that can later be interpreted by the [`crate::parser`]
5//! to check for more context and validity.
6//!
7//! Due to the grammar of YAML, the scanner has to have some context and is not error-free.
8
9#![allow(clippy::cast_possible_wrap)]
10#![allow(clippy::cast_sign_loss)]
11
12use std::{char, collections::VecDeque, error::Error, fmt};
13
14use arraydeque::ArrayDeque;
15
16use crate::char_traits::{
17    as_hex, is_alpha, is_anchor_char, is_blank, is_blank_or_breakz, is_break, is_breakz, is_digit,
18    is_flow, is_hex, is_tag_char, is_uri_char, is_z,
19};
20
21/// The encoding of the input. Currently, only UTF-8 is supported.
22#[derive(Clone, Copy, PartialEq, Debug, Eq)]
23pub enum TEncoding {
24    /// UTF-8 encoding.
25    Utf8,
26}
27
28/// The style as which the scalar was written in the YAML document.
29#[derive(Clone, Copy, PartialEq, Debug, Eq)]
30pub enum TScalarStyle {
31    /// A YAML plain scalar.
32    Plain,
33    /// A YAML single quoted scalar.
34    SingleQuoted,
35    /// A YAML double quoted scalar.
36    DoubleQuoted,
37
38    /// A YAML literal block (`|` block).
39    Literal,
40    /// A YAML folded block (`>` block).
41    Folded,
42}
43
44/// A location in a yaml document.
45#[derive(Clone, Copy, PartialEq, Debug, Eq)]
46pub struct Marker {
47    /// The index (in chars) in the input string.
48    index: usize,
49    /// The line (1-indexed).
50    line: usize,
51    /// The column (1-indexed).
52    col: usize,
53}
54
55impl Marker {
56    fn new(index: usize, line: usize, col: usize) -> Marker {
57        Marker { index, line, col }
58    }
59
60    /// Return the index (in bytes) of the marker in the source.
61    #[must_use]
62    pub fn index(&self) -> usize {
63        self.index
64    }
65
66    /// Return the line of the marker in the source.
67    #[must_use]
68    pub fn line(&self) -> usize {
69        self.line
70    }
71
72    /// Return the column of the marker in the source.
73    #[must_use]
74    pub fn col(&self) -> usize {
75        self.col
76    }
77}
78
79/// An error that occurred while scanning.
80#[derive(Clone, PartialEq, Debug, Eq)]
81pub struct ScanError {
82    /// The position at which the error happened in the source.
83    mark: Marker,
84    /// Human-readable details about the error.
85    info: String,
86}
87
88impl ScanError {
89    /// Create a new error from a location and an error string.
90    #[must_use]
91    pub fn new(loc: Marker, info: &str) -> ScanError {
92        ScanError {
93            mark: loc,
94            info: info.to_owned(),
95        }
96    }
97
98    /// Create a new error from a location and an error string.
99    #[must_use]
100    pub fn new_string(loc: Marker, info: String) -> ScanError {
101        ScanError { mark: loc, info }
102    }
103
104    /// Return the marker pointing to the error in the source.
105    #[must_use]
106    pub fn marker(&self) -> &Marker {
107        &self.mark
108    }
109
110    /// Return the information string describing the error that happened.
111    #[must_use]
112    pub fn info(&self) -> &str {
113        self.info.as_ref()
114    }
115}
116
117impl Error for ScanError {
118    fn description(&self) -> &str {
119        self.info.as_ref()
120    }
121
122    fn cause(&self) -> Option<&dyn Error> {
123        None
124    }
125}
126
127impl fmt::Display for ScanError {
128    // col starts from 0
129    fn fmt(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
130        write!(
131            formatter,
132            "{} at byte {} line {} column {}",
133            self.info,
134            self.mark.index,
135            self.mark.line,
136            self.mark.col + 1,
137        )
138    }
139}
140
141/// The contents of a scanner token.
142#[derive(Clone, PartialEq, Debug, Eq)]
143pub enum TokenType {
144    /// The start of the stream. Sent first, before even [`TokenType::DocumentStart`].
145    StreamStart(TEncoding),
146    /// The end of the stream, EOF.
147    StreamEnd,
148    /// A YAML version directive.
149    VersionDirective(
150        /// Major
151        u32,
152        /// Minor
153        u32,
154    ),
155    /// A YAML tag directive (e.g.: `!!str`, `!foo!bar`, ...).
156    TagDirective(
157        /// Handle
158        String,
159        /// Prefix
160        String,
161    ),
162    /// The start of a YAML document (`---`).
163    DocumentStart,
164    /// The end of a YAML document (`...`).
165    DocumentEnd,
166    /// The start of a sequence block.
167    ///
168    /// Sequence blocks are arrays starting with a `-`.
169    BlockSequenceStart,
170    /// The start of a sequence mapping.
171    ///
172    /// Sequence mappings are "dictionaries" with "key: value" entries.
173    BlockMappingStart,
174    /// End of the corresponding `BlockSequenceStart` or `BlockMappingStart`.
175    BlockEnd,
176    /// Start of an inline array (`[ a, b ]`).
177    FlowSequenceStart,
178    /// End of an inline array.
179    FlowSequenceEnd,
180    /// Start of an inline mapping (`{ a: b, c: d }`).
181    FlowMappingStart,
182    /// End of an inline mapping.
183    FlowMappingEnd,
184    /// An entry in a block sequence (c.f.: [`TokenType::BlockSequenceStart`]).
185    BlockEntry,
186    /// An entry in a flow sequence (c.f.: [`TokenType::FlowSequenceStart`]).
187    FlowEntry,
188    /// A key in a mapping.
189    Key,
190    /// A value in a mapping.
191    Value,
192    /// A reference to an anchor.
193    Alias(String),
194    /// A YAML anchor (`&`/`*`).
195    Anchor(String),
196    /// A YAML tag (starting with bangs `!`).
197    Tag(
198        /// The handle of the tag.
199        String,
200        /// The suffix of the tag.
201        String,
202    ),
203    /// A regular YAML scalar.
204    Scalar(TScalarStyle, String),
205}
206
207/// A scanner token.
208#[derive(Clone, PartialEq, Debug, Eq)]
209pub struct Token(pub Marker, pub TokenType);
210
211/// A scalar that was parsed and may correspond to a simple key.
212///
213/// Upon scanning the following yaml:
214/// ```yaml
215/// a: b
216/// ```
217/// We do not know that `a` is a key for a map until we have reached the following `:`. For this
218/// YAML, we would store `a` as a scalar token in the [`Scanner`], but not emit it yet. It would be
219/// kept inside the scanner until more context is fetched and we are able to know whether it is a
220/// plain scalar or a key.
221///
222/// For example, see the following 2 yaml documents:
223/// ```yaml
224/// ---
225/// a: b # Here, `a` is a key.
226/// ...
227/// ---
228/// a # Here, `a` is a plain scalar.
229/// ...
230/// ```
231/// An instance of [`SimpleKey`] is created in the [`Scanner`] when such ambiguity occurs.
232///
233/// In both documents, scanning `a` would lead to the creation of a [`SimpleKey`] with
234/// [`Self::possible`] set to `true`. The token for `a` would be pushed in the [`Scanner`] but not
235/// yet emitted. Instead, more context would be fetched (through [`Scanner::fetch_more_tokens`]).
236///
237/// In the first document, upon reaching the `:`, the [`SimpleKey`] would be inspected and our
238/// scalar `a` since it is a possible key, would be "turned" into a key. This is done by prepending
239/// a [`TokenType::Key`] to our scalar token in the [`Scanner`]. This way, the
240/// [`crate::parser::Parser`] would read the [`TokenType::Key`] token before the
241/// [`TokenType::Scalar`] token.
242///
243/// In the second document however, reaching the EOF would stale the [`SimpleKey`] and no
244/// [`TokenType::Key`] would be emitted by the scanner.
245#[derive(Clone, PartialEq, Debug, Eq)]
246struct SimpleKey {
247    /// Whether the token this [`SimpleKey`] refers to may still be a key.
248    ///
249    /// Sometimes, when we have more context, we notice that what we thought could be a key no
250    /// longer can be. In that case, [`Self::possible`] is set to `false`.
251    ///
252    /// For instance, let us consider the following invalid YAML:
253    /// ```yaml
254    /// key
255    ///   : value
256    /// ```
257    /// Upon reading the `\n` after `key`, the [`SimpleKey`] that was created for `key` is staled
258    /// and [`Self::possible`] set to `false`.
259    possible: bool,
260    /// Whether the token this [`SimpleKey`] refers to is required to be a key.
261    ///
262    /// With more context, we may know for sure that the token must be a key. If the YAML is
263    /// invalid, it may happen that the token be deemed not a key. In such event, an error has to
264    /// be raised. This boolean helps us know when to raise such error.
265    ///
266    /// TODO(ethiraric, 30/12/2023): Example of when this happens.
267    required: bool,
268    /// The index of the token referred to by the [`SimpleKey`].
269    ///
270    /// This is the index in the scanner, which takes into account both the tokens that have been
271    /// emitted and those about to be emitted. See [`Scanner::tokens_parsed`] and
272    /// [`Scanner::tokens`] for more details.
273    token_number: usize,
274    /// The position at which the token the [`SimpleKey`] refers to is.
275    mark: Marker,
276}
277
278impl SimpleKey {
279    /// Create a new [`SimpleKey`] at the given `Marker` and with the given flow level.
280    fn new(mark: Marker) -> SimpleKey {
281        SimpleKey {
282            possible: false,
283            required: false,
284            token_number: 0,
285            mark,
286        }
287    }
288}
289
290/// An indentation level on the stack of indentations.
291#[derive(Clone, Debug, Default)]
292struct Indent {
293    /// The former indentation level.
294    indent: isize,
295    /// Whether, upon closing, this indents generates a `BlockEnd` token.
296    ///
297    /// There are levels of indentation which do not start a block. Examples of this would be:
298    /// ```yaml
299    /// -
300    ///   foo # ok
301    /// -
302    /// bar # ko, bar needs to be indented further than the `-`.
303    /// - [
304    ///  baz, # ok
305    /// quux # ko, quux needs to be indented further than the '-'.
306    /// ] # ko, the closing bracket needs to be indented further than the `-`.
307    /// ```
308    ///
309    /// The indentation level created by the `-` is for a single entry in the sequence. Emitting a
310    /// `BlockEnd` when this indentation block ends would generate one `BlockEnd` per entry in the
311    /// sequence, although we must have exactly one to end the sequence.
312    needs_block_end: bool,
313}
314
315/// The size of the [`Scanner`] buffer.
316///
317/// The buffer is statically allocated to avoid conditions for reallocations each time we
318/// consume/push a character. As of now, almost all lookaheads are 4 characters maximum, except:
319///   - Escape sequences parsing: some escape codes are 8 characters
320///   - Scanning indent in scalars: this looks ahead `indent + 2` characters
321///
322/// This constant must be set to at least 8. When scanning indent in scalars, the lookahead is done
323/// in a single call if and only if the indent is `BUFFER_LEN - 2` or less. If the indent is higher
324/// than that, the code will fall back to a loop of lookaheads.
325const BUFFER_LEN: usize = 16;
326
327/// The YAML scanner.
328///
329/// This corresponds to the low-level interface when reading YAML. The scanner emits token as they
330/// are read (akin to a lexer), but it also holds sufficient context to be able to disambiguate
331/// some of the constructs. It has understanding of indentation and whitespace and is able to
332/// generate error messages for some invalid YAML constructs.
333///
334/// It is however not a full parser and needs [`crate::parser::Parser`] to fully detect invalid
335/// YAML documents.
336#[derive(Debug)]
337#[allow(clippy::struct_excessive_bools)]
338pub struct Scanner<T> {
339    /// The reader, providing with characters.
340    rdr: T,
341    /// The position of the cursor within the reader.
342    mark: Marker,
343    /// Buffer for tokens to be returned.
344    ///
345    /// This buffer can hold some temporary tokens that are not yet ready to be returned. For
346    /// instance, if we just read a scalar, it can be a value or a key if an implicit mapping
347    /// follows. In this case, the token stays in the `VecDeque` but cannot be returned from
348    /// [`Self::next`] until we have more context.
349    tokens: VecDeque<Token>,
350    /// Buffer for the next characters to consume.
351    buffer: ArrayDeque<char, BUFFER_LEN>,
352    /// The last error that happened.
353    error: Option<ScanError>,
354
355    /// Whether we have already emitted the `StreamStart` token.
356    stream_start_produced: bool,
357    /// Whether we have already emitted the `StreamEnd` token.
358    stream_end_produced: bool,
359    /// In some flow contexts, the value of a mapping is allowed to be adjacent to the `:`. When it
360    /// is, the index at which the `:` may be must be stored in `adjacent_value_allowed_at`.
361    adjacent_value_allowed_at: usize,
362    /// Whether a simple key could potentially start at the current position.
363    ///
364    /// Simple keys are the opposite of complex keys which are keys starting with `?`.
365    simple_key_allowed: bool,
366    /// A stack of potential simple keys.
367    ///
368    /// Refer to the documentation of [`SimpleKey`] for a more in-depth explanation of what they
369    /// are.
370    simple_keys: Vec<SimpleKey>,
371    /// The current indentation level.
372    indent: isize,
373    /// List of all block indentation levels we are in (except the current one).
374    indents: Vec<Indent>,
375    /// Level of nesting of flow sequences.
376    flow_level: u8,
377    /// The number of tokens that have been returned from the scanner.
378    ///
379    /// This excludes the tokens from [`Self::tokens`].
380    tokens_parsed: usize,
381    /// Whether a token is ready to be taken from [`Self::tokens`].
382    token_available: bool,
383    /// Whether all characters encountered since the last newline were whitespace.
384    leading_whitespace: bool,
385    /// Whether we started a flow mapping.
386    ///
387    /// This is used to detect implicit flow mapping starts such as:
388    /// ```yaml
389    /// [ : foo ] # { null: "foo" }
390    /// ```
391    flow_mapping_started: bool,
392    /// Whether we currently are in an implicit flow mapping.
393    implicit_flow_mapping: bool,
394}
395
396impl<T: Iterator<Item = char>> Iterator for Scanner<T> {
397    type Item = Token;
398    fn next(&mut self) -> Option<Token> {
399        if self.error.is_some() {
400            return None;
401        }
402        match self.next_token() {
403            Ok(Some(tok)) => {
404                debug_print!(
405                    "    \x1B[;32m\u{21B3} {:?} \x1B[;36m{:?}\x1B[;m",
406                    tok.1,
407                    tok.0
408                );
409                Some(tok)
410            }
411            Ok(tok) => tok,
412            Err(e) => {
413                self.error = Some(e);
414                None
415            }
416        }
417    }
418}
419
420/// A convenience alias for scanner functions that may fail without returning a value.
421pub type ScanResult = Result<(), ScanError>;
422
423impl<T: Iterator<Item = char>> Scanner<T> {
424    /// Creates the YAML tokenizer.
425    pub fn new(rdr: T) -> Scanner<T> {
426        Scanner {
427            rdr,
428            buffer: ArrayDeque::new(),
429            mark: Marker::new(0, 1, 0),
430            tokens: VecDeque::new(),
431            error: None,
432
433            stream_start_produced: false,
434            stream_end_produced: false,
435            adjacent_value_allowed_at: 0,
436            simple_key_allowed: true,
437            simple_keys: Vec::new(),
438            indent: -1,
439            indents: Vec::new(),
440            flow_level: 0,
441            tokens_parsed: 0,
442            token_available: false,
443            leading_whitespace: true,
444            flow_mapping_started: false,
445            implicit_flow_mapping: false,
446        }
447    }
448
449    /// Get a copy of the last error that was encountered, if any.
450    ///
451    /// This does not clear the error state and further calls to [`Self::get_error`] will return (a
452    /// clone of) the same error.
453    #[inline]
454    pub fn get_error(&self) -> Option<ScanError> {
455        self.error.clone()
456    }
457
458    /// Fill `self.buffer` with at least `count` characters.
459    ///
460    /// The characters that are extracted this way are not consumed but only placed in the buffer.
461    #[inline]
462    fn lookahead(&mut self, count: usize) {
463        if self.buffer.len() >= count {
464            return;
465        }
466        for _ in 0..(count - self.buffer.len()) {
467            self.buffer
468                .push_back(self.rdr.next().unwrap_or('\0'))
469                .unwrap();
470        }
471    }
472
473    /// Consume the next character. It is assumed the next character is a blank.
474    #[inline]
475    fn skip_blank(&mut self) {
476        self.buffer.pop_front();
477
478        self.mark.index += 1;
479        self.mark.col += 1;
480    }
481
482    /// Consume the next character. It is assumed the next character is not a blank.
483    #[inline]
484    fn skip_non_blank(&mut self) {
485        self.buffer.pop_front();
486
487        self.mark.index += 1;
488        self.mark.col += 1;
489        self.leading_whitespace = false;
490    }
491
492    /// Consume the next characters. It is assumed none of the next characters are blanks.
493    #[inline]
494    fn skip_n_non_blank(&mut self, n: usize) {
495        self.buffer.drain(0..n);
496
497        self.mark.index += n;
498        self.mark.col += n;
499        self.leading_whitespace = false;
500    }
501
502    /// Consume the next character. It is assumed the next character is a newline.
503    #[inline]
504    fn skip_nl(&mut self) {
505        self.buffer.pop_front();
506
507        self.mark.index += 1;
508        self.mark.col = 0;
509        self.mark.line += 1;
510        self.leading_whitespace = true;
511    }
512
513    /// Consume a linebreak (either CR, LF or CRLF), if any. Do nothing if there's none.
514    #[inline]
515    fn skip_linebreak(&mut self) {
516        if self.buffer[0] == '\r' && self.buffer[1] == '\n' {
517            // While technically not a blank, this does not matter as `self.leading_whitespace`
518            // will be reset by `skip_nl`.
519            self.skip_blank();
520            self.skip_nl();
521        } else if is_break(self.buffer[0]) {
522            self.skip_nl();
523        }
524    }
525
526    /// Return the next character in the buffer.
527    ///
528    /// The character is not consumed.
529    #[inline]
530    fn ch(&self) -> char {
531        self.buffer[0]
532    }
533
534    /// Look for the next character and return it.
535    ///
536    /// The character is not consumed.
537    /// Equivalent to calling [`Self::lookahead`] and [`Self::ch`].
538    #[inline]
539    fn look_ch(&mut self) -> char {
540        self.lookahead(1);
541        self.ch()
542    }
543
544    /// Read a character from the input stream, returning it directly.
545    ///
546    /// The buffer is bypassed and `self.mark` needs to be updated manually.
547    #[inline]
548    #[must_use]
549    fn raw_read_ch(&mut self) -> char {
550        self.rdr.next().unwrap_or('\0')
551    }
552
553    /// Return whether the next character is `c`.
554    #[inline]
555    fn ch_is(&self, c: char) -> bool {
556        self.buffer[0] == c
557    }
558
559    /// Return whether the [`TokenType::StreamStart`] event has been emitted.
560    #[inline]
561    pub fn stream_started(&self) -> bool {
562        self.stream_start_produced
563    }
564
565    /// Return whether the [`TokenType::StreamEnd`] event has been emitted.
566    #[inline]
567    pub fn stream_ended(&self) -> bool {
568        self.stream_end_produced
569    }
570
571    /// Get the current position in the input stream.
572    #[inline]
573    pub fn mark(&self) -> Marker {
574        self.mark
575    }
576
577    // Read and consume a line break (either `\r`, `\n` or `\r\n`).
578    //
579    // A `\n` is pushed into `s`.
580    //
581    // # Panics (in debug)
582    // If the next characters do not correspond to a line break.
583    #[inline]
584    fn read_break(&mut self, s: &mut String) {
585        let c = self.buffer[0];
586        let nc = self.buffer[1];
587        debug_assert!(is_break(c));
588        if c == '\r' && nc == '\n' {
589            self.skip_blank();
590        }
591        self.skip_nl();
592
593        s.push('\n');
594    }
595
596    /// Check whether the next characters correspond to an end of document.
597    ///
598    /// [`Self::lookahead`] must have been called before calling this function.
599    fn next_is_document_end(&self) -> bool {
600        assert!(self.buffer.len() >= 4);
601        self.buffer[0] == '.'
602            && self.buffer[1] == '.'
603            && self.buffer[2] == '.'
604            && is_blank_or_breakz(self.buffer[3])
605    }
606
607    /// Check whether the next characters correspond to a document indicator.
608    ///
609    /// [`Self::lookahead`] must have been called before calling this function.
610    #[inline]
611    fn next_is_document_indicator(&self) -> bool {
612        assert!(self.buffer.len() >= 4);
613        self.mark.col == 0
614            && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
615                || ((self.buffer[0] == '.') && (self.buffer[1] == '.') && (self.buffer[2] == '.')))
616            && is_blank_or_breakz(self.buffer[3])
617    }
618
619    /// Insert a token at the given position.
620    fn insert_token(&mut self, pos: usize, tok: Token) {
621        let old_len = self.tokens.len();
622        assert!(pos <= old_len);
623        self.tokens.insert(pos, tok);
624    }
625
626    fn allow_simple_key(&mut self) {
627        self.simple_key_allowed = true;
628    }
629
630    fn disallow_simple_key(&mut self) {
631        self.simple_key_allowed = false;
632    }
633
634    /// Fetch the next token in the stream.
635    /// # Errors
636    /// Returns `ScanError` when the scanner does not find the next expected token.
637    pub fn fetch_next_token(&mut self) -> ScanResult {
638        self.lookahead(1);
639        // eprintln!("--> fetch_next_token Cur {:?} {:?}", self.mark, self.ch());
640
641        if !self.stream_start_produced {
642            self.fetch_stream_start();
643            return Ok(());
644        }
645        self.skip_to_next_token()?;
646
647        debug_print!(
648            "  \x1B[38;5;244m\u{2192} fetch_next_token after whitespace {:?} {:?}\x1B[m",
649            self.mark,
650            self.ch()
651        );
652
653        self.stale_simple_keys()?;
654
655        let mark = self.mark;
656        self.unroll_indent(mark.col as isize);
657
658        self.lookahead(4);
659
660        if is_z(self.ch()) {
661            self.fetch_stream_end()?;
662            return Ok(());
663        }
664
665        // Is it a directive?
666        if self.mark.col == 0 && self.ch_is('%') {
667            return self.fetch_directive();
668        }
669
670        if self.mark.col == 0
671            && self.buffer[0] == '-'
672            && self.buffer[1] == '-'
673            && self.buffer[2] == '-'
674            && is_blank_or_breakz(self.buffer[3])
675        {
676            self.fetch_document_indicator(TokenType::DocumentStart)?;
677            return Ok(());
678        }
679
680        if self.mark.col == 0
681            && self.buffer[0] == '.'
682            && self.buffer[1] == '.'
683            && self.buffer[2] == '.'
684            && is_blank_or_breakz(self.buffer[3])
685        {
686            self.fetch_document_indicator(TokenType::DocumentEnd)?;
687            self.skip_ws_to_eol(SkipTabs::Yes)?;
688            if !is_breakz(self.ch()) {
689                return Err(ScanError::new(
690                    self.mark,
691                    "invalid content after document end marker",
692                ));
693            }
694            return Ok(());
695        }
696
697        if (self.mark.col as isize) < self.indent {
698            return Err(ScanError::new(self.mark, "invalid indentation"));
699        }
700
701        let c = self.buffer[0];
702        let nc = self.buffer[1];
703        match c {
704            '[' => self.fetch_flow_collection_start(TokenType::FlowSequenceStart),
705            '{' => self.fetch_flow_collection_start(TokenType::FlowMappingStart),
706            ']' => self.fetch_flow_collection_end(TokenType::FlowSequenceEnd),
707            '}' => self.fetch_flow_collection_end(TokenType::FlowMappingEnd),
708            ',' => self.fetch_flow_entry(),
709            '-' if is_blank_or_breakz(nc) => self.fetch_block_entry(),
710            '?' if is_blank_or_breakz(nc) => self.fetch_key(),
711            ':' if is_blank_or_breakz(nc)
712                || (self.flow_level > 0
713                    && (is_flow(nc) || self.mark.index == self.adjacent_value_allowed_at)) =>
714            {
715                self.fetch_value()
716            }
717            // Is it an alias?
718            '*' => self.fetch_anchor(true),
719            // Is it an anchor?
720            '&' => self.fetch_anchor(false),
721            '!' => self.fetch_tag(),
722            // Is it a literal scalar?
723            '|' if self.flow_level == 0 => self.fetch_block_scalar(true),
724            // Is it a folded scalar?
725            '>' if self.flow_level == 0 => self.fetch_block_scalar(false),
726            '\'' => self.fetch_flow_scalar(true),
727            '"' => self.fetch_flow_scalar(false),
728            // plain scalar
729            '-' if !is_blank_or_breakz(nc) => self.fetch_plain_scalar(),
730            ':' | '?' if !is_blank_or_breakz(nc) && self.flow_level == 0 => {
731                self.fetch_plain_scalar()
732            }
733            '%' | '@' | '`' => Err(ScanError::new(
734                self.mark,
735                &format!("unexpected character: `{c}'"),
736            )),
737            _ => self.fetch_plain_scalar(),
738        }
739    }
740
741    /// Return the next token in the stream.
742    /// # Errors
743    /// Returns `ScanError` when scanning fails to find an expected next token.
744    pub fn next_token(&mut self) -> Result<Option<Token>, ScanError> {
745        if self.stream_end_produced {
746            return Ok(None);
747        }
748
749        if !self.token_available {
750            self.fetch_more_tokens()?;
751        }
752        let Some(t) = self.tokens.pop_front() else {
753            return Err(ScanError::new(
754                self.mark,
755                "did not find expected next token",
756            ));
757        };
758        self.token_available = false;
759        self.tokens_parsed += 1;
760
761        if let TokenType::StreamEnd = t.1 {
762            self.stream_end_produced = true;
763        }
764        Ok(Some(t))
765    }
766
767    /// Fetch tokens from the token stream.
768    /// # Errors
769    /// Returns `ScanError` when loading fails.
770    pub fn fetch_more_tokens(&mut self) -> ScanResult {
771        let mut need_more;
772        loop {
773            if self.tokens.is_empty() {
774                need_more = true;
775            } else {
776                need_more = false;
777                // Stale potential keys that we know won't be keys.
778                self.stale_simple_keys()?;
779                // If our next token to be emitted may be a key, fetch more context.
780                for sk in &self.simple_keys {
781                    if sk.possible && sk.token_number == self.tokens_parsed {
782                        need_more = true;
783                        break;
784                    }
785                }
786            }
787
788            if !need_more {
789                break;
790            }
791            self.fetch_next_token()?;
792        }
793        self.token_available = true;
794
795        Ok(())
796    }
797
798    /// Mark simple keys that can no longer be keys as such.
799    ///
800    /// This function sets `possible` to `false` to each key that, now we have more context, we
801    /// know will not be keys.
802    ///
803    /// # Errors
804    /// This function returns an error if one of the key we would stale was required to be a key.
805    fn stale_simple_keys(&mut self) -> ScanResult {
806        for sk in &mut self.simple_keys {
807            if sk.possible
808                // If not in a flow construct, simple keys cannot span multiple lines.
809                && self.flow_level == 0
810                    && (sk.mark.line < self.mark.line || sk.mark.index + 1024 < self.mark.index)
811            {
812                if sk.required {
813                    return Err(ScanError::new(self.mark, "simple key expect ':'"));
814                }
815                sk.possible = false;
816            }
817        }
818        Ok(())
819    }
820
821    /// Skip over all whitespace and comments until the next token.
822    ///
823    /// # Errors
824    /// This function returns an error if a tabulation is encountered where there should not be
825    /// one.
826    fn skip_to_next_token(&mut self) -> ScanResult {
827        loop {
828            // TODO(chenyh) BOM
829            match self.look_ch() {
830                // Tabs may not be used as indentation.
831                // "Indentation" only exists as long as a block is started, but does not exist
832                // inside of flow-style constructs. Tabs are allowed as part of leading
833                // whitespaces outside of indentation.
834                // If a flow-style construct is in an indented block, its contents must still be
835                // indented. Also, tabs are allowed anywhere in it if it has no content.
836                '\t' if self.is_within_block()
837                    && self.leading_whitespace
838                    && (self.mark.col as isize) < self.indent =>
839                {
840                    self.skip_ws_to_eol(SkipTabs::Yes)?;
841                    // If we have content on that line with a tab, return an error.
842                    if !is_breakz(self.ch()) {
843                        return Err(ScanError::new(
844                            self.mark,
845                            "tabs disallowed within this context (block indentation)",
846                        ));
847                    }
848                }
849                '\t' | ' ' => self.skip_blank(),
850                '\n' | '\r' => {
851                    self.lookahead(2);
852                    self.skip_linebreak();
853                    if self.flow_level == 0 {
854                        self.allow_simple_key();
855                    }
856                }
857                '#' => {
858                    while !is_breakz(self.look_ch()) {
859                        self.skip_non_blank();
860                    }
861                }
862                _ => break,
863            }
864        }
865        Ok(())
866    }
867
868    /// Skip over YAML whitespace (` `, `\n`, `\r`).
869    ///
870    /// # Errors
871    /// This function returns an error if no whitespace was found.
872    fn skip_yaml_whitespace(&mut self) -> ScanResult {
873        let mut need_whitespace = true;
874        loop {
875            match self.look_ch() {
876                ' ' => {
877                    self.skip_blank();
878
879                    need_whitespace = false;
880                }
881                '\n' | '\r' => {
882                    self.lookahead(2);
883                    self.skip_linebreak();
884                    if self.flow_level == 0 {
885                        self.allow_simple_key();
886                    }
887                    need_whitespace = false;
888                }
889                '#' => {
890                    while !is_breakz(self.look_ch()) {
891                        self.skip_non_blank();
892                    }
893                }
894                _ => break,
895            }
896        }
897
898        if need_whitespace {
899            Err(ScanError::new(self.mark(), "expected whitespace"))
900        } else {
901            Ok(())
902        }
903    }
904
905    /// Skip yaml whitespace at most up to eol. Also skips comments.
906    fn skip_ws_to_eol(&mut self, skip_tabs: SkipTabs) -> Result<SkipTabs, ScanError> {
907        let mut encountered_tab = false;
908        let mut has_yaml_ws = false;
909        loop {
910            match self.look_ch() {
911                ' ' => {
912                    has_yaml_ws = true;
913                    self.skip_blank();
914                }
915                '\t' if skip_tabs != SkipTabs::No => {
916                    encountered_tab = true;
917                    self.skip_blank();
918                }
919                // YAML comments must be preceded by whitespace.
920                '#' if !encountered_tab && !has_yaml_ws => {
921                    return Err(ScanError::new(
922                        self.mark,
923                        "comments must be separated from other tokens by whitespace",
924                    ));
925                }
926                '#' => {
927                    while !is_breakz(self.look_ch()) {
928                        self.skip_non_blank();
929                    }
930                }
931                _ => break,
932            }
933        }
934
935        Ok(SkipTabs::Result(encountered_tab, has_yaml_ws))
936    }
937
938    fn fetch_stream_start(&mut self) {
939        let mark = self.mark;
940        self.indent = -1;
941        self.stream_start_produced = true;
942        self.allow_simple_key();
943        self.tokens
944            .push_back(Token(mark, TokenType::StreamStart(TEncoding::Utf8)));
945        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
946    }
947
948    fn fetch_stream_end(&mut self) -> ScanResult {
949        // force new line
950        if self.mark.col != 0 {
951            self.mark.col = 0;
952            self.mark.line += 1;
953        }
954
955        // If the stream ended, we won't have more context. We can stall all the simple keys we
956        // had. If one was required, however, that was an error and we must propagate it.
957        for sk in &mut self.simple_keys {
958            if sk.required && sk.possible {
959                return Err(ScanError::new(self.mark, "simple key expected"));
960            }
961            sk.possible = false;
962        }
963
964        self.unroll_indent(-1);
965        self.remove_simple_key()?;
966        self.disallow_simple_key();
967
968        self.tokens
969            .push_back(Token(self.mark, TokenType::StreamEnd));
970        Ok(())
971    }
972
973    fn fetch_directive(&mut self) -> ScanResult {
974        self.unroll_indent(-1);
975        self.remove_simple_key()?;
976
977        self.disallow_simple_key();
978
979        let tok = self.scan_directive()?;
980        self.tokens.push_back(tok);
981
982        Ok(())
983    }
984
985    fn scan_directive(&mut self) -> Result<Token, ScanError> {
986        let start_mark = self.mark;
987        self.skip_non_blank();
988
989        let name = self.scan_directive_name()?;
990        let tok = match name.as_ref() {
991            "YAML" => self.scan_version_directive_value(&start_mark)?,
992            "TAG" => self.scan_tag_directive_value(&start_mark)?,
993            // XXX This should be a warning instead of an error
994            _ => {
995                // skip current line
996                while !is_breakz(self.look_ch()) {
997                    self.skip_non_blank();
998                }
999                // XXX return an empty TagDirective token
1000                Token(
1001                    start_mark,
1002                    TokenType::TagDirective(String::new(), String::new()),
1003                )
1004                // return Err(ScanError::new(start_mark,
1005                //     "while scanning a directive, found unknown directive name"))
1006            }
1007        };
1008
1009        self.skip_ws_to_eol(SkipTabs::Yes)?;
1010
1011        if is_breakz(self.ch()) {
1012            self.lookahead(2);
1013            self.skip_linebreak();
1014            Ok(tok)
1015        } else {
1016            Err(ScanError::new(
1017                start_mark,
1018                "while scanning a directive, did not find expected comment or line break",
1019            ))
1020        }
1021    }
1022
1023    fn scan_version_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
1024        while is_blank(self.look_ch()) {
1025            self.skip_blank();
1026        }
1027
1028        let major = self.scan_version_directive_number(mark)?;
1029
1030        if self.ch() != '.' {
1031            return Err(ScanError::new(
1032                *mark,
1033                "while scanning a YAML directive, did not find expected digit or '.' character",
1034            ));
1035        }
1036        self.skip_non_blank();
1037
1038        let minor = self.scan_version_directive_number(mark)?;
1039
1040        Ok(Token(*mark, TokenType::VersionDirective(major, minor)))
1041    }
1042
1043    fn scan_directive_name(&mut self) -> Result<String, ScanError> {
1044        let start_mark = self.mark;
1045        let mut string = String::new();
1046        while is_alpha(self.look_ch()) {
1047            string.push(self.ch());
1048            self.skip_non_blank();
1049        }
1050
1051        if string.is_empty() {
1052            return Err(ScanError::new(
1053                start_mark,
1054                "while scanning a directive, could not find expected directive name",
1055            ));
1056        }
1057
1058        if !is_blank_or_breakz(self.ch()) {
1059            return Err(ScanError::new(
1060                start_mark,
1061                "while scanning a directive, found unexpected non-alphabetical character",
1062            ));
1063        }
1064
1065        Ok(string)
1066    }
1067
1068    fn scan_version_directive_number(&mut self, mark: &Marker) -> Result<u32, ScanError> {
1069        let mut val = 0u32;
1070        let mut length = 0usize;
1071        while let Some(digit) = self.look_ch().to_digit(10) {
1072            if length + 1 > 9 {
1073                return Err(ScanError::new(
1074                    *mark,
1075                    "while scanning a YAML directive, found extremely long version number",
1076                ));
1077            }
1078            length += 1;
1079            val = val * 10 + digit;
1080            self.skip_non_blank();
1081        }
1082
1083        if length == 0 {
1084            return Err(ScanError::new(
1085                *mark,
1086                "while scanning a YAML directive, did not find expected version number",
1087            ));
1088        }
1089
1090        Ok(val)
1091    }
1092
1093    fn scan_tag_directive_value(&mut self, mark: &Marker) -> Result<Token, ScanError> {
1094        /* Eat whitespaces. */
1095        while is_blank(self.look_ch()) {
1096            self.skip_blank();
1097        }
1098        let handle = self.scan_tag_handle(true, mark)?;
1099
1100        /* Eat whitespaces. */
1101        while is_blank(self.look_ch()) {
1102            self.skip_blank();
1103        }
1104
1105        let prefix = self.scan_tag_prefix(mark)?;
1106
1107        self.lookahead(1);
1108
1109        if is_blank_or_breakz(self.ch()) {
1110            Ok(Token(*mark, TokenType::TagDirective(handle, prefix)))
1111        } else {
1112            Err(ScanError::new(
1113                *mark,
1114                "while scanning TAG, did not find expected whitespace or line break",
1115            ))
1116        }
1117    }
1118
1119    fn fetch_tag(&mut self) -> ScanResult {
1120        self.save_simple_key();
1121        self.disallow_simple_key();
1122
1123        let tok = self.scan_tag()?;
1124        self.tokens.push_back(tok);
1125        Ok(())
1126    }
1127
1128    fn scan_tag(&mut self) -> Result<Token, ScanError> {
1129        let start_mark = self.mark;
1130        let mut handle = String::new();
1131        let mut suffix;
1132
1133        // Check if the tag is in the canonical form (verbatim).
1134        self.lookahead(2);
1135
1136        if self.buffer[1] == '<' {
1137            suffix = self.scan_verbatim_tag(&start_mark)?;
1138        } else {
1139            // The tag has either the '!suffix' or the '!handle!suffix'
1140            handle = self.scan_tag_handle(false, &start_mark)?;
1141            // Check if it is, indeed, handle.
1142            if handle.len() >= 2 && handle.starts_with('!') && handle.ends_with('!') {
1143                // A tag handle starting with "!!" is a secondary tag handle.
1144                let is_secondary_handle = handle == "!!";
1145                suffix =
1146                    self.scan_tag_shorthand_suffix(false, is_secondary_handle, "", &start_mark)?;
1147            } else {
1148                suffix = self.scan_tag_shorthand_suffix(false, false, &handle, &start_mark)?;
1149                "!".clone_into(&mut handle);
1150                // A special case: the '!' tag.  Set the handle to '' and the
1151                // suffix to '!'.
1152                if suffix.is_empty() {
1153                    handle.clear();
1154                    suffix = "!".to_owned();
1155                }
1156            }
1157        }
1158
1159        if is_blank_or_breakz(self.look_ch()) || (self.flow_level > 0 && is_flow(self.ch())) {
1160            // XXX: ex 7.2, an empty scalar can follow a secondary tag
1161            Ok(Token(start_mark, TokenType::Tag(handle, suffix)))
1162        } else {
1163            Err(ScanError::new(
1164                start_mark,
1165                "while scanning a tag, did not find expected whitespace or line break",
1166            ))
1167        }
1168    }
1169
1170    fn scan_tag_handle(&mut self, directive: bool, mark: &Marker) -> Result<String, ScanError> {
1171        let mut string = String::new();
1172        if self.look_ch() != '!' {
1173            return Err(ScanError::new(
1174                *mark,
1175                "while scanning a tag, did not find expected '!'",
1176            ));
1177        }
1178
1179        string.push(self.ch());
1180        self.skip_non_blank();
1181
1182        while is_alpha(self.look_ch()) {
1183            string.push(self.ch());
1184            self.skip_non_blank();
1185        }
1186
1187        // Check if the trailing character is '!' and copy it.
1188        if self.ch() == '!' {
1189            string.push(self.ch());
1190            self.skip_non_blank();
1191        } else if directive && string != "!" {
1192            // It's either the '!' tag or not really a tag handle.  If it's a %TAG
1193            // directive, it's an error.  If it's a tag token, it must be a part of
1194            // URI.
1195            return Err(ScanError::new(
1196                *mark,
1197                "while parsing a tag directive, did not find expected '!'",
1198            ));
1199        }
1200        Ok(string)
1201    }
1202
1203    /// Scan for a tag prefix (6.8.2.2).
1204    ///
1205    /// There are 2 kinds of tag prefixes:
1206    ///   - Local: Starts with a `!`, contains only URI chars (`!foo`)
1207    ///   - Global: Starts with a tag char, contains then URI chars (`!foo,2000:app/`)
1208    fn scan_tag_prefix(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1209        let mut string = String::new();
1210
1211        if self.look_ch() == '!' {
1212            // If we have a local tag, insert and skip `!`.
1213            string.push(self.ch());
1214            self.skip_non_blank();
1215        } else if !is_tag_char(self.ch()) {
1216            // Otherwise, check if the first global tag character is valid.
1217            return Err(ScanError::new(*start_mark, "invalid global tag character"));
1218        } else if self.ch() == '%' {
1219            // If it is valid and an escape sequence, escape it.
1220            string.push(self.scan_uri_escapes(start_mark)?);
1221        } else {
1222            // Otherwise, push the first character.
1223            string.push(self.ch());
1224            self.skip_non_blank();
1225        }
1226
1227        while is_uri_char(self.look_ch()) {
1228            if self.ch() == '%' {
1229                string.push(self.scan_uri_escapes(start_mark)?);
1230            } else {
1231                string.push(self.ch());
1232                self.skip_non_blank();
1233            }
1234        }
1235
1236        Ok(string)
1237    }
1238
1239    /// Scan for a verbatim tag.
1240    ///
1241    /// The prefixing `!<` must _not_ have been skipped.
1242    fn scan_verbatim_tag(&mut self, start_mark: &Marker) -> Result<String, ScanError> {
1243        // Eat `!<`
1244        self.skip_non_blank();
1245        self.skip_non_blank();
1246
1247        let mut string = String::new();
1248        while is_uri_char(self.look_ch()) {
1249            if self.ch() == '%' {
1250                string.push(self.scan_uri_escapes(start_mark)?);
1251            } else {
1252                string.push(self.ch());
1253                self.skip_non_blank();
1254            }
1255        }
1256
1257        if self.ch() != '>' {
1258            return Err(ScanError::new(
1259                *start_mark,
1260                "while scanning a verbatim tag, did not find the expected '>'",
1261            ));
1262        }
1263        self.skip_non_blank();
1264
1265        Ok(string)
1266    }
1267
1268    fn scan_tag_shorthand_suffix(
1269        &mut self,
1270        _directive: bool,
1271        _is_secondary: bool,
1272        head: &str,
1273        mark: &Marker,
1274    ) -> Result<String, ScanError> {
1275        let mut length = head.len();
1276        let mut string = String::new();
1277
1278        // Copy the head if needed.
1279        // Note that we don't copy the leading '!' character.
1280        if length > 1 {
1281            string.extend(head.chars().skip(1));
1282        }
1283
1284        while is_tag_char(self.look_ch()) {
1285            // Check if it is a URI-escape sequence.
1286            if self.ch() == '%' {
1287                string.push(self.scan_uri_escapes(mark)?);
1288            } else {
1289                string.push(self.ch());
1290                self.skip_non_blank();
1291            }
1292
1293            length += 1;
1294        }
1295
1296        if length == 0 {
1297            return Err(ScanError::new(
1298                *mark,
1299                "while parsing a tag, did not find expected tag URI",
1300            ));
1301        }
1302
1303        Ok(string)
1304    }
1305
1306    fn scan_uri_escapes(&mut self, mark: &Marker) -> Result<char, ScanError> {
1307        let mut width = 0usize;
1308        let mut code = 0u32;
1309        loop {
1310            self.lookahead(3);
1311
1312            if !(self.ch() == '%' && is_hex(self.buffer[1]) && is_hex(self.buffer[2])) {
1313                return Err(ScanError::new(
1314                    *mark,
1315                    "while parsing a tag, did not find URI escaped octet",
1316                ));
1317            }
1318
1319            let octet = (as_hex(self.buffer[1]) << 4) + as_hex(self.buffer[2]);
1320            if width == 0 {
1321                width = match octet {
1322                    _ if octet & 0x80 == 0x00 => 1,
1323                    _ if octet & 0xE0 == 0xC0 => 2,
1324                    _ if octet & 0xF0 == 0xE0 => 3,
1325                    _ if octet & 0xF8 == 0xF0 => 4,
1326                    _ => {
1327                        return Err(ScanError::new(
1328                            *mark,
1329                            "while parsing a tag, found an incorrect leading UTF-8 octet",
1330                        ));
1331                    }
1332                };
1333                code = octet;
1334            } else {
1335                if octet & 0xc0 != 0x80 {
1336                    return Err(ScanError::new(
1337                        *mark,
1338                        "while parsing a tag, found an incorrect trailing UTF-8 octet",
1339                    ));
1340                }
1341                code = (code << 8) + octet;
1342            }
1343
1344            self.skip_n_non_blank(3);
1345
1346            width -= 1;
1347            if width == 0 {
1348                break;
1349            }
1350        }
1351
1352        match char::from_u32(code) {
1353            Some(ch) => Ok(ch),
1354            None => Err(ScanError::new(
1355                *mark,
1356                "while parsing a tag, found an invalid UTF-8 codepoint",
1357            )),
1358        }
1359    }
1360
1361    fn fetch_anchor(&mut self, alias: bool) -> ScanResult {
1362        self.save_simple_key();
1363        self.disallow_simple_key();
1364
1365        let tok = self.scan_anchor(alias)?;
1366
1367        self.tokens.push_back(tok);
1368
1369        Ok(())
1370    }
1371
1372    fn scan_anchor(&mut self, alias: bool) -> Result<Token, ScanError> {
1373        let mut string = String::new();
1374        let start_mark = self.mark;
1375
1376        self.skip_non_blank();
1377        while is_anchor_char(self.look_ch()) {
1378            string.push(self.ch());
1379            self.skip_non_blank();
1380        }
1381
1382        if string.is_empty() {
1383            return Err(ScanError::new(start_mark, "while scanning an anchor or alias, did not find expected alphabetic or numeric character"));
1384        }
1385
1386        if alias {
1387            Ok(Token(start_mark, TokenType::Alias(string)))
1388        } else {
1389            Ok(Token(start_mark, TokenType::Anchor(string)))
1390        }
1391    }
1392
1393    fn fetch_flow_collection_start(&mut self, tok: TokenType) -> ScanResult {
1394        // The indicators '[' and '{' may start a simple key.
1395        self.save_simple_key();
1396
1397        self.roll_one_col_indent();
1398        self.increase_flow_level()?;
1399
1400        self.allow_simple_key();
1401
1402        let start_mark = self.mark;
1403        self.skip_non_blank();
1404
1405        if tok == TokenType::FlowMappingStart {
1406            self.flow_mapping_started = true;
1407        }
1408
1409        self.skip_ws_to_eol(SkipTabs::Yes)?;
1410
1411        self.tokens.push_back(Token(start_mark, tok));
1412        Ok(())
1413    }
1414
1415    fn fetch_flow_collection_end(&mut self, tok: TokenType) -> ScanResult {
1416        self.remove_simple_key()?;
1417        self.decrease_flow_level();
1418
1419        self.disallow_simple_key();
1420
1421        self.end_implicit_mapping(self.mark);
1422
1423        let start_mark = self.mark;
1424        self.skip_non_blank();
1425        self.skip_ws_to_eol(SkipTabs::Yes)?;
1426
1427        // A flow collection within a flow mapping can be a key. In that case, the value may be
1428        // adjacent to the `:`.
1429        // ```yaml
1430        // - [ {a: b}:value ]
1431        // ```
1432        if self.flow_level > 0 {
1433            self.adjacent_value_allowed_at = self.mark.index;
1434        }
1435
1436        self.tokens.push_back(Token(start_mark, tok));
1437        Ok(())
1438    }
1439
1440    /// Push the `FlowEntry` token and skip over the `,`.
1441    fn fetch_flow_entry(&mut self) -> ScanResult {
1442        self.remove_simple_key()?;
1443        self.allow_simple_key();
1444
1445        self.end_implicit_mapping(self.mark);
1446
1447        let start_mark = self.mark;
1448        self.skip_non_blank();
1449        self.skip_ws_to_eol(SkipTabs::Yes)?;
1450
1451        self.tokens
1452            .push_back(Token(start_mark, TokenType::FlowEntry));
1453        Ok(())
1454    }
1455
1456    fn increase_flow_level(&mut self) -> ScanResult {
1457        self.simple_keys.push(SimpleKey::new(Marker::new(0, 0, 0)));
1458        self.flow_level = self
1459            .flow_level
1460            .checked_add(1)
1461            .ok_or_else(|| ScanError::new(self.mark, "recursion limit exceeded"))?;
1462        Ok(())
1463    }
1464
1465    fn decrease_flow_level(&mut self) {
1466        if self.flow_level > 0 {
1467            self.flow_level -= 1;
1468            self.simple_keys.pop().unwrap();
1469        }
1470    }
1471
1472    /// Push the `Block*` token(s) and skip over the `-`.
1473    ///
1474    /// Add an indentation level and push a `BlockSequenceStart` token if needed, then push a
1475    /// `BlockEntry` token.
1476    /// This function only skips over the `-` and does not fetch the entry value.
1477    fn fetch_block_entry(&mut self) -> ScanResult {
1478        if self.flow_level > 0 {
1479            // - * only allowed in block
1480            return Err(ScanError::new(
1481                self.mark,
1482                r#""-" is only valid inside a block"#,
1483            ));
1484        }
1485        // Check if we are allowed to start a new entry.
1486        if !self.simple_key_allowed {
1487            return Err(ScanError::new(
1488                self.mark,
1489                "block sequence entries are not allowed in this context",
1490            ));
1491        }
1492
1493        // ???, fixes test G9HC.
1494        if let Some(Token(mark, TokenType::Anchor(..) | TokenType::Tag(..))) = self.tokens.back() {
1495            if self.mark.col == 0 && mark.col == 0 && self.indent > -1 {
1496                return Err(ScanError::new(*mark, "invalid indentation for anchor"));
1497            }
1498        }
1499
1500        // Skip over the `-`.
1501        let mark = self.mark;
1502        self.skip_non_blank();
1503
1504        // generate BLOCK-SEQUENCE-START if indented
1505        self.roll_indent(mark.col, None, TokenType::BlockSequenceStart, mark);
1506        let found_tabs = self.skip_ws_to_eol(SkipTabs::Yes)?.found_tabs();
1507        self.lookahead(2);
1508        if found_tabs && self.buffer[0] == '-' && is_blank_or_breakz(self.buffer[1]) {
1509            return Err(ScanError::new(
1510                self.mark,
1511                "'-' must be followed by a valid YAML whitespace",
1512            ));
1513        }
1514
1515        self.skip_ws_to_eol(SkipTabs::No)?;
1516        if is_break(self.look_ch()) || is_flow(self.ch()) {
1517            self.roll_one_col_indent();
1518        }
1519
1520        self.remove_simple_key()?;
1521        self.allow_simple_key();
1522
1523        self.tokens
1524            .push_back(Token(self.mark, TokenType::BlockEntry));
1525
1526        Ok(())
1527    }
1528
1529    fn fetch_document_indicator(&mut self, t: TokenType) -> ScanResult {
1530        self.unroll_indent(-1);
1531        self.remove_simple_key()?;
1532        self.disallow_simple_key();
1533
1534        let mark = self.mark;
1535
1536        self.skip_n_non_blank(3);
1537
1538        self.tokens.push_back(Token(mark, t));
1539        Ok(())
1540    }
1541
1542    fn fetch_block_scalar(&mut self, literal: bool) -> ScanResult {
1543        self.save_simple_key();
1544        self.allow_simple_key();
1545        let tok = self.scan_block_scalar(literal)?;
1546
1547        self.tokens.push_back(tok);
1548        Ok(())
1549    }
1550
1551    #[allow(clippy::too_many_lines)]
1552    fn scan_block_scalar(&mut self, literal: bool) -> Result<Token, ScanError> {
1553        let start_mark = self.mark;
1554        let mut chomping = Chomping::Clip;
1555        let mut increment: usize = 0;
1556        let mut indent: usize = 0;
1557        let mut trailing_blank: bool;
1558        let mut leading_blank: bool = false;
1559        let style = if literal {
1560            TScalarStyle::Literal
1561        } else {
1562            TScalarStyle::Folded
1563        };
1564
1565        let mut string = String::new();
1566        let mut leading_break = String::new();
1567        let mut trailing_breaks = String::new();
1568        let mut chomping_break = String::new();
1569
1570        // skip '|' or '>'
1571        self.skip_non_blank();
1572        self.unroll_non_block_indents();
1573
1574        if self.look_ch() == '+' || self.ch() == '-' {
1575            if self.ch() == '+' {
1576                chomping = Chomping::Keep;
1577            } else {
1578                chomping = Chomping::Strip;
1579            }
1580            self.skip_non_blank();
1581            if is_digit(self.look_ch()) {
1582                if self.ch() == '0' {
1583                    return Err(ScanError::new(
1584                        start_mark,
1585                        "while scanning a block scalar, found an indentation indicator equal to 0",
1586                    ));
1587                }
1588                increment = (self.ch() as usize) - ('0' as usize);
1589                self.skip_non_blank();
1590            }
1591        } else if is_digit(self.ch()) {
1592            if self.ch() == '0' {
1593                return Err(ScanError::new(
1594                    start_mark,
1595                    "while scanning a block scalar, found an indentation indicator equal to 0",
1596                ));
1597            }
1598
1599            increment = (self.ch() as usize) - ('0' as usize);
1600            self.skip_non_blank();
1601            self.lookahead(1);
1602            if self.ch() == '+' || self.ch() == '-' {
1603                if self.ch() == '+' {
1604                    chomping = Chomping::Keep;
1605                } else {
1606                    chomping = Chomping::Strip;
1607                }
1608                self.skip_non_blank();
1609            }
1610        }
1611
1612        self.skip_ws_to_eol(SkipTabs::Yes)?;
1613
1614        // Check if we are at the end of the line.
1615        if !is_breakz(self.look_ch()) {
1616            return Err(ScanError::new(
1617                start_mark,
1618                "while scanning a block scalar, did not find expected comment or line break",
1619            ));
1620        }
1621
1622        if is_break(self.ch()) {
1623            self.lookahead(2);
1624            self.read_break(&mut chomping_break);
1625        }
1626
1627        if self.look_ch() == '\t' {
1628            return Err(ScanError::new(
1629                start_mark,
1630                "a block scalar content cannot start with a tab",
1631            ));
1632        }
1633
1634        if increment > 0 {
1635            indent = if self.indent >= 0 {
1636                (self.indent + increment as isize) as usize
1637            } else {
1638                increment
1639            }
1640        }
1641
1642        // Scan the leading line breaks and determine the indentation level if needed.
1643        if indent == 0 {
1644            self.skip_block_scalar_first_line_indent(&mut indent, &mut trailing_breaks);
1645        } else {
1646            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1647        }
1648
1649        // We have an end-of-stream with no content, e.g.:
1650        // ```yaml
1651        // - |+
1652        // ```
1653        if is_z(self.ch()) {
1654            let contents = match chomping {
1655                // We strip trailing linebreaks. Nothing remain.
1656                Chomping::Strip => String::new(),
1657                // There was no newline after the chomping indicator.
1658                _ if self.mark.line == start_mark.line() => String::new(),
1659                // We clip lines, and there was a newline after the chomping indicator.
1660                // All other breaks are ignored.
1661                Chomping::Clip => chomping_break,
1662                // We keep lines. There was a newline after the chomping indicator but nothing
1663                // else.
1664                Chomping::Keep if trailing_breaks.is_empty() => chomping_break,
1665                // Otherwise, the newline after chomping is ignored.
1666                Chomping::Keep => trailing_breaks,
1667            };
1668            return Ok(Token(start_mark, TokenType::Scalar(style, contents)));
1669        }
1670
1671        if self.mark.col < indent && (self.mark.col as isize) > self.indent {
1672            return Err(ScanError::new(
1673                self.mark,
1674                "wrongly indented line in block scalar",
1675            ));
1676        }
1677
1678        let mut line_buffer = String::with_capacity(100);
1679        let start_mark = self.mark;
1680        while self.mark.col == indent && !is_z(self.ch()) {
1681            if indent == 0 {
1682                self.lookahead(4);
1683                if self.next_is_document_end() {
1684                    break;
1685                }
1686            }
1687
1688            // We are at the first content character of a content line.
1689            trailing_blank = is_blank(self.ch());
1690            if !literal && !leading_break.is_empty() && !leading_blank && !trailing_blank {
1691                string.push_str(&trailing_breaks);
1692                if trailing_breaks.is_empty() {
1693                    string.push(' ');
1694                }
1695            } else {
1696                string.push_str(&leading_break);
1697                string.push_str(&trailing_breaks);
1698            }
1699
1700            leading_break.clear();
1701            trailing_breaks.clear();
1702
1703            leading_blank = is_blank(self.ch());
1704
1705            self.scan_block_scalar_content_line(&mut string, &mut line_buffer);
1706
1707            // break on EOF
1708            if is_z(self.ch()) {
1709                break;
1710            }
1711
1712            self.lookahead(2);
1713            self.read_break(&mut leading_break);
1714
1715            // Eat the following indentation spaces and line breaks.
1716            self.skip_block_scalar_indent(indent, &mut trailing_breaks);
1717        }
1718
1719        // Chomp the tail.
1720        if chomping != Chomping::Strip {
1721            string.push_str(&leading_break);
1722            // If we had reached an eof but the last character wasn't an end-of-line, check if the
1723            // last line was indented at least as the rest of the scalar, then we need to consider
1724            // there is a newline.
1725            if is_z(self.ch()) && self.mark.col >= indent.max(1) {
1726                string.push('\n');
1727            }
1728        }
1729
1730        if chomping == Chomping::Keep {
1731            string.push_str(&trailing_breaks);
1732        }
1733
1734        Ok(Token(start_mark, TokenType::Scalar(style, string)))
1735    }
1736
1737    /// Retrieve the contents of the line, parsing it as a block scalar.
1738    ///
1739    /// The contents will be appended to `string`. `line_buffer` is used as a temporary buffer to
1740    /// store bytes before pushing them to `string` and thus avoiding reallocating more than
1741    /// necessary. `line_buffer` is assumed to be empty upon calling this function. It will be
1742    /// `clear`ed before the end of the function.
1743    ///
1744    /// This function assumed the first character to read is the first content character in the
1745    /// line. This function does not consume the line break character(s) after the line.
1746    fn scan_block_scalar_content_line(&mut self, string: &mut String, line_buffer: &mut String) {
1747        // Start by evaluating characters in the buffer.
1748        while !self.buffer.is_empty() && !is_breakz(self.ch()) {
1749            string.push(self.ch());
1750            // We may technically skip non-blank characters. However, the only distinction is
1751            // to determine what is leading whitespace and what is not. Here, we read the
1752            // contents of the line until either eof or a linebreak. We know we will not read
1753            // `self.leading_whitespace` until the end of the line, where it will be reset.
1754            // This allows us to call a slightly less expensive function.
1755            self.skip_blank();
1756        }
1757
1758        // All characters that were in the buffer were consumed. We need to check if more
1759        // follow.
1760        if self.buffer.is_empty() {
1761            // We will read all consecutive non-breakz characters. We push them into a
1762            // temporary buffer. The main difference with going through `self.buffer` is that
1763            // characters are appended here as their real size (1B for ascii, or up to 4 bytes for
1764            // UTF-8). We can then use the internal `line_buffer` `Vec` to push data into `string`
1765            // (using `String::push_str`).
1766            let mut c = self.raw_read_ch();
1767            while !is_breakz(c) {
1768                line_buffer.push(c);
1769                c = self.raw_read_ch();
1770            }
1771
1772            // Our last character read is stored in `c`. It is either an EOF or a break. In any
1773            // case, we need to push it back into `self.buffer` so it may be properly read
1774            // after. We must not insert it in `string`.
1775            self.buffer.push_back(c).unwrap();
1776
1777            // We need to manually update our position; we haven't called a `skip` function.
1778            self.mark.col += line_buffer.len();
1779            self.mark.index += line_buffer.len();
1780
1781            // We can now append our bytes to our `string`.
1782            string.reserve(line_buffer.len());
1783            string.push_str(line_buffer);
1784            // This clears the _contents_ without touching the _capacity_.
1785            line_buffer.clear();
1786        }
1787    }
1788
1789    /// Skip the block scalar indentation and empty lines.
1790    fn skip_block_scalar_indent(&mut self, indent: usize, breaks: &mut String) {
1791        loop {
1792            // Consume all spaces. Tabs cannot be used as indentation.
1793            if indent < BUFFER_LEN - 2 {
1794                self.lookahead(BUFFER_LEN);
1795                while self.mark.col < indent && self.ch() == ' ' {
1796                    self.skip_blank();
1797                }
1798            } else {
1799                loop {
1800                    self.lookahead(BUFFER_LEN);
1801                    while !self.buffer.is_empty() && self.mark.col < indent && self.ch() == ' ' {
1802                        self.skip_blank();
1803                    }
1804                    // If we reached our indent, we can break. We must also break if we have
1805                    // reached content or EOF; that is, the buffer is not empty and the next
1806                    // character is not a space.
1807                    if self.mark.col == indent || (!self.buffer.is_empty() && self.ch() != ' ') {
1808                        break;
1809                    }
1810                }
1811                self.lookahead(2);
1812            }
1813
1814            // If our current line is empty, skip over the break and continue looping.
1815            if is_break(self.ch()) {
1816                self.read_break(breaks);
1817            } else {
1818                // Otherwise, we have a content line. Return control.
1819                break;
1820            }
1821        }
1822    }
1823
1824    /// Determine the indentation level for a block scalar from the first line of its contents.
1825    ///
1826    /// The function skips over whitespace-only lines and sets `indent` to the the longest
1827    /// whitespace line that was encountered.
1828    fn skip_block_scalar_first_line_indent(&mut self, indent: &mut usize, breaks: &mut String) {
1829        let mut max_indent = 0;
1830        loop {
1831            // Consume all spaces. Tabs cannot be used as indentation.
1832            while self.look_ch() == ' ' {
1833                self.skip_blank();
1834            }
1835
1836            if self.mark.col > max_indent {
1837                max_indent = self.mark.col;
1838            }
1839
1840            if is_break(self.ch()) {
1841                // If our current line is empty, skip over the break and continue looping.
1842                self.lookahead(2);
1843                self.read_break(breaks);
1844            } else {
1845                // Otherwise, we have a content line. Return control.
1846                break;
1847            }
1848        }
1849
1850        // In case a yaml looks like:
1851        // ```yaml
1852        // |
1853        // foo
1854        // bar
1855        // ```
1856        // We need to set the indent to 0 and not 1. In all other cases, the indent must be at
1857        // least 1. When in the above example, `self.indent` will be set to -1.
1858        *indent = max_indent.max((self.indent + 1) as usize);
1859        if self.indent > 0 {
1860            *indent = (*indent).max(1);
1861        }
1862    }
1863
1864    fn fetch_flow_scalar(&mut self, single: bool) -> ScanResult {
1865        self.save_simple_key();
1866        self.disallow_simple_key();
1867
1868        let tok = self.scan_flow_scalar(single)?;
1869
1870        // From spec: To ensure JSON compatibility, if a key inside a flow mapping is JSON-like,
1871        // YAML allows the following value to be specified adjacent to the “:”.
1872        self.skip_to_next_token()?;
1873        self.adjacent_value_allowed_at = self.mark.index;
1874
1875        self.tokens.push_back(tok);
1876        Ok(())
1877    }
1878
1879    #[allow(clippy::too_many_lines)]
1880    fn scan_flow_scalar(&mut self, single: bool) -> Result<Token, ScanError> {
1881        let start_mark = self.mark;
1882
1883        let mut string = String::new();
1884        let mut leading_break = String::new();
1885        let mut trailing_breaks = String::new();
1886        let mut whitespaces = String::new();
1887        let mut leading_blanks;
1888
1889        /* Eat the left quote. */
1890        self.skip_non_blank();
1891
1892        loop {
1893            /* Check for a document indicator. */
1894            self.lookahead(4);
1895
1896            if self.mark.col == 0
1897                && (((self.buffer[0] == '-') && (self.buffer[1] == '-') && (self.buffer[2] == '-'))
1898                    || ((self.buffer[0] == '.')
1899                        && (self.buffer[1] == '.')
1900                        && (self.buffer[2] == '.')))
1901                && is_blank_or_breakz(self.buffer[3])
1902            {
1903                return Err(ScanError::new(
1904                    start_mark,
1905                    "while scanning a quoted scalar, found unexpected document indicator",
1906                ));
1907            }
1908
1909            if is_z(self.ch()) {
1910                return Err(ScanError::new(
1911                    start_mark,
1912                    "while scanning a quoted scalar, found unexpected end of stream",
1913                ));
1914            }
1915
1916            if (self.mark.col as isize) < self.indent {
1917                return Err(ScanError::new(
1918                    start_mark,
1919                    "invalid indentation in quoted scalar",
1920                ));
1921            }
1922
1923            leading_blanks = false;
1924            self.consume_flow_scalar_non_whitespace_chars(
1925                single,
1926                &mut string,
1927                &mut leading_blanks,
1928                &start_mark,
1929            )?;
1930
1931            match self.look_ch() {
1932                '\'' if single => break,
1933                '"' if !single => break,
1934                _ => {}
1935            }
1936
1937            // Consume blank characters.
1938            while is_blank(self.ch()) || is_break(self.ch()) {
1939                if is_blank(self.ch()) {
1940                    // Consume a space or a tab character.
1941                    if leading_blanks {
1942                        if self.ch() == '\t' && (self.mark.col as isize) < self.indent {
1943                            return Err(ScanError::new(
1944                                self.mark,
1945                                "tab cannot be used as indentation",
1946                            ));
1947                        }
1948                        self.skip_blank();
1949                    } else {
1950                        whitespaces.push(self.ch());
1951                        self.skip_blank();
1952                    }
1953                } else {
1954                    self.lookahead(2);
1955                    // Check if it is a first line break.
1956                    if leading_blanks {
1957                        self.read_break(&mut trailing_breaks);
1958                    } else {
1959                        whitespaces.clear();
1960                        self.read_break(&mut leading_break);
1961                        leading_blanks = true;
1962                    }
1963                }
1964                self.lookahead(1);
1965            }
1966
1967            // Join the whitespaces or fold line breaks.
1968            if leading_blanks {
1969                if leading_break.is_empty() {
1970                    string.push_str(&leading_break);
1971                    string.push_str(&trailing_breaks);
1972                    trailing_breaks.clear();
1973                    leading_break.clear();
1974                } else {
1975                    if trailing_breaks.is_empty() {
1976                        string.push(' ');
1977                    } else {
1978                        string.push_str(&trailing_breaks);
1979                        trailing_breaks.clear();
1980                    }
1981                    leading_break.clear();
1982                }
1983            } else {
1984                string.push_str(&whitespaces);
1985                whitespaces.clear();
1986            }
1987        } // loop
1988
1989        // Eat the right quote.
1990        self.skip_non_blank();
1991        // Ensure there is no invalid trailing content.
1992        self.skip_ws_to_eol(SkipTabs::Yes)?;
1993        match self.ch() {
1994            // These can be encountered in flow sequences or mappings.
1995            ',' | '}' | ']' if self.flow_level > 0 => {}
1996            // An end-of-line / end-of-stream is fine. No trailing content.
1997            c if is_breakz(c) => {}
1998            // ':' can be encountered if our scalar is a key.
1999            // Outside of flow contexts, keys cannot span multiple lines
2000            ':' if self.flow_level == 0 && start_mark.line == self.mark.line => {}
2001            // Inside a flow context, this is allowed.
2002            ':' if self.flow_level > 0 => {}
2003            _ => {
2004                return Err(ScanError::new(
2005                    self.mark,
2006                    "invalid trailing content after double-quoted scalar",
2007                ));
2008            }
2009        }
2010
2011        let style = if single {
2012            TScalarStyle::SingleQuoted
2013        } else {
2014            TScalarStyle::DoubleQuoted
2015        };
2016        Ok(Token(start_mark, TokenType::Scalar(style, string)))
2017    }
2018
2019    /// Consume successive non-whitespace characters from a flow scalar.
2020    ///
2021    /// This function resolves escape sequences and stops upon encountering a whitespace, the end
2022    /// of the stream or the closing character for the scalar (`'` for single quoted scalars, `"`
2023    /// for double quoted scalars).
2024    ///
2025    /// # Errors
2026    /// Return an error if an invalid escape sequence is found.
2027    fn consume_flow_scalar_non_whitespace_chars(
2028        &mut self,
2029        single: bool,
2030        string: &mut String,
2031        leading_blanks: &mut bool,
2032        start_mark: &Marker,
2033    ) -> Result<(), ScanError> {
2034        self.lookahead(2);
2035        while !is_blank_or_breakz(self.ch()) {
2036            match self.ch() {
2037                // Check for an escaped single quote.
2038                '\'' if self.buffer[1] == '\'' && single => {
2039                    string.push('\'');
2040                    self.skip_n_non_blank(2);
2041                }
2042                // Check for the right quote.
2043                '\'' if single => break,
2044                '"' if !single => break,
2045                // Check for an escaped line break.
2046                '\\' if !single && is_break(self.buffer[1]) => {
2047                    self.lookahead(3);
2048                    self.skip_non_blank();
2049                    self.skip_linebreak();
2050                    *leading_blanks = true;
2051                    break;
2052                }
2053                // Check for an escape sequence.
2054                '\\' if !single => {
2055                    string.push(self.resolve_flow_scalar_escape_sequence(start_mark)?);
2056                }
2057                c => {
2058                    string.push(c);
2059                    self.skip_non_blank();
2060                }
2061            }
2062            self.lookahead(2);
2063        }
2064        Ok(())
2065    }
2066
2067    /// Escape the sequence we encounter in a flow scalar.
2068    ///
2069    /// `self.ch()` must point to the `\` starting the escape sequence.
2070    ///
2071    /// # Errors
2072    /// Return an error if an invalid escape sequence is found.
2073    fn resolve_flow_scalar_escape_sequence(
2074        &mut self,
2075        start_mark: &Marker,
2076    ) -> Result<char, ScanError> {
2077        let mut code_length = 0usize;
2078        let mut ret = '\0';
2079
2080        match self.buffer[1] {
2081            '0' => ret = '\0',
2082            'a' => ret = '\x07',
2083            'b' => ret = '\x08',
2084            't' | '\t' => ret = '\t',
2085            'n' => ret = '\n',
2086            'v' => ret = '\x0b',
2087            'f' => ret = '\x0c',
2088            'r' => ret = '\x0d',
2089            'e' => ret = '\x1b',
2090            ' ' => ret = '\x20',
2091            '"' => ret = '"',
2092            '/' => ret = '/',
2093            '\\' => ret = '\\',
2094            // Unicode next line (#x85)
2095            'N' => ret = char::from_u32(0x85).unwrap(),
2096            // Unicode non-breaking space (#xA0)
2097            '_' => ret = char::from_u32(0xA0).unwrap(),
2098            // Unicode line separator (#x2028)
2099            'L' => ret = char::from_u32(0x2028).unwrap(),
2100            // Unicode paragraph separator (#x2029)
2101            'P' => ret = char::from_u32(0x2029).unwrap(),
2102            'x' => code_length = 2,
2103            'u' => code_length = 4,
2104            'U' => code_length = 8,
2105            _ => {
2106                return Err(ScanError::new(
2107                    *start_mark,
2108                    "while parsing a quoted scalar, found unknown escape character",
2109                ))
2110            }
2111        }
2112        self.skip_n_non_blank(2);
2113
2114        // Consume an arbitrary escape code.
2115        if code_length > 0 {
2116            self.lookahead(code_length);
2117            let mut value = 0u32;
2118            for i in 0..code_length {
2119                if !is_hex(self.buffer[i]) {
2120                    return Err(ScanError::new(
2121                        *start_mark,
2122                        "while parsing a quoted scalar, did not find expected hexadecimal number",
2123                    ));
2124                }
2125                value = (value << 4) + as_hex(self.buffer[i]);
2126            }
2127
2128            let Some(ch) = char::from_u32(value) else {
2129                return Err(ScanError::new(
2130                    *start_mark,
2131                    "while parsing a quoted scalar, found invalid Unicode character escape code",
2132                ));
2133            };
2134            ret = ch;
2135
2136            self.skip_n_non_blank(code_length);
2137        }
2138        Ok(ret)
2139    }
2140
2141    fn fetch_plain_scalar(&mut self) -> ScanResult {
2142        self.save_simple_key();
2143        self.disallow_simple_key();
2144
2145        let tok = self.scan_plain_scalar()?;
2146
2147        self.tokens.push_back(tok);
2148        Ok(())
2149    }
2150
2151    /// Scan for a plain scalar.
2152    ///
2153    /// Plain scalars are the most readable but restricted style. They may span multiple lines in
2154    /// some contexts.
2155    #[allow(clippy::too_many_lines)]
2156    fn scan_plain_scalar(&mut self) -> Result<Token, ScanError> {
2157        self.unroll_non_block_indents();
2158        let indent = self.indent + 1;
2159        let start_mark = self.mark;
2160
2161        if self.flow_level > 0 && (start_mark.col as isize) < indent {
2162            return Err(ScanError::new(
2163                start_mark,
2164                "invalid indentation in flow construct",
2165            ));
2166        }
2167
2168        let mut string = String::with_capacity(32);
2169        let mut leading_break = String::with_capacity(32);
2170        let mut trailing_breaks = String::with_capacity(32);
2171        let mut whitespaces = String::with_capacity(32);
2172
2173        loop {
2174            self.lookahead(4);
2175            if self.next_is_document_indicator() || self.ch() == '#' {
2176                break;
2177            }
2178
2179            if self.flow_level > 0 && self.ch() == '-' && is_flow(self.buffer[1]) {
2180                return Err(ScanError::new(
2181                    self.mark,
2182                    "plain scalar cannot start with '-' followed by ,[]{}",
2183                ));
2184            }
2185
2186            if !is_blank_or_breakz(self.ch()) && self.next_can_be_plain_scalar() {
2187                if self.leading_whitespace {
2188                    if leading_break.is_empty() {
2189                        string.push_str(&leading_break);
2190                        string.push_str(&trailing_breaks);
2191                        trailing_breaks.clear();
2192                        leading_break.clear();
2193                    } else {
2194                        if trailing_breaks.is_empty() {
2195                            string.push(' ');
2196                        } else {
2197                            string.push_str(&trailing_breaks);
2198                            trailing_breaks.clear();
2199                        }
2200                        leading_break.clear();
2201                    }
2202                    self.leading_whitespace = false;
2203                } else if !whitespaces.is_empty() {
2204                    string.push_str(&whitespaces);
2205                    whitespaces.clear();
2206                }
2207
2208                // We can unroll the first iteration of the loop.
2209                string.push(self.ch());
2210                self.skip_non_blank();
2211                self.lookahead(2);
2212
2213                // Add content non-blank characters to the scalar.
2214                while !is_blank_or_breakz(self.ch()) {
2215                    if !self.next_can_be_plain_scalar() {
2216                        break;
2217                    }
2218
2219                    string.push(self.ch());
2220                    self.skip_non_blank();
2221                    self.lookahead(2);
2222                }
2223            }
2224
2225            // We may reach the end of a plain scalar if:
2226            //  - We reach eof
2227            //  - We reach ": "
2228            //  - We find a flow character in a flow context
2229            if !(is_blank(self.ch()) || is_break(self.ch())) {
2230                break;
2231            }
2232
2233            // Process blank characters.
2234            while is_blank(self.look_ch()) || is_break(self.ch()) {
2235                if is_blank(self.ch()) {
2236                    if !self.leading_whitespace {
2237                        whitespaces.push(self.ch());
2238                        self.skip_blank();
2239                    } else if (self.mark.col as isize) < indent && self.ch() == '\t' {
2240                        // Tabs in an indentation columns are allowed if and only if the line is
2241                        // empty. Skip to the end of the line.
2242                        self.skip_ws_to_eol(SkipTabs::Yes)?;
2243                        if !is_breakz(self.ch()) {
2244                            return Err(ScanError::new(
2245                                start_mark,
2246                                "while scanning a plain scalar, found a tab",
2247                            ));
2248                        }
2249                    } else {
2250                        self.skip_blank();
2251                    }
2252                } else {
2253                    self.lookahead(2);
2254                    // Check if it is a first line break
2255                    if self.leading_whitespace {
2256                        self.read_break(&mut trailing_breaks);
2257                    } else {
2258                        whitespaces.clear();
2259                        self.read_break(&mut leading_break);
2260                        self.leading_whitespace = true;
2261                    }
2262                }
2263            }
2264
2265            // check indentation level
2266            if self.flow_level == 0 && (self.mark.col as isize) < indent {
2267                break;
2268            }
2269        }
2270
2271        if self.leading_whitespace {
2272            self.allow_simple_key();
2273        }
2274
2275        Ok(Token(
2276            start_mark,
2277            TokenType::Scalar(TScalarStyle::Plain, string),
2278        ))
2279    }
2280
2281    fn fetch_key(&mut self) -> ScanResult {
2282        let start_mark = self.mark;
2283        if self.flow_level == 0 {
2284            // Check if we are allowed to start a new key (not necessarily simple).
2285            if !self.simple_key_allowed {
2286                return Err(ScanError::new(
2287                    self.mark,
2288                    "mapping keys are not allowed in this context",
2289                ));
2290            }
2291            self.roll_indent(
2292                start_mark.col,
2293                None,
2294                TokenType::BlockMappingStart,
2295                start_mark,
2296            );
2297        } else {
2298            // The parser, upon receiving a `Key`, will insert a `MappingStart` event.
2299            self.flow_mapping_started = true;
2300        }
2301
2302        self.remove_simple_key()?;
2303
2304        if self.flow_level == 0 {
2305            self.allow_simple_key();
2306        } else {
2307            self.disallow_simple_key();
2308        }
2309
2310        self.skip_non_blank();
2311        self.skip_yaml_whitespace()?;
2312        if self.ch() == '\t' {
2313            return Err(ScanError::new(
2314                self.mark(),
2315                "tabs disallowed in this context",
2316            ));
2317        }
2318        self.tokens.push_back(Token(start_mark, TokenType::Key));
2319        Ok(())
2320    }
2321
2322    /// Fetch a value from a mapping (after a `:`).
2323    fn fetch_value(&mut self) -> ScanResult {
2324        let sk = self.simple_keys.last().unwrap().clone();
2325        let start_mark = self.mark;
2326        self.implicit_flow_mapping = self.flow_level > 0 && !self.flow_mapping_started;
2327
2328        // Skip over ':'.
2329        self.skip_non_blank();
2330        if self.look_ch() == '\t'
2331            && !self.skip_ws_to_eol(SkipTabs::Yes)?.has_valid_yaml_ws()
2332            && (self.ch() == '-' || is_alpha(self.ch()))
2333        {
2334            return Err(ScanError::new(
2335                self.mark,
2336                "':' must be followed by a valid YAML whitespace",
2337            ));
2338        }
2339
2340        if sk.possible {
2341            // insert simple key
2342            let tok = Token(sk.mark, TokenType::Key);
2343            self.insert_token(sk.token_number - self.tokens_parsed, tok);
2344            if self.implicit_flow_mapping {
2345                if sk.mark.line < start_mark.line {
2346                    return Err(ScanError::new(
2347                        start_mark,
2348                        "illegal placement of ':' indicator",
2349                    ));
2350                }
2351                self.insert_token(
2352                    sk.token_number - self.tokens_parsed,
2353                    Token(self.mark, TokenType::FlowMappingStart),
2354                );
2355            }
2356
2357            // Add the BLOCK-MAPPING-START token if needed.
2358            self.roll_indent(
2359                sk.mark.col,
2360                Some(sk.token_number),
2361                TokenType::BlockMappingStart,
2362                start_mark,
2363            );
2364            self.roll_one_col_indent();
2365
2366            self.simple_keys.last_mut().unwrap().possible = false;
2367            self.disallow_simple_key();
2368        } else {
2369            if self.implicit_flow_mapping {
2370                self.tokens
2371                    .push_back(Token(self.mark, TokenType::FlowMappingStart));
2372            }
2373            // The ':' indicator follows a complex key.
2374            if self.flow_level == 0 {
2375                if !self.simple_key_allowed {
2376                    return Err(ScanError::new(
2377                        start_mark,
2378                        "mapping values are not allowed in this context",
2379                    ));
2380                }
2381
2382                self.roll_indent(
2383                    start_mark.col,
2384                    None,
2385                    TokenType::BlockMappingStart,
2386                    start_mark,
2387                );
2388            }
2389            self.roll_one_col_indent();
2390
2391            if self.flow_level == 0 {
2392                self.allow_simple_key();
2393            } else {
2394                self.disallow_simple_key();
2395            }
2396        }
2397        self.tokens.push_back(Token(start_mark, TokenType::Value));
2398
2399        Ok(())
2400    }
2401
2402    /// Add an indentation level to the stack with the given block token, if needed.
2403    ///
2404    /// An indentation level is added only if:
2405    ///   - We are not in a flow-style construct (which don't have indentation per-se).
2406    ///   - The current column is further indented than the last indent we have registered.
2407    fn roll_indent(&mut self, col: usize, number: Option<usize>, tok: TokenType, mark: Marker) {
2408        if self.flow_level > 0 {
2409            return;
2410        }
2411
2412        // If the last indent was a non-block indent, remove it.
2413        // This means that we prepared an indent that we thought we wouldn't use, but realized just
2414        // now that it is a block indent.
2415        if self.indent <= col as isize {
2416            if let Some(indent) = self.indents.last() {
2417                if !indent.needs_block_end {
2418                    self.indent = indent.indent;
2419                    self.indents.pop();
2420                }
2421            }
2422        }
2423
2424        if self.indent < col as isize {
2425            self.indents.push(Indent {
2426                indent: self.indent,
2427                needs_block_end: true,
2428            });
2429            self.indent = col as isize;
2430            let tokens_parsed = self.tokens_parsed;
2431            match number {
2432                Some(n) => self.insert_token(n - tokens_parsed, Token(mark, tok)),
2433                None => self.tokens.push_back(Token(mark, tok)),
2434            }
2435        }
2436    }
2437
2438    /// Pop indentation levels from the stack as much as needed.
2439    ///
2440    /// Indentation levels are popped from the stack while they are further indented than `col`.
2441    /// If we are in a flow-style construct (which don't have indentation per-se), this function
2442    /// does nothing.
2443    fn unroll_indent(&mut self, col: isize) {
2444        if self.flow_level > 0 {
2445            return;
2446        }
2447        while self.indent > col {
2448            let indent = self.indents.pop().unwrap();
2449            self.indent = indent.indent;
2450            if indent.needs_block_end {
2451                self.tokens.push_back(Token(self.mark, TokenType::BlockEnd));
2452            }
2453        }
2454    }
2455
2456    /// Add an indentation level of 1 column that does not start a block.
2457    ///
2458    /// See the documentation of [`Indent::needs_block_end`] for more details.
2459    /// An indentation is not added if we are inside a flow level or if the last indent is already
2460    /// a non-block indent.
2461    fn roll_one_col_indent(&mut self) {
2462        if self.flow_level == 0 && self.indents.last().is_some_and(|x| x.needs_block_end) {
2463            self.indents.push(Indent {
2464                indent: self.indent,
2465                needs_block_end: false,
2466            });
2467            self.indent += 1;
2468        }
2469    }
2470
2471    /// Unroll all last indents created with [`Self::roll_one_col_indent`].
2472    fn unroll_non_block_indents(&mut self) {
2473        while let Some(indent) = self.indents.last() {
2474            if indent.needs_block_end {
2475                break;
2476            }
2477            self.indent = indent.indent;
2478            self.indents.pop();
2479        }
2480    }
2481
2482    /// Mark the next token to be inserted as a potential simple key.
2483    fn save_simple_key(&mut self) {
2484        if self.simple_key_allowed {
2485            let required = self.flow_level == 0
2486                && self.indent == (self.mark.col as isize)
2487                && self.indents.last().unwrap().needs_block_end;
2488            let mut sk = SimpleKey::new(self.mark);
2489            sk.possible = true;
2490            sk.required = required;
2491            sk.token_number = self.tokens_parsed + self.tokens.len();
2492
2493            self.simple_keys.pop();
2494            self.simple_keys.push(sk);
2495        }
2496    }
2497
2498    fn remove_simple_key(&mut self) -> ScanResult {
2499        let last = self.simple_keys.last_mut().unwrap();
2500        if last.possible && last.required {
2501            return Err(ScanError::new(self.mark, "simple key expected"));
2502        }
2503
2504        last.possible = false;
2505        Ok(())
2506    }
2507
2508    /// Check whether the next characters may be part of a plain scalar.
2509    ///
2510    /// This function assumes we are not given a blankz character.
2511    // For some reason, `#[inline]` is not enough.
2512    #[allow(clippy::inline_always)]
2513    #[inline(always)]
2514    fn next_can_be_plain_scalar(&self) -> bool {
2515        match self.ch() {
2516            // indicators can end a plain scalar, see 7.3.3. Plain Style
2517            ':' if is_blank_or_breakz(self.buffer[1])
2518                || (self.flow_level > 0 && is_flow(self.buffer[1])) =>
2519            {
2520                false
2521            }
2522            c if self.flow_level > 0 && is_flow(c) => false,
2523            _ => true,
2524        }
2525    }
2526
2527    /// Return whether the scanner is inside a block but outside of a flow sequence.
2528    fn is_within_block(&self) -> bool {
2529        !self.indents.is_empty()
2530    }
2531
2532    /// If an implicit mapping had started, end it.
2533    fn end_implicit_mapping(&mut self, mark: Marker) {
2534        if self.implicit_flow_mapping {
2535            self.implicit_flow_mapping = false;
2536            self.flow_mapping_started = false;
2537            self.tokens
2538                .push_back(Token(mark, TokenType::FlowMappingEnd));
2539        }
2540    }
2541}
2542
2543/// Behavior to adopt regarding treating tabs as whitespace.
2544///
2545/// Although tab is a valid yaml whitespace, it doesn't always behave the same as a space.
2546#[derive(Copy, Clone, Eq, PartialEq)]
2547enum SkipTabs {
2548    /// Skip all tabs as whitespace.
2549    Yes,
2550    /// Don't skip any tab. Return from the function when encountering one.
2551    No,
2552    /// Return value from the function.
2553    Result(
2554        /// Whether tabs were encountered.
2555        bool,
2556        /// Whether at least 1 valid yaml whitespace has been encountered.
2557        bool,
2558    ),
2559}
2560
2561impl SkipTabs {
2562    /// Whether tabs were found while skipping whitespace.
2563    ///
2564    /// This function must be called after a call to `skip_ws_to_eol`.
2565    fn found_tabs(self) -> bool {
2566        matches!(self, SkipTabs::Result(true, _))
2567    }
2568
2569    /// Whether a valid YAML whitespace has been found in skipped-over content.
2570    ///
2571    /// This function must be called after a call to `skip_ws_to_eol`.
2572    fn has_valid_yaml_ws(self) -> bool {
2573        matches!(self, SkipTabs::Result(_, true))
2574    }
2575}
2576
2577/// Chomping, how final line breaks and trailing empty lines are interpreted.
2578///
2579/// See YAML spec 8.1.1.2.
2580#[derive(PartialEq, Eq)]
2581pub enum Chomping {
2582    /// The final line break and any trailing empty lines are excluded.
2583    Strip,
2584    /// The final line break is preserved, but trailing empty lines are excluded.
2585    Clip,
2586    /// The final line break and trailing empty lines are included.
2587    Keep,
2588}
2589
2590#[cfg(test)]
2591mod test {
2592    #[test]
2593    fn test_is_anchor_char() {
2594        use super::is_anchor_char;
2595        assert!(is_anchor_char('x'));
2596    }
2597}