toml_span/
tokens.rs

1#![allow(missing_docs)]
2//! The tokenizer is publicly exposed if you wish to use it instead
3
4use crate::{value::Key, Span};
5use std::{borrow::Cow, char, str};
6
7#[derive(Eq, PartialEq, Debug)]
8pub enum Token<'a> {
9    Whitespace(&'a str),
10    Newline,
11    Comment(&'a str),
12
13    Equals,
14    Period,
15    Comma,
16    Colon,
17    Plus,
18    LeftBrace,
19    RightBrace,
20    LeftBracket,
21    RightBracket,
22
23    Keylike(&'a str),
24    String {
25        src: &'a str,
26        val: Cow<'a, str>,
27        multiline: bool,
28    },
29}
30
31#[derive(Eq, PartialEq, Debug)]
32pub enum Error {
33    InvalidCharInString(usize, char),
34    InvalidEscape(usize, char),
35    InvalidHexEscape(usize, char),
36    InvalidEscapeValue(usize, usize, u32),
37    NewlineInString(usize),
38    Unexpected(usize, char),
39    UnterminatedString(usize),
40    MultilineStringKey(usize, usize),
41    Wanted {
42        at: usize,
43        expected: &'static str,
44        found: &'static str,
45    },
46}
47
48#[derive(Clone)]
49pub struct Tokenizer<'a> {
50    input: &'a str,
51    chars: CrlfFold<'a>,
52}
53
54#[derive(Clone)]
55struct CrlfFold<'a> {
56    chars: str::CharIndices<'a>,
57}
58
59#[derive(Debug)]
60enum MaybeString {
61    NotEscaped(usize),
62    Owned(String),
63}
64
65impl<'a> Tokenizer<'a> {
66    pub fn new(input: &'a str) -> Tokenizer<'a> {
67        let mut t = Tokenizer {
68            input,
69            chars: CrlfFold {
70                chars: input.char_indices(),
71            },
72        };
73        // Eat utf-8 BOM
74        t.eatc('\u{feff}');
75        t
76    }
77
78    pub fn step(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
79        let (start, token) = match self.one() {
80            Some((start, '\n')) => (start, Token::Newline),
81            Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
82            Some((start, '#')) => (start, self.comment_token(start)),
83            Some((start, '=')) => (start, Token::Equals),
84            Some((start, '.')) => (start, Token::Period),
85            Some((start, ',')) => (start, Token::Comma),
86            Some((start, ':')) => (start, Token::Colon),
87            Some((start, '+')) => (start, Token::Plus),
88            Some((start, '{')) => (start, Token::LeftBrace),
89            Some((start, '}')) => (start, Token::RightBrace),
90            Some((start, '[')) => (start, Token::LeftBracket),
91            Some((start, ']')) => (start, Token::RightBracket),
92            Some((start, '\'')) => return self.literal_string(start).map(|(s, t)| Some((s, t))),
93            Some((start, '"')) => return self.basic_string(start).map(|(s, t)| Some((s, t))),
94            Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
95            Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
96            None => return Ok(None),
97        };
98
99        let span = self.step_span(start);
100        Ok(Some((span, token)))
101    }
102
103    pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
104        self.clone().step()
105    }
106
107    pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
108        self.eat_spanned(expected).map(|s| s.is_some())
109    }
110
111    /// Eat a value, returning it's span if it was consumed.
112    pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
113        let span = match self.peek()? {
114            Some((span, ref found)) if expected == *found => span,
115            Some(_) | None => return Ok(None),
116        };
117
118        drop(self.step());
119        Ok(Some(span))
120    }
121
122    pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
123        // ignore span
124        let _ = self.expect_spanned(expected)?;
125        Ok(())
126    }
127
128    /// Expect the given token returning its span.
129    pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
130        let current = self.current();
131        match self.step()? {
132            Some((span, found)) => {
133                if expected == found {
134                    Ok(span)
135                } else {
136                    Err(Error::Wanted {
137                        at: current,
138                        expected: expected.describe(),
139                        found: found.describe(),
140                    })
141                }
142            }
143            None => Err(Error::Wanted {
144                at: self.input.len(),
145                expected: expected.describe(),
146                found: "eof",
147            }),
148        }
149    }
150
151    pub fn table_key(&mut self) -> Result<Key<'a>, Error> {
152        let current = self.current();
153        match self.step()? {
154            Some((span, Token::Keylike(k))) => Ok(Key {
155                span,
156                name: k.into(),
157            }),
158            Some((
159                span,
160                Token::String {
161                    src,
162                    val,
163                    multiline,
164                    ..
165                },
166            )) => {
167                let offset = self.substr_offset(src);
168                if multiline {
169                    return Err(Error::MultilineStringKey(offset, offset + val.len()));
170                }
171                match src.find('\n') {
172                    None => Ok(Key { span, name: val }),
173                    // This is not reachable
174                    Some(i) => Err(Error::InvalidCharInString(i, '\n')),
175                }
176            }
177            Some((_, other)) => Err(Error::Wanted {
178                at: current,
179                expected: "a table key",
180                found: other.describe(),
181            }),
182            None => Err(Error::Wanted {
183                at: self.input.len(),
184                expected: "a table key",
185                found: "eof",
186            }),
187        }
188    }
189
190    pub fn eat_whitespace(&mut self) {
191        while self.eatc(' ') || self.eatc('\t') {
192            // ...
193        }
194    }
195
196    pub fn eat_comment(&mut self) -> Result<bool, Error> {
197        if !self.eatc('#') {
198            return Ok(false);
199        }
200        drop(self.comment_token(0));
201        self.eat_newline_or_eof().map(|()| true)
202    }
203
204    pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
205        let current = self.current();
206        match self.step()? {
207            None | Some((_, Token::Newline)) => Ok(()),
208            Some((_, other)) => Err(Error::Wanted {
209                at: current,
210                expected: "newline",
211                found: other.describe(),
212            }),
213        }
214    }
215
216    pub fn skip_to_newline(&mut self) {
217        loop {
218            match self.one() {
219                Some((_, '\n')) | None => break,
220                _ => {}
221            }
222        }
223    }
224
225    fn eatc(&mut self, ch: char) -> bool {
226        match self.chars.clone().next() {
227            Some((_, ch2)) if ch == ch2 => {
228                self.one();
229                true
230            }
231            _ => false,
232        }
233    }
234
235    pub fn current(&mut self) -> usize {
236        match self.chars.clone().next() {
237            Some(i) => i.0,
238            None => self.input.len(),
239        }
240    }
241
242    fn whitespace_token(&mut self, start: usize) -> Token<'a> {
243        while self.eatc(' ') || self.eatc('\t') {
244            // ...
245        }
246        Token::Whitespace(&self.input[start..self.current()])
247    }
248
249    fn comment_token(&mut self, start: usize) -> Token<'a> {
250        while let Some((_, ch)) = self.chars.clone().next() {
251            if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
252                break;
253            }
254            self.one();
255        }
256        Token::Comment(&self.input[start..self.current()])
257    }
258
259    /// String spans are treated slightly differently, as we only want the
260    /// characters in the string, not the quotes, as once the user gets the
261    /// string and its span they won't know the actual begin/end which can
262    /// be needed for doing substring indices (eg reporting error messages
263    /// when parsing a string)
264    #[allow(clippy::type_complexity)]
265    fn read_string(
266        &mut self,
267        delim: char,
268        start: usize,
269        new_ch: &mut dyn FnMut(
270            &mut Tokenizer<'_>,
271            &mut MaybeString,
272            bool,
273            usize,
274            char,
275        ) -> Result<(), Error>,
276    ) -> Result<(Span, Token<'a>), Error> {
277        let mut multiline = false;
278        if self.eatc(delim) {
279            if self.eatc(delim) {
280                multiline = true;
281            } else {
282                return Ok((
283                    // Point the caret at the beginning of the quote, that looks
284                    // better than the end quote
285                    (start..start + 1).into(),
286                    Token::String {
287                        src: &self.input[start..start + 2],
288                        val: Cow::Borrowed(""),
289                        multiline: false,
290                    },
291                ));
292            }
293        }
294        let mut val = MaybeString::NotEscaped(self.current());
295        let mut n = 0;
296        loop {
297            n += 1;
298            match self.one() {
299                Some((i, '\n')) => {
300                    if multiline {
301                        if self.input.as_bytes()[i] == b'\r' {
302                            val.make_owned(&self.input[..i]);
303                        }
304                        if n == 1 {
305                            val = MaybeString::NotEscaped(self.current());
306                        } else {
307                            val.push('\n');
308                        }
309                    } else {
310                        return Err(Error::NewlineInString(i));
311                    }
312                }
313                Some((mut i, ch)) if ch == delim => {
314                    let span = if multiline {
315                        if !self.eatc(delim) {
316                            val.push(delim);
317                            continue;
318                        }
319                        if !self.eatc(delim) {
320                            val.push(delim);
321                            val.push(delim);
322                            continue;
323                        }
324                        if self.eatc(delim) {
325                            val.push(delim);
326                            i += 1;
327                        }
328                        if self.eatc(delim) {
329                            val.push(delim);
330                            i += 1;
331                        }
332
333                        // Also skip the first newline after the opening delimiters
334                        let maybe_nl = self.input.as_bytes()[start + 3];
335                        let start_off = if maybe_nl == b'\n' {
336                            4
337                        } else if maybe_nl == b'\r' {
338                            5
339                        } else {
340                            3
341                        };
342
343                        start + start_off..self.current() - 3
344                    } else {
345                        start + 1..self.current() - 1
346                    }
347                    .into();
348
349                    return Ok((
350                        span,
351                        Token::String {
352                            src: &self.input[start..self.current()],
353                            val: val.into_cow(&self.input[..i]),
354                            multiline,
355                        },
356                    ));
357                }
358                Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
359                None => return Err(Error::UnterminatedString(start)),
360            }
361        }
362    }
363
364    fn literal_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
365        self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
366            if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) {
367                val.push(ch);
368                Ok(())
369            } else {
370                Err(Error::InvalidCharInString(i, ch))
371            }
372        })
373    }
374
375    fn basic_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
376        self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
377            '\\' => {
378                val.make_owned(&me.input[..i]);
379                match me.chars.next() {
380                    Some((_, '"')) => val.push('"'),
381                    Some((_, '\\')) => val.push('\\'),
382                    Some((_, 'b')) => val.push('\u{8}'),
383                    Some((_, 'f')) => val.push('\u{c}'),
384                    Some((_, 'n')) => val.push('\n'),
385                    Some((_, 'r')) => val.push('\r'),
386                    Some((_, 't')) => val.push('\t'),
387                    Some((i, c @ ('u' | 'U'))) => {
388                        let c = if c == 'u' {
389                            me.hex::<4>(start, i)
390                        } else {
391                            me.hex::<8>(start, i)
392                        };
393                        val.push(c?);
394                    }
395                    Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
396                        if c != '\n' {
397                            while let Some((_, ch)) = me.chars.clone().next() {
398                                match ch {
399                                    ' ' | '\t' => {
400                                        me.chars.next();
401                                        continue;
402                                    }
403                                    '\n' => {
404                                        me.chars.next();
405                                        break;
406                                    }
407                                    _ => return Err(Error::InvalidEscape(i, c)),
408                                }
409                            }
410                        }
411                        while let Some((_, ch)) = me.chars.clone().next() {
412                            match ch {
413                                ' ' | '\t' | '\n' => {
414                                    me.chars.next();
415                                }
416                                _ => break,
417                            }
418                        }
419                    }
420                    Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
421                    None => return Err(Error::UnterminatedString(start)),
422                }
423                Ok(())
424            }
425            ch if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) => {
426                val.push(ch);
427                Ok(())
428            }
429            _ => Err(Error::InvalidCharInString(i, ch)),
430        })
431    }
432
433    fn hex<const N: usize>(&mut self, start: usize, i: usize) -> Result<char, Error> {
434        let mut buf = [0; N];
435        for b in buf.iter_mut() {
436            match self.one() {
437                Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => *b = ch as u8,
438                Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
439                None => return Err(Error::UnterminatedString(start)),
440            }
441        }
442        let val = u32::from_str_radix(std::str::from_utf8(&buf).unwrap(), 16).unwrap();
443        match char::from_u32(val) {
444            Some(ch) => Ok(ch),
445            None => Err(Error::InvalidEscapeValue(i, N, val)),
446        }
447    }
448
449    fn keylike(&mut self, start: usize) -> Token<'a> {
450        while let Some((_, ch)) = self.peek_one() {
451            if !is_keylike(ch) {
452                break;
453            }
454            self.one();
455        }
456        Token::Keylike(&self.input[start..self.current()])
457    }
458
459    pub fn substr_offset(&self, s: &'a str) -> usize {
460        assert!(s.len() <= self.input.len());
461        let a = self.input.as_ptr() as usize;
462        let b = s.as_ptr() as usize;
463        assert!(a <= b);
464        b - a
465    }
466
467    /// Calculate the span of a single character.
468    fn step_span(&mut self, start: usize) -> Span {
469        let end = match self.peek_one() {
470            Some(t) => t.0,
471            None => self.input.len(),
472        };
473        Span { start, end }
474    }
475
476    /// Peek one char without consuming it.
477    fn peek_one(&mut self) -> Option<(usize, char)> {
478        self.chars.clone().next()
479    }
480
481    /// Take one char.
482    pub fn one(&mut self) -> Option<(usize, char)> {
483        self.chars.next()
484    }
485}
486
487impl Iterator for CrlfFold<'_> {
488    type Item = (usize, char);
489
490    fn next(&mut self) -> Option<(usize, char)> {
491        self.chars.next().map(|(i, c)| {
492            if c == '\r' {
493                let mut attempt = self.chars.clone();
494                if let Some((_, '\n')) = attempt.next() {
495                    self.chars = attempt;
496                    return (i, '\n');
497                }
498            }
499            (i, c)
500        })
501    }
502}
503
504impl MaybeString {
505    fn push(&mut self, ch: char) {
506        match *self {
507            MaybeString::NotEscaped(..) => {}
508            MaybeString::Owned(ref mut s) => s.push(ch),
509        }
510    }
511
512    fn make_owned(&mut self, input: &str) {
513        match *self {
514            MaybeString::NotEscaped(start) => {
515                *self = MaybeString::Owned(input[start..].to_owned());
516            }
517            MaybeString::Owned(..) => {}
518        }
519    }
520
521    fn into_cow(self, input: &str) -> Cow<'_, str> {
522        match self {
523            MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
524            MaybeString::Owned(s) => Cow::Owned(s),
525        }
526    }
527}
528
529#[inline]
530fn is_keylike(ch: char) -> bool {
531    ch.is_ascii_alphanumeric() || ch == '-' || ch == '_'
532}
533
534impl Token<'_> {
535    pub fn describe(&self) -> &'static str {
536        match *self {
537            Token::Keylike(_) => "an identifier",
538            Token::Equals => "an equals",
539            Token::Period => "a period",
540            Token::Comment(_) => "a comment",
541            Token::Newline => "a newline",
542            Token::Whitespace(_) => "whitespace",
543            Token::Comma => "a comma",
544            Token::RightBrace => "a right brace",
545            Token::LeftBrace => "a left brace",
546            Token::RightBracket => "a right bracket",
547            Token::LeftBracket => "a left bracket",
548            Token::String { multiline, .. } => {
549                if multiline {
550                    "a multiline string"
551                } else {
552                    "a string"
553                }
554            }
555            Token::Colon => "a colon",
556            Token::Plus => "a plus",
557        }
558    }
559}