1#![allow(missing_docs)]
2use crate::{value::Key, Span};
5use std::{borrow::Cow, char, str};
6
7#[derive(Eq, PartialEq, Debug)]
8pub enum Token<'a> {
9 Whitespace(&'a str),
10 Newline,
11 Comment(&'a str),
12
13 Equals,
14 Period,
15 Comma,
16 Colon,
17 Plus,
18 LeftBrace,
19 RightBrace,
20 LeftBracket,
21 RightBracket,
22
23 Keylike(&'a str),
24 String {
25 src: &'a str,
26 val: Cow<'a, str>,
27 multiline: bool,
28 },
29}
30
31#[derive(Eq, PartialEq, Debug)]
32pub enum Error {
33 InvalidCharInString(usize, char),
34 InvalidEscape(usize, char),
35 InvalidHexEscape(usize, char),
36 InvalidEscapeValue(usize, usize, u32),
37 NewlineInString(usize),
38 Unexpected(usize, char),
39 UnterminatedString(usize),
40 MultilineStringKey(usize, usize),
41 Wanted {
42 at: usize,
43 expected: &'static str,
44 found: &'static str,
45 },
46}
47
48#[derive(Clone)]
49pub struct Tokenizer<'a> {
50 input: &'a str,
51 chars: CrlfFold<'a>,
52}
53
54#[derive(Clone)]
55struct CrlfFold<'a> {
56 chars: str::CharIndices<'a>,
57}
58
59#[derive(Debug)]
60enum MaybeString {
61 NotEscaped(usize),
62 Owned(String),
63}
64
65impl<'a> Tokenizer<'a> {
66 pub fn new(input: &'a str) -> Tokenizer<'a> {
67 let mut t = Tokenizer {
68 input,
69 chars: CrlfFold {
70 chars: input.char_indices(),
71 },
72 };
73 t.eatc('\u{feff}');
75 t
76 }
77
78 pub fn step(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
79 let (start, token) = match self.one() {
80 Some((start, '\n')) => (start, Token::Newline),
81 Some((start, ' ' | '\t')) => (start, self.whitespace_token(start)),
82 Some((start, '#')) => (start, self.comment_token(start)),
83 Some((start, '=')) => (start, Token::Equals),
84 Some((start, '.')) => (start, Token::Period),
85 Some((start, ',')) => (start, Token::Comma),
86 Some((start, ':')) => (start, Token::Colon),
87 Some((start, '+')) => (start, Token::Plus),
88 Some((start, '{')) => (start, Token::LeftBrace),
89 Some((start, '}')) => (start, Token::RightBrace),
90 Some((start, '[')) => (start, Token::LeftBracket),
91 Some((start, ']')) => (start, Token::RightBracket),
92 Some((start, '\'')) => return self.literal_string(start).map(|(s, t)| Some((s, t))),
93 Some((start, '"')) => return self.basic_string(start).map(|(s, t)| Some((s, t))),
94 Some((start, ch)) if is_keylike(ch) => (start, self.keylike(start)),
95 Some((start, ch)) => return Err(Error::Unexpected(start, ch)),
96 None => return Ok(None),
97 };
98
99 let span = self.step_span(start);
100 Ok(Some((span, token)))
101 }
102
103 pub fn peek(&mut self) -> Result<Option<(Span, Token<'a>)>, Error> {
104 self.clone().step()
105 }
106
107 pub fn eat(&mut self, expected: Token<'a>) -> Result<bool, Error> {
108 self.eat_spanned(expected).map(|s| s.is_some())
109 }
110
111 pub fn eat_spanned(&mut self, expected: Token<'a>) -> Result<Option<Span>, Error> {
113 let span = match self.peek()? {
114 Some((span, ref found)) if expected == *found => span,
115 Some(_) | None => return Ok(None),
116 };
117
118 drop(self.step());
119 Ok(Some(span))
120 }
121
122 pub fn expect(&mut self, expected: Token<'a>) -> Result<(), Error> {
123 let _ = self.expect_spanned(expected)?;
125 Ok(())
126 }
127
128 pub fn expect_spanned(&mut self, expected: Token<'a>) -> Result<Span, Error> {
130 let current = self.current();
131 match self.step()? {
132 Some((span, found)) => {
133 if expected == found {
134 Ok(span)
135 } else {
136 Err(Error::Wanted {
137 at: current,
138 expected: expected.describe(),
139 found: found.describe(),
140 })
141 }
142 }
143 None => Err(Error::Wanted {
144 at: self.input.len(),
145 expected: expected.describe(),
146 found: "eof",
147 }),
148 }
149 }
150
151 pub fn table_key(&mut self) -> Result<Key<'a>, Error> {
152 let current = self.current();
153 match self.step()? {
154 Some((span, Token::Keylike(k))) => Ok(Key {
155 span,
156 name: k.into(),
157 }),
158 Some((
159 span,
160 Token::String {
161 src,
162 val,
163 multiline,
164 ..
165 },
166 )) => {
167 let offset = self.substr_offset(src);
168 if multiline {
169 return Err(Error::MultilineStringKey(offset, offset + val.len()));
170 }
171 match src.find('\n') {
172 None => Ok(Key { span, name: val }),
173 Some(i) => Err(Error::InvalidCharInString(i, '\n')),
175 }
176 }
177 Some((_, other)) => Err(Error::Wanted {
178 at: current,
179 expected: "a table key",
180 found: other.describe(),
181 }),
182 None => Err(Error::Wanted {
183 at: self.input.len(),
184 expected: "a table key",
185 found: "eof",
186 }),
187 }
188 }
189
190 pub fn eat_whitespace(&mut self) {
191 while self.eatc(' ') || self.eatc('\t') {
192 }
194 }
195
196 pub fn eat_comment(&mut self) -> Result<bool, Error> {
197 if !self.eatc('#') {
198 return Ok(false);
199 }
200 drop(self.comment_token(0));
201 self.eat_newline_or_eof().map(|()| true)
202 }
203
204 pub fn eat_newline_or_eof(&mut self) -> Result<(), Error> {
205 let current = self.current();
206 match self.step()? {
207 None | Some((_, Token::Newline)) => Ok(()),
208 Some((_, other)) => Err(Error::Wanted {
209 at: current,
210 expected: "newline",
211 found: other.describe(),
212 }),
213 }
214 }
215
216 pub fn skip_to_newline(&mut self) {
217 loop {
218 match self.one() {
219 Some((_, '\n')) | None => break,
220 _ => {}
221 }
222 }
223 }
224
225 fn eatc(&mut self, ch: char) -> bool {
226 match self.chars.clone().next() {
227 Some((_, ch2)) if ch == ch2 => {
228 self.one();
229 true
230 }
231 _ => false,
232 }
233 }
234
235 pub fn current(&mut self) -> usize {
236 match self.chars.clone().next() {
237 Some(i) => i.0,
238 None => self.input.len(),
239 }
240 }
241
242 fn whitespace_token(&mut self, start: usize) -> Token<'a> {
243 while self.eatc(' ') || self.eatc('\t') {
244 }
246 Token::Whitespace(&self.input[start..self.current()])
247 }
248
249 fn comment_token(&mut self, start: usize) -> Token<'a> {
250 while let Some((_, ch)) = self.chars.clone().next() {
251 if ch != '\t' && !('\u{20}'..='\u{10ffff}').contains(&ch) {
252 break;
253 }
254 self.one();
255 }
256 Token::Comment(&self.input[start..self.current()])
257 }
258
259 #[allow(clippy::type_complexity)]
265 fn read_string(
266 &mut self,
267 delim: char,
268 start: usize,
269 new_ch: &mut dyn FnMut(
270 &mut Tokenizer<'_>,
271 &mut MaybeString,
272 bool,
273 usize,
274 char,
275 ) -> Result<(), Error>,
276 ) -> Result<(Span, Token<'a>), Error> {
277 let mut multiline = false;
278 if self.eatc(delim) {
279 if self.eatc(delim) {
280 multiline = true;
281 } else {
282 return Ok((
283 (start..start + 1).into(),
286 Token::String {
287 src: &self.input[start..start + 2],
288 val: Cow::Borrowed(""),
289 multiline: false,
290 },
291 ));
292 }
293 }
294 let mut val = MaybeString::NotEscaped(self.current());
295 let mut n = 0;
296 loop {
297 n += 1;
298 match self.one() {
299 Some((i, '\n')) => {
300 if multiline {
301 if self.input.as_bytes()[i] == b'\r' {
302 val.make_owned(&self.input[..i]);
303 }
304 if n == 1 {
305 val = MaybeString::NotEscaped(self.current());
306 } else {
307 val.push('\n');
308 }
309 } else {
310 return Err(Error::NewlineInString(i));
311 }
312 }
313 Some((mut i, ch)) if ch == delim => {
314 let span = if multiline {
315 if !self.eatc(delim) {
316 val.push(delim);
317 continue;
318 }
319 if !self.eatc(delim) {
320 val.push(delim);
321 val.push(delim);
322 continue;
323 }
324 if self.eatc(delim) {
325 val.push(delim);
326 i += 1;
327 }
328 if self.eatc(delim) {
329 val.push(delim);
330 i += 1;
331 }
332
333 let maybe_nl = self.input.as_bytes()[start + 3];
335 let start_off = if maybe_nl == b'\n' {
336 4
337 } else if maybe_nl == b'\r' {
338 5
339 } else {
340 3
341 };
342
343 start + start_off..self.current() - 3
344 } else {
345 start + 1..self.current() - 1
346 }
347 .into();
348
349 return Ok((
350 span,
351 Token::String {
352 src: &self.input[start..self.current()],
353 val: val.into_cow(&self.input[..i]),
354 multiline,
355 },
356 ));
357 }
358 Some((i, c)) => new_ch(self, &mut val, multiline, i, c)?,
359 None => return Err(Error::UnterminatedString(start)),
360 }
361 }
362 }
363
364 fn literal_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
365 self.read_string('\'', start, &mut |_me, val, _multi, i, ch| {
366 if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) {
367 val.push(ch);
368 Ok(())
369 } else {
370 Err(Error::InvalidCharInString(i, ch))
371 }
372 })
373 }
374
375 fn basic_string(&mut self, start: usize) -> Result<(Span, Token<'a>), Error> {
376 self.read_string('"', start, &mut |me, val, multi, i, ch| match ch {
377 '\\' => {
378 val.make_owned(&me.input[..i]);
379 match me.chars.next() {
380 Some((_, '"')) => val.push('"'),
381 Some((_, '\\')) => val.push('\\'),
382 Some((_, 'b')) => val.push('\u{8}'),
383 Some((_, 'f')) => val.push('\u{c}'),
384 Some((_, 'n')) => val.push('\n'),
385 Some((_, 'r')) => val.push('\r'),
386 Some((_, 't')) => val.push('\t'),
387 Some((i, c @ ('u' | 'U'))) => {
388 let c = if c == 'u' {
389 me.hex::<4>(start, i)
390 } else {
391 me.hex::<8>(start, i)
392 };
393 val.push(c?);
394 }
395 Some((i, c @ (' ' | '\t' | '\n'))) if multi => {
396 if c != '\n' {
397 while let Some((_, ch)) = me.chars.clone().next() {
398 match ch {
399 ' ' | '\t' => {
400 me.chars.next();
401 continue;
402 }
403 '\n' => {
404 me.chars.next();
405 break;
406 }
407 _ => return Err(Error::InvalidEscape(i, c)),
408 }
409 }
410 }
411 while let Some((_, ch)) = me.chars.clone().next() {
412 match ch {
413 ' ' | '\t' | '\n' => {
414 me.chars.next();
415 }
416 _ => break,
417 }
418 }
419 }
420 Some((i, c)) => return Err(Error::InvalidEscape(i, c)),
421 None => return Err(Error::UnterminatedString(start)),
422 }
423 Ok(())
424 }
425 ch if ch == '\u{09}' || (ch != '\u{7f}' && ('\u{20}'..='\u{10ffff}').contains(&ch)) => {
426 val.push(ch);
427 Ok(())
428 }
429 _ => Err(Error::InvalidCharInString(i, ch)),
430 })
431 }
432
433 fn hex<const N: usize>(&mut self, start: usize, i: usize) -> Result<char, Error> {
434 let mut buf = [0; N];
435 for b in buf.iter_mut() {
436 match self.one() {
437 Some((_, ch)) if ch as u32 <= 0x7F && ch.is_ascii_hexdigit() => *b = ch as u8,
438 Some((i, ch)) => return Err(Error::InvalidHexEscape(i, ch)),
439 None => return Err(Error::UnterminatedString(start)),
440 }
441 }
442 let val = u32::from_str_radix(std::str::from_utf8(&buf).unwrap(), 16).unwrap();
443 match char::from_u32(val) {
444 Some(ch) => Ok(ch),
445 None => Err(Error::InvalidEscapeValue(i, N, val)),
446 }
447 }
448
449 fn keylike(&mut self, start: usize) -> Token<'a> {
450 while let Some((_, ch)) = self.peek_one() {
451 if !is_keylike(ch) {
452 break;
453 }
454 self.one();
455 }
456 Token::Keylike(&self.input[start..self.current()])
457 }
458
459 pub fn substr_offset(&self, s: &'a str) -> usize {
460 assert!(s.len() <= self.input.len());
461 let a = self.input.as_ptr() as usize;
462 let b = s.as_ptr() as usize;
463 assert!(a <= b);
464 b - a
465 }
466
467 fn step_span(&mut self, start: usize) -> Span {
469 let end = match self.peek_one() {
470 Some(t) => t.0,
471 None => self.input.len(),
472 };
473 Span { start, end }
474 }
475
476 fn peek_one(&mut self) -> Option<(usize, char)> {
478 self.chars.clone().next()
479 }
480
481 pub fn one(&mut self) -> Option<(usize, char)> {
483 self.chars.next()
484 }
485}
486
487impl Iterator for CrlfFold<'_> {
488 type Item = (usize, char);
489
490 fn next(&mut self) -> Option<(usize, char)> {
491 self.chars.next().map(|(i, c)| {
492 if c == '\r' {
493 let mut attempt = self.chars.clone();
494 if let Some((_, '\n')) = attempt.next() {
495 self.chars = attempt;
496 return (i, '\n');
497 }
498 }
499 (i, c)
500 })
501 }
502}
503
504impl MaybeString {
505 fn push(&mut self, ch: char) {
506 match *self {
507 MaybeString::NotEscaped(..) => {}
508 MaybeString::Owned(ref mut s) => s.push(ch),
509 }
510 }
511
512 fn make_owned(&mut self, input: &str) {
513 match *self {
514 MaybeString::NotEscaped(start) => {
515 *self = MaybeString::Owned(input[start..].to_owned());
516 }
517 MaybeString::Owned(..) => {}
518 }
519 }
520
521 fn into_cow(self, input: &str) -> Cow<'_, str> {
522 match self {
523 MaybeString::NotEscaped(start) => Cow::Borrowed(&input[start..]),
524 MaybeString::Owned(s) => Cow::Owned(s),
525 }
526 }
527}
528
529#[inline]
530fn is_keylike(ch: char) -> bool {
531 ch.is_ascii_alphanumeric() || ch == '-' || ch == '_'
532}
533
534impl Token<'_> {
535 pub fn describe(&self) -> &'static str {
536 match *self {
537 Token::Keylike(_) => "an identifier",
538 Token::Equals => "an equals",
539 Token::Period => "a period",
540 Token::Comment(_) => "a comment",
541 Token::Newline => "a newline",
542 Token::Whitespace(_) => "whitespace",
543 Token::Comma => "a comma",
544 Token::RightBrace => "a right brace",
545 Token::LeftBrace => "a left brace",
546 Token::RightBracket => "a right bracket",
547 Token::LeftBracket => "a left bracket",
548 Token::String { multiline, .. } => {
549 if multiline {
550 "a multiline string"
551 } else {
552 "a string"
553 }
554 }
555 Token::Colon => "a colon",
556 Token::Plus => "a plus",
557 }
558 }
559}