jsonc_parser/
scanner.rs

1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8/// Converts text into a stream of tokens.
9pub struct Scanner<'a> {
10  byte_index: usize,
11  token_start: usize,
12  char_iter: Chars<'a>,
13  // todo(dsherret): why isn't this a VecDeque?
14  char_buffer: Vec<char>,
15  current_token: Option<Token<'a>>,
16  file_text: &'a str,
17}
18
19const CHAR_BUFFER_MAX_SIZE: usize = 6;
20
21impl<'a> Scanner<'a> {
22  /// Creates a new scanner based on the provided text.
23  pub fn new(file_text: &'a str) -> Scanner<'a> {
24    let mut char_iter = file_text.chars();
25    let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
26    let current_char = char_iter.next();
27    if let Some(current_char) = current_char {
28      char_buffer.push(current_char);
29    }
30
31    Scanner {
32      byte_index: 0,
33      token_start: 0,
34      char_iter,
35      char_buffer,
36      current_token: None,
37      file_text,
38    }
39  }
40
41  pub fn file_text(&self) -> &str {
42    self.file_text
43  }
44
45  /// Moves to and returns the next token.
46  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
47    self.skip_whitespace();
48    self.token_start = self.byte_index;
49    if let Some(current_char) = self.current_char() {
50      let token_result = match current_char {
51        '{' => {
52          self.move_next_char();
53          Ok(Token::OpenBrace)
54        }
55        '}' => {
56          self.move_next_char();
57          Ok(Token::CloseBrace)
58        }
59        '[' => {
60          self.move_next_char();
61          Ok(Token::OpenBracket)
62        }
63        ']' => {
64          self.move_next_char();
65          Ok(Token::CloseBracket)
66        }
67        ',' => {
68          self.move_next_char();
69          Ok(Token::Comma)
70        }
71        ':' => {
72          self.move_next_char();
73          Ok(Token::Colon)
74        }
75        '\'' | '"' => self.parse_string(),
76        '/' => match self.peek_char() {
77          Some('/') => Ok(self.parse_comment_line()),
78          Some('*') => self.parse_comment_block(),
79          _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
80        },
81        _ => {
82          if current_char == '-' || self.is_digit() {
83            self.parse_number()
84          } else if self.try_move_word("true") {
85            Ok(Token::Boolean(true))
86          } else if self.try_move_word("false") {
87            Ok(Token::Boolean(false))
88          } else if self.try_move_word("null") {
89            Ok(Token::Null)
90          } else {
91            self.parse_word()
92          }
93        }
94      };
95      match token_result {
96        Ok(token) => {
97          self.current_token = Some(token.clone());
98          Ok(Some(token))
99        }
100        Err(err) => Err(err),
101      }
102    } else {
103      self.current_token = None;
104      Ok(None)
105    }
106  }
107
108  /// Gets the start position of the token.
109  pub fn token_start(&self) -> usize {
110    self.token_start
111  }
112
113  /// Gets the end position of the token.
114  pub fn token_end(&self) -> usize {
115    self.byte_index
116  }
117
118  /// Gets the current token.
119  pub fn token(&self) -> Option<Token<'a>> {
120    self.current_token.as_ref().map(|x| x.to_owned())
121  }
122
123  pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
124    self.create_error_for_start(self.token_start, kind)
125  }
126
127  pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
128    self.create_error_for_start(self.byte_index, kind)
129  }
130
131  pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
132    let range = Range {
133      start,
134      end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
135        self.byte_index + c.len_utf8()
136      } else {
137        self.file_text.len()
138      },
139    };
140    self.create_error_for_range(range, kind)
141  }
142
143  pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
144    ParseError::new(range, kind, self.file_text)
145  }
146
147  fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
148    crate::string::parse_string_with_char_provider(self)
149      .map(Token::String)
150      // todo(dsherret): don't convert the error kind to a string here
151      .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
152  }
153
154  fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
155    let start_byte_index = self.byte_index;
156
157    if self.is_negative_sign() {
158      self.move_next_char();
159    }
160
161    if self.is_zero() {
162      self.move_next_char();
163    } else if self.is_one_nine() {
164      self.move_next_char();
165      while self.is_digit() {
166        self.move_next_char();
167      }
168    } else {
169      return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
170    }
171
172    if self.is_decimal_point() {
173      self.move_next_char();
174
175      if !self.is_digit() {
176        return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
177      }
178
179      while self.is_digit() {
180        self.move_next_char();
181      }
182    }
183
184    match self.current_char() {
185      Some('e') | Some('E') => {
186        match self.move_next_char() {
187          Some('-') | Some('+') => {
188            self.move_next_char();
189            if !self.is_digit() {
190              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
191            }
192          }
193          _ => {
194            if !self.is_digit() {
195              return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
196            }
197          }
198        }
199
200        while self.is_digit() {
201          self.move_next_char();
202        }
203      }
204      _ => {}
205    }
206
207    let end_byte_index = self.byte_index;
208    Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
209  }
210
211  fn parse_comment_line(&mut self) -> Token<'a> {
212    self.assert_then_move_char('/');
213    #[cfg(debug_assertions)]
214    self.assert_char('/');
215
216    let start_byte_index = self.byte_index + 1;
217    while self.move_next_char().is_some() {
218      if self.is_new_line() {
219        break;
220      }
221    }
222
223    Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
224  }
225
226  fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
227    self.assert_then_move_char('/');
228    #[cfg(debug_assertions)]
229    self.assert_char('*');
230    let mut found_end = false;
231
232    let start_byte_index = self.byte_index + 1;
233    while let Some(current_char) = self.move_next_char() {
234      if current_char == '*' && self.peek_char() == Some('/') {
235        found_end = true;
236        break;
237      }
238    }
239
240    if found_end {
241      let end_byte_index = self.byte_index;
242      self.assert_then_move_char('*');
243      self.assert_then_move_char('/');
244      Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
245    } else {
246      Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
247    }
248  }
249
250  fn skip_whitespace(&mut self) {
251    while let Some(current_char) = self.current_char() {
252      if current_char.is_whitespace() {
253        self.move_next_char();
254      } else {
255        break;
256      }
257    }
258  }
259
260  fn try_move_word(&mut self, text: &str) -> bool {
261    let mut char_index = 0;
262    for c in text.chars() {
263      if let Some(current_char) = self.peek_char_offset(char_index) {
264        if current_char != c {
265          return false;
266        }
267
268        char_index += 1;
269      } else {
270        return false;
271      }
272    }
273
274    if let Some(next_char) = self.peek_char_offset(char_index) {
275      if next_char.is_alphanumeric() {
276        return false;
277      }
278    }
279
280    for _ in 0..char_index {
281      self.move_next_char();
282    }
283
284    true
285  }
286
287  fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
288    let start_byte_index = self.byte_index;
289
290    while let Some(current_char) = self.current_char() {
291      if current_char.is_whitespace() || current_char == '\r' || current_char == '\n' || current_char == ':' {
292        break;
293      }
294      if !current_char.is_alphanumeric() && current_char != '-' {
295        return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
296      }
297
298      self.move_next_char();
299    }
300
301    let end_byte_index = self.byte_index;
302
303    if end_byte_index - start_byte_index == 0 {
304      return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
305    }
306
307    Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
308  }
309
310  fn assert_then_move_char(&mut self, _character: char) {
311    #[cfg(debug_assertions)]
312    self.assert_char(_character);
313
314    self.move_next_char();
315  }
316
317  #[cfg(debug_assertions)]
318  fn assert_char(&mut self, character: char) {
319    let current_char = self.current_char();
320    debug_assert!(
321      current_char == Some(character),
322      "Expected {:?}, was {:?}",
323      character,
324      current_char
325    );
326  }
327
328  fn move_next_char(&mut self) -> Option<char> {
329    if let Some(&current_char) = self.char_buffer.first() {
330      // shift the entire array to the left then pop the last item
331      for i in 1..self.char_buffer.len() {
332        self.char_buffer[i - 1] = self.char_buffer[i];
333      }
334      self.char_buffer.pop();
335
336      if self.char_buffer.is_empty() {
337        if let Some(new_char) = self.char_iter.next() {
338          self.char_buffer.push(new_char);
339        }
340      }
341
342      self.byte_index += current_char.len_utf8();
343    }
344
345    self.current_char()
346  }
347
348  fn peek_char(&mut self) -> Option<char> {
349    self.peek_char_offset(1)
350  }
351
352  fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
353    // fill the char buffer
354    for _ in self.char_buffer.len()..offset + 1 {
355      if let Some(next_char) = self.char_iter.next() {
356        self.char_buffer.push(next_char);
357      } else {
358        // end of string
359        return None;
360      }
361    }
362
363    // should not exceed this
364    debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
365
366    self.char_buffer.get(offset).copied()
367  }
368
369  fn current_char(&self) -> Option<char> {
370    self.char_buffer.first().copied()
371  }
372
373  fn is_new_line(&mut self) -> bool {
374    match self.current_char() {
375      Some('\n') => true,
376      Some('\r') => self.peek_char() == Some('\n'),
377      _ => false,
378    }
379  }
380
381  fn is_digit(&self) -> bool {
382    self.is_one_nine() || self.is_zero()
383  }
384
385  fn is_zero(&self) -> bool {
386    self.current_char() == Some('0')
387  }
388
389  fn is_one_nine(&self) -> bool {
390    match self.current_char() {
391      Some(current_char) => ('1'..='9').contains(&current_char),
392      _ => false,
393    }
394  }
395
396  fn is_negative_sign(&self) -> bool {
397    self.current_char() == Some('-')
398  }
399
400  fn is_decimal_point(&self) -> bool {
401    self.current_char() == Some('.')
402  }
403}
404
405impl<'a> CharProvider<'a> for Scanner<'a> {
406  fn current_char(&mut self) -> Option<char> {
407    Scanner::current_char(self)
408  }
409
410  fn move_next_char(&mut self) -> Option<char> {
411    Scanner::move_next_char(self)
412  }
413
414  fn byte_index(&self) -> usize {
415    self.byte_index
416  }
417
418  fn text(&self) -> &'a str {
419    self.file_text
420  }
421}
422
423#[cfg(test)]
424mod tests {
425  use std::borrow::Cow;
426
427  use super::super::tokens::Token;
428  use super::*;
429  use pretty_assertions::assert_eq;
430
431  #[test]
432  fn it_tokenizes_string() {
433    assert_has_tokens(
434      r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
435      vec![
436        Token::String(Cow::Borrowed(r#"t"est"#)),
437        Token::Comma,
438        Token::String(Cow::Borrowed("\t\r\n\n  test\n other")),
439        Token::Comma,
440      ],
441    );
442  }
443
444  #[test]
445  fn it_errors_escaping_single_quote_in_double_quote() {
446    assert_has_error(
447      r#""t\'est""#,
448      "Invalid escape in double quote string on line 1 column 3",
449    );
450  }
451
452  #[test]
453  fn it_tokenizes_single_quote_string() {
454    assert_has_tokens(
455      r#"'t\'est','a',"#,
456      vec![
457        Token::String(Cow::Borrowed(r#"t'est"#)),
458        Token::Comma,
459        Token::String(Cow::Borrowed("a")),
460        Token::Comma,
461      ],
462    );
463  }
464
465  #[test]
466  fn it_errors_escaping_double_quote_in_single_quote() {
467    assert_has_error(
468      r#"'t\"est'"#,
469      "Invalid escape in single quote string on line 1 column 3",
470    );
471  }
472
473  #[test]
474  fn it_errors_for_word_starting_with_invalid_token() {
475    assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
476  }
477
478  #[test]
479  fn it_tokenizes_numbers() {
480    assert_has_tokens(
481      "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
482      vec![
483        Token::Number("0"),
484        Token::Comma,
485        Token::Number("0.123"),
486        Token::Comma,
487        Token::Number("-198"),
488        Token::Comma,
489        Token::Number("0e-345"),
490        Token::Comma,
491        Token::Number("0.3e+025"),
492        Token::Comma,
493        Token::Number("1e1"),
494        Token::Comma,
495      ],
496    );
497  }
498
499  #[test]
500  fn it_errors_invalid_exponent() {
501    assert_has_error(
502      r#"1ea"#,
503      "Expected plus, minus, or digit in number literal on line 1 column 3",
504    );
505    assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
506  }
507
508  #[test]
509  fn it_tokenizes_simple_tokens() {
510    assert_has_tokens(
511      "{}[],:true,false,null,",
512      vec![
513        Token::OpenBrace,
514        Token::CloseBrace,
515        Token::OpenBracket,
516        Token::CloseBracket,
517        Token::Comma,
518        Token::Colon,
519        Token::Boolean(true),
520        Token::Comma,
521        Token::Boolean(false),
522        Token::Comma,
523        Token::Null,
524        Token::Comma,
525      ],
526    );
527  }
528
529  #[test]
530  fn it_tokenizes_comment_line() {
531    assert_has_tokens(
532      "//test\n//t\r\n// test\n,",
533      vec![
534        Token::CommentLine("test"),
535        Token::CommentLine("t"),
536        Token::CommentLine(" test"),
537        Token::Comma,
538      ],
539    );
540  }
541
542  #[test]
543  fn it_tokenizes_comment_blocks() {
544    assert_has_tokens(
545      "/*test\n *//* test*/,",
546      vec![
547        Token::CommentBlock("test\n "),
548        Token::CommentBlock(" test"),
549        Token::Comma,
550      ],
551    );
552  }
553
554  #[test]
555  fn it_errors_on_invalid_utf8_char_for_issue_6() {
556    assert_has_error(
557      "\"\\uDF06\"",
558      "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
559    );
560  }
561
562  fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
563    let mut scanner = Scanner::new(text);
564    let mut scanned_tokens = Vec::new();
565
566    loop {
567      match scanner.scan() {
568        Ok(Some(token)) => scanned_tokens.push(token),
569        Ok(None) => break,
570        Err(err) => panic!("Error parsing: {:?}", err),
571      }
572    }
573
574    assert_eq!(scanned_tokens, tokens);
575  }
576
577  fn assert_has_error(text: &str, message: &str) {
578    let mut scanner = Scanner::new(text);
579    let mut error_message = String::new();
580
581    loop {
582      match scanner.scan() {
583        Ok(Some(_)) => {}
584        Ok(None) => break,
585        Err(err) => {
586          error_message = err.to_string();
587          break;
588        }
589      }
590    }
591
592    assert_eq!(error_message, message);
593  }
594}