1use crate::string::CharProvider;
2
3use super::common::Range;
4use super::errors::*;
5use super::tokens::Token;
6use std::str::Chars;
7
8pub struct Scanner<'a> {
10 byte_index: usize,
11 token_start: usize,
12 char_iter: Chars<'a>,
13 char_buffer: Vec<char>,
15 current_token: Option<Token<'a>>,
16 file_text: &'a str,
17}
18
19const CHAR_BUFFER_MAX_SIZE: usize = 6;
20
21impl<'a> Scanner<'a> {
22 pub fn new(file_text: &'a str) -> Scanner<'a> {
24 let mut char_iter = file_text.chars();
25 let mut char_buffer = Vec::with_capacity(CHAR_BUFFER_MAX_SIZE);
26 let current_char = char_iter.next();
27 if let Some(current_char) = current_char {
28 char_buffer.push(current_char);
29 }
30
31 Scanner {
32 byte_index: 0,
33 token_start: 0,
34 char_iter,
35 char_buffer,
36 current_token: None,
37 file_text,
38 }
39 }
40
41 pub fn file_text(&self) -> &str {
42 self.file_text
43 }
44
45 pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
47 self.skip_whitespace();
48 self.token_start = self.byte_index;
49 if let Some(current_char) = self.current_char() {
50 let token_result = match current_char {
51 '{' => {
52 self.move_next_char();
53 Ok(Token::OpenBrace)
54 }
55 '}' => {
56 self.move_next_char();
57 Ok(Token::CloseBrace)
58 }
59 '[' => {
60 self.move_next_char();
61 Ok(Token::OpenBracket)
62 }
63 ']' => {
64 self.move_next_char();
65 Ok(Token::CloseBracket)
66 }
67 ',' => {
68 self.move_next_char();
69 Ok(Token::Comma)
70 }
71 ':' => {
72 self.move_next_char();
73 Ok(Token::Colon)
74 }
75 '\'' | '"' => self.parse_string(),
76 '/' => match self.peek_char() {
77 Some('/') => Ok(self.parse_comment_line()),
78 Some('*') => self.parse_comment_block(),
79 _ => Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken)),
80 },
81 _ => {
82 if current_char == '-' || self.is_digit() {
83 self.parse_number()
84 } else if self.try_move_word("true") {
85 Ok(Token::Boolean(true))
86 } else if self.try_move_word("false") {
87 Ok(Token::Boolean(false))
88 } else if self.try_move_word("null") {
89 Ok(Token::Null)
90 } else {
91 self.parse_word()
92 }
93 }
94 };
95 match token_result {
96 Ok(token) => {
97 self.current_token = Some(token.clone());
98 Ok(Some(token))
99 }
100 Err(err) => Err(err),
101 }
102 } else {
103 self.current_token = None;
104 Ok(None)
105 }
106 }
107
108 pub fn token_start(&self) -> usize {
110 self.token_start
111 }
112
113 pub fn token_end(&self) -> usize {
115 self.byte_index
116 }
117
118 pub fn token(&self) -> Option<Token<'a>> {
120 self.current_token.as_ref().map(|x| x.to_owned())
121 }
122
123 pub(super) fn create_error_for_current_token(&self, kind: ParseErrorKind) -> ParseError {
124 self.create_error_for_start(self.token_start, kind)
125 }
126
127 pub(super) fn create_error_for_current_char(&self, kind: ParseErrorKind) -> ParseError {
128 self.create_error_for_start(self.byte_index, kind)
129 }
130
131 pub(super) fn create_error_for_start(&self, start: usize, kind: ParseErrorKind) -> ParseError {
132 let range = Range {
133 start,
134 end: if let Some(c) = self.file_text[self.byte_index..].chars().next() {
135 self.byte_index + c.len_utf8()
136 } else {
137 self.file_text.len()
138 },
139 };
140 self.create_error_for_range(range, kind)
141 }
142
143 pub(super) fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
144 ParseError::new(range, kind, self.file_text)
145 }
146
147 fn parse_string(&mut self) -> Result<Token<'a>, ParseError> {
148 crate::string::parse_string_with_char_provider(self)
149 .map(Token::String)
150 .map_err(|err| self.create_error_for_start(err.byte_index, ParseErrorKind::String(err.kind)))
152 }
153
154 fn parse_number(&mut self) -> Result<Token<'a>, ParseError> {
155 let start_byte_index = self.byte_index;
156
157 if self.is_negative_sign() {
158 self.move_next_char();
159 }
160
161 if self.is_zero() {
162 self.move_next_char();
163 } else if self.is_one_nine() {
164 self.move_next_char();
165 while self.is_digit() {
166 self.move_next_char();
167 }
168 } else {
169 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigitFollowingNegativeSign));
170 }
171
172 if self.is_decimal_point() {
173 self.move_next_char();
174
175 if !self.is_digit() {
176 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
177 }
178
179 while self.is_digit() {
180 self.move_next_char();
181 }
182 }
183
184 match self.current_char() {
185 Some('e') | Some('E') => {
186 match self.move_next_char() {
187 Some('-') | Some('+') => {
188 self.move_next_char();
189 if !self.is_digit() {
190 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedDigit));
191 }
192 }
193 _ => {
194 if !self.is_digit() {
195 return Err(self.create_error_for_current_char(ParseErrorKind::ExpectedPlusMinusOrDigitInNumberLiteral));
196 }
197 }
198 }
199
200 while self.is_digit() {
201 self.move_next_char();
202 }
203 }
204 _ => {}
205 }
206
207 let end_byte_index = self.byte_index;
208 Ok(Token::Number(&self.file_text[start_byte_index..end_byte_index]))
209 }
210
211 fn parse_comment_line(&mut self) -> Token<'a> {
212 self.assert_then_move_char('/');
213 #[cfg(debug_assertions)]
214 self.assert_char('/');
215
216 let start_byte_index = self.byte_index + 1;
217 while self.move_next_char().is_some() {
218 if self.is_new_line() {
219 break;
220 }
221 }
222
223 Token::CommentLine(&self.file_text[start_byte_index..self.byte_index])
224 }
225
226 fn parse_comment_block(&mut self) -> Result<Token<'a>, ParseError> {
227 self.assert_then_move_char('/');
228 #[cfg(debug_assertions)]
229 self.assert_char('*');
230 let mut found_end = false;
231
232 let start_byte_index = self.byte_index + 1;
233 while let Some(current_char) = self.move_next_char() {
234 if current_char == '*' && self.peek_char() == Some('/') {
235 found_end = true;
236 break;
237 }
238 }
239
240 if found_end {
241 let end_byte_index = self.byte_index;
242 self.assert_then_move_char('*');
243 self.assert_then_move_char('/');
244 Ok(Token::CommentBlock(&self.file_text[start_byte_index..end_byte_index]))
245 } else {
246 Err(self.create_error_for_current_token(ParseErrorKind::UnterminatedCommentBlock))
247 }
248 }
249
250 fn skip_whitespace(&mut self) {
251 while let Some(current_char) = self.current_char() {
252 if current_char.is_whitespace() {
253 self.move_next_char();
254 } else {
255 break;
256 }
257 }
258 }
259
260 fn try_move_word(&mut self, text: &str) -> bool {
261 let mut char_index = 0;
262 for c in text.chars() {
263 if let Some(current_char) = self.peek_char_offset(char_index) {
264 if current_char != c {
265 return false;
266 }
267
268 char_index += 1;
269 } else {
270 return false;
271 }
272 }
273
274 if let Some(next_char) = self.peek_char_offset(char_index) {
275 if next_char.is_alphanumeric() {
276 return false;
277 }
278 }
279
280 for _ in 0..char_index {
281 self.move_next_char();
282 }
283
284 true
285 }
286
287 fn parse_word(&mut self) -> Result<Token<'a>, ParseError> {
288 let start_byte_index = self.byte_index;
289
290 while let Some(current_char) = self.current_char() {
291 if current_char.is_whitespace() || current_char == '\r' || current_char == '\n' || current_char == ':' {
292 break;
293 }
294 if !current_char.is_alphanumeric() && current_char != '-' {
295 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
296 }
297
298 self.move_next_char();
299 }
300
301 let end_byte_index = self.byte_index;
302
303 if end_byte_index - start_byte_index == 0 {
304 return Err(self.create_error_for_current_token(ParseErrorKind::UnexpectedToken));
305 }
306
307 Ok(Token::Word(&self.file_text[start_byte_index..end_byte_index]))
308 }
309
310 fn assert_then_move_char(&mut self, _character: char) {
311 #[cfg(debug_assertions)]
312 self.assert_char(_character);
313
314 self.move_next_char();
315 }
316
317 #[cfg(debug_assertions)]
318 fn assert_char(&mut self, character: char) {
319 let current_char = self.current_char();
320 debug_assert!(
321 current_char == Some(character),
322 "Expected {:?}, was {:?}",
323 character,
324 current_char
325 );
326 }
327
328 fn move_next_char(&mut self) -> Option<char> {
329 if let Some(¤t_char) = self.char_buffer.first() {
330 for i in 1..self.char_buffer.len() {
332 self.char_buffer[i - 1] = self.char_buffer[i];
333 }
334 self.char_buffer.pop();
335
336 if self.char_buffer.is_empty() {
337 if let Some(new_char) = self.char_iter.next() {
338 self.char_buffer.push(new_char);
339 }
340 }
341
342 self.byte_index += current_char.len_utf8();
343 }
344
345 self.current_char()
346 }
347
348 fn peek_char(&mut self) -> Option<char> {
349 self.peek_char_offset(1)
350 }
351
352 fn peek_char_offset(&mut self, offset: usize) -> Option<char> {
353 for _ in self.char_buffer.len()..offset + 1 {
355 if let Some(next_char) = self.char_iter.next() {
356 self.char_buffer.push(next_char);
357 } else {
358 return None;
360 }
361 }
362
363 debug_assert!(self.char_buffer.len() <= CHAR_BUFFER_MAX_SIZE);
365
366 self.char_buffer.get(offset).copied()
367 }
368
369 fn current_char(&self) -> Option<char> {
370 self.char_buffer.first().copied()
371 }
372
373 fn is_new_line(&mut self) -> bool {
374 match self.current_char() {
375 Some('\n') => true,
376 Some('\r') => self.peek_char() == Some('\n'),
377 _ => false,
378 }
379 }
380
381 fn is_digit(&self) -> bool {
382 self.is_one_nine() || self.is_zero()
383 }
384
385 fn is_zero(&self) -> bool {
386 self.current_char() == Some('0')
387 }
388
389 fn is_one_nine(&self) -> bool {
390 match self.current_char() {
391 Some(current_char) => ('1'..='9').contains(¤t_char),
392 _ => false,
393 }
394 }
395
396 fn is_negative_sign(&self) -> bool {
397 self.current_char() == Some('-')
398 }
399
400 fn is_decimal_point(&self) -> bool {
401 self.current_char() == Some('.')
402 }
403}
404
405impl<'a> CharProvider<'a> for Scanner<'a> {
406 fn current_char(&mut self) -> Option<char> {
407 Scanner::current_char(self)
408 }
409
410 fn move_next_char(&mut self) -> Option<char> {
411 Scanner::move_next_char(self)
412 }
413
414 fn byte_index(&self) -> usize {
415 self.byte_index
416 }
417
418 fn text(&self) -> &'a str {
419 self.file_text
420 }
421}
422
423#[cfg(test)]
424mod tests {
425 use std::borrow::Cow;
426
427 use super::super::tokens::Token;
428 use super::*;
429 use pretty_assertions::assert_eq;
430
431 #[test]
432 fn it_tokenizes_string() {
433 assert_has_tokens(
434 r#""t\"est", "\t\r\n\n\u0020 test\n other","#,
435 vec![
436 Token::String(Cow::Borrowed(r#"t"est"#)),
437 Token::Comma,
438 Token::String(Cow::Borrowed("\t\r\n\n test\n other")),
439 Token::Comma,
440 ],
441 );
442 }
443
444 #[test]
445 fn it_errors_escaping_single_quote_in_double_quote() {
446 assert_has_error(
447 r#""t\'est""#,
448 "Invalid escape in double quote string on line 1 column 3",
449 );
450 }
451
452 #[test]
453 fn it_tokenizes_single_quote_string() {
454 assert_has_tokens(
455 r#"'t\'est','a',"#,
456 vec![
457 Token::String(Cow::Borrowed(r#"t'est"#)),
458 Token::Comma,
459 Token::String(Cow::Borrowed("a")),
460 Token::Comma,
461 ],
462 );
463 }
464
465 #[test]
466 fn it_errors_escaping_double_quote_in_single_quote() {
467 assert_has_error(
468 r#"'t\"est'"#,
469 "Invalid escape in single quote string on line 1 column 3",
470 );
471 }
472
473 #[test]
474 fn it_errors_for_word_starting_with_invalid_token() {
475 assert_has_error(r#"{ &test }"#, "Unexpected token on line 1 column 3");
476 }
477
478 #[test]
479 fn it_tokenizes_numbers() {
480 assert_has_tokens(
481 "0, 0.123, -198, 0e-345, 0.3e+025, 1e1,",
482 vec![
483 Token::Number("0"),
484 Token::Comma,
485 Token::Number("0.123"),
486 Token::Comma,
487 Token::Number("-198"),
488 Token::Comma,
489 Token::Number("0e-345"),
490 Token::Comma,
491 Token::Number("0.3e+025"),
492 Token::Comma,
493 Token::Number("1e1"),
494 Token::Comma,
495 ],
496 );
497 }
498
499 #[test]
500 fn it_errors_invalid_exponent() {
501 assert_has_error(
502 r#"1ea"#,
503 "Expected plus, minus, or digit in number literal on line 1 column 3",
504 );
505 assert_has_error(r#"1e-a"#, "Expected digit on line 1 column 4");
506 }
507
508 #[test]
509 fn it_tokenizes_simple_tokens() {
510 assert_has_tokens(
511 "{}[],:true,false,null,",
512 vec![
513 Token::OpenBrace,
514 Token::CloseBrace,
515 Token::OpenBracket,
516 Token::CloseBracket,
517 Token::Comma,
518 Token::Colon,
519 Token::Boolean(true),
520 Token::Comma,
521 Token::Boolean(false),
522 Token::Comma,
523 Token::Null,
524 Token::Comma,
525 ],
526 );
527 }
528
529 #[test]
530 fn it_tokenizes_comment_line() {
531 assert_has_tokens(
532 "//test\n//t\r\n// test\n,",
533 vec![
534 Token::CommentLine("test"),
535 Token::CommentLine("t"),
536 Token::CommentLine(" test"),
537 Token::Comma,
538 ],
539 );
540 }
541
542 #[test]
543 fn it_tokenizes_comment_blocks() {
544 assert_has_tokens(
545 "/*test\n *//* test*/,",
546 vec![
547 Token::CommentBlock("test\n "),
548 Token::CommentBlock(" test"),
549 Token::Comma,
550 ],
551 );
552 }
553
554 #[test]
555 fn it_errors_on_invalid_utf8_char_for_issue_6() {
556 assert_has_error(
557 "\"\\uDF06\"",
558 "Invalid unicode escape sequence. 'DF06' is not a valid UTF8 character on line 1 column 2",
559 );
560 }
561
562 fn assert_has_tokens(text: &str, tokens: Vec<Token>) {
563 let mut scanner = Scanner::new(text);
564 let mut scanned_tokens = Vec::new();
565
566 loop {
567 match scanner.scan() {
568 Ok(Some(token)) => scanned_tokens.push(token),
569 Ok(None) => break,
570 Err(err) => panic!("Error parsing: {:?}", err),
571 }
572 }
573
574 assert_eq!(scanned_tokens, tokens);
575 }
576
577 fn assert_has_error(text: &str, message: &str) {
578 let mut scanner = Scanner::new(text);
579 let mut error_message = String::new();
580
581 loop {
582 match scanner.scan() {
583 Ok(Some(_)) => {}
584 Ok(None) => break,
585 Err(err) => {
586 error_message = err.to_string();
587 break;
588 }
589 }
590 }
591
592 assert_eq!(error_message, message);
593 }
594}