1use std::char;
2use std::num::ParseFloatError;
3use std::num::ParseIntError;
4
5use crate::lexer::float;
6use crate::lexer::float::ProtobufFloatParseError;
7use crate::lexer::json_number_lit::JsonNumberLit;
8use crate::lexer::loc::Loc;
9use crate::lexer::loc::FIRST_COL;
10use crate::lexer::parser_language::ParserLanguage;
11use crate::lexer::str_lit::StrLit;
12use crate::lexer::str_lit::StrLitDecodeError;
13use crate::lexer::token::Token;
14use crate::lexer::token::TokenWithLocation;
15
16#[derive(Debug, thiserror::Error)]
17pub enum LexerError {
18 #[error("Incorrect input")]
20 IncorrectInput,
21 #[error("Unexpected EOF")]
22 UnexpectedEof,
23 #[error("Expecting char: {:?}", .0)]
24 ExpectChar(char),
25 #[error("Parse int error")]
26 ParseIntError,
27 #[error("Parse float error")]
28 ParseFloatError,
29 #[error("Incorrect float literal")]
31 IncorrectFloatLit,
32 #[error("Incorrect JSON escape")]
33 IncorrectJsonEscape,
34 #[error("Incorrect JSON number")]
35 IncorrectJsonNumber,
36 #[error("Incorrect Unicode character")]
37 IncorrectUnicodeChar,
38 #[error("Expecting hex digit")]
39 ExpectHexDigit,
40 #[error("Expecting oct digit")]
41 ExpectOctDigit,
42 #[error("Expecting dec digit")]
43 ExpectDecDigit,
44 #[error(transparent)]
45 StrLitDecodeError(#[from] StrLitDecodeError),
46 #[error("Expecting identifier")]
47 ExpectedIdent,
48}
49
50pub type LexerResult<T> = Result<T, LexerError>;
51
52impl From<ParseIntError> for LexerError {
53 fn from(_: ParseIntError) -> Self {
54 LexerError::ParseIntError
55 }
56}
57
58impl From<ParseFloatError> for LexerError {
59 fn from(_: ParseFloatError) -> Self {
60 LexerError::ParseFloatError
61 }
62}
63
64impl From<ProtobufFloatParseError> for LexerError {
65 fn from(_: ProtobufFloatParseError) -> Self {
66 LexerError::IncorrectFloatLit
67 }
68}
69
70pub(crate) struct DecodedBytes {
74 buf: [u8; 4],
76 len: usize,
77}
78
79impl DecodedBytes {
80 fn byte(b: u8) -> DecodedBytes {
81 DecodedBytes {
82 buf: [b, 0, 0, 0],
83 len: 1,
84 }
85 }
86
87 fn char(value: char) -> Self {
88 let mut buf = [0; 4];
89 let len = value.encode_utf8(&mut buf).len();
90 DecodedBytes { buf, len }
91 }
92
93 pub(crate) fn bytes(&self) -> &[u8] {
94 &self.buf[..self.len]
95 }
96}
97
98#[derive(Copy, Clone)]
99pub struct Lexer<'a> {
100 language: ParserLanguage,
101 input: &'a str,
102 pos: usize,
103 pub loc: Loc,
104}
105
106fn is_letter(c: char) -> bool {
107 c.is_alphabetic() || c == '_'
108}
109
110impl<'a> Lexer<'a> {
111 pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
112 Lexer {
113 language,
114 input,
115 pos: 0,
116 loc: Loc::start(),
117 }
118 }
119
120 pub fn eof(&self) -> bool {
122 self.pos == self.input.len()
123 }
124
125 fn rem_chars(&self) -> &'a str {
127 &self.input[self.pos..]
128 }
129
130 pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
131 self.lookahead_char().map_or(false, p)
132 }
133
134 fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
135 self.lookahead_char_is(|c| alphabet.contains(c))
136 }
137
138 fn next_char_opt(&mut self) -> Option<char> {
139 let rem = self.rem_chars();
140 if rem.is_empty() {
141 None
142 } else {
143 let mut char_indices = rem.char_indices();
144 let (_, c) = char_indices.next().unwrap();
145 let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
146 self.pos += c_len;
147 if c == '\n' {
148 self.loc.line += 1;
149 self.loc.col = FIRST_COL;
150 } else {
151 self.loc.col += 1;
152 }
153 Some(c)
154 }
155 }
156
157 fn next_char(&mut self) -> LexerResult<char> {
158 self.next_char_opt().ok_or(LexerError::UnexpectedEof)
159 }
160
161 fn skip_whitespaces(&mut self) {
163 self.take_while(|c| c.is_whitespace());
164 }
165
166 fn skip_c_comment(&mut self) -> LexerResult<()> {
167 if self.skip_if_lookahead_is_str("/*") {
168 let end = "*/";
169 match self.rem_chars().find(end) {
170 None => Err(LexerError::UnexpectedEof),
171 Some(len) => {
172 let new_pos = self.pos + len + end.len();
173 self.skip_to_pos(new_pos);
174 Ok(())
175 }
176 }
177 } else {
178 Ok(())
179 }
180 }
181
182 fn skip_cpp_comment(&mut self) {
183 if self.skip_if_lookahead_is_str("//") {
184 loop {
185 match self.next_char_opt() {
186 Some('\n') | None => break,
187 _ => {}
188 }
189 }
190 }
191 }
192
193 fn skip_sh_comment(&mut self) {
194 if self.skip_if_lookahead_is_str("#") {
195 loop {
196 match self.next_char_opt() {
197 Some('\n') | None => break,
198 _ => {}
199 }
200 }
201 }
202 }
203
204 fn skip_comment(&mut self) -> LexerResult<()> {
205 match self.language {
206 ParserLanguage::Proto => {
207 self.skip_c_comment()?;
208 self.skip_cpp_comment();
209 }
210 ParserLanguage::TextFormat => {
211 self.skip_sh_comment();
212 }
213 ParserLanguage::Json => {}
214 }
215 Ok(())
216 }
217
218 pub fn skip_ws(&mut self) -> LexerResult<()> {
219 loop {
220 let pos = self.pos;
221 self.skip_whitespaces();
222 self.skip_comment()?;
223 if pos == self.pos {
224 return Ok(());
226 }
227 }
228 }
229
230 pub fn take_while<F>(&mut self, f: F) -> &'a str
231 where
232 F: Fn(char) -> bool,
233 {
234 let start = self.pos;
235 while self.lookahead_char().map(&f) == Some(true) {
236 self.next_char_opt().unwrap();
237 }
238 let end = self.pos;
239 &self.input[start..end]
240 }
241
242 fn lookahead_char(&self) -> Option<char> {
243 self.clone().next_char_opt()
244 }
245
246 fn lookahead_is_str(&self, s: &str) -> bool {
247 self.rem_chars().starts_with(s)
248 }
249
250 fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
251 if self.lookahead_is_str(s) {
252 let new_pos = self.pos + s.len();
253 self.skip_to_pos(new_pos);
254 true
255 } else {
256 false
257 }
258 }
259
260 fn next_char_if<P>(&mut self, p: P) -> Option<char>
261 where
262 P: FnOnce(char) -> bool,
263 {
264 let mut clone = self.clone();
265 match clone.next_char_opt() {
266 Some(c) if p(c) => {
267 *self = clone;
268 Some(c)
269 }
270 _ => None,
271 }
272 }
273
274 pub fn next_char_if_eq(&mut self, expect: char) -> bool {
275 self.next_char_if(|c| c == expect) != None
276 }
277
278 fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
279 for c in alphabet.chars() {
280 if self.next_char_if_eq(c) {
281 return Some(c);
282 }
283 }
284 None
285 }
286
287 fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
288 if self.next_char_if_eq(expect) {
289 Ok(())
290 } else {
291 Err(LexerError::ExpectChar(expect))
292 }
293 }
294
295 fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
296 where
297 P: FnOnce(char) -> bool,
298 {
299 self.next_char_if(expect).ok_or(err)
300 }
301
302 fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
306 assert!(new_pos >= self.pos);
307 assert!(new_pos <= self.input.len());
308 let pos = self.pos;
309 while self.pos != new_pos {
310 self.next_char_opt().unwrap();
311 }
312 &self.input[pos..new_pos]
313 }
314
315 fn next_letter_opt(&mut self) -> Option<char> {
322 self.next_char_if(is_letter)
323 }
324
325 fn _next_capital_letter_opt(&mut self) -> Option<char> {
327 self.next_char_if(|c| c >= 'A' && c <= 'Z')
328 }
329
330 fn next_ident_part(&mut self) -> Option<char> {
331 self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
332 }
333
334 fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
338 if let Some(c) = self.next_letter_opt() {
339 let mut ident = String::new();
340 ident.push(c);
341 while let Some(c) = self.next_ident_part() {
342 ident.push(c);
343 }
344 Ok(Some(ident))
345 } else {
346 Ok(None)
347 }
348 }
349
350 fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
354 Ok(
355 if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
356 let s = self.take_while(|c| c.is_ascii_hexdigit());
357 Some(u64::from_str_radix(s, 16)? as u64)
358 } else {
359 None
360 },
361 )
362 }
363
364 fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
367 let mut clone = self.clone();
369
370 let pos = clone.pos;
371
372 Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
373 clone.take_while(|c| c.is_ascii_digit());
374 let value = clone.input[pos..clone.pos].parse()?;
375 *self = clone;
376 Some(value)
377 } else {
378 None
379 })
380 }
381
382 fn next_hex_digit(&mut self) -> LexerResult<u32> {
384 let mut clone = self.clone();
385 let r = match clone.next_char()? {
386 c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
387 c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
388 c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
389 _ => return Err(LexerError::ExpectHexDigit),
390 };
391 *self = clone;
392 Ok(r)
393 }
394
395 fn next_octal_digit(&mut self) -> LexerResult<u32> {
397 self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
398 .map(|c| c as u32 - '0' as u32)
399 }
400
401 fn next_decimal_digit(&mut self) -> LexerResult<u32> {
403 self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
404 .map(|c| c as u32 - '0' as u32)
405 }
406
407 fn next_decimal_digits(&mut self) -> LexerResult<()> {
409 self.next_decimal_digit()?;
410 self.take_while(|c| c >= '0' && c <= '9');
411 Ok(())
412 }
413
414 pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
416 assert_ne!(ParserLanguage::Json, self.language);
417
418 self.skip_ws()?;
419 if let Some(i) = self.next_hex_lit_opt()? {
420 return Ok(Some(i));
421 }
422 if let Some(i) = self.next_decimal_octal_lit_opt()? {
423 return Ok(Some(i));
424 }
425 Ok(None)
426 }
427
428 fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
432 if self.next_char_if_in("eE") != None {
433 self.next_char_if_in("+-");
434 self.next_decimal_digits()?;
435 Ok(Some(()))
436 } else {
437 Ok(None)
438 }
439 }
440
441 fn next_float_lit(&mut self) -> LexerResult<()> {
443 assert_ne!(ParserLanguage::Json, self.language);
444
445 if self.next_char_if_eq('.') {
447 self.next_decimal_digits()?;
448 self.next_exponent_opt()?;
449 } else {
450 self.next_decimal_digits()?;
451 if self.next_char_if_eq('.') {
452 self.next_decimal_digits()?;
453 self.next_exponent_opt()?;
454 } else {
455 if self.next_exponent_opt()? == None {
456 return Err(LexerError::IncorrectFloatLit);
457 }
458 }
459 }
460 Ok(())
461 }
462
463 pub(crate) fn next_str_lit_bytes(&mut self) -> LexerResult<DecodedBytes> {
472 match self.next_char()? {
473 '\\' => {
474 match self.next_char()? {
475 '\'' => Ok(DecodedBytes::byte(b'\'')),
476 '"' => Ok(DecodedBytes::byte(b'"')),
477 '\\' => Ok(DecodedBytes::byte(b'\\')),
478 'a' => Ok(DecodedBytes::byte(b'\x07')),
479 'b' => Ok(DecodedBytes::byte(b'\x08')),
480 'f' => Ok(DecodedBytes::byte(b'\x0c')),
481 'n' => Ok(DecodedBytes::byte(b'\n')),
482 'r' => Ok(DecodedBytes::byte(b'\r')),
483 't' => Ok(DecodedBytes::byte(b'\t')),
484 'v' => Ok(DecodedBytes::byte(b'\x0b')),
485 'x' => {
486 let d1 = self.next_hex_digit()? as u8;
487 let d2 = self.next_hex_digit()? as u8;
488 Ok(DecodedBytes::byte((d1 << 4) | d2))
489 }
490 d if d >= '0' && d <= '7' => {
491 let mut r = d as u8 - b'0';
492 for _ in 0..2 {
493 match self.next_octal_digit() {
494 Err(_) => break,
495 Ok(d) => r = (r << 3) + d as u8,
496 }
497 }
498 Ok(DecodedBytes::byte(r))
499 }
500 c => Ok(DecodedBytes::char(c)),
502 }
503 }
504 '\n' | '\0' => Err(LexerError::IncorrectInput),
505 c => Ok(DecodedBytes::char(c)),
506 }
507 }
508
509 fn char_try_from(i: u32) -> LexerResult<char> {
510 char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
511 }
512
513 pub fn next_json_char_value(&mut self) -> LexerResult<char> {
514 match self.next_char()? {
515 '\\' => match self.next_char()? {
516 '"' => Ok('"'),
517 '\'' => Ok('\''),
518 '\\' => Ok('\\'),
519 '/' => Ok('/'),
520 'b' => Ok('\x08'),
521 'f' => Ok('\x0c'),
522 'n' => Ok('\n'),
523 'r' => Ok('\r'),
524 't' => Ok('\t'),
525 'u' => {
526 let mut v = 0;
527 for _ in 0..4 {
528 let digit = self.next_hex_digit()?;
529 v = v * 16 + digit;
530 }
531 Self::char_try_from(v)
532 }
533 _ => Err(LexerError::IncorrectJsonEscape),
534 },
535 c => Ok(c),
536 }
537 }
538
539 fn next_str_lit_raw(&mut self) -> LexerResult<String> {
542 let mut raw = String::new();
543
544 let mut first = true;
545 loop {
546 if !first {
547 self.skip_ws()?;
548 }
549
550 let start = self.pos;
551
552 let q = match self.next_char_if_in("'\"") {
553 Some(q) => q,
554 None if !first => break,
555 None => return Err(LexerError::IncorrectInput),
556 };
557 first = false;
558 while self.lookahead_char() != Some(q) {
559 self.next_str_lit_bytes()?;
560 }
561 self.next_char_expect_eq(q)?;
562
563 raw.push_str(&self.input[start + 1..self.pos - 1]);
564 }
565 Ok(raw)
566 }
567
568 fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
569 if self.lookahead_char_is_in("'\"") {
570 Ok(Some(self.next_str_lit_raw()?))
571 } else {
572 Ok(None)
573 }
574 }
575
576 fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
578 assert_eq!(ParserLanguage::Json, self.language);
579
580 fn is_digit(c: char) -> bool {
581 c >= '0' && c <= '9'
582 }
583
584 fn is_digit_1_9(c: char) -> bool {
585 c >= '1' && c <= '9'
586 }
587
588 if !self.lookahead_char_is_in("-0123456789") {
589 return Ok(None);
590 }
591
592 let mut s = String::new();
593 if self.next_char_if_eq('-') {
594 s.push('-');
595 }
596
597 if self.next_char_if_eq('0') {
598 s.push('0');
599 } else {
600 s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
601 while let Some(c) = self.next_char_if(is_digit) {
602 s.push(c);
603 }
604 }
605
606 if self.next_char_if_eq('.') {
607 s.push('.');
608 s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
609 while let Some(c) = self.next_char_if(is_digit) {
610 s.push(c);
611 }
612 }
613
614 if let Some(c) = self.next_char_if_in("eE") {
615 s.push(c);
616 if let Some(c) = self.next_char_if_in("+-") {
617 s.push(c);
618 }
619 s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
620 while let Some(c) = self.next_char_if(is_digit) {
621 s.push(c);
622 }
623 }
624
625 Ok(Some(JsonNumberLit(s)))
626 }
627
628 fn next_token_inner(&mut self) -> LexerResult<Token> {
629 if self.language == ParserLanguage::Json {
630 if let Some(v) = self.next_json_number_opt()? {
631 return Ok(Token::JsonNumber(v));
632 }
633 }
634
635 if let Some(ident) = self.next_ident_opt()? {
636 let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
637 Token::FloatLit(f64::NAN)
638 } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
639 Token::FloatLit(f64::INFINITY)
640 } else {
641 Token::Ident(ident.to_owned())
642 };
643 return Ok(token);
644 }
645
646 if self.language != ParserLanguage::Json {
647 let mut clone = self.clone();
648 let pos = clone.pos;
649 if let Ok(_) = clone.next_float_lit() {
650 let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
651 *self = clone;
652 return Ok(Token::FloatLit(f));
653 }
654
655 if let Some(lit) = self.next_int_lit_opt()? {
656 return Ok(Token::IntLit(lit));
657 }
658 }
659
660 if let Some(escaped) = self.next_str_lit_raw_opt()? {
661 return Ok(Token::StrLit(StrLit { escaped }));
662 }
663
664 if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
666 return Ok(Token::Symbol(c));
667 }
668
669 if let Some(ident) = self.next_ident_opt()? {
670 return Ok(Token::Ident(ident));
671 }
672
673 Err(LexerError::IncorrectInput)
674 }
675
676 pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
677 self.skip_ws()?;
678 let loc = self.loc;
679
680 Ok(if self.eof() {
681 None
682 } else {
683 let token = self.next_token_inner()?;
684 self.skip_ws()?;
687 Some(TokenWithLocation { token, loc })
688 })
689 }
690}
691
692#[cfg(test)]
693mod test {
694 use super::*;
695
696 fn lex<P, R>(input: &str, parse_what: P) -> R
697 where
698 P: FnOnce(&mut Lexer) -> LexerResult<R>,
699 {
700 let mut lexer = Lexer::new(input, ParserLanguage::Proto);
701 let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
702 assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
703 r
704 }
705
706 fn lex_opt<P, R>(input: &str, parse_what: P) -> R
707 where
708 P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
709 {
710 let mut lexer = Lexer::new(input, ParserLanguage::Proto);
711 let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
712 let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
713 assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
714 r
715 }
716
717 #[test]
718 fn test_lexer_int_lit() {
719 let msg = r#"10"#;
720 let mess = lex_opt(msg, |p| p.next_int_lit_opt());
721 assert_eq!(10, mess);
722 }
723
724 #[test]
725 fn test_lexer_float_lit() {
726 let msg = r#"12.3"#;
727 let mess = lex(msg, |p| p.next_token_inner());
728 assert_eq!(Token::FloatLit(12.3), mess);
729 }
730
731 #[test]
732 fn test_lexer_float_lit_leading_zeros_in_exp() {
733 let msg = r#"1e00009"#;
734 let mess = lex(msg, |p| p.next_token_inner());
735 assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
736 }
737}