protobuf_support/lexer/
lexer_impl.rs

1use std::char;
2use std::num::ParseFloatError;
3use std::num::ParseIntError;
4
5use crate::lexer::float;
6use crate::lexer::float::ProtobufFloatParseError;
7use crate::lexer::json_number_lit::JsonNumberLit;
8use crate::lexer::loc::Loc;
9use crate::lexer::loc::FIRST_COL;
10use crate::lexer::parser_language::ParserLanguage;
11use crate::lexer::str_lit::StrLit;
12use crate::lexer::str_lit::StrLitDecodeError;
13use crate::lexer::token::Token;
14use crate::lexer::token::TokenWithLocation;
15
16#[derive(Debug, thiserror::Error)]
17pub enum LexerError {
18    // TODO: something better than this
19    #[error("Incorrect input")]
20    IncorrectInput,
21    #[error("Unexpected EOF")]
22    UnexpectedEof,
23    #[error("Expecting char: {:?}", .0)]
24    ExpectChar(char),
25    #[error("Parse int error")]
26    ParseIntError,
27    #[error("Parse float error")]
28    ParseFloatError,
29    // TODO: how it is different from ParseFloatError?
30    #[error("Incorrect float literal")]
31    IncorrectFloatLit,
32    #[error("Incorrect JSON escape")]
33    IncorrectJsonEscape,
34    #[error("Incorrect JSON number")]
35    IncorrectJsonNumber,
36    #[error("Incorrect Unicode character")]
37    IncorrectUnicodeChar,
38    #[error("Expecting hex digit")]
39    ExpectHexDigit,
40    #[error("Expecting oct digit")]
41    ExpectOctDigit,
42    #[error("Expecting dec digit")]
43    ExpectDecDigit,
44    #[error(transparent)]
45    StrLitDecodeError(#[from] StrLitDecodeError),
46    #[error("Expecting identifier")]
47    ExpectedIdent,
48}
49
50pub type LexerResult<T> = Result<T, LexerError>;
51
52impl From<ParseIntError> for LexerError {
53    fn from(_: ParseIntError) -> Self {
54        LexerError::ParseIntError
55    }
56}
57
58impl From<ParseFloatError> for LexerError {
59    fn from(_: ParseFloatError) -> Self {
60        LexerError::ParseFloatError
61    }
62}
63
64impl From<ProtobufFloatParseError> for LexerError {
65    fn from(_: ProtobufFloatParseError) -> Self {
66        LexerError::IncorrectFloatLit
67    }
68}
69
70/// The raw bytes for a single char or escape sequence in a string literal
71///
72/// The raw bytes are available via an `into_iter` implementation.
73pub(crate) struct DecodedBytes {
74    // a single char can be up to 4-bytes when encoded in utf-8
75    buf: [u8; 4],
76    len: usize,
77}
78
79impl DecodedBytes {
80    fn byte(b: u8) -> DecodedBytes {
81        DecodedBytes {
82            buf: [b, 0, 0, 0],
83            len: 1,
84        }
85    }
86
87    fn char(value: char) -> Self {
88        let mut buf = [0; 4];
89        let len = value.encode_utf8(&mut buf).len();
90        DecodedBytes { buf, len }
91    }
92
93    pub(crate) fn bytes(&self) -> &[u8] {
94        &self.buf[..self.len]
95    }
96}
97
98#[derive(Copy, Clone)]
99pub struct Lexer<'a> {
100    language: ParserLanguage,
101    input: &'a str,
102    pos: usize,
103    pub loc: Loc,
104}
105
106fn is_letter(c: char) -> bool {
107    c.is_alphabetic() || c == '_'
108}
109
110impl<'a> Lexer<'a> {
111    pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
112        Lexer {
113            language,
114            input,
115            pos: 0,
116            loc: Loc::start(),
117        }
118    }
119
120    /// No more chars
121    pub fn eof(&self) -> bool {
122        self.pos == self.input.len()
123    }
124
125    /// Remaining chars
126    fn rem_chars(&self) -> &'a str {
127        &self.input[self.pos..]
128    }
129
130    pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
131        self.lookahead_char().map_or(false, p)
132    }
133
134    fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
135        self.lookahead_char_is(|c| alphabet.contains(c))
136    }
137
138    fn next_char_opt(&mut self) -> Option<char> {
139        let rem = self.rem_chars();
140        if rem.is_empty() {
141            None
142        } else {
143            let mut char_indices = rem.char_indices();
144            let (_, c) = char_indices.next().unwrap();
145            let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
146            self.pos += c_len;
147            if c == '\n' {
148                self.loc.line += 1;
149                self.loc.col = FIRST_COL;
150            } else {
151                self.loc.col += 1;
152            }
153            Some(c)
154        }
155    }
156
157    fn next_char(&mut self) -> LexerResult<char> {
158        self.next_char_opt().ok_or(LexerError::UnexpectedEof)
159    }
160
161    /// Skip whitespaces
162    fn skip_whitespaces(&mut self) {
163        self.take_while(|c| c.is_whitespace());
164    }
165
166    fn skip_c_comment(&mut self) -> LexerResult<()> {
167        if self.skip_if_lookahead_is_str("/*") {
168            let end = "*/";
169            match self.rem_chars().find(end) {
170                None => Err(LexerError::UnexpectedEof),
171                Some(len) => {
172                    let new_pos = self.pos + len + end.len();
173                    self.skip_to_pos(new_pos);
174                    Ok(())
175                }
176            }
177        } else {
178            Ok(())
179        }
180    }
181
182    fn skip_cpp_comment(&mut self) {
183        if self.skip_if_lookahead_is_str("//") {
184            loop {
185                match self.next_char_opt() {
186                    Some('\n') | None => break,
187                    _ => {}
188                }
189            }
190        }
191    }
192
193    fn skip_sh_comment(&mut self) {
194        if self.skip_if_lookahead_is_str("#") {
195            loop {
196                match self.next_char_opt() {
197                    Some('\n') | None => break,
198                    _ => {}
199                }
200            }
201        }
202    }
203
204    fn skip_comment(&mut self) -> LexerResult<()> {
205        match self.language {
206            ParserLanguage::Proto => {
207                self.skip_c_comment()?;
208                self.skip_cpp_comment();
209            }
210            ParserLanguage::TextFormat => {
211                self.skip_sh_comment();
212            }
213            ParserLanguage::Json => {}
214        }
215        Ok(())
216    }
217
218    pub fn skip_ws(&mut self) -> LexerResult<()> {
219        loop {
220            let pos = self.pos;
221            self.skip_whitespaces();
222            self.skip_comment()?;
223            if pos == self.pos {
224                // Did not advance
225                return Ok(());
226            }
227        }
228    }
229
230    pub fn take_while<F>(&mut self, f: F) -> &'a str
231    where
232        F: Fn(char) -> bool,
233    {
234        let start = self.pos;
235        while self.lookahead_char().map(&f) == Some(true) {
236            self.next_char_opt().unwrap();
237        }
238        let end = self.pos;
239        &self.input[start..end]
240    }
241
242    fn lookahead_char(&self) -> Option<char> {
243        self.clone().next_char_opt()
244    }
245
246    fn lookahead_is_str(&self, s: &str) -> bool {
247        self.rem_chars().starts_with(s)
248    }
249
250    fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
251        if self.lookahead_is_str(s) {
252            let new_pos = self.pos + s.len();
253            self.skip_to_pos(new_pos);
254            true
255        } else {
256            false
257        }
258    }
259
260    fn next_char_if<P>(&mut self, p: P) -> Option<char>
261    where
262        P: FnOnce(char) -> bool,
263    {
264        let mut clone = self.clone();
265        match clone.next_char_opt() {
266            Some(c) if p(c) => {
267                *self = clone;
268                Some(c)
269            }
270            _ => None,
271        }
272    }
273
274    pub fn next_char_if_eq(&mut self, expect: char) -> bool {
275        self.next_char_if(|c| c == expect) != None
276    }
277
278    fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
279        for c in alphabet.chars() {
280            if self.next_char_if_eq(c) {
281                return Some(c);
282            }
283        }
284        None
285    }
286
287    fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
288        if self.next_char_if_eq(expect) {
289            Ok(())
290        } else {
291            Err(LexerError::ExpectChar(expect))
292        }
293    }
294
295    fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
296    where
297        P: FnOnce(char) -> bool,
298    {
299        self.next_char_if(expect).ok_or(err)
300    }
301
302    // str functions
303
304    /// properly update line and column
305    fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
306        assert!(new_pos >= self.pos);
307        assert!(new_pos <= self.input.len());
308        let pos = self.pos;
309        while self.pos != new_pos {
310            self.next_char_opt().unwrap();
311        }
312        &self.input[pos..new_pos]
313    }
314
315    // Protobuf grammar
316
317    // char functions
318
319    // letter = "A" … "Z" | "a" … "z"
320    // https://github.com/google/protobuf/issues/4565
321    fn next_letter_opt(&mut self) -> Option<char> {
322        self.next_char_if(is_letter)
323    }
324
325    // capitalLetter =  "A" … "Z"
326    fn _next_capital_letter_opt(&mut self) -> Option<char> {
327        self.next_char_if(|c| c >= 'A' && c <= 'Z')
328    }
329
330    fn next_ident_part(&mut self) -> Option<char> {
331        self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
332    }
333
334    // Identifiers
335
336    // ident = letter { letter | decimalDigit | "_" }
337    fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
338        if let Some(c) = self.next_letter_opt() {
339            let mut ident = String::new();
340            ident.push(c);
341            while let Some(c) = self.next_ident_part() {
342                ident.push(c);
343            }
344            Ok(Some(ident))
345        } else {
346            Ok(None)
347        }
348    }
349
350    // Integer literals
351
352    // hexLit     = "0" ( "x" | "X" ) hexDigit { hexDigit }
353    fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
354        Ok(
355            if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
356                let s = self.take_while(|c| c.is_ascii_hexdigit());
357                Some(u64::from_str_radix(s, 16)? as u64)
358            } else {
359                None
360            },
361        )
362    }
363
364    // decimalLit = ( "1" … "9" ) { decimalDigit }
365    // octalLit   = "0" { octalDigit }
366    fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
367        // do not advance on number parse error
368        let mut clone = self.clone();
369
370        let pos = clone.pos;
371
372        Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
373            clone.take_while(|c| c.is_ascii_digit());
374            let value = clone.input[pos..clone.pos].parse()?;
375            *self = clone;
376            Some(value)
377        } else {
378            None
379        })
380    }
381
382    // hexDigit     = "0" … "9" | "A" … "F" | "a" … "f"
383    fn next_hex_digit(&mut self) -> LexerResult<u32> {
384        let mut clone = self.clone();
385        let r = match clone.next_char()? {
386            c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
387            c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
388            c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
389            _ => return Err(LexerError::ExpectHexDigit),
390        };
391        *self = clone;
392        Ok(r)
393    }
394
395    // octalDigit   = "0" … "7"
396    fn next_octal_digit(&mut self) -> LexerResult<u32> {
397        self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
398            .map(|c| c as u32 - '0' as u32)
399    }
400
401    // decimalDigit = "0" … "9"
402    fn next_decimal_digit(&mut self) -> LexerResult<u32> {
403        self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
404            .map(|c| c as u32 - '0' as u32)
405    }
406
407    // decimals  = decimalDigit { decimalDigit }
408    fn next_decimal_digits(&mut self) -> LexerResult<()> {
409        self.next_decimal_digit()?;
410        self.take_while(|c| c >= '0' && c <= '9');
411        Ok(())
412    }
413
414    // intLit     = decimalLit | octalLit | hexLit
415    pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
416        assert_ne!(ParserLanguage::Json, self.language);
417
418        self.skip_ws()?;
419        if let Some(i) = self.next_hex_lit_opt()? {
420            return Ok(Some(i));
421        }
422        if let Some(i) = self.next_decimal_octal_lit_opt()? {
423            return Ok(Some(i));
424        }
425        Ok(None)
426    }
427
428    // Floating-point literals
429
430    // exponent  = ( "e" | "E" ) [ "+" | "-" ] decimals
431    fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
432        if self.next_char_if_in("eE") != None {
433            self.next_char_if_in("+-");
434            self.next_decimal_digits()?;
435            Ok(Some(()))
436        } else {
437            Ok(None)
438        }
439    }
440
441    // floatLit = ( decimals "." [ decimals ] [ exponent ] | decimals exponent | "."decimals [ exponent ] ) | "inf" | "nan"
442    fn next_float_lit(&mut self) -> LexerResult<()> {
443        assert_ne!(ParserLanguage::Json, self.language);
444
445        // "inf" and "nan" are handled as part of ident
446        if self.next_char_if_eq('.') {
447            self.next_decimal_digits()?;
448            self.next_exponent_opt()?;
449        } else {
450            self.next_decimal_digits()?;
451            if self.next_char_if_eq('.') {
452                self.next_decimal_digits()?;
453                self.next_exponent_opt()?;
454            } else {
455                if self.next_exponent_opt()? == None {
456                    return Err(LexerError::IncorrectFloatLit);
457                }
458            }
459        }
460        Ok(())
461    }
462
463    // String literals
464
465    // charValue = hexEscape | octEscape | charEscape | /[^\0\n\\]/
466    // hexEscape = '\' ( "x" | "X" ) hexDigit hexDigit
467    // https://github.com/google/protobuf/issues/4560
468    // octEscape = '\' octalDigit octalDigit octalDigit
469    // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' )
470    // quote = "'" | '"'
471    pub(crate) fn next_str_lit_bytes(&mut self) -> LexerResult<DecodedBytes> {
472        match self.next_char()? {
473            '\\' => {
474                match self.next_char()? {
475                    '\'' => Ok(DecodedBytes::byte(b'\'')),
476                    '"' => Ok(DecodedBytes::byte(b'"')),
477                    '\\' => Ok(DecodedBytes::byte(b'\\')),
478                    'a' => Ok(DecodedBytes::byte(b'\x07')),
479                    'b' => Ok(DecodedBytes::byte(b'\x08')),
480                    'f' => Ok(DecodedBytes::byte(b'\x0c')),
481                    'n' => Ok(DecodedBytes::byte(b'\n')),
482                    'r' => Ok(DecodedBytes::byte(b'\r')),
483                    't' => Ok(DecodedBytes::byte(b'\t')),
484                    'v' => Ok(DecodedBytes::byte(b'\x0b')),
485                    'x' => {
486                        let d1 = self.next_hex_digit()? as u8;
487                        let d2 = self.next_hex_digit()? as u8;
488                        Ok(DecodedBytes::byte((d1 << 4) | d2))
489                    }
490                    d if d >= '0' && d <= '7' => {
491                        let mut r = d as u8 - b'0';
492                        for _ in 0..2 {
493                            match self.next_octal_digit() {
494                                Err(_) => break,
495                                Ok(d) => r = (r << 3) + d as u8,
496                            }
497                        }
498                        Ok(DecodedBytes::byte(r))
499                    }
500                    // https://github.com/google/protobuf/issues/4562
501                    c => Ok(DecodedBytes::char(c)),
502                }
503            }
504            '\n' | '\0' => Err(LexerError::IncorrectInput),
505            c => Ok(DecodedBytes::char(c)),
506        }
507    }
508
509    fn char_try_from(i: u32) -> LexerResult<char> {
510        char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
511    }
512
513    pub fn next_json_char_value(&mut self) -> LexerResult<char> {
514        match self.next_char()? {
515            '\\' => match self.next_char()? {
516                '"' => Ok('"'),
517                '\'' => Ok('\''),
518                '\\' => Ok('\\'),
519                '/' => Ok('/'),
520                'b' => Ok('\x08'),
521                'f' => Ok('\x0c'),
522                'n' => Ok('\n'),
523                'r' => Ok('\r'),
524                't' => Ok('\t'),
525                'u' => {
526                    let mut v = 0;
527                    for _ in 0..4 {
528                        let digit = self.next_hex_digit()?;
529                        v = v * 16 + digit;
530                    }
531                    Self::char_try_from(v)
532                }
533                _ => Err(LexerError::IncorrectJsonEscape),
534            },
535            c => Ok(c),
536        }
537    }
538
539    // https://github.com/google/protobuf/issues/4564
540    // strLit = ( "'" { charValue } "'" ) | ( '"' { charValue } '"' )
541    fn next_str_lit_raw(&mut self) -> LexerResult<String> {
542        let mut raw = String::new();
543
544        let mut first = true;
545        loop {
546            if !first {
547                self.skip_ws()?;
548            }
549
550            let start = self.pos;
551
552            let q = match self.next_char_if_in("'\"") {
553                Some(q) => q,
554                None if !first => break,
555                None => return Err(LexerError::IncorrectInput),
556            };
557            first = false;
558            while self.lookahead_char() != Some(q) {
559                self.next_str_lit_bytes()?;
560            }
561            self.next_char_expect_eq(q)?;
562
563            raw.push_str(&self.input[start + 1..self.pos - 1]);
564        }
565        Ok(raw)
566    }
567
568    fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
569        if self.lookahead_char_is_in("'\"") {
570            Ok(Some(self.next_str_lit_raw()?))
571        } else {
572            Ok(None)
573        }
574    }
575
576    /// Parse next token as JSON number
577    fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
578        assert_eq!(ParserLanguage::Json, self.language);
579
580        fn is_digit(c: char) -> bool {
581            c >= '0' && c <= '9'
582        }
583
584        fn is_digit_1_9(c: char) -> bool {
585            c >= '1' && c <= '9'
586        }
587
588        if !self.lookahead_char_is_in("-0123456789") {
589            return Ok(None);
590        }
591
592        let mut s = String::new();
593        if self.next_char_if_eq('-') {
594            s.push('-');
595        }
596
597        if self.next_char_if_eq('0') {
598            s.push('0');
599        } else {
600            s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
601            while let Some(c) = self.next_char_if(is_digit) {
602                s.push(c);
603            }
604        }
605
606        if self.next_char_if_eq('.') {
607            s.push('.');
608            s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
609            while let Some(c) = self.next_char_if(is_digit) {
610                s.push(c);
611            }
612        }
613
614        if let Some(c) = self.next_char_if_in("eE") {
615            s.push(c);
616            if let Some(c) = self.next_char_if_in("+-") {
617                s.push(c);
618            }
619            s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
620            while let Some(c) = self.next_char_if(is_digit) {
621                s.push(c);
622            }
623        }
624
625        Ok(Some(JsonNumberLit(s)))
626    }
627
628    fn next_token_inner(&mut self) -> LexerResult<Token> {
629        if self.language == ParserLanguage::Json {
630            if let Some(v) = self.next_json_number_opt()? {
631                return Ok(Token::JsonNumber(v));
632            }
633        }
634
635        if let Some(ident) = self.next_ident_opt()? {
636            let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
637                Token::FloatLit(f64::NAN)
638            } else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
639                Token::FloatLit(f64::INFINITY)
640            } else {
641                Token::Ident(ident.to_owned())
642            };
643            return Ok(token);
644        }
645
646        if self.language != ParserLanguage::Json {
647            let mut clone = self.clone();
648            let pos = clone.pos;
649            if let Ok(_) = clone.next_float_lit() {
650                let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
651                *self = clone;
652                return Ok(Token::FloatLit(f));
653            }
654
655            if let Some(lit) = self.next_int_lit_opt()? {
656                return Ok(Token::IntLit(lit));
657            }
658        }
659
660        if let Some(escaped) = self.next_str_lit_raw_opt()? {
661            return Ok(Token::StrLit(StrLit { escaped }));
662        }
663
664        // This branch must be after str lit
665        if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
666            return Ok(Token::Symbol(c));
667        }
668
669        if let Some(ident) = self.next_ident_opt()? {
670            return Ok(Token::Ident(ident));
671        }
672
673        Err(LexerError::IncorrectInput)
674    }
675
676    pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
677        self.skip_ws()?;
678        let loc = self.loc;
679
680        Ok(if self.eof() {
681            None
682        } else {
683            let token = self.next_token_inner()?;
684            // Skip whitespace here to update location
685            // to the beginning of the next token
686            self.skip_ws()?;
687            Some(TokenWithLocation { token, loc })
688        })
689    }
690}
691
692#[cfg(test)]
693mod test {
694    use super::*;
695
696    fn lex<P, R>(input: &str, parse_what: P) -> R
697    where
698        P: FnOnce(&mut Lexer) -> LexerResult<R>,
699    {
700        let mut lexer = Lexer::new(input, ParserLanguage::Proto);
701        let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
702        assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
703        r
704    }
705
706    fn lex_opt<P, R>(input: &str, parse_what: P) -> R
707    where
708        P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
709    {
710        let mut lexer = Lexer::new(input, ParserLanguage::Proto);
711        let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
712        let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
713        assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
714        r
715    }
716
717    #[test]
718    fn test_lexer_int_lit() {
719        let msg = r#"10"#;
720        let mess = lex_opt(msg, |p| p.next_int_lit_opt());
721        assert_eq!(10, mess);
722    }
723
724    #[test]
725    fn test_lexer_float_lit() {
726        let msg = r#"12.3"#;
727        let mess = lex(msg, |p| p.next_token_inner());
728        assert_eq!(Token::FloatLit(12.3), mess);
729    }
730
731    #[test]
732    fn test_lexer_float_lit_leading_zeros_in_exp() {
733        let msg = r#"1e00009"#;
734        let mess = lex(msg, |p| p.next_token_inner());
735        assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
736    }
737}