protobuf_support/lexer/
tokenizer.rs

1use crate::lexer::lexer_impl::Lexer;
2use crate::lexer::lexer_impl::LexerError;
3use crate::lexer::loc::Loc;
4use crate::lexer::parser_language::ParserLanguage;
5use crate::lexer::str_lit::StrLit;
6use crate::lexer::str_lit::StrLitDecodeError;
7use crate::lexer::token::Token;
8use crate::lexer::token::TokenWithLocation;
9
10#[derive(Debug, thiserror::Error)]
11pub enum TokenizerError {
12    #[error(transparent)]
13    LexerError(#[from] LexerError),
14    #[error(transparent)]
15    StrLitDecodeError(#[from] StrLitDecodeError),
16    #[error("Internal tokenizer error")]
17    InternalError,
18    // TODO: too broad
19    #[error("Incorrect input")]
20    IncorrectInput,
21    #[error("Not allowed in this context: {0}")]
22    NotAllowedInThisContext(&'static str),
23    #[error("Unexpected end of input")]
24    UnexpectedEof,
25    #[error("Expecting string literal")]
26    ExpectStrLit,
27    #[error("Expecting int literal")]
28    ExpectIntLit,
29    #[error("Expecting float literal")]
30    ExpectFloatLit,
31    #[error("Expecting identifier")]
32    ExpectIdent,
33    #[error("Expecting identifier `{}`", .0)]
34    ExpectNamedIdent(String),
35    #[error("While parsing {}, expecting char `{}`", .1, .0)]
36    ExpectChar(char, &'static str),
37    #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
38    ExpectAnyChar(Vec<char>),
39}
40
41pub type TokenizerResult<R> = Result<R, TokenizerError>;
42
43#[derive(Clone)]
44pub struct Tokenizer<'a> {
45    lexer: Lexer<'a>,
46    next_token: Option<TokenWithLocation>,
47    last_token_loc: Option<Loc>,
48}
49
50impl<'a> Tokenizer<'a> {
51    pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
52        Tokenizer {
53            lexer: Lexer::new(input, comment_style),
54            next_token: None,
55            last_token_loc: None,
56        }
57    }
58
59    pub fn loc(&self) -> Loc {
60        // After lookahead return the location of the next token
61        self.next_token
62            .as_ref()
63            .map(|t| t.loc.clone())
64            // After token consumed return the location of that token
65            .or(self.last_token_loc.clone())
66            // Otherwise return the position of lexer
67            .unwrap_or(self.lexer.loc)
68    }
69
70    pub fn lookahead_loc(&mut self) -> Loc {
71        drop(self.lookahead());
72        // TODO: does not handle EOF properly
73        self.loc()
74    }
75
76    fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
77        Ok(match self.next_token {
78            Some(ref token) => Some(&token.token),
79            None => {
80                self.next_token = self.lexer.next_token()?;
81                self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
82                match self.next_token {
83                    Some(ref token) => Some(&token.token),
84                    None => None,
85                }
86            }
87        })
88    }
89
90    pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
91        match self.lookahead()? {
92            Some(token) => Ok(token),
93            None => Err(TokenizerError::UnexpectedEof),
94        }
95    }
96
97    fn next(&mut self) -> TokenizerResult<Option<Token>> {
98        self.lookahead()?;
99        Ok(self
100            .next_token
101            .take()
102            .map(|TokenWithLocation { token, .. }| token))
103    }
104
105    pub fn next_some(&mut self) -> TokenizerResult<Token> {
106        match self.next()? {
107            Some(token) => Ok(token),
108            None => Err(TokenizerError::UnexpectedEof),
109        }
110    }
111
112    /// Can be called only after lookahead, otherwise it's error
113    pub fn advance(&mut self) -> TokenizerResult<Token> {
114        self.next_token
115            .take()
116            .map(|TokenWithLocation { token, .. }| token)
117            .ok_or(TokenizerError::InternalError)
118    }
119
120    /// No more tokens
121    pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
122        Ok(self.lookahead()?.is_none())
123    }
124
125    pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
126    where
127        P: FnOnce(&Token) -> Option<R>,
128    {
129        self.lookahead()?;
130        let v = match self.next_token {
131            Some(ref token) => match p(&token.token) {
132                Some(v) => v,
133                None => return Ok(None),
134            },
135            _ => return Ok(None),
136        };
137        self.next_token = None;
138        Ok(Some(v))
139    }
140
141    pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
142    where
143        P: FnOnce(&Token) -> Result<R, E>,
144        E: From<TokenizerError>,
145    {
146        self.lookahead()?;
147        let r = match self.next_token {
148            Some(ref token) => p(&token.token)?,
149            None => return Err(TokenizerError::UnexpectedEof.into()),
150        };
151        self.next_token = None;
152        Ok(r)
153    }
154
155    fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
156    where
157        P: FnOnce(&Token) -> bool,
158    {
159        self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
160    }
161
162    pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
163        let v = match self.lookahead()? {
164            Some(&Token::Ident(ref next)) => {
165                if idents.into_iter().find(|&i| i == next).is_some() {
166                    next.clone()
167                } else {
168                    return Ok(None);
169                }
170            }
171            _ => return Ok(None),
172        };
173        self.advance()?;
174        Ok(Some(v))
175    }
176
177    pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
178        Ok(self.next_ident_if_in(&[word])? != None)
179    }
180
181    pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
182        if self.next_ident_if_eq(word)? {
183            Ok(())
184        } else {
185            Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
186        }
187    }
188
189    pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
190        if self.clone().next_ident_if_eq(word)? {
191            // TODO: which context?
192            return Err(TokenizerError::NotAllowedInThisContext(word));
193        }
194        Ok(())
195    }
196
197    pub fn next_symbol_if_in(&mut self, symbols: &[char]) -> TokenizerResult<bool> {
198        self.next_token_if(|token| match token {
199            Token::Symbol(c) if symbols.contains(c) => true,
200            _ => false,
201        })
202        .map(|token| token.is_some())
203    }
204
205    pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
206        self.next_symbol_if_in(&[symbol])
207    }
208
209    pub fn next_symbol_expect_eq(
210        &mut self,
211        symbol: char,
212        desc: &'static str,
213    ) -> TokenizerResult<()> {
214        if self.lookahead_is_symbol(symbol)? {
215            self.advance()?;
216            Ok(())
217        } else {
218            Err(TokenizerError::ExpectChar(symbol, desc))
219        }
220    }
221
222    pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
223        for symbol in symbols {
224            if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
225                return Ok(*symbol);
226            }
227        }
228        Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
229    }
230
231    pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
232        Ok(match self.lookahead()? {
233            Some(&Token::StrLit(..)) => true,
234            _ => false,
235        })
236    }
237
238    pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
239        Ok(match self.lookahead()? {
240            Some(&Token::IntLit(..)) => true,
241            _ => false,
242        })
243    }
244
245    pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
246        Ok(match self.lookahead()? {
247            Some(&Token::JsonNumber(..)) => true,
248            _ => false,
249        })
250    }
251
252    pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
253        Ok(match self.lookahead()? {
254            Some(&Token::Symbol(c)) => Some(c),
255            _ => None,
256        })
257    }
258
259    pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
260        Ok(self.lookahead_if_symbol()? == Some(symbol))
261    }
262
263    pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
264        Ok(match self.lookahead()? {
265            Some(Token::Ident(i)) => i == ident,
266            _ => false,
267        })
268    }
269
270    pub fn next_ident(&mut self) -> TokenizerResult<String> {
271        self.next_token_check_map(|token| match token {
272            &Token::Ident(ref ident) => Ok(ident.clone()),
273            _ => Err(TokenizerError::ExpectIdent),
274        })
275    }
276
277    pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
278        self.next_token_check_map(|token| match token {
279            &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
280            _ => Err(TokenizerError::ExpectStrLit),
281        })
282    }
283
284    pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
285        self.next_token_check_map(|token| match token {
286            &Token::IntLit(v) => Ok(v),
287            _ => Err(TokenizerError::ExpectIntLit),
288        })
289    }
290
291    pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
292        self.next_token_check_map(|token| match token {
293            &Token::FloatLit(v) => Ok(v),
294            _ => Err(TokenizerError::ExpectFloatLit),
295        })
296    }
297}
298
299#[cfg(test)]
300mod test {
301
302    use super::*;
303
304    fn tokenize<P, R>(input: &str, what: P) -> R
305    where
306        P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
307    {
308        let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
309        let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
310        let eof = tokenizer
311            .syntax_eof()
312            .expect(&format!("check eof failed at {}", tokenizer.loc()));
313        assert!(eof, "{}", tokenizer.loc());
314        r
315    }
316
317    #[test]
318    fn test_ident() {
319        let msg = r#"  aabb_c  "#;
320        let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
321        assert_eq!("aabb_c", mess);
322    }
323
324    #[test]
325    fn test_str_lit() {
326        let msg = r#"  "a\nb"  "#;
327        let mess = tokenize(msg, |p| p.next_str_lit());
328        assert_eq!(
329            StrLit {
330                escaped: r#"a\nb"#.to_owned()
331            },
332            mess
333        );
334    }
335}