1use crate::lexer::lexer_impl::Lexer;
2use crate::lexer::lexer_impl::LexerError;
3use crate::lexer::loc::Loc;
4use crate::lexer::parser_language::ParserLanguage;
5use crate::lexer::str_lit::StrLit;
6use crate::lexer::str_lit::StrLitDecodeError;
7use crate::lexer::token::Token;
8use crate::lexer::token::TokenWithLocation;
9
10#[derive(Debug, thiserror::Error)]
11pub enum TokenizerError {
12 #[error(transparent)]
13 LexerError(#[from] LexerError),
14 #[error(transparent)]
15 StrLitDecodeError(#[from] StrLitDecodeError),
16 #[error("Internal tokenizer error")]
17 InternalError,
18 #[error("Incorrect input")]
20 IncorrectInput,
21 #[error("Not allowed in this context: {0}")]
22 NotAllowedInThisContext(&'static str),
23 #[error("Unexpected end of input")]
24 UnexpectedEof,
25 #[error("Expecting string literal")]
26 ExpectStrLit,
27 #[error("Expecting int literal")]
28 ExpectIntLit,
29 #[error("Expecting float literal")]
30 ExpectFloatLit,
31 #[error("Expecting identifier")]
32 ExpectIdent,
33 #[error("Expecting identifier `{}`", .0)]
34 ExpectNamedIdent(String),
35 #[error("While parsing {}, expecting char `{}`", .1, .0)]
36 ExpectChar(char, &'static str),
37 #[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
38 ExpectAnyChar(Vec<char>),
39}
40
41pub type TokenizerResult<R> = Result<R, TokenizerError>;
42
43#[derive(Clone)]
44pub struct Tokenizer<'a> {
45 lexer: Lexer<'a>,
46 next_token: Option<TokenWithLocation>,
47 last_token_loc: Option<Loc>,
48}
49
50impl<'a> Tokenizer<'a> {
51 pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
52 Tokenizer {
53 lexer: Lexer::new(input, comment_style),
54 next_token: None,
55 last_token_loc: None,
56 }
57 }
58
59 pub fn loc(&self) -> Loc {
60 self.next_token
62 .as_ref()
63 .map(|t| t.loc.clone())
64 .or(self.last_token_loc.clone())
66 .unwrap_or(self.lexer.loc)
68 }
69
70 pub fn lookahead_loc(&mut self) -> Loc {
71 drop(self.lookahead());
72 self.loc()
74 }
75
76 fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
77 Ok(match self.next_token {
78 Some(ref token) => Some(&token.token),
79 None => {
80 self.next_token = self.lexer.next_token()?;
81 self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
82 match self.next_token {
83 Some(ref token) => Some(&token.token),
84 None => None,
85 }
86 }
87 })
88 }
89
90 pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
91 match self.lookahead()? {
92 Some(token) => Ok(token),
93 None => Err(TokenizerError::UnexpectedEof),
94 }
95 }
96
97 fn next(&mut self) -> TokenizerResult<Option<Token>> {
98 self.lookahead()?;
99 Ok(self
100 .next_token
101 .take()
102 .map(|TokenWithLocation { token, .. }| token))
103 }
104
105 pub fn next_some(&mut self) -> TokenizerResult<Token> {
106 match self.next()? {
107 Some(token) => Ok(token),
108 None => Err(TokenizerError::UnexpectedEof),
109 }
110 }
111
112 pub fn advance(&mut self) -> TokenizerResult<Token> {
114 self.next_token
115 .take()
116 .map(|TokenWithLocation { token, .. }| token)
117 .ok_or(TokenizerError::InternalError)
118 }
119
120 pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
122 Ok(self.lookahead()?.is_none())
123 }
124
125 pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
126 where
127 P: FnOnce(&Token) -> Option<R>,
128 {
129 self.lookahead()?;
130 let v = match self.next_token {
131 Some(ref token) => match p(&token.token) {
132 Some(v) => v,
133 None => return Ok(None),
134 },
135 _ => return Ok(None),
136 };
137 self.next_token = None;
138 Ok(Some(v))
139 }
140
141 pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
142 where
143 P: FnOnce(&Token) -> Result<R, E>,
144 E: From<TokenizerError>,
145 {
146 self.lookahead()?;
147 let r = match self.next_token {
148 Some(ref token) => p(&token.token)?,
149 None => return Err(TokenizerError::UnexpectedEof.into()),
150 };
151 self.next_token = None;
152 Ok(r)
153 }
154
155 fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
156 where
157 P: FnOnce(&Token) -> bool,
158 {
159 self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
160 }
161
162 pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
163 let v = match self.lookahead()? {
164 Some(&Token::Ident(ref next)) => {
165 if idents.into_iter().find(|&i| i == next).is_some() {
166 next.clone()
167 } else {
168 return Ok(None);
169 }
170 }
171 _ => return Ok(None),
172 };
173 self.advance()?;
174 Ok(Some(v))
175 }
176
177 pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
178 Ok(self.next_ident_if_in(&[word])? != None)
179 }
180
181 pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
182 if self.next_ident_if_eq(word)? {
183 Ok(())
184 } else {
185 Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
186 }
187 }
188
189 pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
190 if self.clone().next_ident_if_eq(word)? {
191 return Err(TokenizerError::NotAllowedInThisContext(word));
193 }
194 Ok(())
195 }
196
197 pub fn next_symbol_if_in(&mut self, symbols: &[char]) -> TokenizerResult<bool> {
198 self.next_token_if(|token| match token {
199 Token::Symbol(c) if symbols.contains(c) => true,
200 _ => false,
201 })
202 .map(|token| token.is_some())
203 }
204
205 pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
206 self.next_symbol_if_in(&[symbol])
207 }
208
209 pub fn next_symbol_expect_eq(
210 &mut self,
211 symbol: char,
212 desc: &'static str,
213 ) -> TokenizerResult<()> {
214 if self.lookahead_is_symbol(symbol)? {
215 self.advance()?;
216 Ok(())
217 } else {
218 Err(TokenizerError::ExpectChar(symbol, desc))
219 }
220 }
221
222 pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
223 for symbol in symbols {
224 if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
225 return Ok(*symbol);
226 }
227 }
228 Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
229 }
230
231 pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
232 Ok(match self.lookahead()? {
233 Some(&Token::StrLit(..)) => true,
234 _ => false,
235 })
236 }
237
238 pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
239 Ok(match self.lookahead()? {
240 Some(&Token::IntLit(..)) => true,
241 _ => false,
242 })
243 }
244
245 pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
246 Ok(match self.lookahead()? {
247 Some(&Token::JsonNumber(..)) => true,
248 _ => false,
249 })
250 }
251
252 pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
253 Ok(match self.lookahead()? {
254 Some(&Token::Symbol(c)) => Some(c),
255 _ => None,
256 })
257 }
258
259 pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
260 Ok(self.lookahead_if_symbol()? == Some(symbol))
261 }
262
263 pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
264 Ok(match self.lookahead()? {
265 Some(Token::Ident(i)) => i == ident,
266 _ => false,
267 })
268 }
269
270 pub fn next_ident(&mut self) -> TokenizerResult<String> {
271 self.next_token_check_map(|token| match token {
272 &Token::Ident(ref ident) => Ok(ident.clone()),
273 _ => Err(TokenizerError::ExpectIdent),
274 })
275 }
276
277 pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
278 self.next_token_check_map(|token| match token {
279 &Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
280 _ => Err(TokenizerError::ExpectStrLit),
281 })
282 }
283
284 pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
285 self.next_token_check_map(|token| match token {
286 &Token::IntLit(v) => Ok(v),
287 _ => Err(TokenizerError::ExpectIntLit),
288 })
289 }
290
291 pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
292 self.next_token_check_map(|token| match token {
293 &Token::FloatLit(v) => Ok(v),
294 _ => Err(TokenizerError::ExpectFloatLit),
295 })
296 }
297}
298
299#[cfg(test)]
300mod test {
301
302 use super::*;
303
304 fn tokenize<P, R>(input: &str, what: P) -> R
305 where
306 P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
307 {
308 let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
309 let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
310 let eof = tokenizer
311 .syntax_eof()
312 .expect(&format!("check eof failed at {}", tokenizer.loc()));
313 assert!(eof, "{}", tokenizer.loc());
314 r
315 }
316
317 #[test]
318 fn test_ident() {
319 let msg = r#" aabb_c "#;
320 let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
321 assert_eq!("aabb_c", mess);
322 }
323
324 #[test]
325 fn test_str_lit() {
326 let msg = r#" "a\nb" "#;
327 let mess = tokenize(msg, |p| p.next_str_lit());
328 assert_eq!(
329 StrLit {
330 escaped: r#"a\nb"#.to_owned()
331 },
332 mess
333 );
334 }
335}