use crate::lexer::lexer_impl::Lexer;
use crate::lexer::lexer_impl::LexerError;
use crate::lexer::loc::Loc;
use crate::lexer::parser_language::ParserLanguage;
use crate::lexer::str_lit::StrLit;
use crate::lexer::str_lit::StrLitDecodeError;
use crate::lexer::token::Token;
use crate::lexer::token::TokenWithLocation;
#[derive(Debug, thiserror::Error)]
pub enum TokenizerError {
#[error(transparent)]
LexerError(#[from] LexerError),
#[error(transparent)]
StrLitDecodeError(#[from] StrLitDecodeError),
#[error("Internal tokenizer error")]
InternalError,
#[error("Incorrect input")]
IncorrectInput,
#[error("Not allowed in this context: {0}")]
NotAllowedInThisContext(&'static str),
#[error("Unexpected end of input")]
UnexpectedEof,
#[error("Expecting string literal")]
ExpectStrLit,
#[error("Expecting int literal")]
ExpectIntLit,
#[error("Expecting float literal")]
ExpectFloatLit,
#[error("Expecting identifier")]
ExpectIdent,
#[error("Expecting identifier `{}`", .0)]
ExpectNamedIdent(String),
#[error("While parsing {}, expecting char `{}`", .1, .0)]
ExpectChar(char, &'static str),
#[error("Expecting any char of: {}", .0.iter().map(|c| format!("`{}`", c)).collect::<Vec<_>>().join(", "))]
ExpectAnyChar(Vec<char>),
}
pub type TokenizerResult<R> = Result<R, TokenizerError>;
#[derive(Clone)]
pub struct Tokenizer<'a> {
lexer: Lexer<'a>,
next_token: Option<TokenWithLocation>,
last_token_loc: Option<Loc>,
}
impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str, comment_style: ParserLanguage) -> Tokenizer<'a> {
Tokenizer {
lexer: Lexer::new(input, comment_style),
next_token: None,
last_token_loc: None,
}
}
pub fn loc(&self) -> Loc {
self.next_token
.as_ref()
.map(|t| t.loc.clone())
.or(self.last_token_loc.clone())
.unwrap_or(self.lexer.loc)
}
pub fn lookahead_loc(&mut self) -> Loc {
drop(self.lookahead());
self.loc()
}
fn lookahead(&mut self) -> TokenizerResult<Option<&Token>> {
Ok(match self.next_token {
Some(ref token) => Some(&token.token),
None => {
self.next_token = self.lexer.next_token()?;
self.last_token_loc = self.next_token.as_ref().map(|t| t.loc.clone());
match self.next_token {
Some(ref token) => Some(&token.token),
None => None,
}
}
})
}
pub fn lookahead_some(&mut self) -> TokenizerResult<&Token> {
match self.lookahead()? {
Some(token) => Ok(token),
None => Err(TokenizerError::UnexpectedEof),
}
}
fn next(&mut self) -> TokenizerResult<Option<Token>> {
self.lookahead()?;
Ok(self
.next_token
.take()
.map(|TokenWithLocation { token, .. }| token))
}
pub fn next_some(&mut self) -> TokenizerResult<Token> {
match self.next()? {
Some(token) => Ok(token),
None => Err(TokenizerError::UnexpectedEof),
}
}
pub fn advance(&mut self) -> TokenizerResult<Token> {
self.next_token
.take()
.map(|TokenWithLocation { token, .. }| token)
.ok_or(TokenizerError::InternalError)
}
pub fn syntax_eof(&mut self) -> TokenizerResult<bool> {
Ok(self.lookahead()?.is_none())
}
pub fn next_token_if_map<P, R>(&mut self, p: P) -> TokenizerResult<Option<R>>
where
P: FnOnce(&Token) -> Option<R>,
{
self.lookahead()?;
let v = match self.next_token {
Some(ref token) => match p(&token.token) {
Some(v) => v,
None => return Ok(None),
},
_ => return Ok(None),
};
self.next_token = None;
Ok(Some(v))
}
pub fn next_token_check_map<P, R, E>(&mut self, p: P) -> Result<R, E>
where
P: FnOnce(&Token) -> Result<R, E>,
E: From<TokenizerError>,
{
self.lookahead()?;
let r = match self.next_token {
Some(ref token) => p(&token.token)?,
None => return Err(TokenizerError::UnexpectedEof.into()),
};
self.next_token = None;
Ok(r)
}
fn next_token_if<P>(&mut self, p: P) -> TokenizerResult<Option<Token>>
where
P: FnOnce(&Token) -> bool,
{
self.next_token_if_map(|token| if p(token) { Some(token.clone()) } else { None })
}
pub fn next_ident_if_in(&mut self, idents: &[&str]) -> TokenizerResult<Option<String>> {
let v = match self.lookahead()? {
Some(&Token::Ident(ref next)) => {
if idents.into_iter().find(|&i| i == next).is_some() {
next.clone()
} else {
return Ok(None);
}
}
_ => return Ok(None),
};
self.advance()?;
Ok(Some(v))
}
pub fn next_ident_if_eq(&mut self, word: &str) -> TokenizerResult<bool> {
Ok(self.next_ident_if_in(&[word])? != None)
}
pub fn next_ident_expect_eq(&mut self, word: &str) -> TokenizerResult<()> {
if self.next_ident_if_eq(word)? {
Ok(())
} else {
Err(TokenizerError::ExpectNamedIdent(word.to_owned()))
}
}
pub fn next_ident_if_eq_error(&mut self, word: &'static str) -> TokenizerResult<()> {
if self.clone().next_ident_if_eq(word)? {
return Err(TokenizerError::NotAllowedInThisContext(word));
}
Ok(())
}
pub fn next_symbol_if_eq(&mut self, symbol: char) -> TokenizerResult<bool> {
Ok(self.next_token_if(|token| match token {
&Token::Symbol(c) if c == symbol => true,
_ => false,
})? != None)
}
pub fn next_symbol_expect_eq(
&mut self,
symbol: char,
desc: &'static str,
) -> TokenizerResult<()> {
if self.lookahead_is_symbol(symbol)? {
self.advance()?;
Ok(())
} else {
Err(TokenizerError::ExpectChar(symbol, desc))
}
}
pub fn next_symbol_expect_eq_oneof(&mut self, symbols: &[char]) -> TokenizerResult<char> {
for symbol in symbols {
if let Ok(()) = self.next_symbol_expect_eq(*symbol, "ignored") {
return Ok(*symbol);
}
}
Err(TokenizerError::ExpectAnyChar(symbols.to_owned()))
}
pub fn lookahead_is_str_lit(&mut self) -> TokenizerResult<bool> {
Ok(match self.lookahead()? {
Some(&Token::StrLit(..)) => true,
_ => false,
})
}
pub fn lookahead_is_int_lit(&mut self) -> TokenizerResult<bool> {
Ok(match self.lookahead()? {
Some(&Token::IntLit(..)) => true,
_ => false,
})
}
pub fn lookahead_is_json_number(&mut self) -> TokenizerResult<bool> {
Ok(match self.lookahead()? {
Some(&Token::JsonNumber(..)) => true,
_ => false,
})
}
pub fn lookahead_if_symbol(&mut self) -> TokenizerResult<Option<char>> {
Ok(match self.lookahead()? {
Some(&Token::Symbol(c)) => Some(c),
_ => None,
})
}
pub fn lookahead_is_symbol(&mut self, symbol: char) -> TokenizerResult<bool> {
Ok(self.lookahead_if_symbol()? == Some(symbol))
}
pub fn lookahead_is_ident(&mut self, ident: &str) -> TokenizerResult<bool> {
Ok(match self.lookahead()? {
Some(Token::Ident(i)) => i == ident,
_ => false,
})
}
pub fn next_ident(&mut self) -> TokenizerResult<String> {
self.next_token_check_map(|token| match token {
&Token::Ident(ref ident) => Ok(ident.clone()),
_ => Err(TokenizerError::ExpectIdent),
})
}
pub fn next_str_lit(&mut self) -> TokenizerResult<StrLit> {
self.next_token_check_map(|token| match token {
&Token::StrLit(ref str_lit) => Ok(str_lit.clone()),
_ => Err(TokenizerError::ExpectStrLit),
})
}
pub fn next_int_lit(&mut self) -> TokenizerResult<u64> {
self.next_token_check_map(|token| match token {
&Token::IntLit(v) => Ok(v),
_ => Err(TokenizerError::ExpectIntLit),
})
}
pub fn next_float_lit(&mut self) -> TokenizerResult<f64> {
self.next_token_check_map(|token| match token {
&Token::FloatLit(v) => Ok(v),
_ => Err(TokenizerError::ExpectFloatLit),
})
}
}
#[cfg(test)]
mod test {
use super::*;
fn tokenize<P, R>(input: &str, what: P) -> R
where
P: FnOnce(&mut Tokenizer) -> TokenizerResult<R>,
{
let mut tokenizer = Tokenizer::new(input, ParserLanguage::Proto);
let r = what(&mut tokenizer).expect(&format!("parse failed at {}", tokenizer.loc()));
let eof = tokenizer
.syntax_eof()
.expect(&format!("check eof failed at {}", tokenizer.loc()));
assert!(eof, "{}", tokenizer.loc());
r
}
#[test]
fn test_ident() {
let msg = r#" aabb_c "#;
let mess = tokenize(msg, |p| p.next_ident().map(|s| s.to_owned()));
assert_eq!("aabb_c", mess);
}
#[test]
fn test_str_lit() {
let msg = r#" "a\nb" "#;
let mess = tokenize(msg, |p| p.next_str_lit());
assert_eq!(
StrLit {
escaped: r#"a\nb"#.to_owned()
},
mess
);
}
}