use std::char;
use std::num::ParseFloatError;
use std::num::ParseIntError;
use crate::lexer::float;
use crate::lexer::float::ProtobufFloatParseError;
use crate::lexer::json_number_lit::JsonNumberLit;
use crate::lexer::loc::Loc;
use crate::lexer::loc::FIRST_COL;
use crate::lexer::parser_language::ParserLanguage;
use crate::lexer::str_lit::StrLit;
use crate::lexer::str_lit::StrLitDecodeError;
use crate::lexer::token::Token;
use crate::lexer::token::TokenWithLocation;
#[derive(Debug, thiserror::Error)]
pub enum LexerError {
#[error("Incorrect input")]
IncorrectInput,
#[error("Unexpected EOF")]
UnexpectedEof,
#[error("Expecting char: {:?}", .0)]
ExpectChar(char),
#[error("Parse int error")]
ParseIntError,
#[error("Parse float error")]
ParseFloatError,
#[error("Incorrect float literal")]
IncorrectFloatLit,
#[error("Incorrect JSON escape")]
IncorrectJsonEscape,
#[error("Incorrect JSON number")]
IncorrectJsonNumber,
#[error("Incorrect Unicode character")]
IncorrectUnicodeChar,
#[error("Expecting hex digit")]
ExpectHexDigit,
#[error("Expecting oct digit")]
ExpectOctDigit,
#[error("Expecting dec digit")]
ExpectDecDigit,
#[error(transparent)]
StrLitDecodeError(#[from] StrLitDecodeError),
#[error("Expecting identifier")]
ExpectedIdent,
}
pub type LexerResult<T> = Result<T, LexerError>;
impl From<ParseIntError> for LexerError {
fn from(_: ParseIntError) -> Self {
LexerError::ParseIntError
}
}
impl From<ParseFloatError> for LexerError {
fn from(_: ParseFloatError) -> Self {
LexerError::ParseFloatError
}
}
impl From<ProtobufFloatParseError> for LexerError {
fn from(_: ProtobufFloatParseError) -> Self {
LexerError::IncorrectFloatLit
}
}
pub(crate) struct DecodedBytes {
buf: [u8; 4],
len: usize,
}
impl DecodedBytes {
fn byte(b: u8) -> DecodedBytes {
DecodedBytes {
buf: [b, 0, 0, 0],
len: 1,
}
}
fn char(value: char) -> Self {
let mut buf = [0; 4];
let len = value.encode_utf8(&mut buf).len();
DecodedBytes { buf, len }
}
pub(crate) fn bytes(&self) -> &[u8] {
&self.buf[..self.len]
}
}
#[derive(Copy, Clone)]
pub struct Lexer<'a> {
language: ParserLanguage,
input: &'a str,
pos: usize,
pub loc: Loc,
}
fn is_letter(c: char) -> bool {
c.is_alphabetic() || c == '_'
}
impl<'a> Lexer<'a> {
pub fn new(input: &'a str, language: ParserLanguage) -> Lexer<'a> {
Lexer {
language,
input,
pos: 0,
loc: Loc::start(),
}
}
pub fn eof(&self) -> bool {
self.pos == self.input.len()
}
fn rem_chars(&self) -> &'a str {
&self.input[self.pos..]
}
pub fn lookahead_char_is<P: FnOnce(char) -> bool>(&self, p: P) -> bool {
self.lookahead_char().map_or(false, p)
}
fn lookahead_char_is_in(&self, alphabet: &str) -> bool {
self.lookahead_char_is(|c| alphabet.contains(c))
}
fn next_char_opt(&mut self) -> Option<char> {
let rem = self.rem_chars();
if rem.is_empty() {
None
} else {
let mut char_indices = rem.char_indices();
let (_, c) = char_indices.next().unwrap();
let c_len = char_indices.next().map(|(len, _)| len).unwrap_or(rem.len());
self.pos += c_len;
if c == '\n' {
self.loc.line += 1;
self.loc.col = FIRST_COL;
} else {
self.loc.col += 1;
}
Some(c)
}
}
fn next_char(&mut self) -> LexerResult<char> {
self.next_char_opt().ok_or(LexerError::UnexpectedEof)
}
fn skip_whitespaces(&mut self) {
self.take_while(|c| c.is_whitespace());
}
fn skip_c_comment(&mut self) -> LexerResult<()> {
if self.skip_if_lookahead_is_str("/*") {
let end = "*/";
match self.rem_chars().find(end) {
None => Err(LexerError::UnexpectedEof),
Some(len) => {
let new_pos = self.pos + len + end.len();
self.skip_to_pos(new_pos);
Ok(())
}
}
} else {
Ok(())
}
}
fn skip_cpp_comment(&mut self) {
if self.skip_if_lookahead_is_str("//") {
loop {
match self.next_char_opt() {
Some('\n') | None => break,
_ => {}
}
}
}
}
fn skip_sh_comment(&mut self) {
if self.skip_if_lookahead_is_str("#") {
loop {
match self.next_char_opt() {
Some('\n') | None => break,
_ => {}
}
}
}
}
fn skip_comment(&mut self) -> LexerResult<()> {
match self.language {
ParserLanguage::Proto => {
self.skip_c_comment()?;
self.skip_cpp_comment();
}
ParserLanguage::TextFormat => {
self.skip_sh_comment();
}
ParserLanguage::Json => {}
}
Ok(())
}
pub fn skip_ws(&mut self) -> LexerResult<()> {
loop {
let pos = self.pos;
self.skip_whitespaces();
self.skip_comment()?;
if pos == self.pos {
return Ok(());
}
}
}
pub fn take_while<F>(&mut self, f: F) -> &'a str
where
F: Fn(char) -> bool,
{
let start = self.pos;
while self.lookahead_char().map(&f) == Some(true) {
self.next_char_opt().unwrap();
}
let end = self.pos;
&self.input[start..end]
}
fn lookahead_char(&self) -> Option<char> {
self.clone().next_char_opt()
}
fn lookahead_is_str(&self, s: &str) -> bool {
self.rem_chars().starts_with(s)
}
fn skip_if_lookahead_is_str(&mut self, s: &str) -> bool {
if self.lookahead_is_str(s) {
let new_pos = self.pos + s.len();
self.skip_to_pos(new_pos);
true
} else {
false
}
}
fn next_char_if<P>(&mut self, p: P) -> Option<char>
where
P: FnOnce(char) -> bool,
{
let mut clone = self.clone();
match clone.next_char_opt() {
Some(c) if p(c) => {
*self = clone;
Some(c)
}
_ => None,
}
}
pub fn next_char_if_eq(&mut self, expect: char) -> bool {
self.next_char_if(|c| c == expect) != None
}
fn next_char_if_in(&mut self, alphabet: &str) -> Option<char> {
for c in alphabet.chars() {
if self.next_char_if_eq(c) {
return Some(c);
}
}
None
}
fn next_char_expect_eq(&mut self, expect: char) -> LexerResult<()> {
if self.next_char_if_eq(expect) {
Ok(())
} else {
Err(LexerError::ExpectChar(expect))
}
}
fn next_char_expect<P>(&mut self, expect: P, err: LexerError) -> LexerResult<char>
where
P: FnOnce(char) -> bool,
{
self.next_char_if(expect).ok_or(err)
}
fn skip_to_pos(&mut self, new_pos: usize) -> &'a str {
assert!(new_pos >= self.pos);
assert!(new_pos <= self.input.len());
let pos = self.pos;
while self.pos != new_pos {
self.next_char_opt().unwrap();
}
&self.input[pos..new_pos]
}
fn next_letter_opt(&mut self) -> Option<char> {
self.next_char_if(is_letter)
}
fn _next_capital_letter_opt(&mut self) -> Option<char> {
self.next_char_if(|c| c >= 'A' && c <= 'Z')
}
fn next_ident_part(&mut self) -> Option<char> {
self.next_char_if(|c| c.is_ascii_alphanumeric() || c == '_')
}
fn next_ident_opt(&mut self) -> LexerResult<Option<String>> {
if let Some(c) = self.next_letter_opt() {
let mut ident = String::new();
ident.push(c);
while let Some(c) = self.next_ident_part() {
ident.push(c);
}
Ok(Some(ident))
} else {
Ok(None)
}
}
fn next_hex_lit_opt(&mut self) -> LexerResult<Option<u64>> {
Ok(
if self.skip_if_lookahead_is_str("0x") || self.skip_if_lookahead_is_str("0X") {
let s = self.take_while(|c| c.is_ascii_hexdigit());
Some(u64::from_str_radix(s, 16)? as u64)
} else {
None
},
)
}
fn next_decimal_octal_lit_opt(&mut self) -> LexerResult<Option<u64>> {
let mut clone = self.clone();
let pos = clone.pos;
Ok(if clone.next_char_if(|c| c.is_ascii_digit()) != None {
clone.take_while(|c| c.is_ascii_digit());
let value = clone.input[pos..clone.pos].parse()?;
*self = clone;
Some(value)
} else {
None
})
}
fn next_hex_digit(&mut self) -> LexerResult<u32> {
let mut clone = self.clone();
let r = match clone.next_char()? {
c if c >= '0' && c <= '9' => c as u32 - b'0' as u32,
c if c >= 'A' && c <= 'F' => c as u32 - b'A' as u32 + 10,
c if c >= 'a' && c <= 'f' => c as u32 - b'a' as u32 + 10,
_ => return Err(LexerError::ExpectHexDigit),
};
*self = clone;
Ok(r)
}
fn next_octal_digit(&mut self) -> LexerResult<u32> {
self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectOctDigit)
.map(|c| c as u32 - '0' as u32)
}
fn next_decimal_digit(&mut self) -> LexerResult<u32> {
self.next_char_expect(|c| c >= '0' && c <= '9', LexerError::ExpectDecDigit)
.map(|c| c as u32 - '0' as u32)
}
fn next_decimal_digits(&mut self) -> LexerResult<()> {
self.next_decimal_digit()?;
self.take_while(|c| c >= '0' && c <= '9');
Ok(())
}
pub fn next_int_lit_opt(&mut self) -> LexerResult<Option<u64>> {
assert_ne!(ParserLanguage::Json, self.language);
self.skip_ws()?;
if let Some(i) = self.next_hex_lit_opt()? {
return Ok(Some(i));
}
if let Some(i) = self.next_decimal_octal_lit_opt()? {
return Ok(Some(i));
}
Ok(None)
}
fn next_exponent_opt(&mut self) -> LexerResult<Option<()>> {
if self.next_char_if_in("eE") != None {
self.next_char_if_in("+-");
self.next_decimal_digits()?;
Ok(Some(()))
} else {
Ok(None)
}
}
fn next_float_lit(&mut self) -> LexerResult<()> {
assert_ne!(ParserLanguage::Json, self.language);
if self.next_char_if_eq('.') {
self.next_decimal_digits()?;
self.next_exponent_opt()?;
} else {
self.next_decimal_digits()?;
if self.next_char_if_eq('.') {
self.next_decimal_digits()?;
self.next_exponent_opt()?;
} else {
if self.next_exponent_opt()? == None {
return Err(LexerError::IncorrectFloatLit);
}
}
}
Ok(())
}
pub(crate) fn next_str_lit_bytes(&mut self) -> LexerResult<DecodedBytes> {
match self.next_char()? {
'\\' => {
match self.next_char()? {
'\'' => Ok(DecodedBytes::byte(b'\'')),
'"' => Ok(DecodedBytes::byte(b'"')),
'\\' => Ok(DecodedBytes::byte(b'\\')),
'a' => Ok(DecodedBytes::byte(b'\x07')),
'b' => Ok(DecodedBytes::byte(b'\x08')),
'f' => Ok(DecodedBytes::byte(b'\x0c')),
'n' => Ok(DecodedBytes::byte(b'\n')),
'r' => Ok(DecodedBytes::byte(b'\r')),
't' => Ok(DecodedBytes::byte(b'\t')),
'v' => Ok(DecodedBytes::byte(b'\x0b')),
'x' => {
let d1 = self.next_hex_digit()? as u8;
let d2 = self.next_hex_digit()? as u8;
Ok(DecodedBytes::byte((d1 << 4) | d2))
}
d if d >= '0' && d <= '7' => {
let mut r = d as u8 - b'0';
for _ in 0..2 {
match self.next_octal_digit() {
Err(_) => break,
Ok(d) => r = (r << 3) + d as u8,
}
}
Ok(DecodedBytes::byte(r))
}
c => Ok(DecodedBytes::char(c)),
}
}
'\n' | '\0' => Err(LexerError::IncorrectInput),
c => Ok(DecodedBytes::char(c)),
}
}
fn char_try_from(i: u32) -> LexerResult<char> {
char::try_from(i).map_err(|_| LexerError::IncorrectUnicodeChar)
}
pub fn next_json_char_value(&mut self) -> LexerResult<char> {
match self.next_char()? {
'\\' => match self.next_char()? {
'"' => Ok('"'),
'\'' => Ok('\''),
'\\' => Ok('\\'),
'/' => Ok('/'),
'b' => Ok('\x08'),
'f' => Ok('\x0c'),
'n' => Ok('\n'),
'r' => Ok('\r'),
't' => Ok('\t'),
'u' => {
let mut v = 0;
for _ in 0..4 {
let digit = self.next_hex_digit()?;
v = v * 16 + digit;
}
Self::char_try_from(v)
}
_ => Err(LexerError::IncorrectJsonEscape),
},
c => Ok(c),
}
}
fn next_str_lit_raw(&mut self) -> LexerResult<String> {
let mut raw = String::new();
let mut first = true;
loop {
if !first {
self.skip_ws()?;
}
let start = self.pos;
let q = match self.next_char_if_in("'\"") {
Some(q) => q,
None if !first => break,
None => return Err(LexerError::IncorrectInput),
};
first = false;
while self.lookahead_char() != Some(q) {
self.next_str_lit_bytes()?;
}
self.next_char_expect_eq(q)?;
raw.push_str(&self.input[start + 1..self.pos - 1]);
}
Ok(raw)
}
fn next_str_lit_raw_opt(&mut self) -> LexerResult<Option<String>> {
if self.lookahead_char_is_in("'\"") {
Ok(Some(self.next_str_lit_raw()?))
} else {
Ok(None)
}
}
fn next_json_number_opt(&mut self) -> LexerResult<Option<JsonNumberLit>> {
assert_eq!(ParserLanguage::Json, self.language);
fn is_digit(c: char) -> bool {
c >= '0' && c <= '9'
}
fn is_digit_1_9(c: char) -> bool {
c >= '1' && c <= '9'
}
if !self.lookahead_char_is_in("-0123456789") {
return Ok(None);
}
let mut s = String::new();
if self.next_char_if_eq('-') {
s.push('-');
}
if self.next_char_if_eq('0') {
s.push('0');
} else {
s.push(self.next_char_expect(is_digit_1_9, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
if self.next_char_if_eq('.') {
s.push('.');
s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
if let Some(c) = self.next_char_if_in("eE") {
s.push(c);
if let Some(c) = self.next_char_if_in("+-") {
s.push(c);
}
s.push(self.next_char_expect(is_digit, LexerError::IncorrectJsonNumber)?);
while let Some(c) = self.next_char_if(is_digit) {
s.push(c);
}
}
Ok(Some(JsonNumberLit(s)))
}
fn next_token_inner(&mut self) -> LexerResult<Token> {
if self.language == ParserLanguage::Json {
if let Some(v) = self.next_json_number_opt()? {
return Ok(Token::JsonNumber(v));
}
}
if let Some(ident) = self.next_ident_opt()? {
let token = if self.language != ParserLanguage::Json && ident == float::PROTOBUF_NAN {
Token::FloatLit(f64::NAN)
} else if self.language != ParserLanguage::Json && ident == float::PROTOBUF_INF {
Token::FloatLit(f64::INFINITY)
} else {
Token::Ident(ident.to_owned())
};
return Ok(token);
}
if self.language != ParserLanguage::Json {
let mut clone = self.clone();
let pos = clone.pos;
if let Ok(_) = clone.next_float_lit() {
let f = float::parse_protobuf_float(&self.input[pos..clone.pos])?;
*self = clone;
return Ok(Token::FloatLit(f));
}
if let Some(lit) = self.next_int_lit_opt()? {
return Ok(Token::IntLit(lit));
}
}
if let Some(escaped) = self.next_str_lit_raw_opt()? {
return Ok(Token::StrLit(StrLit { escaped }));
}
if let Some(c) = self.next_char_if(|c| c.is_ascii_punctuation()) {
return Ok(Token::Symbol(c));
}
if let Some(ident) = self.next_ident_opt()? {
return Ok(Token::Ident(ident));
}
Err(LexerError::IncorrectInput)
}
pub fn next_token(&mut self) -> LexerResult<Option<TokenWithLocation>> {
self.skip_ws()?;
let loc = self.loc;
Ok(if self.eof() {
None
} else {
let token = self.next_token_inner()?;
self.skip_ws()?;
Some(TokenWithLocation { token, loc })
})
}
}
#[cfg(test)]
mod test {
use super::*;
fn lex<P, R>(input: &str, parse_what: P) -> R
where
P: FnOnce(&mut Lexer) -> LexerResult<R>,
{
let mut lexer = Lexer::new(input, ParserLanguage::Proto);
let r = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
r
}
fn lex_opt<P, R>(input: &str, parse_what: P) -> R
where
P: FnOnce(&mut Lexer) -> LexerResult<Option<R>>,
{
let mut lexer = Lexer::new(input, ParserLanguage::Proto);
let o = parse_what(&mut lexer).expect(&format!("lexer failed at {}", lexer.loc));
let r = o.expect(&format!("lexer returned none at {}", lexer.loc));
assert!(lexer.eof(), "check eof failed at {}", lexer.loc);
r
}
#[test]
fn test_lexer_int_lit() {
let msg = r#"10"#;
let mess = lex_opt(msg, |p| p.next_int_lit_opt());
assert_eq!(10, mess);
}
#[test]
fn test_lexer_float_lit() {
let msg = r#"12.3"#;
let mess = lex(msg, |p| p.next_token_inner());
assert_eq!(Token::FloatLit(12.3), mess);
}
#[test]
fn test_lexer_float_lit_leading_zeros_in_exp() {
let msg = r#"1e00009"#;
let mess = lex(msg, |p| p.next_token_inner());
assert_eq!(Token::FloatLit(1_000_000_000.0), mess);
}
}