use crate::error::{Error, Errors, Span};
use std::borrow::Cow;
use std::path::Path;
use std::sync::Arc;
type Result<T> = std::result::Result<T, Errors>;
#[derive(Clone, Debug)]
pub struct Lexer<'a> {
pub filenames: Vec<Arc<str>>,
pub file_texts: Vec<Arc<str>>,
file_starts: Vec<usize>,
buf: Cow<'a, [u8]>,
pos: Pos,
lookahead: Option<(Pos, Token)>,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Hash, PartialOrd, Ord)]
pub struct Pos {
pub file: usize,
pub offset: usize,
pub line: usize,
pub col: usize,
}
impl Pos {
pub fn pretty_print_line(&self, filenames: &[Arc<str>]) -> String {
format!("{} line {}", filenames[self.file], self.line)
}
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum Token {
LParen,
RParen,
Symbol(String),
Int(i128),
At,
}
impl<'a> Lexer<'a> {
pub fn from_str(s: &'a str, filename: &'a str) -> Result<Lexer<'a>> {
let mut l = Lexer {
filenames: vec![filename.into()],
file_texts: vec![s.into()],
file_starts: vec![0],
buf: Cow::Borrowed(s.as_bytes()),
pos: Pos {
file: 0,
offset: 0,
line: 1,
col: 0,
},
lookahead: None,
};
l.reload()?;
Ok(l)
}
pub fn from_files<P>(file_paths: impl IntoIterator<Item = P>) -> Result<Lexer<'a>>
where
P: AsRef<Path>,
{
let mut filenames = Vec::<Arc<str>>::new();
let mut file_texts = Vec::<Arc<str>>::new();
for f in file_paths {
let f = f.as_ref();
filenames.push(f.display().to_string().into());
let s = std::fs::read_to_string(f)
.map_err(|e| Errors::from_io(e, format!("failed to read file: {}", f.display())))?;
file_texts.push(s.into());
}
assert!(!filenames.is_empty());
let mut file_starts = vec![];
let mut buf = String::new();
for text in &file_texts {
file_starts.push(buf.len());
buf += &text;
buf += "\n";
}
let mut l = Lexer {
filenames,
file_texts,
buf: Cow::Owned(buf.into_bytes()),
file_starts,
pos: Pos {
file: 0,
offset: 0,
line: 1,
col: 0,
},
lookahead: None,
};
l.reload()?;
Ok(l)
}
pub fn pos(&self) -> Pos {
Pos {
file: self.pos.file,
offset: self.pos.offset - self.file_starts[self.pos.file],
line: self.pos.line,
col: self.pos.col,
}
}
fn advance_pos(&mut self) {
self.pos.col += 1;
if self.buf[self.pos.offset] == b'\n' {
self.pos.line += 1;
self.pos.col = 0;
}
self.pos.offset += 1;
if self.pos.file + 1 < self.file_starts.len() {
let next_start = self.file_starts[self.pos.file + 1];
if self.pos.offset >= next_start {
assert!(self.pos.offset == next_start);
self.pos.file += 1;
self.pos.line = 1;
}
}
}
fn error(&self, pos: Pos, msg: impl Into<String>) -> Errors {
Errors {
errors: vec![Error::ParseError {
msg: msg.into(),
span: Span::new_single(pos),
}],
filenames: self.filenames.clone(),
file_texts: self.file_texts.clone(),
}
}
fn next_token(&mut self) -> Result<Option<(Pos, Token)>> {
fn is_sym_first_char(c: u8) -> bool {
match c {
b'-' | b'0'..=b'9' | b'(' | b')' | b';' | b'<' | b'>' => false,
c if c.is_ascii_whitespace() => false,
_ => true,
}
}
fn is_sym_other_char(c: u8) -> bool {
match c {
b'(' | b')' | b';' | b'@' | b'<' => false,
c if c.is_ascii_whitespace() => false,
_ => true,
}
}
while self.pos.offset < self.buf.len() {
if self.buf[self.pos.offset].is_ascii_whitespace() {
self.advance_pos();
continue;
}
if self.buf[self.pos.offset] == b';' {
while self.pos.offset < self.buf.len() && self.buf[self.pos.offset] != b'\n' {
self.advance_pos();
}
continue;
}
break;
}
if self.pos.offset == self.buf.len() {
return Ok(None);
}
let char_pos = self.pos();
match self.buf[self.pos.offset] {
b'(' => {
self.advance_pos();
Ok(Some((char_pos, Token::LParen)))
}
b')' => {
self.advance_pos();
Ok(Some((char_pos, Token::RParen)))
}
b'@' => {
self.advance_pos();
Ok(Some((char_pos, Token::At)))
}
c if is_sym_first_char(c) => {
let start = self.pos.offset;
let start_pos = self.pos();
while self.pos.offset < self.buf.len()
&& is_sym_other_char(self.buf[self.pos.offset])
{
self.advance_pos();
}
let end = self.pos.offset;
let s = std::str::from_utf8(&self.buf[start..end])
.expect("Only ASCII characters, should be UTF-8");
debug_assert!(!s.is_empty());
Ok(Some((start_pos, Token::Symbol(s.to_string()))))
}
c @ (b'0'..=b'9' | b'-') => {
let start_pos = self.pos();
let neg = if c == b'-' {
self.advance_pos();
true
} else {
false
};
let mut radix = 10;
if self.buf.get(self.pos.offset).copied() == Some(b'0')
&& (self.buf.get(self.pos.offset + 1).copied() == Some(b'x')
|| self.buf.get(self.pos.offset + 1).copied() == Some(b'X'))
{
self.advance_pos();
self.advance_pos();
radix = 16;
}
let mut s = vec![];
while self.pos.offset < self.buf.len()
&& ((radix == 10 && self.buf[self.pos.offset].is_ascii_digit())
|| (radix == 16 && self.buf[self.pos.offset].is_ascii_hexdigit())
|| self.buf[self.pos.offset] == b'_')
{
if self.buf[self.pos.offset] != b'_' {
s.push(self.buf[self.pos.offset]);
}
self.advance_pos();
}
let s_utf8 = std::str::from_utf8(&s[..]).unwrap();
let num = i128::from_str_radix(s_utf8, radix)
.or_else(|_| u128::from_str_radix(s_utf8, radix).map(|val| val as i128))
.map_err(|e| self.error(start_pos, e.to_string()))?;
let tok = if neg {
Token::Int(num.checked_neg().ok_or_else(|| {
self.error(start_pos, "integer literal cannot fit in i128")
})?)
} else {
Token::Int(num)
};
Ok(Some((start_pos, tok)))
}
c => Err(self.error(self.pos, format!("Unexpected character '{}'", c))),
}
}
pub fn next(&mut self) -> Result<Option<(Pos, Token)>> {
let tok = self.lookahead.take();
self.reload()?;
Ok(tok)
}
fn reload(&mut self) -> Result<()> {
if self.lookahead.is_none() && self.pos.offset < self.buf.len() {
self.lookahead = self.next_token()?;
}
Ok(())
}
pub fn peek(&self) -> Option<&(Pos, Token)> {
self.lookahead.as_ref()
}
pub fn eof(&self) -> bool {
self.lookahead.is_none()
}
}
impl Token {
pub fn is_int(&self) -> bool {
match self {
Token::Int(_) => true,
_ => false,
}
}
pub fn is_sym(&self) -> bool {
match self {
Token::Symbol(_) => true,
_ => false,
}
}
}
#[cfg(test)]
mod test {
use super::*;
fn lex(s: &str, file: &str) -> Vec<Token> {
let mut toks = vec![];
let mut lexer = Lexer::from_str(s, file).unwrap();
while let Some((_, tok)) = lexer.next().unwrap() {
toks.push(tok);
}
toks
}
#[test]
fn lexer_basic() {
assert_eq!(
lex(
";; comment\n; another\r\n \t(one two three 23 -568 )\n",
"lexer_basic"
),
vec![
Token::LParen,
Token::Symbol("one".to_string()),
Token::Symbol("two".to_string()),
Token::Symbol("three".to_string()),
Token::Int(23),
Token::Int(-568),
Token::RParen
]
);
}
#[test]
fn ends_with_sym() {
assert_eq!(
lex("asdf", "ends_with_sym"),
vec![Token::Symbol("asdf".to_string()),]
);
}
#[test]
fn ends_with_num() {
assert_eq!(lex("23", "ends_with_num"), vec![Token::Int(23)],);
}
#[test]
fn weird_syms() {
assert_eq!(
lex("(+ [] => !! _test!;comment\n)", "weird_syms"),
vec![
Token::LParen,
Token::Symbol("+".to_string()),
Token::Symbol("[]".to_string()),
Token::Symbol("=>".to_string()),
Token::Symbol("!!".to_string()),
Token::Symbol("_test!".to_string()),
Token::RParen,
]
);
}
}