#[cfg(test)]
#[path = "lexer_test.rs"]
mod test;
use cairo_lang_filesystem::ids::FileId;
use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
use cairo_lang_syntax::node::ast::{
TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
TokenWhitespace, TriviumGreen,
};
use cairo_lang_syntax::node::db::SyntaxGroup;
use cairo_lang_syntax::node::kind::SyntaxKind;
use cairo_lang_syntax::node::Token;
use cairo_lang_utils::require;
use smol_str::SmolStr;
pub struct Lexer<'a> {
db: &'a dyn SyntaxGroup,
text: &'a str,
previous_position: TextOffset,
current_position: TextOffset,
done: bool,
}
impl<'a> Lexer<'a> {
pub fn from_text(db: &'a dyn SyntaxGroup, _source: FileId, text: &'a str) -> Lexer<'a> {
Lexer {
db,
text,
previous_position: TextOffset::default(),
current_position: TextOffset::default(),
done: false,
}
}
pub fn position(&self) -> TextOffset {
self.current_position
}
fn peek(&self) -> Option<char> {
self.current_position.take_from(self.text).chars().next()
}
fn peek_nth(&self, n: usize) -> Option<char> {
self.current_position.take_from(self.text).chars().nth(n)
}
fn take(&mut self) -> Option<char> {
let res = self.peek()?;
self.current_position = self.current_position.add_width(TextWidth::from_char(res));
Some(res)
}
fn take_while<F>(&mut self, f: F)
where
F: Fn(char) -> bool,
{
while self.peek().map(&f).unwrap_or(false) {
self.take();
}
}
fn peek_span_text(&self) -> &'a str {
let span = TextSpan { start: self.previous_position, end: self.current_position };
span.take(self.text)
}
fn consume_span(&mut self) -> &str {
let val = self.peek_span_text();
self.previous_position = self.current_position;
val
}
fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
let mut res: Vec<TriviumGreen> = Vec::new();
while let Some(current) = self.peek() {
let trivium = match current {
' ' | '\r' | '\t' => self.match_trivium_whitespace(),
'\n' => self.match_trivium_newline(),
'/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
_ => break,
};
res.push(trivium);
if current == '\n' && !leading {
break;
}
}
res
}
fn match_trivium_whitespace(&mut self) -> TriviumGreen {
self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
}
fn match_trivium_newline(&mut self) -> TriviumGreen {
self.take();
TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
}
fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
match self.peek_nth(2) {
Some('/') => {
self.take_while(|c| c != '\n');
TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
.into()
}
Some('!') => {
self.take_while(|c| c != '\n');
TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
.into()
}
_ => {
self.take_while(|c| c != '\n');
TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
.into()
}
}
}
fn take_token_literal_number(&mut self) -> TokenKind {
let special = if self.peek() == Some('0') {
self.take();
match self.peek() {
Some('x' | 'o' | 'b') => {
match self.take() {
Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
_ => unreachable!(),
}
true
}
_ => false,
}
} else {
false
};
if !special {
self.take_while(|c| c.is_ascii_digit());
}
if self.peek() == Some('_') {
self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
}
TokenKind::LiteralNumber
}
fn take_token_short_string(&mut self) -> TokenKind {
self.take_token_string_helper('\'');
if self.peek() == Some('_') {
self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
}
TokenKind::ShortString
}
fn take_token_string(&mut self) -> TokenKind {
self.take_token_string_helper('"');
TokenKind::String
}
fn take_token_string_helper(&mut self, delimiter: char) {
self.take();
let mut escaped = false;
while let Some(token) = self.peek() {
self.take();
match token {
_ if escaped => escaped = false,
'\\' => escaped = true,
_ if token == delimiter => {
break;
}
_ => {}
};
}
}
fn take_token_identifier(&mut self) -> TokenKind {
self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
match self.peek_span_text() {
"as" => TokenKind::As,
"const" => TokenKind::Const,
"false" => TokenKind::False,
"true" => TokenKind::True,
"extern" => TokenKind::Extern,
"type" => TokenKind::Type,
"fn" => TokenKind::Function,
"trait" => TokenKind::Trait,
"impl" => TokenKind::Impl,
"of" => TokenKind::Of,
"mod" => TokenKind::Module,
"struct" => TokenKind::Struct,
"enum" => TokenKind::Enum,
"let" => TokenKind::Let,
"return" => TokenKind::Return,
"match" => TokenKind::Match,
"if" => TokenKind::If,
"loop" => TokenKind::Loop,
"continue" => TokenKind::Continue,
"break" => TokenKind::Break,
"else" => TokenKind::Else,
"while" => TokenKind::While,
"use" => TokenKind::Use,
"implicits" => TokenKind::Implicits,
"ref" => TokenKind::Ref,
"mut" => TokenKind::Mut,
"for" => TokenKind::For,
"nopanic" => TokenKind::NoPanic,
"pub" => TokenKind::Pub,
"_" => TokenKind::Underscore,
_ => TokenKind::Identifier,
}
}
fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
self.take();
kind
}
fn pick_kind(
&mut self,
second_char: char,
long_kind: TokenKind,
short_kind: TokenKind,
) -> TokenKind {
self.take();
if self.peek() == Some(second_char) {
self.take();
long_kind
} else {
short_kind
}
}
fn match_terminal(&mut self) -> LexerTerminal {
let leading_trivia = self.match_trivia(true);
let kind = if let Some(current) = self.peek() {
match current {
'0'..='9' => self.take_token_literal_number(),
'\'' => self.take_token_short_string(),
'"' => self.take_token_string(),
',' => self.take_token_of_kind(TokenKind::Comma),
';' => self.take_token_of_kind(TokenKind::Semicolon),
'?' => self.take_token_of_kind(TokenKind::QuestionMark),
'{' => self.take_token_of_kind(TokenKind::LBrace),
'}' => self.take_token_of_kind(TokenKind::RBrace),
'[' => self.take_token_of_kind(TokenKind::LBrack),
']' => self.take_token_of_kind(TokenKind::RBrack),
'(' => self.take_token_of_kind(TokenKind::LParen),
')' => self.take_token_of_kind(TokenKind::RParen),
'.' => self.pick_kind('.', TokenKind::DotDot, TokenKind::Dot),
'*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
'/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
'%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
'+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
'#' => self.take_token_of_kind(TokenKind::Hash),
'-' => {
self.take();
match self.peek() {
Some('>') => self.take_token_of_kind(TokenKind::Arrow),
Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
_ => TokenKind::Minus,
}
}
'<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
'>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
'!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
'~' => self.take_token_of_kind(TokenKind::BitNot),
'=' => {
self.take();
match self.peek() {
Some('=') => self.take_token_of_kind(TokenKind::EqEq),
Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
_ => TokenKind::Eq,
}
}
'&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
'|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
'^' => self.take_token_of_kind(TokenKind::Xor),
'@' => self.take_token_of_kind(TokenKind::At),
_ => self.take_token_of_kind(TokenKind::BadCharacters),
}
} else {
TokenKind::EndOfFile
};
let text = SmolStr::from(self.consume_span());
let trailing_trivia = self.match_trivia(false);
let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
}
}
#[derive(Clone, PartialEq, Eq, Debug)]
pub struct LexerTerminal {
pub text: SmolStr,
pub kind: SyntaxKind,
pub leading_trivia: Vec<TriviumGreen>,
pub trailing_trivia: Vec<TriviumGreen>,
}
impl LexerTerminal {
pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
+ TextWidth::from_str(&self.text)
+ self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
}
}
impl Iterator for Lexer<'_> {
type Item = LexerTerminal;
fn next(&mut self) -> Option<Self::Item> {
require(!self.done)?;
let lexer_terminal = self.match_terminal();
if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
self.done = true;
};
Some(lexer_terminal)
}
}
#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
enum TokenKind {
Identifier,
LiteralNumber,
ShortString,
String,
As,
Const,
False,
True,
Extern,
Type,
Function,
Trait,
Impl,
Of,
Module,
Struct,
Enum,
Let,
Return,
Match,
If,
While,
For,
Loop,
Continue,
Break,
Else,
Use,
Implicits,
NoPanic,
Pub,
Ref,
Mut,
And,
AndAnd,
At,
Or,
OrOr,
Xor,
EqEq,
Neq,
GE,
GT,
LE,
LT,
Not,
BitNot,
Plus,
PlusEq,
Minus,
MinusEq,
Mul,
MulEq,
Div,
DivEq,
Mod,
ModEq,
Colon,
ColonColon,
Comma,
Dot,
DotDot,
Eq,
Hash,
Semicolon,
QuestionMark,
Underscore,
LBrace,
RBrace,
LBrack,
RBrack,
LParen,
RParen,
Arrow,
MatchArrow,
EndOfFile,
BadCharacters,
}
fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
match kind {
TokenKind::As => SyntaxKind::TerminalAs,
TokenKind::Const => SyntaxKind::TerminalConst,
TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
TokenKind::ShortString => SyntaxKind::TerminalShortString,
TokenKind::String => SyntaxKind::TerminalString,
TokenKind::False => SyntaxKind::TerminalFalse,
TokenKind::True => SyntaxKind::TerminalTrue,
TokenKind::Extern => SyntaxKind::TerminalExtern,
TokenKind::Type => SyntaxKind::TerminalType,
TokenKind::Function => SyntaxKind::TerminalFunction,
TokenKind::Trait => SyntaxKind::TerminalTrait,
TokenKind::Impl => SyntaxKind::TerminalImpl,
TokenKind::Of => SyntaxKind::TerminalOf,
TokenKind::Module => SyntaxKind::TerminalModule,
TokenKind::Struct => SyntaxKind::TerminalStruct,
TokenKind::Enum => SyntaxKind::TerminalEnum,
TokenKind::Let => SyntaxKind::TerminalLet,
TokenKind::Return => SyntaxKind::TerminalReturn,
TokenKind::Match => SyntaxKind::TerminalMatch,
TokenKind::If => SyntaxKind::TerminalIf,
TokenKind::While => SyntaxKind::TerminalWhile,
TokenKind::For => SyntaxKind::TerminalFor,
TokenKind::Loop => SyntaxKind::TerminalLoop,
TokenKind::Continue => SyntaxKind::TerminalContinue,
TokenKind::Break => SyntaxKind::TerminalBreak,
TokenKind::Else => SyntaxKind::TerminalElse,
TokenKind::Use => SyntaxKind::TerminalUse,
TokenKind::Implicits => SyntaxKind::TerminalImplicits,
TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
TokenKind::Pub => SyntaxKind::TerminalPub,
TokenKind::And => SyntaxKind::TerminalAnd,
TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
TokenKind::At => SyntaxKind::TerminalAt,
TokenKind::Or => SyntaxKind::TerminalOr,
TokenKind::OrOr => SyntaxKind::TerminalOrOr,
TokenKind::Xor => SyntaxKind::TerminalXor,
TokenKind::EqEq => SyntaxKind::TerminalEqEq,
TokenKind::Neq => SyntaxKind::TerminalNeq,
TokenKind::GE => SyntaxKind::TerminalGE,
TokenKind::GT => SyntaxKind::TerminalGT,
TokenKind::LE => SyntaxKind::TerminalLE,
TokenKind::LT => SyntaxKind::TerminalLT,
TokenKind::Not => SyntaxKind::TerminalNot,
TokenKind::BitNot => SyntaxKind::TerminalBitNot,
TokenKind::Plus => SyntaxKind::TerminalPlus,
TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
TokenKind::Minus => SyntaxKind::TerminalMinus,
TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
TokenKind::Mul => SyntaxKind::TerminalMul,
TokenKind::MulEq => SyntaxKind::TerminalMulEq,
TokenKind::Div => SyntaxKind::TerminalDiv,
TokenKind::DivEq => SyntaxKind::TerminalDivEq,
TokenKind::Mod => SyntaxKind::TerminalMod,
TokenKind::ModEq => SyntaxKind::TerminalModEq,
TokenKind::Colon => SyntaxKind::TerminalColon,
TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
TokenKind::Comma => SyntaxKind::TerminalComma,
TokenKind::Dot => SyntaxKind::TerminalDot,
TokenKind::DotDot => SyntaxKind::TerminalDotDot,
TokenKind::Eq => SyntaxKind::TerminalEq,
TokenKind::Hash => SyntaxKind::TerminalHash,
TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
TokenKind::LBrace => SyntaxKind::TerminalLBrace,
TokenKind::RBrace => SyntaxKind::TerminalRBrace,
TokenKind::LBrack => SyntaxKind::TerminalLBrack,
TokenKind::RBrack => SyntaxKind::TerminalRBrack,
TokenKind::LParen => SyntaxKind::TerminalLParen,
TokenKind::RParen => SyntaxKind::TerminalRParen,
TokenKind::Ref => SyntaxKind::TerminalRef,
TokenKind::Mut => SyntaxKind::TerminalMut,
TokenKind::Arrow => SyntaxKind::TerminalArrow,
TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
}
}