1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8 TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9 TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17 db: &'a dyn SyntaxGroup,
18 text: &'a str,
19 previous_position: TextOffset,
20 current_position: TextOffset,
21 done: bool,
22}
23
24impl<'a> Lexer<'a> {
25 pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27 Lexer {
28 db,
29 text,
30 previous_position: TextOffset::START,
31 current_position: TextOffset::START,
32 done: false,
33 }
34 }
35
36 pub fn position(&self) -> TextOffset {
37 self.current_position
38 }
39
40 fn peek(&self) -> Option<char> {
42 self.current_position.take_from(self.text).chars().next()
43 }
44
45 fn peek_nth(&self, n: usize) -> Option<char> {
46 self.current_position.take_from(self.text).chars().nth(n)
47 }
48
49 fn take(&mut self) -> Option<char> {
50 let res = self.peek()?;
51 self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52 Some(res)
53 }
54
55 fn take_while<F>(&mut self, f: F)
57 where
58 F: Fn(char) -> bool,
59 {
60 while self.peek().map(&f).unwrap_or(false) {
61 self.take();
62 }
63 }
64
65 fn peek_span_text(&self) -> &'a str {
66 let span = TextSpan { start: self.previous_position, end: self.current_position };
67 span.take(self.text)
68 }
69
70 fn consume_span(&mut self) -> &str {
71 let val = self.peek_span_text();
72 self.previous_position = self.current_position;
73 val
74 }
75
76 fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78 let mut res: Vec<TriviumGreen> = Vec::new();
79 while let Some(current) = self.peek() {
80 let trivium = match current {
81 ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82 '\n' => self.match_trivium_newline(),
83 '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84 _ => break,
85 };
86 res.push(trivium);
87 if current == '\n' && !leading {
88 break;
89 }
90 }
91 res
92 }
93
94 fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96 self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97 TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98 }
99
100 fn match_trivium_newline(&mut self) -> TriviumGreen {
102 self.take();
103 TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104 }
105
106 fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108 match self.peek_nth(2) {
109 Some('/') => {
110 self.take_while(|c| c != '\n');
111 TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112 .into()
113 }
114 Some('!') => {
115 self.take_while(|c| c != '\n');
116 TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117 .into()
118 }
119 _ => {
120 self.take_while(|c| c != '\n');
121 TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122 .into()
123 }
124 }
125 }
126
127 fn take_token_literal_number(&mut self) -> TokenKind {
132 let special = if self.peek() == Some('0') {
133 self.take();
134 match self.peek() {
135 Some('x' | 'o' | 'b') => {
136 match self.take() {
137 Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138 Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139 Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140 _ => unreachable!(),
141 }
142 true
143 }
144 _ => false,
145 }
146 } else {
147 false
148 };
149 if !special {
151 self.take_while(|c| c.is_ascii_digit());
152 }
153
154 if self.peek() == Some('_') {
156 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157 }
158 TokenKind::LiteralNumber
159 }
160
161 fn take_token_short_string(&mut self) -> TokenKind {
163 self.take_token_string_helper('\'');
164
165 if self.peek() == Some('_') {
167 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168 }
169 TokenKind::ShortString
170 }
171
172 fn take_token_string(&mut self) -> TokenKind {
174 self.take_token_string_helper('"');
175 TokenKind::String
176 }
177
178 fn take_token_string_helper(&mut self, delimiter: char) {
179 self.take();
180 let mut escaped = false;
181 while let Some(token) = self.peek() {
182 self.take();
183 match token {
184 _ if escaped => escaped = false,
185 '\\' => escaped = true,
186 _ if token == delimiter => {
187 break;
188 }
189 _ => {}
190 };
191 }
192 }
193
194 fn take_token_identifier(&mut self) -> TokenKind {
196 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199 match self.peek_span_text() {
200 "as" => TokenKind::As,
201 "const" => TokenKind::Const,
202 "false" => TokenKind::False,
203 "true" => TokenKind::True,
204 "extern" => TokenKind::Extern,
205 "type" => TokenKind::Type,
206 "fn" => TokenKind::Function,
207 "trait" => TokenKind::Trait,
208 "impl" => TokenKind::Impl,
209 "of" => TokenKind::Of,
210 "mod" => TokenKind::Module,
211 "struct" => TokenKind::Struct,
212 "enum" => TokenKind::Enum,
213 "let" => TokenKind::Let,
214 "return" => TokenKind::Return,
215 "match" => TokenKind::Match,
216 "if" => TokenKind::If,
217 "loop" => TokenKind::Loop,
218 "continue" => TokenKind::Continue,
219 "break" => TokenKind::Break,
220 "else" => TokenKind::Else,
221 "while" => TokenKind::While,
222 "use" => TokenKind::Use,
223 "implicits" => TokenKind::Implicits,
224 "ref" => TokenKind::Ref,
225 "mut" => TokenKind::Mut,
226 "for" => TokenKind::For,
227 "nopanic" => TokenKind::NoPanic,
228 "pub" => TokenKind::Pub,
229 "_" => TokenKind::Underscore,
230 _ => TokenKind::Identifier,
231 }
232 }
233
234 fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
236 self.take();
237 kind
238 }
239
240 fn pick_kind(
242 &mut self,
243 second_char: char,
244 long_kind: TokenKind,
245 short_kind: TokenKind,
246 ) -> TokenKind {
247 self.take();
248 if self.peek() == Some(second_char) {
249 self.take();
250 long_kind
251 } else {
252 short_kind
253 }
254 }
255
256 fn match_terminal(&mut self) -> LexerTerminal {
257 let leading_trivia = self.match_trivia(true);
258
259 let kind = if let Some(current) = self.peek() {
260 match current {
261 '0'..='9' => self.take_token_literal_number(),
262 '\'' => self.take_token_short_string(),
263 '"' => self.take_token_string(),
264 ',' => self.take_token_of_kind(TokenKind::Comma),
265 ';' => self.take_token_of_kind(TokenKind::Semicolon),
266 '?' => self.take_token_of_kind(TokenKind::QuestionMark),
267 '{' => self.take_token_of_kind(TokenKind::LBrace),
268 '}' => self.take_token_of_kind(TokenKind::RBrace),
269 '[' => self.take_token_of_kind(TokenKind::LBrack),
270 ']' => self.take_token_of_kind(TokenKind::RBrack),
271 '(' => self.take_token_of_kind(TokenKind::LParen),
272 ')' => self.take_token_of_kind(TokenKind::RParen),
273 '.' => self.pick_kind('.', TokenKind::DotDot, TokenKind::Dot),
274 '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
275 '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
276 '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
277 '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
278 '#' => self.take_token_of_kind(TokenKind::Hash),
279 '-' => {
280 self.take();
281 match self.peek() {
282 Some('>') => self.take_token_of_kind(TokenKind::Arrow),
283 Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
284 _ => TokenKind::Minus,
285 }
286 }
287 '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
288 '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
289 'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
290 ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
291 '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
292 '~' => self.take_token_of_kind(TokenKind::BitNot),
293 '=' => {
294 self.take();
295 match self.peek() {
296 Some('=') => self.take_token_of_kind(TokenKind::EqEq),
297 Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
298 _ => TokenKind::Eq,
299 }
300 }
301 '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
302 '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
303 '^' => self.take_token_of_kind(TokenKind::Xor),
304 '@' => self.take_token_of_kind(TokenKind::At),
305 _ => self.take_token_of_kind(TokenKind::BadCharacters),
306 }
307 } else {
308 TokenKind::EndOfFile
309 };
310
311 let text = SmolStr::from(self.consume_span());
312 let trailing_trivia = self.match_trivia(false);
313 let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
314
315 LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
317 }
318}
319
320#[derive(Clone, PartialEq, Eq, Debug)]
322pub struct LexerTerminal {
323 pub text: SmolStr,
324 pub kind: SyntaxKind,
326 pub leading_trivia: Vec<TriviumGreen>,
327 pub trailing_trivia: Vec<TriviumGreen>,
328}
329impl LexerTerminal {
330 pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
331 self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
332 + TextWidth::from_str(&self.text)
333 + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
334 }
335}
336
337impl Iterator for Lexer<'_> {
338 type Item = LexerTerminal;
339
340 fn next(&mut self) -> Option<Self::Item> {
343 require(!self.done)?;
344 let lexer_terminal = self.match_terminal();
345 if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
346 self.done = true;
347 };
348 Some(lexer_terminal)
349 }
350}
351
352#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
353enum TokenKind {
354 Identifier,
355
356 LiteralNumber,
358 ShortString,
359 String,
360
361 As,
363 Const,
364 False,
365 True,
366 Extern,
367 Type,
368 Function,
369 Trait,
370 Impl,
371 Of,
372 Module,
373 Struct,
374 Enum,
375 Let,
376 Return,
377 Match,
378 If,
379 While,
380 For,
381 Loop,
382 Continue,
383 Break,
384 Else,
385 Use,
386 Implicits,
387 NoPanic,
388 Pub,
389
390 Ref,
392 Mut,
393
394 And,
396 AndAnd,
397 At,
398 Or,
399 OrOr,
400 Xor,
401 EqEq,
402 Neq,
403 GE,
404 GT,
405 LE,
406 LT,
407 Not,
408 BitNot,
409 Plus,
410 PlusEq,
411 Minus,
412 MinusEq,
413 Mul,
414 MulEq,
415 Div,
416 DivEq,
417 Mod,
418 ModEq,
419
420 Colon,
421 ColonColon,
422 Comma,
423 Dot,
424 DotDot,
425 Eq,
426 Hash,
427 Semicolon,
428 QuestionMark,
429 Underscore,
430 LBrace,
431 RBrace,
432 LBrack,
433 RBrack,
434 LParen,
435 RParen,
436 Arrow,
437 MatchArrow,
438
439 EndOfFile,
441 BadCharacters,
442}
443
444fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
445 match kind {
446 TokenKind::As => SyntaxKind::TerminalAs,
447 TokenKind::Const => SyntaxKind::TerminalConst,
448 TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
449 TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
450 TokenKind::ShortString => SyntaxKind::TerminalShortString,
451 TokenKind::String => SyntaxKind::TerminalString,
452 TokenKind::False => SyntaxKind::TerminalFalse,
453 TokenKind::True => SyntaxKind::TerminalTrue,
454 TokenKind::Extern => SyntaxKind::TerminalExtern,
455 TokenKind::Type => SyntaxKind::TerminalType,
456 TokenKind::Function => SyntaxKind::TerminalFunction,
457 TokenKind::Trait => SyntaxKind::TerminalTrait,
458 TokenKind::Impl => SyntaxKind::TerminalImpl,
459 TokenKind::Of => SyntaxKind::TerminalOf,
460 TokenKind::Module => SyntaxKind::TerminalModule,
461 TokenKind::Struct => SyntaxKind::TerminalStruct,
462 TokenKind::Enum => SyntaxKind::TerminalEnum,
463 TokenKind::Let => SyntaxKind::TerminalLet,
464 TokenKind::Return => SyntaxKind::TerminalReturn,
465 TokenKind::Match => SyntaxKind::TerminalMatch,
466 TokenKind::If => SyntaxKind::TerminalIf,
467 TokenKind::While => SyntaxKind::TerminalWhile,
468 TokenKind::For => SyntaxKind::TerminalFor,
469 TokenKind::Loop => SyntaxKind::TerminalLoop,
470 TokenKind::Continue => SyntaxKind::TerminalContinue,
471 TokenKind::Break => SyntaxKind::TerminalBreak,
472 TokenKind::Else => SyntaxKind::TerminalElse,
473 TokenKind::Use => SyntaxKind::TerminalUse,
474 TokenKind::Implicits => SyntaxKind::TerminalImplicits,
475 TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
476 TokenKind::Pub => SyntaxKind::TerminalPub,
477 TokenKind::And => SyntaxKind::TerminalAnd,
478 TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
479 TokenKind::At => SyntaxKind::TerminalAt,
480 TokenKind::Or => SyntaxKind::TerminalOr,
481 TokenKind::OrOr => SyntaxKind::TerminalOrOr,
482 TokenKind::Xor => SyntaxKind::TerminalXor,
483 TokenKind::EqEq => SyntaxKind::TerminalEqEq,
484 TokenKind::Neq => SyntaxKind::TerminalNeq,
485 TokenKind::GE => SyntaxKind::TerminalGE,
486 TokenKind::GT => SyntaxKind::TerminalGT,
487 TokenKind::LE => SyntaxKind::TerminalLE,
488 TokenKind::LT => SyntaxKind::TerminalLT,
489 TokenKind::Not => SyntaxKind::TerminalNot,
490 TokenKind::BitNot => SyntaxKind::TerminalBitNot,
491 TokenKind::Plus => SyntaxKind::TerminalPlus,
492 TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
493 TokenKind::Minus => SyntaxKind::TerminalMinus,
494 TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
495 TokenKind::Mul => SyntaxKind::TerminalMul,
496 TokenKind::MulEq => SyntaxKind::TerminalMulEq,
497 TokenKind::Div => SyntaxKind::TerminalDiv,
498 TokenKind::DivEq => SyntaxKind::TerminalDivEq,
499 TokenKind::Mod => SyntaxKind::TerminalMod,
500 TokenKind::ModEq => SyntaxKind::TerminalModEq,
501 TokenKind::Colon => SyntaxKind::TerminalColon,
502 TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
503 TokenKind::Comma => SyntaxKind::TerminalComma,
504 TokenKind::Dot => SyntaxKind::TerminalDot,
505 TokenKind::DotDot => SyntaxKind::TerminalDotDot,
506 TokenKind::Eq => SyntaxKind::TerminalEq,
507 TokenKind::Hash => SyntaxKind::TerminalHash,
508 TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
509 TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
510 TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
511 TokenKind::LBrace => SyntaxKind::TerminalLBrace,
512 TokenKind::RBrace => SyntaxKind::TerminalRBrace,
513 TokenKind::LBrack => SyntaxKind::TerminalLBrack,
514 TokenKind::RBrack => SyntaxKind::TerminalRBrack,
515 TokenKind::LParen => SyntaxKind::TerminalLParen,
516 TokenKind::RParen => SyntaxKind::TerminalRParen,
517 TokenKind::Ref => SyntaxKind::TerminalRef,
518 TokenKind::Mut => SyntaxKind::TerminalMut,
519 TokenKind::Arrow => SyntaxKind::TerminalArrow,
520 TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
521 TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
522 TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
523 }
524}