1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8 TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9 TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17 db: &'a dyn SyntaxGroup,
18 text: &'a str,
19 previous_position: TextOffset,
20 current_position: TextOffset,
21 done: bool,
22}
23
24impl<'a> Lexer<'a> {
25 pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27 Lexer {
28 db,
29 text,
30 previous_position: TextOffset::START,
31 current_position: TextOffset::START,
32 done: false,
33 }
34 }
35
36 pub fn position(&self) -> TextOffset {
37 self.current_position
38 }
39
40 fn peek(&self) -> Option<char> {
42 self.current_position.take_from(self.text).chars().next()
43 }
44
45 fn peek_nth(&self, n: usize) -> Option<char> {
46 self.current_position.take_from(self.text).chars().nth(n)
47 }
48
49 fn take(&mut self) -> Option<char> {
50 let res = self.peek()?;
51 self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52 Some(res)
53 }
54
55 fn take_while<F>(&mut self, f: F)
57 where
58 F: Fn(char) -> bool,
59 {
60 while self.peek().map(&f).unwrap_or(false) {
61 self.take();
62 }
63 }
64
65 fn peek_span_text(&self) -> &'a str {
66 let span = TextSpan { start: self.previous_position, end: self.current_position };
67 span.take(self.text)
68 }
69
70 fn consume_span(&mut self) -> &str {
71 let val = self.peek_span_text();
72 self.previous_position = self.current_position;
73 val
74 }
75
76 fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78 let mut res: Vec<TriviumGreen> = Vec::new();
79 while let Some(current) = self.peek() {
80 let trivium = match current {
81 ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82 '\n' => self.match_trivium_newline(),
83 '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84 _ => break,
85 };
86 res.push(trivium);
87 if current == '\n' && !leading {
88 break;
89 }
90 }
91 res
92 }
93
94 fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96 self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97 TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98 }
99
100 fn match_trivium_newline(&mut self) -> TriviumGreen {
102 self.take();
103 TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104 }
105
106 fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108 match self.peek_nth(2) {
109 Some('/') => {
110 self.take_while(|c| c != '\n');
111 TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112 .into()
113 }
114 Some('!') => {
115 self.take_while(|c| c != '\n');
116 TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117 .into()
118 }
119 _ => {
120 self.take_while(|c| c != '\n');
121 TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122 .into()
123 }
124 }
125 }
126
127 fn take_token_literal_number(&mut self) -> TokenKind {
132 let special = if self.peek() == Some('0') {
133 self.take();
134 match self.peek() {
135 Some('x' | 'o' | 'b') => {
136 match self.take() {
137 Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138 Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139 Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140 _ => unreachable!(),
141 }
142 true
143 }
144 _ => false,
145 }
146 } else {
147 false
148 };
149 if !special {
151 self.take_while(|c| c.is_ascii_digit());
152 }
153
154 if self.peek() == Some('_') {
156 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157 }
158 TokenKind::LiteralNumber
159 }
160
161 fn take_token_short_string(&mut self) -> TokenKind {
163 self.take_token_string_helper('\'');
164
165 if self.peek() == Some('_') {
167 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168 }
169 TokenKind::ShortString
170 }
171
172 fn take_token_string(&mut self) -> TokenKind {
174 self.take_token_string_helper('"');
175 TokenKind::String
176 }
177
178 fn take_token_string_helper(&mut self, delimiter: char) {
179 self.take();
180 let mut escaped = false;
181 while let Some(token) = self.peek() {
182 self.take();
183 match token {
184 _ if escaped => escaped = false,
185 '\\' => escaped = true,
186 _ if token == delimiter => {
187 break;
188 }
189 _ => {}
190 };
191 }
192 }
193
194 fn take_token_identifier(&mut self) -> TokenKind {
196 self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199 match self.peek_span_text() {
200 "as" => TokenKind::As,
201 "const" => TokenKind::Const,
202 "false" => TokenKind::False,
203 "true" => TokenKind::True,
204 "extern" => TokenKind::Extern,
205 "type" => TokenKind::Type,
206 "fn" => TokenKind::Function,
207 "trait" => TokenKind::Trait,
208 "impl" => TokenKind::Impl,
209 "of" => TokenKind::Of,
210 "mod" => TokenKind::Module,
211 "struct" => TokenKind::Struct,
212 "enum" => TokenKind::Enum,
213 "let" => TokenKind::Let,
214 "return" => TokenKind::Return,
215 "match" => TokenKind::Match,
216 "if" => TokenKind::If,
217 "loop" => TokenKind::Loop,
218 "continue" => TokenKind::Continue,
219 "break" => TokenKind::Break,
220 "else" => TokenKind::Else,
221 "while" => TokenKind::While,
222 "use" => TokenKind::Use,
223 "implicits" => TokenKind::Implicits,
224 "ref" => TokenKind::Ref,
225 "mut" => TokenKind::Mut,
226 "for" => TokenKind::For,
227 "nopanic" => TokenKind::NoPanic,
228 "pub" => TokenKind::Pub,
229 "_" => TokenKind::Underscore,
230 _ => TokenKind::Identifier,
231 }
232 }
233
234 fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
236 self.take();
237 kind
238 }
239
240 fn pick_kind(
242 &mut self,
243 second_char: char,
244 long_kind: TokenKind,
245 short_kind: TokenKind,
246 ) -> TokenKind {
247 self.take();
248 if self.peek() == Some(second_char) {
249 self.take();
250 long_kind
251 } else {
252 short_kind
253 }
254 }
255
256 fn match_terminal(&mut self) -> LexerTerminal {
257 let leading_trivia = self.match_trivia(true);
258
259 let kind = if let Some(current) = self.peek() {
260 match current {
261 '0'..='9' => self.take_token_literal_number(),
262 '\'' => self.take_token_short_string(),
263 '"' => self.take_token_string(),
264 ',' => self.take_token_of_kind(TokenKind::Comma),
265 ';' => self.take_token_of_kind(TokenKind::Semicolon),
266 '?' => self.take_token_of_kind(TokenKind::QuestionMark),
267 '{' => self.take_token_of_kind(TokenKind::LBrace),
268 '}' => self.take_token_of_kind(TokenKind::RBrace),
269 '[' => self.take_token_of_kind(TokenKind::LBrack),
270 ']' => self.take_token_of_kind(TokenKind::RBrack),
271 '(' => self.take_token_of_kind(TokenKind::LParen),
272 ')' => self.take_token_of_kind(TokenKind::RParen),
273 '.' => {
274 self.take();
275 match self.peek() {
276 Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
277 _ => TokenKind::Dot,
278 }
279 }
280 '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
281 '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
282 '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
283 '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
284 '#' => self.take_token_of_kind(TokenKind::Hash),
285 '-' => {
286 self.take();
287 match self.peek() {
288 Some('>') => self.take_token_of_kind(TokenKind::Arrow),
289 Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
290 _ => TokenKind::Minus,
291 }
292 }
293 '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
294 '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
295 'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
296 ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
297 '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
298 '~' => self.take_token_of_kind(TokenKind::BitNot),
299 '=' => {
300 self.take();
301 match self.peek() {
302 Some('=') => self.take_token_of_kind(TokenKind::EqEq),
303 Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
304 _ => TokenKind::Eq,
305 }
306 }
307 '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
308 '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
309 '^' => self.take_token_of_kind(TokenKind::Xor),
310 '@' => self.take_token_of_kind(TokenKind::At),
311 _ => self.take_token_of_kind(TokenKind::BadCharacters),
312 }
313 } else {
314 TokenKind::EndOfFile
315 };
316
317 let text = SmolStr::from(self.consume_span());
318 let trailing_trivia = self.match_trivia(false);
319 let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
320
321 LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
323 }
324}
325
326#[derive(Clone, PartialEq, Eq, Debug)]
328pub struct LexerTerminal {
329 pub text: SmolStr,
330 pub kind: SyntaxKind,
332 pub leading_trivia: Vec<TriviumGreen>,
333 pub trailing_trivia: Vec<TriviumGreen>,
334}
335impl LexerTerminal {
336 pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
337 self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
338 + TextWidth::from_str(&self.text)
339 + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
340 }
341}
342
343impl Iterator for Lexer<'_> {
344 type Item = LexerTerminal;
345
346 fn next(&mut self) -> Option<Self::Item> {
349 require(!self.done)?;
350 let lexer_terminal = self.match_terminal();
351 if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
352 self.done = true;
353 };
354 Some(lexer_terminal)
355 }
356}
357
358#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
359enum TokenKind {
360 Identifier,
361
362 LiteralNumber,
364 ShortString,
365 String,
366
367 As,
369 Const,
370 False,
371 True,
372 Extern,
373 Type,
374 Function,
375 Trait,
376 Impl,
377 Of,
378 Module,
379 Struct,
380 Enum,
381 Let,
382 Return,
383 Match,
384 If,
385 While,
386 For,
387 Loop,
388 Continue,
389 Break,
390 Else,
391 Use,
392 Implicits,
393 NoPanic,
394 Pub,
395
396 Ref,
398 Mut,
399
400 And,
402 AndAnd,
403 At,
404 Or,
405 OrOr,
406 Xor,
407 EqEq,
408 Neq,
409 GE,
410 GT,
411 LE,
412 LT,
413 Not,
414 BitNot,
415 Plus,
416 PlusEq,
417 Minus,
418 MinusEq,
419 Mul,
420 MulEq,
421 Div,
422 DivEq,
423 Mod,
424 ModEq,
425
426 Colon,
427 ColonColon,
428 Comma,
429 Dot,
430 DotDot,
431 DotDotEq,
432 Eq,
433 Hash,
434 Semicolon,
435 QuestionMark,
436 Underscore,
437 LBrace,
438 RBrace,
439 LBrack,
440 RBrack,
441 LParen,
442 RParen,
443 Arrow,
444 MatchArrow,
445
446 EndOfFile,
448 BadCharacters,
449}
450
451fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
452 match kind {
453 TokenKind::As => SyntaxKind::TerminalAs,
454 TokenKind::Const => SyntaxKind::TerminalConst,
455 TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
456 TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
457 TokenKind::ShortString => SyntaxKind::TerminalShortString,
458 TokenKind::String => SyntaxKind::TerminalString,
459 TokenKind::False => SyntaxKind::TerminalFalse,
460 TokenKind::True => SyntaxKind::TerminalTrue,
461 TokenKind::Extern => SyntaxKind::TerminalExtern,
462 TokenKind::Type => SyntaxKind::TerminalType,
463 TokenKind::Function => SyntaxKind::TerminalFunction,
464 TokenKind::Trait => SyntaxKind::TerminalTrait,
465 TokenKind::Impl => SyntaxKind::TerminalImpl,
466 TokenKind::Of => SyntaxKind::TerminalOf,
467 TokenKind::Module => SyntaxKind::TerminalModule,
468 TokenKind::Struct => SyntaxKind::TerminalStruct,
469 TokenKind::Enum => SyntaxKind::TerminalEnum,
470 TokenKind::Let => SyntaxKind::TerminalLet,
471 TokenKind::Return => SyntaxKind::TerminalReturn,
472 TokenKind::Match => SyntaxKind::TerminalMatch,
473 TokenKind::If => SyntaxKind::TerminalIf,
474 TokenKind::While => SyntaxKind::TerminalWhile,
475 TokenKind::For => SyntaxKind::TerminalFor,
476 TokenKind::Loop => SyntaxKind::TerminalLoop,
477 TokenKind::Continue => SyntaxKind::TerminalContinue,
478 TokenKind::Break => SyntaxKind::TerminalBreak,
479 TokenKind::Else => SyntaxKind::TerminalElse,
480 TokenKind::Use => SyntaxKind::TerminalUse,
481 TokenKind::Implicits => SyntaxKind::TerminalImplicits,
482 TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
483 TokenKind::Pub => SyntaxKind::TerminalPub,
484 TokenKind::And => SyntaxKind::TerminalAnd,
485 TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
486 TokenKind::At => SyntaxKind::TerminalAt,
487 TokenKind::Or => SyntaxKind::TerminalOr,
488 TokenKind::OrOr => SyntaxKind::TerminalOrOr,
489 TokenKind::Xor => SyntaxKind::TerminalXor,
490 TokenKind::EqEq => SyntaxKind::TerminalEqEq,
491 TokenKind::Neq => SyntaxKind::TerminalNeq,
492 TokenKind::GE => SyntaxKind::TerminalGE,
493 TokenKind::GT => SyntaxKind::TerminalGT,
494 TokenKind::LE => SyntaxKind::TerminalLE,
495 TokenKind::LT => SyntaxKind::TerminalLT,
496 TokenKind::Not => SyntaxKind::TerminalNot,
497 TokenKind::BitNot => SyntaxKind::TerminalBitNot,
498 TokenKind::Plus => SyntaxKind::TerminalPlus,
499 TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
500 TokenKind::Minus => SyntaxKind::TerminalMinus,
501 TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
502 TokenKind::Mul => SyntaxKind::TerminalMul,
503 TokenKind::MulEq => SyntaxKind::TerminalMulEq,
504 TokenKind::Div => SyntaxKind::TerminalDiv,
505 TokenKind::DivEq => SyntaxKind::TerminalDivEq,
506 TokenKind::Mod => SyntaxKind::TerminalMod,
507 TokenKind::ModEq => SyntaxKind::TerminalModEq,
508 TokenKind::Colon => SyntaxKind::TerminalColon,
509 TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
510 TokenKind::Comma => SyntaxKind::TerminalComma,
511 TokenKind::Dot => SyntaxKind::TerminalDot,
512 TokenKind::DotDot => SyntaxKind::TerminalDotDot,
513 TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
514 TokenKind::Eq => SyntaxKind::TerminalEq,
515 TokenKind::Hash => SyntaxKind::TerminalHash,
516 TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
517 TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
518 TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
519 TokenKind::LBrace => SyntaxKind::TerminalLBrace,
520 TokenKind::RBrace => SyntaxKind::TerminalRBrace,
521 TokenKind::LBrack => SyntaxKind::TerminalLBrack,
522 TokenKind::RBrack => SyntaxKind::TerminalRBrack,
523 TokenKind::LParen => SyntaxKind::TerminalLParen,
524 TokenKind::RParen => SyntaxKind::TerminalRParen,
525 TokenKind::Ref => SyntaxKind::TerminalRef,
526 TokenKind::Mut => SyntaxKind::TerminalMut,
527 TokenKind::Arrow => SyntaxKind::TerminalArrow,
528 TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
529 TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
530 TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
531 }
532}