cairo_lang_parser/
lexer.rs

1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8    TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9    TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17    db: &'a dyn SyntaxGroup,
18    text: &'a str,
19    previous_position: TextOffset,
20    current_position: TextOffset,
21    done: bool,
22}
23
24impl<'a> Lexer<'a> {
25    // Ctors.
26    pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27        Lexer {
28            db,
29            text,
30            previous_position: TextOffset::START,
31            current_position: TextOffset::START,
32            done: false,
33        }
34    }
35
36    pub fn position(&self) -> TextOffset {
37        self.current_position
38    }
39
40    // Helpers.
41    fn peek(&self) -> Option<char> {
42        self.current_position.take_from(self.text).chars().next()
43    }
44
45    fn peek_nth(&self, n: usize) -> Option<char> {
46        self.current_position.take_from(self.text).chars().nth(n)
47    }
48
49    fn take(&mut self) -> Option<char> {
50        let res = self.peek()?;
51        self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52        Some(res)
53    }
54
55    /// Takes a character while the given function returns true.
56    fn take_while<F>(&mut self, f: F)
57    where
58        F: Fn(char) -> bool,
59    {
60        while self.peek().map(&f).unwrap_or(false) {
61            self.take();
62        }
63    }
64
65    fn peek_span_text(&self) -> &'a str {
66        let span = TextSpan { start: self.previous_position, end: self.current_position };
67        span.take(self.text)
68    }
69
70    fn consume_span(&mut self) -> &str {
71        let val = self.peek_span_text();
72        self.previous_position = self.current_position;
73        val
74    }
75
76    // Trivia matchers.
77    fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78        let mut res: Vec<TriviumGreen> = Vec::new();
79        while let Some(current) = self.peek() {
80            let trivium = match current {
81                ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82                '\n' => self.match_trivium_newline(),
83                '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84                _ => break,
85            };
86            res.push(trivium);
87            if current == '\n' && !leading {
88                break;
89            }
90        }
91        res
92    }
93
94    /// Assumes the next character is one of [' ', '\r', '\t'].
95    fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96        self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97        TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98    }
99
100    /// Assumes the next character '/n'.
101    fn match_trivium_newline(&mut self) -> TriviumGreen {
102        self.take();
103        TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104    }
105
106    /// Assumes the next 2 characters are "//".
107    fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108        match self.peek_nth(2) {
109            Some('/') => {
110                self.take_while(|c| c != '\n');
111                TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112                    .into()
113            }
114            Some('!') => {
115                self.take_while(|c| c != '\n');
116                TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117                    .into()
118            }
119            _ => {
120                self.take_while(|c| c != '\n');
121                TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122                    .into()
123            }
124        }
125    }
126
127    // Token matchers.
128    // =================================================================================
129
130    /// Takes a number. May be decimal, hex, oct or bin.
131    fn take_token_literal_number(&mut self) -> TokenKind {
132        let special = if self.peek() == Some('0') {
133            self.take();
134            match self.peek() {
135                Some('x' | 'o' | 'b') => {
136                    match self.take() {
137                        Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138                        Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139                        Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140                        _ => unreachable!(),
141                    }
142                    true
143                }
144                _ => false,
145            }
146        } else {
147            false
148        };
149        // Not a special case - so just reading the rest of the digits.
150        if !special {
151            self.take_while(|c| c.is_ascii_digit());
152        }
153
154        // Parse _type suffix.
155        if self.peek() == Some('_') {
156            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157        }
158        TokenKind::LiteralNumber
159    }
160
161    /// Takes a short string.
162    fn take_token_short_string(&mut self) -> TokenKind {
163        self.take_token_string_helper('\'');
164
165        // Parse _type suffix.
166        if self.peek() == Some('_') {
167            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168        }
169        TokenKind::ShortString
170    }
171
172    /// Takes a string.
173    fn take_token_string(&mut self) -> TokenKind {
174        self.take_token_string_helper('"');
175        TokenKind::String
176    }
177
178    fn take_token_string_helper(&mut self, delimiter: char) {
179        self.take();
180        let mut escaped = false;
181        while let Some(token) = self.peek() {
182            self.take();
183            match token {
184                _ if escaped => escaped = false,
185                '\\' => escaped = true,
186                _ if token == delimiter => {
187                    break;
188                }
189                _ => {}
190            };
191        }
192    }
193
194    /// Assumes the next character is [a-zA-Z_].
195    fn take_token_identifier(&mut self) -> TokenKind {
196        // TODO(spapini): Support or explicitly report general unicode characters.
197        self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199        match self.peek_span_text() {
200            "as" => TokenKind::As,
201            "const" => TokenKind::Const,
202            "false" => TokenKind::False,
203            "true" => TokenKind::True,
204            "extern" => TokenKind::Extern,
205            "type" => TokenKind::Type,
206            "fn" => TokenKind::Function,
207            "trait" => TokenKind::Trait,
208            "impl" => TokenKind::Impl,
209            "of" => TokenKind::Of,
210            "mod" => TokenKind::Module,
211            "struct" => TokenKind::Struct,
212            "enum" => TokenKind::Enum,
213            "let" => TokenKind::Let,
214            "return" => TokenKind::Return,
215            "match" => TokenKind::Match,
216            "if" => TokenKind::If,
217            "loop" => TokenKind::Loop,
218            "continue" => TokenKind::Continue,
219            "break" => TokenKind::Break,
220            "else" => TokenKind::Else,
221            "while" => TokenKind::While,
222            "use" => TokenKind::Use,
223            "implicits" => TokenKind::Implicits,
224            "ref" => TokenKind::Ref,
225            "mut" => TokenKind::Mut,
226            "for" => TokenKind::For,
227            "nopanic" => TokenKind::NoPanic,
228            "pub" => TokenKind::Pub,
229            "_" => TokenKind::Underscore,
230            _ => TokenKind::Identifier,
231        }
232    }
233
234    /// Takes a single character and returns the given kind.
235    fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
236        self.take();
237        kind
238    }
239
240    /// If the next character is `second_char`, returns `long_kind`, otherwise returns `short_kind`.
241    fn pick_kind(
242        &mut self,
243        second_char: char,
244        long_kind: TokenKind,
245        short_kind: TokenKind,
246    ) -> TokenKind {
247        self.take();
248        if self.peek() == Some(second_char) {
249            self.take();
250            long_kind
251        } else {
252            short_kind
253        }
254    }
255
256    fn match_terminal(&mut self) -> LexerTerminal {
257        let leading_trivia = self.match_trivia(true);
258
259        let kind = if let Some(current) = self.peek() {
260            match current {
261                '0'..='9' => self.take_token_literal_number(),
262                '\'' => self.take_token_short_string(),
263                '"' => self.take_token_string(),
264                ',' => self.take_token_of_kind(TokenKind::Comma),
265                ';' => self.take_token_of_kind(TokenKind::Semicolon),
266                '?' => self.take_token_of_kind(TokenKind::QuestionMark),
267                '{' => self.take_token_of_kind(TokenKind::LBrace),
268                '}' => self.take_token_of_kind(TokenKind::RBrace),
269                '[' => self.take_token_of_kind(TokenKind::LBrack),
270                ']' => self.take_token_of_kind(TokenKind::RBrack),
271                '(' => self.take_token_of_kind(TokenKind::LParen),
272                ')' => self.take_token_of_kind(TokenKind::RParen),
273                '.' => {
274                    self.take();
275                    match self.peek() {
276                        Some('.') => self.pick_kind('=', TokenKind::DotDotEq, TokenKind::DotDot),
277                        _ => TokenKind::Dot,
278                    }
279                }
280                '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
281                '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
282                '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
283                '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
284                '#' => self.take_token_of_kind(TokenKind::Hash),
285                '-' => {
286                    self.take();
287                    match self.peek() {
288                        Some('>') => self.take_token_of_kind(TokenKind::Arrow),
289                        Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
290                        _ => TokenKind::Minus,
291                    }
292                }
293                '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
294                '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
295                'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
296                ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
297                '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
298                '~' => self.take_token_of_kind(TokenKind::BitNot),
299                '=' => {
300                    self.take();
301                    match self.peek() {
302                        Some('=') => self.take_token_of_kind(TokenKind::EqEq),
303                        Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
304                        _ => TokenKind::Eq,
305                    }
306                }
307                '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
308                '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
309                '^' => self.take_token_of_kind(TokenKind::Xor),
310                '@' => self.take_token_of_kind(TokenKind::At),
311                _ => self.take_token_of_kind(TokenKind::BadCharacters),
312            }
313        } else {
314            TokenKind::EndOfFile
315        };
316
317        let text = SmolStr::from(self.consume_span());
318        let trailing_trivia = self.match_trivia(false);
319        let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
320
321        // TODO(yuval): log(verbose) "consumed text: ..."
322        LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
323    }
324}
325
326/// Output terminal emitted by the lexer.
327#[derive(Clone, PartialEq, Eq, Debug)]
328pub struct LexerTerminal {
329    pub text: SmolStr,
330    /// The kind of the inner token of this terminal.
331    pub kind: SyntaxKind,
332    pub leading_trivia: Vec<TriviumGreen>,
333    pub trailing_trivia: Vec<TriviumGreen>,
334}
335impl LexerTerminal {
336    pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
337        self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
338            + TextWidth::from_str(&self.text)
339            + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
340    }
341}
342
343impl Iterator for Lexer<'_> {
344    type Item = LexerTerminal;
345
346    /// Returns the next token. Once there are no more tokens left, returns token EOF.
347    /// One should not call this after EOF was returned. If one does, None is returned.
348    fn next(&mut self) -> Option<Self::Item> {
349        require(!self.done)?;
350        let lexer_terminal = self.match_terminal();
351        if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
352            self.done = true;
353        };
354        Some(lexer_terminal)
355    }
356}
357
358#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
359enum TokenKind {
360    Identifier,
361
362    // Literals.
363    LiteralNumber,
364    ShortString,
365    String,
366
367    // Keywords.
368    As,
369    Const,
370    False,
371    True,
372    Extern,
373    Type,
374    Function,
375    Trait,
376    Impl,
377    Of,
378    Module,
379    Struct,
380    Enum,
381    Let,
382    Return,
383    Match,
384    If,
385    While,
386    For,
387    Loop,
388    Continue,
389    Break,
390    Else,
391    Use,
392    Implicits,
393    NoPanic,
394    Pub,
395
396    // Modifiers.
397    Ref,
398    Mut,
399
400    // Punctuation.
401    And,
402    AndAnd,
403    At,
404    Or,
405    OrOr,
406    Xor,
407    EqEq,
408    Neq,
409    GE,
410    GT,
411    LE,
412    LT,
413    Not,
414    BitNot,
415    Plus,
416    PlusEq,
417    Minus,
418    MinusEq,
419    Mul,
420    MulEq,
421    Div,
422    DivEq,
423    Mod,
424    ModEq,
425
426    Colon,
427    ColonColon,
428    Comma,
429    Dot,
430    DotDot,
431    DotDotEq,
432    Eq,
433    Hash,
434    Semicolon,
435    QuestionMark,
436    Underscore,
437    LBrace,
438    RBrace,
439    LBrack,
440    RBrack,
441    LParen,
442    RParen,
443    Arrow,
444    MatchArrow,
445
446    // Meta.
447    EndOfFile,
448    BadCharacters,
449}
450
451fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
452    match kind {
453        TokenKind::As => SyntaxKind::TerminalAs,
454        TokenKind::Const => SyntaxKind::TerminalConst,
455        TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
456        TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
457        TokenKind::ShortString => SyntaxKind::TerminalShortString,
458        TokenKind::String => SyntaxKind::TerminalString,
459        TokenKind::False => SyntaxKind::TerminalFalse,
460        TokenKind::True => SyntaxKind::TerminalTrue,
461        TokenKind::Extern => SyntaxKind::TerminalExtern,
462        TokenKind::Type => SyntaxKind::TerminalType,
463        TokenKind::Function => SyntaxKind::TerminalFunction,
464        TokenKind::Trait => SyntaxKind::TerminalTrait,
465        TokenKind::Impl => SyntaxKind::TerminalImpl,
466        TokenKind::Of => SyntaxKind::TerminalOf,
467        TokenKind::Module => SyntaxKind::TerminalModule,
468        TokenKind::Struct => SyntaxKind::TerminalStruct,
469        TokenKind::Enum => SyntaxKind::TerminalEnum,
470        TokenKind::Let => SyntaxKind::TerminalLet,
471        TokenKind::Return => SyntaxKind::TerminalReturn,
472        TokenKind::Match => SyntaxKind::TerminalMatch,
473        TokenKind::If => SyntaxKind::TerminalIf,
474        TokenKind::While => SyntaxKind::TerminalWhile,
475        TokenKind::For => SyntaxKind::TerminalFor,
476        TokenKind::Loop => SyntaxKind::TerminalLoop,
477        TokenKind::Continue => SyntaxKind::TerminalContinue,
478        TokenKind::Break => SyntaxKind::TerminalBreak,
479        TokenKind::Else => SyntaxKind::TerminalElse,
480        TokenKind::Use => SyntaxKind::TerminalUse,
481        TokenKind::Implicits => SyntaxKind::TerminalImplicits,
482        TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
483        TokenKind::Pub => SyntaxKind::TerminalPub,
484        TokenKind::And => SyntaxKind::TerminalAnd,
485        TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
486        TokenKind::At => SyntaxKind::TerminalAt,
487        TokenKind::Or => SyntaxKind::TerminalOr,
488        TokenKind::OrOr => SyntaxKind::TerminalOrOr,
489        TokenKind::Xor => SyntaxKind::TerminalXor,
490        TokenKind::EqEq => SyntaxKind::TerminalEqEq,
491        TokenKind::Neq => SyntaxKind::TerminalNeq,
492        TokenKind::GE => SyntaxKind::TerminalGE,
493        TokenKind::GT => SyntaxKind::TerminalGT,
494        TokenKind::LE => SyntaxKind::TerminalLE,
495        TokenKind::LT => SyntaxKind::TerminalLT,
496        TokenKind::Not => SyntaxKind::TerminalNot,
497        TokenKind::BitNot => SyntaxKind::TerminalBitNot,
498        TokenKind::Plus => SyntaxKind::TerminalPlus,
499        TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
500        TokenKind::Minus => SyntaxKind::TerminalMinus,
501        TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
502        TokenKind::Mul => SyntaxKind::TerminalMul,
503        TokenKind::MulEq => SyntaxKind::TerminalMulEq,
504        TokenKind::Div => SyntaxKind::TerminalDiv,
505        TokenKind::DivEq => SyntaxKind::TerminalDivEq,
506        TokenKind::Mod => SyntaxKind::TerminalMod,
507        TokenKind::ModEq => SyntaxKind::TerminalModEq,
508        TokenKind::Colon => SyntaxKind::TerminalColon,
509        TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
510        TokenKind::Comma => SyntaxKind::TerminalComma,
511        TokenKind::Dot => SyntaxKind::TerminalDot,
512        TokenKind::DotDot => SyntaxKind::TerminalDotDot,
513        TokenKind::DotDotEq => SyntaxKind::TerminalDotDotEq,
514        TokenKind::Eq => SyntaxKind::TerminalEq,
515        TokenKind::Hash => SyntaxKind::TerminalHash,
516        TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
517        TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
518        TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
519        TokenKind::LBrace => SyntaxKind::TerminalLBrace,
520        TokenKind::RBrace => SyntaxKind::TerminalRBrace,
521        TokenKind::LBrack => SyntaxKind::TerminalLBrack,
522        TokenKind::RBrack => SyntaxKind::TerminalRBrack,
523        TokenKind::LParen => SyntaxKind::TerminalLParen,
524        TokenKind::RParen => SyntaxKind::TerminalRParen,
525        TokenKind::Ref => SyntaxKind::TerminalRef,
526        TokenKind::Mut => SyntaxKind::TerminalMut,
527        TokenKind::Arrow => SyntaxKind::TerminalArrow,
528        TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
529        TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
530        TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
531    }
532}