cairo_lang_parser/
lexer.rs

1#[cfg(test)]
2#[path = "lexer_test.rs"]
3mod test;
4
5use cairo_lang_filesystem::span::{TextOffset, TextSpan, TextWidth};
6use cairo_lang_syntax::node::Token;
7use cairo_lang_syntax::node::ast::{
8    TokenNewline, TokenSingleLineComment, TokenSingleLineDocComment, TokenSingleLineInnerComment,
9    TokenWhitespace, TriviumGreen,
10};
11use cairo_lang_syntax::node::db::SyntaxGroup;
12use cairo_lang_syntax::node::kind::SyntaxKind;
13use cairo_lang_utils::require;
14use smol_str::SmolStr;
15
16pub struct Lexer<'a> {
17    db: &'a dyn SyntaxGroup,
18    text: &'a str,
19    previous_position: TextOffset,
20    current_position: TextOffset,
21    done: bool,
22}
23
24impl<'a> Lexer<'a> {
25    // Ctors.
26    pub fn from_text(db: &'a dyn SyntaxGroup, text: &'a str) -> Lexer<'a> {
27        Lexer {
28            db,
29            text,
30            previous_position: TextOffset::START,
31            current_position: TextOffset::START,
32            done: false,
33        }
34    }
35
36    pub fn position(&self) -> TextOffset {
37        self.current_position
38    }
39
40    // Helpers.
41    fn peek(&self) -> Option<char> {
42        self.current_position.take_from(self.text).chars().next()
43    }
44
45    fn peek_nth(&self, n: usize) -> Option<char> {
46        self.current_position.take_from(self.text).chars().nth(n)
47    }
48
49    fn take(&mut self) -> Option<char> {
50        let res = self.peek()?;
51        self.current_position = self.current_position.add_width(TextWidth::from_char(res));
52        Some(res)
53    }
54
55    /// Takes a character while the given function returns true.
56    fn take_while<F>(&mut self, f: F)
57    where
58        F: Fn(char) -> bool,
59    {
60        while self.peek().map(&f).unwrap_or(false) {
61            self.take();
62        }
63    }
64
65    fn peek_span_text(&self) -> &'a str {
66        let span = TextSpan { start: self.previous_position, end: self.current_position };
67        span.take(self.text)
68    }
69
70    fn consume_span(&mut self) -> &str {
71        let val = self.peek_span_text();
72        self.previous_position = self.current_position;
73        val
74    }
75
76    // Trivia matchers.
77    fn match_trivia(&mut self, leading: bool) -> Vec<TriviumGreen> {
78        let mut res: Vec<TriviumGreen> = Vec::new();
79        while let Some(current) = self.peek() {
80            let trivium = match current {
81                ' ' | '\r' | '\t' => self.match_trivium_whitespace(),
82                '\n' => self.match_trivium_newline(),
83                '/' if self.peek_nth(1) == Some('/') => self.match_trivium_single_line_comment(),
84                _ => break,
85            };
86            res.push(trivium);
87            if current == '\n' && !leading {
88                break;
89            }
90        }
91        res
92    }
93
94    /// Assumes the next character is one of [' ', '\r', '\t'].
95    fn match_trivium_whitespace(&mut self) -> TriviumGreen {
96        self.take_while(|s| matches!(s, ' ' | '\r' | '\t'));
97        TokenWhitespace::new_green(self.db, SmolStr::from(self.consume_span())).into()
98    }
99
100    /// Assumes the next character '/n'.
101    fn match_trivium_newline(&mut self) -> TriviumGreen {
102        self.take();
103        TokenNewline::new_green(self.db, SmolStr::from(self.consume_span())).into()
104    }
105
106    /// Assumes the next 2 characters are "//".
107    fn match_trivium_single_line_comment(&mut self) -> TriviumGreen {
108        match self.peek_nth(2) {
109            Some('/') => {
110                self.take_while(|c| c != '\n');
111                TokenSingleLineDocComment::new_green(self.db, SmolStr::from(self.consume_span()))
112                    .into()
113            }
114            Some('!') => {
115                self.take_while(|c| c != '\n');
116                TokenSingleLineInnerComment::new_green(self.db, SmolStr::from(self.consume_span()))
117                    .into()
118            }
119            _ => {
120                self.take_while(|c| c != '\n');
121                TokenSingleLineComment::new_green(self.db, SmolStr::from(self.consume_span()))
122                    .into()
123            }
124        }
125    }
126
127    // Token matchers.
128    // =================================================================================
129
130    /// Takes a number. May be decimal, hex, oct or bin.
131    fn take_token_literal_number(&mut self) -> TokenKind {
132        let special = if self.peek() == Some('0') {
133            self.take();
134            match self.peek() {
135                Some('x' | 'o' | 'b') => {
136                    match self.take() {
137                        Some('x') => self.take_while(|c| c.is_ascii_hexdigit()),
138                        Some('o') => self.take_while(|c| matches!(c, '0'..='7')),
139                        Some('b') => self.take_while(|c| matches!(c, '0'..='1')),
140                        _ => unreachable!(),
141                    }
142                    true
143                }
144                _ => false,
145            }
146        } else {
147            false
148        };
149        // Not a special case - so just reading the rest of the digits.
150        if !special {
151            self.take_while(|c| c.is_ascii_digit());
152        }
153
154        // Parse _type suffix.
155        if self.peek() == Some('_') {
156            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
157        }
158        TokenKind::LiteralNumber
159    }
160
161    /// Takes a short string.
162    fn take_token_short_string(&mut self) -> TokenKind {
163        self.take_token_string_helper('\'');
164
165        // Parse _type suffix.
166        if self.peek() == Some('_') {
167            self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
168        }
169        TokenKind::ShortString
170    }
171
172    /// Takes a string.
173    fn take_token_string(&mut self) -> TokenKind {
174        self.take_token_string_helper('"');
175        TokenKind::String
176    }
177
178    fn take_token_string_helper(&mut self, delimiter: char) {
179        self.take();
180        let mut escaped = false;
181        while let Some(token) = self.peek() {
182            self.take();
183            match token {
184                _ if escaped => escaped = false,
185                '\\' => escaped = true,
186                _ if token == delimiter => {
187                    break;
188                }
189                _ => {}
190            };
191        }
192    }
193
194    /// Assumes the next character is [a-zA-Z_].
195    fn take_token_identifier(&mut self) -> TokenKind {
196        // TODO(spapini): Support or explicitly report general unicode characters.
197        self.take_while(|c| c.is_ascii_alphanumeric() || c == '_');
198
199        match self.peek_span_text() {
200            "as" => TokenKind::As,
201            "const" => TokenKind::Const,
202            "false" => TokenKind::False,
203            "true" => TokenKind::True,
204            "extern" => TokenKind::Extern,
205            "type" => TokenKind::Type,
206            "fn" => TokenKind::Function,
207            "trait" => TokenKind::Trait,
208            "impl" => TokenKind::Impl,
209            "of" => TokenKind::Of,
210            "mod" => TokenKind::Module,
211            "struct" => TokenKind::Struct,
212            "enum" => TokenKind::Enum,
213            "let" => TokenKind::Let,
214            "return" => TokenKind::Return,
215            "match" => TokenKind::Match,
216            "if" => TokenKind::If,
217            "loop" => TokenKind::Loop,
218            "continue" => TokenKind::Continue,
219            "break" => TokenKind::Break,
220            "else" => TokenKind::Else,
221            "while" => TokenKind::While,
222            "use" => TokenKind::Use,
223            "implicits" => TokenKind::Implicits,
224            "ref" => TokenKind::Ref,
225            "mut" => TokenKind::Mut,
226            "for" => TokenKind::For,
227            "nopanic" => TokenKind::NoPanic,
228            "pub" => TokenKind::Pub,
229            "_" => TokenKind::Underscore,
230            _ => TokenKind::Identifier,
231        }
232    }
233
234    /// Takes a single character and returns the given kind.
235    fn take_token_of_kind(&mut self, kind: TokenKind) -> TokenKind {
236        self.take();
237        kind
238    }
239
240    /// If the next character is `second_char`, returns `long_kind`, otherwise returns `short_kind`.
241    fn pick_kind(
242        &mut self,
243        second_char: char,
244        long_kind: TokenKind,
245        short_kind: TokenKind,
246    ) -> TokenKind {
247        self.take();
248        if self.peek() == Some(second_char) {
249            self.take();
250            long_kind
251        } else {
252            short_kind
253        }
254    }
255
256    fn match_terminal(&mut self) -> LexerTerminal {
257        let leading_trivia = self.match_trivia(true);
258
259        let kind = if let Some(current) = self.peek() {
260            match current {
261                '0'..='9' => self.take_token_literal_number(),
262                '\'' => self.take_token_short_string(),
263                '"' => self.take_token_string(),
264                ',' => self.take_token_of_kind(TokenKind::Comma),
265                ';' => self.take_token_of_kind(TokenKind::Semicolon),
266                '?' => self.take_token_of_kind(TokenKind::QuestionMark),
267                '{' => self.take_token_of_kind(TokenKind::LBrace),
268                '}' => self.take_token_of_kind(TokenKind::RBrace),
269                '[' => self.take_token_of_kind(TokenKind::LBrack),
270                ']' => self.take_token_of_kind(TokenKind::RBrack),
271                '(' => self.take_token_of_kind(TokenKind::LParen),
272                ')' => self.take_token_of_kind(TokenKind::RParen),
273                '.' => self.pick_kind('.', TokenKind::DotDot, TokenKind::Dot),
274                '*' => self.pick_kind('=', TokenKind::MulEq, TokenKind::Mul),
275                '/' => self.pick_kind('=', TokenKind::DivEq, TokenKind::Div),
276                '%' => self.pick_kind('=', TokenKind::ModEq, TokenKind::Mod),
277                '+' => self.pick_kind('=', TokenKind::PlusEq, TokenKind::Plus),
278                '#' => self.take_token_of_kind(TokenKind::Hash),
279                '-' => {
280                    self.take();
281                    match self.peek() {
282                        Some('>') => self.take_token_of_kind(TokenKind::Arrow),
283                        Some('=') => self.take_token_of_kind(TokenKind::MinusEq),
284                        _ => TokenKind::Minus,
285                    }
286                }
287                '<' => self.pick_kind('=', TokenKind::LE, TokenKind::LT),
288                '>' => self.pick_kind('=', TokenKind::GE, TokenKind::GT),
289                'a'..='z' | 'A'..='Z' | '_' => self.take_token_identifier(),
290                ':' => self.pick_kind(':', TokenKind::ColonColon, TokenKind::Colon),
291                '!' => self.pick_kind('=', TokenKind::Neq, TokenKind::Not),
292                '~' => self.take_token_of_kind(TokenKind::BitNot),
293                '=' => {
294                    self.take();
295                    match self.peek() {
296                        Some('=') => self.take_token_of_kind(TokenKind::EqEq),
297                        Some('>') => self.take_token_of_kind(TokenKind::MatchArrow),
298                        _ => TokenKind::Eq,
299                    }
300                }
301                '&' => self.pick_kind('&', TokenKind::AndAnd, TokenKind::And),
302                '|' => self.pick_kind('|', TokenKind::OrOr, TokenKind::Or),
303                '^' => self.take_token_of_kind(TokenKind::Xor),
304                '@' => self.take_token_of_kind(TokenKind::At),
305                _ => self.take_token_of_kind(TokenKind::BadCharacters),
306            }
307        } else {
308            TokenKind::EndOfFile
309        };
310
311        let text = SmolStr::from(self.consume_span());
312        let trailing_trivia = self.match_trivia(false);
313        let terminal_kind = token_kind_to_terminal_syntax_kind(kind);
314
315        // TODO(yuval): log(verbose) "consumed text: ..."
316        LexerTerminal { text, kind: terminal_kind, leading_trivia, trailing_trivia }
317    }
318}
319
320/// Output terminal emitted by the lexer.
321#[derive(Clone, PartialEq, Eq, Debug)]
322pub struct LexerTerminal {
323    pub text: SmolStr,
324    /// The kind of the inner token of this terminal.
325    pub kind: SyntaxKind,
326    pub leading_trivia: Vec<TriviumGreen>,
327    pub trailing_trivia: Vec<TriviumGreen>,
328}
329impl LexerTerminal {
330    pub fn width(&self, db: &dyn SyntaxGroup) -> TextWidth {
331        self.leading_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
332            + TextWidth::from_str(&self.text)
333            + self.trailing_trivia.iter().map(|t| t.0.width(db)).sum::<TextWidth>()
334    }
335}
336
337impl Iterator for Lexer<'_> {
338    type Item = LexerTerminal;
339
340    /// Returns the next token. Once there are no more tokens left, returns token EOF.
341    /// One should not call this after EOF was returned. If one does, None is returned.
342    fn next(&mut self) -> Option<Self::Item> {
343        require(!self.done)?;
344        let lexer_terminal = self.match_terminal();
345        if lexer_terminal.kind == SyntaxKind::TerminalEndOfFile {
346            self.done = true;
347        };
348        Some(lexer_terminal)
349    }
350}
351
352#[derive(Clone, Copy, PartialEq, Debug, Eq, Hash)]
353enum TokenKind {
354    Identifier,
355
356    // Literals.
357    LiteralNumber,
358    ShortString,
359    String,
360
361    // Keywords.
362    As,
363    Const,
364    False,
365    True,
366    Extern,
367    Type,
368    Function,
369    Trait,
370    Impl,
371    Of,
372    Module,
373    Struct,
374    Enum,
375    Let,
376    Return,
377    Match,
378    If,
379    While,
380    For,
381    Loop,
382    Continue,
383    Break,
384    Else,
385    Use,
386    Implicits,
387    NoPanic,
388    Pub,
389
390    // Modifiers.
391    Ref,
392    Mut,
393
394    // Punctuation.
395    And,
396    AndAnd,
397    At,
398    Or,
399    OrOr,
400    Xor,
401    EqEq,
402    Neq,
403    GE,
404    GT,
405    LE,
406    LT,
407    Not,
408    BitNot,
409    Plus,
410    PlusEq,
411    Minus,
412    MinusEq,
413    Mul,
414    MulEq,
415    Div,
416    DivEq,
417    Mod,
418    ModEq,
419
420    Colon,
421    ColonColon,
422    Comma,
423    Dot,
424    DotDot,
425    Eq,
426    Hash,
427    Semicolon,
428    QuestionMark,
429    Underscore,
430    LBrace,
431    RBrace,
432    LBrack,
433    RBrack,
434    LParen,
435    RParen,
436    Arrow,
437    MatchArrow,
438
439    // Meta.
440    EndOfFile,
441    BadCharacters,
442}
443
444fn token_kind_to_terminal_syntax_kind(kind: TokenKind) -> SyntaxKind {
445    match kind {
446        TokenKind::As => SyntaxKind::TerminalAs,
447        TokenKind::Const => SyntaxKind::TerminalConst,
448        TokenKind::Identifier => SyntaxKind::TerminalIdentifier,
449        TokenKind::LiteralNumber => SyntaxKind::TerminalLiteralNumber,
450        TokenKind::ShortString => SyntaxKind::TerminalShortString,
451        TokenKind::String => SyntaxKind::TerminalString,
452        TokenKind::False => SyntaxKind::TerminalFalse,
453        TokenKind::True => SyntaxKind::TerminalTrue,
454        TokenKind::Extern => SyntaxKind::TerminalExtern,
455        TokenKind::Type => SyntaxKind::TerminalType,
456        TokenKind::Function => SyntaxKind::TerminalFunction,
457        TokenKind::Trait => SyntaxKind::TerminalTrait,
458        TokenKind::Impl => SyntaxKind::TerminalImpl,
459        TokenKind::Of => SyntaxKind::TerminalOf,
460        TokenKind::Module => SyntaxKind::TerminalModule,
461        TokenKind::Struct => SyntaxKind::TerminalStruct,
462        TokenKind::Enum => SyntaxKind::TerminalEnum,
463        TokenKind::Let => SyntaxKind::TerminalLet,
464        TokenKind::Return => SyntaxKind::TerminalReturn,
465        TokenKind::Match => SyntaxKind::TerminalMatch,
466        TokenKind::If => SyntaxKind::TerminalIf,
467        TokenKind::While => SyntaxKind::TerminalWhile,
468        TokenKind::For => SyntaxKind::TerminalFor,
469        TokenKind::Loop => SyntaxKind::TerminalLoop,
470        TokenKind::Continue => SyntaxKind::TerminalContinue,
471        TokenKind::Break => SyntaxKind::TerminalBreak,
472        TokenKind::Else => SyntaxKind::TerminalElse,
473        TokenKind::Use => SyntaxKind::TerminalUse,
474        TokenKind::Implicits => SyntaxKind::TerminalImplicits,
475        TokenKind::NoPanic => SyntaxKind::TerminalNoPanic,
476        TokenKind::Pub => SyntaxKind::TerminalPub,
477        TokenKind::And => SyntaxKind::TerminalAnd,
478        TokenKind::AndAnd => SyntaxKind::TerminalAndAnd,
479        TokenKind::At => SyntaxKind::TerminalAt,
480        TokenKind::Or => SyntaxKind::TerminalOr,
481        TokenKind::OrOr => SyntaxKind::TerminalOrOr,
482        TokenKind::Xor => SyntaxKind::TerminalXor,
483        TokenKind::EqEq => SyntaxKind::TerminalEqEq,
484        TokenKind::Neq => SyntaxKind::TerminalNeq,
485        TokenKind::GE => SyntaxKind::TerminalGE,
486        TokenKind::GT => SyntaxKind::TerminalGT,
487        TokenKind::LE => SyntaxKind::TerminalLE,
488        TokenKind::LT => SyntaxKind::TerminalLT,
489        TokenKind::Not => SyntaxKind::TerminalNot,
490        TokenKind::BitNot => SyntaxKind::TerminalBitNot,
491        TokenKind::Plus => SyntaxKind::TerminalPlus,
492        TokenKind::PlusEq => SyntaxKind::TerminalPlusEq,
493        TokenKind::Minus => SyntaxKind::TerminalMinus,
494        TokenKind::MinusEq => SyntaxKind::TerminalMinusEq,
495        TokenKind::Mul => SyntaxKind::TerminalMul,
496        TokenKind::MulEq => SyntaxKind::TerminalMulEq,
497        TokenKind::Div => SyntaxKind::TerminalDiv,
498        TokenKind::DivEq => SyntaxKind::TerminalDivEq,
499        TokenKind::Mod => SyntaxKind::TerminalMod,
500        TokenKind::ModEq => SyntaxKind::TerminalModEq,
501        TokenKind::Colon => SyntaxKind::TerminalColon,
502        TokenKind::ColonColon => SyntaxKind::TerminalColonColon,
503        TokenKind::Comma => SyntaxKind::TerminalComma,
504        TokenKind::Dot => SyntaxKind::TerminalDot,
505        TokenKind::DotDot => SyntaxKind::TerminalDotDot,
506        TokenKind::Eq => SyntaxKind::TerminalEq,
507        TokenKind::Hash => SyntaxKind::TerminalHash,
508        TokenKind::Semicolon => SyntaxKind::TerminalSemicolon,
509        TokenKind::QuestionMark => SyntaxKind::TerminalQuestionMark,
510        TokenKind::Underscore => SyntaxKind::TerminalUnderscore,
511        TokenKind::LBrace => SyntaxKind::TerminalLBrace,
512        TokenKind::RBrace => SyntaxKind::TerminalRBrace,
513        TokenKind::LBrack => SyntaxKind::TerminalLBrack,
514        TokenKind::RBrack => SyntaxKind::TerminalRBrack,
515        TokenKind::LParen => SyntaxKind::TerminalLParen,
516        TokenKind::RParen => SyntaxKind::TerminalRParen,
517        TokenKind::Ref => SyntaxKind::TerminalRef,
518        TokenKind::Mut => SyntaxKind::TerminalMut,
519        TokenKind::Arrow => SyntaxKind::TerminalArrow,
520        TokenKind::MatchArrow => SyntaxKind::TerminalMatchArrow,
521        TokenKind::BadCharacters => SyntaxKind::TerminalBadCharacters,
522        TokenKind::EndOfFile => SyntaxKind::TerminalEndOfFile,
523    }
524}