cranelift_isle/
lexer.rs

1//! Lexer for the ISLE language.
2
3use std::borrow::Cow;
4
5use crate::error::{Error, Span};
6use crate::files::Files;
7
8type Result<T> = std::result::Result<T, Error>;
9
10/// The lexer.
11///
12/// Breaks source text up into a sequence of tokens (with source positions).
13#[derive(Clone, Debug)]
14pub struct Lexer<'src> {
15    src: &'src str,
16    pos: Pos,
17    lookahead: Option<(Pos, Token)>,
18}
19
20/// A source position.
21#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Hash, PartialOrd, Ord)]
22pub struct Pos {
23    /// This source position's file.
24    ///
25    /// Indexes into `Lexer::filenames` early in the compiler pipeline, and
26    /// later into `TypeEnv::filenames` once we get into semantic analysis.
27    pub file: usize,
28    /// This source position's byte offset in the file.
29    pub offset: usize,
30}
31
32impl Pos {
33    /// Create a new `Pos`.
34    pub fn new(file: usize, offset: usize) -> Self {
35        Self { file, offset }
36    }
37
38    /// Print this source position as `file.isle line 12`.
39    pub fn pretty_print_line(&self, files: &Files) -> String {
40        format!(
41            "{} line {}",
42            files.file_name(self.file).unwrap(),
43            files.file_line_map(self.file).unwrap().line(self.offset)
44        )
45    }
46}
47
48/// A token of ISLE source.
49#[derive(Clone, Debug, PartialEq, Eq)]
50pub enum Token {
51    /// Left paren.
52    LParen,
53    /// Right paren.
54    RParen,
55    /// A symbol, e.g. `Foo`.
56    Symbol(String),
57    /// An integer.
58    Int(i128),
59    /// `@`
60    At,
61}
62
63impl<'src> Lexer<'src> {
64    /// Create a new lexer for the given source contents
65    pub fn new(file: usize, src: &'src str) -> Result<Lexer<'src>> {
66        let mut l = Lexer {
67            src,
68            pos: Pos::new(file, 0),
69            lookahead: None,
70        };
71        l.reload()?;
72        Ok(l)
73    }
74
75    /// Get the lexer's current source position.
76    pub fn pos(&self) -> Pos {
77        self.pos
78    }
79
80    fn advance_pos(&mut self) {
81        self.advance_by(1)
82    }
83
84    fn advance_by(&mut self, n: usize) {
85        self.pos.offset += n;
86    }
87
88    fn error(&self, pos: Pos, msg: impl Into<String>) -> Error {
89        Error::ParseError {
90            msg: msg.into(),
91            span: Span::new_single(pos),
92        }
93    }
94
95    fn next_token(&mut self) -> Result<Option<(Pos, Token)>> {
96        fn is_sym_first_char(c: u8) -> bool {
97            match c {
98                b'-' | b'0'..=b'9' | b'(' | b')' | b';' => false,
99                c if c.is_ascii_whitespace() => false,
100                _ => true,
101            }
102        }
103        fn is_sym_other_char(c: u8) -> bool {
104            match c {
105                b'(' | b')' | b';' | b'@' => false,
106                c if c.is_ascii_whitespace() => false,
107                _ => true,
108            }
109        }
110
111        // Skip any whitespace and any comments.
112        while let Some(c) = self.peek_byte() {
113            match c {
114                b' ' | b'\t' | b'\n' | b'\r' => self.advance_pos(),
115                b';' => {
116                    while let Some(c) = self.peek_byte() {
117                        match c {
118                            b'\n' | b'\r' => break,
119                            _ => self.advance_pos(),
120                        }
121                    }
122                }
123                b'(' if self.lookahead_byte(1) == Some(b';') => {
124                    let pos = self.pos();
125                    self.advance_by(2);
126                    let mut depth = 1usize;
127                    loop {
128                        match self.peek_byte() {
129                            None => return Err(self.error(pos, "unterminated block comment")),
130                            Some(b'(') if self.lookahead_byte(1) == Some(b';') => {
131                                self.advance_by(2);
132                                depth += 1;
133                            }
134                            Some(b';') if self.lookahead_byte(1) == Some(b')') => {
135                                self.advance_by(2);
136                                depth -= 1;
137                                if depth == 0 {
138                                    break;
139                                }
140                            }
141                            Some(_) => self.advance_pos(),
142                        }
143                    }
144                }
145                _ => break,
146            }
147        }
148
149        let Some(c) = self.peek_byte() else {
150            return Ok(None);
151        };
152        let char_pos = self.pos();
153        match c {
154            b'(' => {
155                self.advance_pos();
156                Ok(Some((char_pos, Token::LParen)))
157            }
158            b')' => {
159                self.advance_pos();
160                Ok(Some((char_pos, Token::RParen)))
161            }
162            b'@' => {
163                self.advance_pos();
164                Ok(Some((char_pos, Token::At)))
165            }
166            c if is_sym_first_char(c) => {
167                let start = self.pos.offset;
168                let start_pos = self.pos();
169                while let Some(c) = self.peek_byte() {
170                    match c {
171                        c if is_sym_other_char(c) => self.advance_pos(),
172                        _ => break,
173                    }
174                }
175                let end = self.pos.offset;
176                let s = &self.src[start..end];
177                debug_assert!(!s.is_empty());
178                Ok(Some((start_pos, Token::Symbol(s.to_string()))))
179            }
180            c @ (b'0'..=b'9' | b'-') => {
181                let start_pos = self.pos();
182                let mut neg = false;
183                if c == b'-' {
184                    self.advance_pos();
185                    neg = true;
186                }
187
188                let mut radix = 10;
189
190                // Check for prefixed literals.
191                match (
192                    self.src.as_bytes().get(self.pos.offset),
193                    self.src.as_bytes().get(self.pos.offset + 1),
194                ) {
195                    (Some(b'0'), Some(b'x' | b'X')) => {
196                        self.advance_by(2);
197                        radix = 16;
198                    }
199                    (Some(b'0'), Some(b'o' | b'O')) => {
200                        self.advance_by(2);
201                        radix = 8;
202                    }
203                    (Some(b'0'), Some(b'b' | b'B')) => {
204                        self.advance_by(2);
205                        radix = 2;
206                    }
207                    _ => {}
208                }
209
210                // Find the range in the buffer for this integer literal. We'll
211                // pass this range to `i64::from_str_radix` to do the actual
212                // string-to-integer conversion.
213                let start = self.pos.offset;
214                while let Some(c) = self.peek_byte() {
215                    match c {
216                        b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_' => self.advance_pos(),
217                        _ => break,
218                    }
219                }
220                let end = self.pos.offset;
221                let s = &self.src[start..end];
222                let s = if s.contains('_') {
223                    Cow::Owned(s.replace('_', ""))
224                } else {
225                    Cow::Borrowed(s)
226                };
227
228                // Support either signed range (-2^127..2^127) or
229                // unsigned range (0..2^128).
230                let num = match u128::from_str_radix(&s, radix) {
231                    Ok(num) => num,
232                    Err(err) => return Err(self.error(start_pos, err.to_string())),
233                };
234
235                let num = match (neg, num) {
236                    (true, 0x80000000000000000000000000000000) => {
237                        return Err(self.error(start_pos, "integer literal cannot fit in i128"))
238                    }
239                    (true, _) => -(num as i128),
240                    (false, _) => num as i128,
241                };
242                let tok = Token::Int(num);
243
244                Ok(Some((start_pos, tok)))
245            }
246            c => Err(self.error(self.pos, format!("Unexpected character '{c}'"))),
247        }
248    }
249
250    /// Get the next token from this lexer's token stream, if any.
251    pub fn next(&mut self) -> Result<Option<(Pos, Token)>> {
252        let tok = self.lookahead.take();
253        self.reload()?;
254        Ok(tok)
255    }
256
257    fn reload(&mut self) -> Result<()> {
258        if self.lookahead.is_none() && self.pos.offset < self.src.len() {
259            self.lookahead = self.next_token()?;
260        }
261        Ok(())
262    }
263
264    /// Peek ahead at the next token.
265    pub fn peek(&self) -> Option<&(Pos, Token)> {
266        self.lookahead.as_ref()
267    }
268
269    /// Are we at the end of the source input?
270    pub fn eof(&self) -> bool {
271        self.lookahead.is_none()
272    }
273
274    fn peek_byte(&self) -> Option<u8> {
275        self.lookahead_byte(0)
276    }
277
278    fn lookahead_byte(&self, n: usize) -> Option<u8> {
279        self.src.as_bytes().get(self.pos.offset + n).copied()
280    }
281}
282
283impl Token {
284    /// Is this an `Int` token?
285    pub fn is_int(&self) -> bool {
286        matches!(self, Token::Int(_))
287    }
288
289    /// Is this a `Sym` token?
290    pub fn is_sym(&self) -> bool {
291        matches!(self, Token::Symbol(_))
292    }
293}
294
295#[cfg(test)]
296mod test {
297    use super::*;
298
299    #[track_caller]
300    fn lex(src: &str) -> Vec<Token> {
301        let mut toks = vec![];
302        let mut lexer = Lexer::new(0, src).unwrap();
303        while let Some((_, tok)) = lexer.next().unwrap() {
304            toks.push(tok);
305        }
306        toks
307    }
308
309    #[test]
310    fn lexer_basic() {
311        assert_eq!(
312            lex(";; comment\n; another\r\n   \t(one two three (; block comment ;) 23 (; nested (; block ;) comment ;) -568  )\n"),
313            [
314                Token::LParen,
315                Token::Symbol("one".to_string()),
316                Token::Symbol("two".to_string()),
317                Token::Symbol("three".to_string()),
318                Token::Int(23),
319                Token::Int(-568),
320                Token::RParen
321            ]
322        );
323    }
324
325    #[test]
326    fn ends_with_sym() {
327        assert_eq!(lex("asdf"), [Token::Symbol("asdf".to_string())]);
328    }
329
330    #[test]
331    fn ends_with_num() {
332        assert_eq!(lex("23"), [Token::Int(23)]);
333    }
334
335    #[test]
336    fn weird_syms() {
337        assert_eq!(
338            lex("(+ [] => !! _test!;comment\n)"),
339            [
340                Token::LParen,
341                Token::Symbol("+".to_string()),
342                Token::Symbol("[]".to_string()),
343                Token::Symbol("=>".to_string()),
344                Token::Symbol("!!".to_string()),
345                Token::Symbol("_test!".to_string()),
346                Token::RParen,
347            ]
348        );
349    }
350
351    #[test]
352    fn integers() {
353        assert_eq!(
354            lex("0 1 -1"),
355            [Token::Int(0), Token::Int(1), Token::Int(-1)]
356        );
357
358        assert_eq!(
359            lex("340_282_366_920_938_463_463_374_607_431_768_211_455"),
360            [Token::Int(-1)]
361        );
362
363        assert_eq!(
364            lex("170_141_183_460_469_231_731_687_303_715_884_105_727"),
365            [Token::Int(i128::MAX)]
366        );
367
368        assert!(Lexer::new(0, "-170_141_183_460_469_231_731_687_303_715_884_105_728").is_err())
369    }
370}