1use std::borrow::Cow;
4
5use crate::error::{Error, Span};
6use crate::files::Files;
7
8type Result<T> = std::result::Result<T, Error>;
9
10#[derive(Clone, Debug)]
14pub struct Lexer<'src> {
15 src: &'src str,
16 pos: Pos,
17 lookahead: Option<(Pos, Token)>,
18}
19
20#[derive(Clone, Copy, Debug, PartialEq, Eq, Default, Hash, PartialOrd, Ord)]
22pub struct Pos {
23 pub file: usize,
28 pub offset: usize,
30}
31
32impl Pos {
33 pub fn new(file: usize, offset: usize) -> Self {
35 Self { file, offset }
36 }
37
38 pub fn pretty_print_line(&self, files: &Files) -> String {
40 format!(
41 "{} line {}",
42 files.file_name(self.file).unwrap(),
43 files.file_line_map(self.file).unwrap().line(self.offset)
44 )
45 }
46}
47
48#[derive(Clone, Debug, PartialEq, Eq)]
50pub enum Token {
51 LParen,
53 RParen,
55 Symbol(String),
57 Int(i128),
59 At,
61}
62
63impl<'src> Lexer<'src> {
64 pub fn new(file: usize, src: &'src str) -> Result<Lexer<'src>> {
66 let mut l = Lexer {
67 src,
68 pos: Pos::new(file, 0),
69 lookahead: None,
70 };
71 l.reload()?;
72 Ok(l)
73 }
74
75 pub fn pos(&self) -> Pos {
77 self.pos
78 }
79
80 fn advance_pos(&mut self) {
81 self.advance_by(1)
82 }
83
84 fn advance_by(&mut self, n: usize) {
85 self.pos.offset += n;
86 }
87
88 fn error(&self, pos: Pos, msg: impl Into<String>) -> Error {
89 Error::ParseError {
90 msg: msg.into(),
91 span: Span::new_single(pos),
92 }
93 }
94
95 fn next_token(&mut self) -> Result<Option<(Pos, Token)>> {
96 fn is_sym_first_char(c: u8) -> bool {
97 match c {
98 b'-' | b'0'..=b'9' | b'(' | b')' | b';' => false,
99 c if c.is_ascii_whitespace() => false,
100 _ => true,
101 }
102 }
103 fn is_sym_other_char(c: u8) -> bool {
104 match c {
105 b'(' | b')' | b';' | b'@' => false,
106 c if c.is_ascii_whitespace() => false,
107 _ => true,
108 }
109 }
110
111 while let Some(c) = self.peek_byte() {
113 match c {
114 b' ' | b'\t' | b'\n' | b'\r' => self.advance_pos(),
115 b';' => {
116 while let Some(c) = self.peek_byte() {
117 match c {
118 b'\n' | b'\r' => break,
119 _ => self.advance_pos(),
120 }
121 }
122 }
123 b'(' if self.lookahead_byte(1) == Some(b';') => {
124 let pos = self.pos();
125 self.advance_by(2);
126 let mut depth = 1usize;
127 loop {
128 match self.peek_byte() {
129 None => return Err(self.error(pos, "unterminated block comment")),
130 Some(b'(') if self.lookahead_byte(1) == Some(b';') => {
131 self.advance_by(2);
132 depth += 1;
133 }
134 Some(b';') if self.lookahead_byte(1) == Some(b')') => {
135 self.advance_by(2);
136 depth -= 1;
137 if depth == 0 {
138 break;
139 }
140 }
141 Some(_) => self.advance_pos(),
142 }
143 }
144 }
145 _ => break,
146 }
147 }
148
149 let Some(c) = self.peek_byte() else {
150 return Ok(None);
151 };
152 let char_pos = self.pos();
153 match c {
154 b'(' => {
155 self.advance_pos();
156 Ok(Some((char_pos, Token::LParen)))
157 }
158 b')' => {
159 self.advance_pos();
160 Ok(Some((char_pos, Token::RParen)))
161 }
162 b'@' => {
163 self.advance_pos();
164 Ok(Some((char_pos, Token::At)))
165 }
166 c if is_sym_first_char(c) => {
167 let start = self.pos.offset;
168 let start_pos = self.pos();
169 while let Some(c) = self.peek_byte() {
170 match c {
171 c if is_sym_other_char(c) => self.advance_pos(),
172 _ => break,
173 }
174 }
175 let end = self.pos.offset;
176 let s = &self.src[start..end];
177 debug_assert!(!s.is_empty());
178 Ok(Some((start_pos, Token::Symbol(s.to_string()))))
179 }
180 c @ (b'0'..=b'9' | b'-') => {
181 let start_pos = self.pos();
182 let mut neg = false;
183 if c == b'-' {
184 self.advance_pos();
185 neg = true;
186 }
187
188 let mut radix = 10;
189
190 match (
192 self.src.as_bytes().get(self.pos.offset),
193 self.src.as_bytes().get(self.pos.offset + 1),
194 ) {
195 (Some(b'0'), Some(b'x' | b'X')) => {
196 self.advance_by(2);
197 radix = 16;
198 }
199 (Some(b'0'), Some(b'o' | b'O')) => {
200 self.advance_by(2);
201 radix = 8;
202 }
203 (Some(b'0'), Some(b'b' | b'B')) => {
204 self.advance_by(2);
205 radix = 2;
206 }
207 _ => {}
208 }
209
210 let start = self.pos.offset;
214 while let Some(c) = self.peek_byte() {
215 match c {
216 b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_' => self.advance_pos(),
217 _ => break,
218 }
219 }
220 let end = self.pos.offset;
221 let s = &self.src[start..end];
222 let s = if s.contains('_') {
223 Cow::Owned(s.replace('_', ""))
224 } else {
225 Cow::Borrowed(s)
226 };
227
228 let num = match u128::from_str_radix(&s, radix) {
231 Ok(num) => num,
232 Err(err) => return Err(self.error(start_pos, err.to_string())),
233 };
234
235 let num = match (neg, num) {
236 (true, 0x80000000000000000000000000000000) => {
237 return Err(self.error(start_pos, "integer literal cannot fit in i128"))
238 }
239 (true, _) => -(num as i128),
240 (false, _) => num as i128,
241 };
242 let tok = Token::Int(num);
243
244 Ok(Some((start_pos, tok)))
245 }
246 c => Err(self.error(self.pos, format!("Unexpected character '{c}'"))),
247 }
248 }
249
250 pub fn next(&mut self) -> Result<Option<(Pos, Token)>> {
252 let tok = self.lookahead.take();
253 self.reload()?;
254 Ok(tok)
255 }
256
257 fn reload(&mut self) -> Result<()> {
258 if self.lookahead.is_none() && self.pos.offset < self.src.len() {
259 self.lookahead = self.next_token()?;
260 }
261 Ok(())
262 }
263
264 pub fn peek(&self) -> Option<&(Pos, Token)> {
266 self.lookahead.as_ref()
267 }
268
269 pub fn eof(&self) -> bool {
271 self.lookahead.is_none()
272 }
273
274 fn peek_byte(&self) -> Option<u8> {
275 self.lookahead_byte(0)
276 }
277
278 fn lookahead_byte(&self, n: usize) -> Option<u8> {
279 self.src.as_bytes().get(self.pos.offset + n).copied()
280 }
281}
282
283impl Token {
284 pub fn is_int(&self) -> bool {
286 matches!(self, Token::Int(_))
287 }
288
289 pub fn is_sym(&self) -> bool {
291 matches!(self, Token::Symbol(_))
292 }
293}
294
295#[cfg(test)]
296mod test {
297 use super::*;
298
299 #[track_caller]
300 fn lex(src: &str) -> Vec<Token> {
301 let mut toks = vec![];
302 let mut lexer = Lexer::new(0, src).unwrap();
303 while let Some((_, tok)) = lexer.next().unwrap() {
304 toks.push(tok);
305 }
306 toks
307 }
308
309 #[test]
310 fn lexer_basic() {
311 assert_eq!(
312 lex(";; comment\n; another\r\n \t(one two three (; block comment ;) 23 (; nested (; block ;) comment ;) -568 )\n"),
313 [
314 Token::LParen,
315 Token::Symbol("one".to_string()),
316 Token::Symbol("two".to_string()),
317 Token::Symbol("three".to_string()),
318 Token::Int(23),
319 Token::Int(-568),
320 Token::RParen
321 ]
322 );
323 }
324
325 #[test]
326 fn ends_with_sym() {
327 assert_eq!(lex("asdf"), [Token::Symbol("asdf".to_string())]);
328 }
329
330 #[test]
331 fn ends_with_num() {
332 assert_eq!(lex("23"), [Token::Int(23)]);
333 }
334
335 #[test]
336 fn weird_syms() {
337 assert_eq!(
338 lex("(+ [] => !! _test!;comment\n)"),
339 [
340 Token::LParen,
341 Token::Symbol("+".to_string()),
342 Token::Symbol("[]".to_string()),
343 Token::Symbol("=>".to_string()),
344 Token::Symbol("!!".to_string()),
345 Token::Symbol("_test!".to_string()),
346 Token::RParen,
347 ]
348 );
349 }
350
351 #[test]
352 fn integers() {
353 assert_eq!(
354 lex("0 1 -1"),
355 [Token::Int(0), Token::Int(1), Token::Int(-1)]
356 );
357
358 assert_eq!(
359 lex("340_282_366_920_938_463_463_374_607_431_768_211_455"),
360 [Token::Int(-1)]
361 );
362
363 assert_eq!(
364 lex("170_141_183_460_469_231_731_687_303_715_884_105_727"),
365 [Token::Int(i128::MAX)]
366 );
367
368 assert!(Lexer::new(0, "-170_141_183_460_469_231_731_687_303_715_884_105_728").is_err())
369 }
370}