surrealdb_core/syn/lexer/
mod.rs

1mod byte;
2mod char;
3pub mod compound;
4mod ident;
5pub mod keywords;
6mod reader;
7mod unicode;
8
9#[cfg(test)]
10mod test;
11
12pub use reader::{BytesReader, CharError};
13
14use crate::syn::{
15	error::{bail, SyntaxError},
16	token::{Span, Token, TokenKind},
17};
18
19/// The SurrealQL lexer.
20/// Takes a slice of bytes and turns it into tokens. The lexer is designed with possible invalid utf-8
21/// in mind and will handle bytes which are invalid utf-8 with an error.
22///
23/// The lexer generates tokens lazily. whenever [`Lexer::next_token`] is called on the lexer it will
24/// try to lex the next bytes in the give source as a token. The lexer always returns a token, even
25/// if the source contains invalid tokens or as at the end of the source. In both cases a specific
26/// type of token is returned.
27///
28/// Note that SurrealQL syntax cannot be lexed in advance. For example, record strings and regexes,
29/// both cannot be parsed correctly without knowledge of previous tokens as they are both ambigious
30/// with other tokens.
31#[non_exhaustive]
32pub struct Lexer<'a> {
33	/// The reader for reading the source bytes.
34	pub(super) reader: BytesReader<'a>,
35	/// The one past the last character of the previous token.
36	last_offset: u32,
37	/// A buffer used to build the value of tokens which can't be read straight from the source.
38	/// like for example strings with escape characters.
39	scratch: String,
40
41	// below are a collection of storage for values produced by tokens.
42	// For performance reasons we wan't to keep the tokens as small as possible.
43	// As only some tokens have an additional value associated with them we don't store that value
44	// in the token itself but, instead, in the lexer ensureing a smaller size for each individual
45	// token.
46	//
47	// This does result in some additional state to keep track of as peeking a token while a token
48	// value is still in the variables below will overwrite the previous value.
49	//
50	// Both numbers and actual strings are stored as a string value.
51	// The parser can, depending on position in syntax, decide to parse a number in a variety of
52	// different precisions or formats. The only way to support all is to delay parsing the
53	// actual number value to when the parser can decide on a format.
54	pub(super) string: Option<String>,
55	pub(super) error: Option<SyntaxError>,
56}
57
58impl<'a> Lexer<'a> {
59	/// Create a new lexer.
60	/// # Panic
61	/// This function will panic if the source is longer then u32::MAX.
62	pub fn new(source: &'a [u8]) -> Lexer<'a> {
63		let reader = BytesReader::new(source);
64		assert!(reader.len() <= u32::MAX as usize, "source code exceeded maximum size");
65		Lexer {
66			reader,
67			last_offset: 0,
68			scratch: String::new(),
69			string: None,
70			error: None,
71		}
72	}
73
74	/// Reset the state of the lexer.
75	///
76	/// Doesn't change the state of the reader.
77	pub fn reset(&mut self) {
78		self.last_offset = 0;
79		self.scratch.clear();
80		self.string = None;
81		self.error = None;
82	}
83
84	/// Change the used source from the lexer to a new buffer.
85	///
86	/// Usefull for reusing buffers.
87	///
88	/// # Panic
89	/// This function will panic if the source is longer then u32::MAX.
90	pub fn change_source<'b>(self, source: &'b [u8]) -> Lexer<'b> {
91		let reader = BytesReader::<'b>::new(source);
92		assert!(reader.len() <= u32::MAX as usize, "source code exceeded maximum size");
93		Lexer {
94			reader,
95			last_offset: 0,
96			scratch: self.scratch,
97			string: self.string,
98			error: self.error,
99		}
100	}
101
102	/// Returns the next token, driving the lexer forward.
103	///
104	/// If the lexer is at the end the source it will always return the Eof token.
105	pub fn next_token(&mut self) -> Token {
106		let Some(byte) = self.reader.next() else {
107			return self.eof_token();
108		};
109		if byte.is_ascii() {
110			self.lex_ascii(byte)
111		} else {
112			self.lex_char(byte)
113		}
114	}
115
116	/// Creates the eof token.
117	///
118	/// An eof token has tokenkind Eof and an span which points to the last character of the
119	/// source.
120	fn eof_token(&mut self) -> Token {
121		Token {
122			kind: TokenKind::Eof,
123			span: Span {
124				offset: self.last_offset,
125				len: 0,
126			},
127		}
128	}
129
130	/// Return an invalid token.
131	fn invalid_token(&mut self, error: SyntaxError) -> Token {
132		self.error = Some(error);
133		self.finish_token(TokenKind::Invalid)
134	}
135
136	// Returns the span for the current token being lexed.
137	pub(crate) fn current_span(&self) -> Span {
138		// We make sure that the source is no longer then u32::MAX so this can't overflow.
139		let new_offset = self.reader.offset() as u32;
140		let len = new_offset - self.last_offset;
141		Span {
142			offset: self.last_offset,
143			len,
144		}
145	}
146
147	pub(crate) fn span_since(&self, offset: usize) -> Span {
148		let new_offset = self.reader.offset() as u32;
149		let len = new_offset - offset as u32;
150		Span {
151			offset: offset as u32,
152			len,
153		}
154	}
155
156	fn advance_span(&mut self) -> Span {
157		let span = self.current_span();
158		self.last_offset = self.reader.offset() as u32;
159		span
160	}
161
162	/// Builds a token from an TokenKind.
163	///
164	/// Attaches a span to the token and returns, updates the new offset.
165	fn finish_token(&mut self, kind: TokenKind) -> Token {
166		Token {
167			kind,
168			span: self.advance_span(),
169		}
170	}
171
172	/// Moves the lexer state back to before the give span.
173	///
174	/// # Warning
175	/// Moving the lexer into a state where the next byte is within a multibyte character will
176	/// result in spurious errors.
177	pub(crate) fn backup_before(&mut self, span: Span) {
178		self.reader.backup(span.offset as usize);
179		self.last_offset = span.offset;
180	}
181
182	/// Moves the lexer state to after the give span.
183	///
184	/// # Warning
185	/// Moving the lexer into a state where the next byte is within a multibyte character will
186	/// result in spurious errors.
187	pub(crate) fn backup_after(&mut self, span: Span) {
188		let offset = span.offset + span.len;
189		self.reader.backup(offset as usize);
190		self.last_offset = offset;
191	}
192
193	/// Checks if the next byte is the given byte, if it is it consumes the byte and returns true.
194	/// Otherwise returns false.
195	///
196	/// Also returns false if there is no next character.
197	fn eat(&mut self, byte: u8) -> bool {
198		if self.reader.peek() == Some(byte) {
199			self.reader.next();
200			true
201		} else {
202			false
203		}
204	}
205
206	/// Checks if the closure returns true when given the next byte, if it is it consumes the byte
207	/// and returns true. Otherwise returns false.
208	///
209	/// Also returns false if there is no next character.
210	fn eat_when<F: FnOnce(u8) -> bool>(&mut self, f: F) -> bool {
211		let Some(x) = self.reader.peek() else {
212			return false;
213		};
214		if f(x) {
215			self.reader.next();
216			true
217		} else {
218			false
219		}
220	}
221
222	fn expect(&mut self, c: char) -> Result<(), SyntaxError> {
223		match self.reader.peek() {
224			Some(x) => {
225				let offset = self.reader.offset() as u32;
226				self.reader.next();
227				let char = self.reader.convert_to_char(x)?;
228				if char == c {
229					return Ok(());
230				}
231				let len = self.reader.offset() as u32 - offset;
232				bail!(
233					"Unexpected character `{char}` expected `{c}`",
234					@Span {
235						offset,
236						len
237					}
238				)
239			}
240			None => {
241				bail!("Unexpected end of file, expected character `{c}`", @self.current_span())
242			}
243		}
244	}
245
246	/// Returns the string for a given span of the source.
247	/// Will panic if the given span was not valid for the source, or invalid utf8
248	pub fn span_str(&self, span: Span) -> &'a str {
249		std::str::from_utf8(self.span_bytes(span)).expect("invalid span segment for source")
250	}
251
252	/// Returns the string for a given span of the source.
253	/// Will panic if the given span was not valid for the source, or invalid utf8
254	pub fn span_bytes(&self, span: Span) -> &'a [u8] {
255		self.reader.span(span)
256	}
257
258	/// Returns an error if not all bytes were consumed.
259	pub fn assert_finished(&self) -> Result<(), SyntaxError> {
260		if !self.reader.is_empty() {
261			let offset = self.reader.offset() as u32;
262			let len = self.reader.remaining().len() as u32;
263			let span = Span {
264				offset,
265				len,
266			};
267			bail!("Trailing characters", @span)
268		}
269		Ok(())
270	}
271}
272
273impl Iterator for Lexer<'_> {
274	type Item = Token;
275
276	fn next(&mut self) -> Option<Self::Item> {
277		let token = self.next_token();
278		if token.is_eof() {
279			return None;
280		}
281		Some(token)
282	}
283}