surrealdb_core/syn/parser/
mod.rs

1//! Module implementing the SurrealQL parser.
2//!
3//! The SurrealQL parse is a relatively simple recursive decent parser.
4//! Most of the functions of the SurrealQL parser peek a token from the lexer and then decide to
5//! take a path depending on which token is next.
6//!
7//! # Implementation Details
8//!
9//! There are a bunch of common patterns for which this module has some confinence functions.
10//! - Whenever only one token can be next you should use the `expected!` macro. This macro
11//!     ensures that the given token type is next and if not returns a parser error.
12//! - Whenever a limited set of tokens can be next it is common to match the token kind and then
13//!     have a catch all arm which calles the macro `unexpected!`. This macro will raise an parse
14//!     error with information about the type of token it recieves and what it expected.
15//! - If a single token can be optionally next use [`Parser::eat`] this function returns a bool
16//!     depending on if the given tokenkind was eaten.
17//! - If a closing delimiting token is expected use `Parser::expect_closing_delimiter`. This
18//!     function will raise an error if the expected delimiter isn't the next token. This error will
19//!     also point to which delimiter the parser expected to be closed.
20//!
21//! ## Far Token Peek
22//!
23//! Occasionally the parser needs to check further ahead than peeking allows.
24//! This is done with the [`Parser::peek1`] function. This function peeks one token further then
25//! peek.
26//!
27//! ## WhiteSpace Tokens
28//!
29//! The lexer produces whitespace tokens, these are tokens which are normally ignored in most place
30//! in the syntax as they have no bearing on the meaning of a statements. [`Parser::next`] and
31//! [`Parser::peek`] automatically skip over any whitespace tokens. However in some places, like
32//! in a record-id and when gluing tokens, these white-space tokens are required for correct
33//! parsing. In which case the function [`Parser::next_whitespace`] and others with `_whitespace`
34//! are used. These functions don't skip whitespace tokens. However these functions do not undo
35//! whitespace tokens which might have been skipped. Implementers must be carefull to not call a
36//! functions which requires whitespace tokens when they may already have been skipped.
37//!
38//! ## Compound tokens and token gluing.
39//!
40//! SurrealQL has a bunch of tokens which have complex rules for when they are allowed and the
41//! value they contain. Such tokens are named compound tokens, and examples include a javascript
42//! body, strand-like tokens, regex, numbers, etc.
43//!
44//! These tokens need to be manually requested from the lexer with the [`Lexer::lex_compound`]
45//! function.
46//!
47//! This manually request of tokens leads to a problems when used in conjunction with peeking. Take
48//! for instance the production `{ "foo": "bar"}`. `"foo"` is a compound token so when intially
49//! encountered the lexer only returns a `"` token and then that token needs to be collected into a
50//! the full strand token. However the parser needs to figure out if we are parsing an object
51//! or a block so it needs to look past the compound token to see if the next token is `:`. This is
52//! where gluing comes in. Calling `Parser::glue` checks if the next token could start a compound
53//! token and combines them into a single token. This can only be done in places where we know if
54//! we encountered a leading token of a compound token it will result in the 'default' compound token.
55
56use self::token_buffer::TokenBuffer;
57use crate::{
58	sql::{self, Datetime, Duration, Strand, Uuid},
59	syn::{
60		error::{bail, SyntaxError},
61		lexer::{compound::NumberKind, Lexer},
62		token::{t, Span, Token, TokenKind},
63	},
64};
65use bytes::BytesMut;
66use reblessive::{Stack, Stk};
67
68mod basic;
69mod builtin;
70mod expression;
71mod function;
72mod glue;
73mod idiom;
74mod json;
75mod kind;
76pub(crate) mod mac;
77mod object;
78mod prime;
79mod stmt;
80mod thing;
81mod token;
82mod token_buffer;
83
84pub(crate) use mac::{enter_object_recursion, enter_query_recursion, unexpected};
85
86use super::error::{syntax_error, RenderedError};
87
88#[cfg(test)]
89pub mod test;
90
91/// The result returned by most parser function.
92pub type ParseResult<T> = Result<T, SyntaxError>;
93
94/// A result of trying to parse a possibly partial query.
95#[derive(Debug)]
96#[non_exhaustive]
97pub enum PartialResult<T> {
98	MoreData,
99	/// Parsing the source produced no reasonable value.
100	Empty {
101		used: usize,
102	},
103	Ok {
104		value: T,
105		used: usize,
106	},
107	Err {
108		err: SyntaxError,
109		used: usize,
110	},
111}
112
113#[derive(Default)]
114pub enum GluedValue {
115	Duration(Duration),
116	Datetime(Datetime),
117	Uuid(Uuid),
118	Number(NumberKind),
119	Strand(Strand),
120	#[default]
121	None,
122}
123
124#[derive(Clone, Debug)]
125pub struct ParserSettings {
126	/// Parse strand like the old parser where a strand which looks like a UUID, Record-Id, Or a
127	/// DateTime will be parsed as a date-time.
128	pub legacy_strands: bool,
129	/// Set whether to allow record-id's which don't adheare to regular ident rules.
130	/// Setting this to true will allow parsing of, for example, `foo:0bar`. This would be rejected
131	/// by normal identifier rules as most identifiers can't start with a number.
132	pub flexible_record_id: bool,
133	/// Disallow a query to have objects deeper that limit.
134	/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
135	pub object_recursion_limit: usize,
136	/// Disallow a query from being deeper than the give limit.
137	/// A query recurses when a statement contains another statement within itself.
138	/// Examples are subquery and blocks like block statements and if statements and such.
139	pub query_recursion_limit: usize,
140	/// Whether record references are enabled.
141	pub references_enabled: bool,
142	/// Whether bearer access is enabled
143	pub bearer_access_enabled: bool,
144	/// Whether bearer access is enabled
145	pub define_api_enabled: bool,
146}
147
148impl Default for ParserSettings {
149	fn default() -> Self {
150		ParserSettings {
151			legacy_strands: false,
152			flexible_record_id: true,
153			object_recursion_limit: 100,
154			query_recursion_limit: 20,
155			references_enabled: false,
156			bearer_access_enabled: false,
157			define_api_enabled: false,
158		}
159	}
160}
161
162/// The SurrealQL parser.
163pub struct Parser<'a> {
164	lexer: Lexer<'a>,
165	last_span: Span,
166	token_buffer: TokenBuffer<4>,
167	glued_value: GluedValue,
168	pub(crate) table_as_field: bool,
169	settings: ParserSettings,
170}
171
172impl<'a> Parser<'a> {
173	/// Create a new parser from a give source.
174	pub fn new(source: &'a [u8]) -> Self {
175		Parser::new_with_settings(source, ParserSettings::default())
176	}
177
178	/// Create a new parser from a give source.
179	pub fn new_with_settings(source: &'a [u8], settings: ParserSettings) -> Self {
180		Parser {
181			lexer: Lexer::new(source),
182			last_span: Span::empty(),
183			token_buffer: TokenBuffer::new(),
184			glued_value: GluedValue::None,
185			table_as_field: true,
186			settings,
187		}
188	}
189
190	pub fn with_settings(mut self, settings: ParserSettings) -> Self {
191		self.settings = settings;
192		self
193	}
194
195	/// Returns the next token and advance the parser one token forward.
196	#[allow(clippy::should_implement_trait)]
197	pub fn next(&mut self) -> Token {
198		let res = loop {
199			let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
200			if res.kind != TokenKind::WhiteSpace {
201				break res;
202			}
203		};
204		self.last_span = res.span;
205		res
206	}
207
208	/// Returns the next token and advance the parser one token forward.
209	///
210	/// This function is like next but returns whitespace tokens which are normally skipped
211	#[allow(clippy::should_implement_trait)]
212	pub fn next_whitespace(&mut self) -> Token {
213		let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
214		self.last_span = res.span;
215		res
216	}
217
218	/// Returns if there is a token in the token buffer, meaning that a token was peeked.
219	pub fn has_peek(&self) -> bool {
220		self.token_buffer.is_empty()
221	}
222
223	/// Consume the current peeked value and advance the parser one token forward.
224	///
225	/// Should only be called after peeking a value.
226	pub fn pop_peek(&mut self) -> Token {
227		let res = self.token_buffer.pop().unwrap();
228		self.last_span = res.span;
229		res
230	}
231
232	/// Returns the next token without consuming it.
233	pub fn peek(&mut self) -> Token {
234		loop {
235			let Some(x) = self.token_buffer.first() else {
236				let res = loop {
237					let res = self.lexer.next_token();
238					if res.kind != TokenKind::WhiteSpace {
239						break res;
240					}
241				};
242				self.token_buffer.push(res);
243				return res;
244			};
245			if x.kind == TokenKind::WhiteSpace {
246				self.token_buffer.pop();
247				continue;
248			}
249			break x;
250		}
251	}
252
253	/// Returns the next token without consuming it.
254	///
255	/// This function is like peek but returns whitespace tokens which are normally skipped
256	/// Does not undo tokens skipped in a previous normal peek.
257	pub fn peek_whitespace(&mut self) -> Token {
258		let Some(x) = self.token_buffer.first() else {
259			let res = self.lexer.next_token();
260			self.token_buffer.push(res);
261			return res;
262		};
263		x
264	}
265
266	/// Return the token kind of the next token without consuming it.
267	pub fn peek_kind(&mut self) -> TokenKind {
268		self.peek().kind
269	}
270
271	/// Returns the next n'th token without consuming it.
272	/// `peek_token_at(0)` is equivalent to `peek`.
273	pub(crate) fn peek_token_at(&mut self, at: u8) -> Token {
274		for _ in self.token_buffer.len()..=at {
275			let r = loop {
276				let r = self.lexer.next_token();
277				if r.kind != TokenKind::WhiteSpace {
278					break r;
279				}
280			};
281			self.token_buffer.push(r);
282		}
283		self.token_buffer.at(at).unwrap()
284	}
285
286	pub fn peek1(&mut self) -> Token {
287		self.peek_token_at(1)
288	}
289
290	/// Returns the next n'th token without consuming it.
291	/// `peek_token_at(0)` is equivalent to `peek`.
292	pub fn peek_whitespace_token_at(&mut self, at: u8) -> Token {
293		for _ in self.token_buffer.len()..=at {
294			let r = self.lexer.next_token();
295			self.token_buffer.push(r);
296		}
297		self.token_buffer.at(at).unwrap()
298	}
299
300	pub fn peek_whitespace1(&mut self) -> Token {
301		self.peek_whitespace_token_at(1)
302	}
303
304	/// Returns the span of the next token if it was already peeked, otherwise returns the token of
305	/// the last consumed token.
306	pub fn recent_span(&mut self) -> Span {
307		self.token_buffer.first().map(|x| x.span).unwrap_or(self.last_span)
308	}
309
310	///  returns the token of the last consumed token.
311	pub fn last_span(&mut self) -> Span {
312		self.last_span
313	}
314
315	pub fn assert_finished(&mut self) -> ParseResult<()> {
316		let p = self.peek();
317		if self.peek().kind != TokenKind::Eof {
318			bail!("Unexpected token `{}`, expected no more tokens",p.kind, @p.span);
319		}
320		Ok(())
321	}
322
323	/// Eat the next token if it is of the given kind.
324	/// Returns whether a token was eaten.
325	pub fn eat(&mut self, token: TokenKind) -> bool {
326		let peek = self.peek();
327		if token == peek.kind {
328			self.token_buffer.pop();
329			self.last_span = peek.span;
330			true
331		} else {
332			false
333		}
334	}
335
336	/// Eat the next token if it is of the given kind.
337	/// Returns whether a token was eaten.
338	///
339	/// Unlike [`Parser::eat`] this doesn't skip whitespace tokens
340	pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
341		let peek = self.peek_whitespace();
342		if token == peek.kind {
343			self.token_buffer.pop();
344			self.last_span = peek.span;
345			true
346		} else {
347			false
348		}
349	}
350
351	/// Forces the next token to be the given one.
352	/// Used in token gluing to replace the current one with the glued token.
353	fn prepend_token(&mut self, token: Token) {
354		self.token_buffer.push_front(token);
355	}
356
357	/// Checks if the next token is of the given kind. If it isn't it returns a UnclosedDelimiter
358	/// error.
359	fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
360		let peek = self.peek();
361		if peek.kind != kind {
362			bail!("Unexpected token `{}` expected delimiter `{kind}`",
363				peek.kind,
364				@self.recent_span(),
365				@should_close => "expected this delimiter to close"
366			);
367		}
368		self.pop_peek();
369		Ok(())
370	}
371
372	/// Recover the parser state to after a given span.
373	pub fn backup_after(&mut self, span: Span) {
374		self.token_buffer.clear();
375		self.lexer.backup_after(span);
376	}
377
378	/// Parse a full query.
379	///
380	/// This is the primary entry point of the parser.
381	pub async fn parse_query(&mut self, ctx: &mut Stk) -> ParseResult<sql::Query> {
382		let statements = self.parse_stmt_list(ctx).await?;
383		Ok(sql::Query(statements))
384	}
385
386	/// Parse a single statement.
387	pub async fn parse_statement(&mut self, ctx: &mut Stk) -> ParseResult<sql::Statement> {
388		self.parse_stmt(ctx).await
389	}
390}
391
392/// A struct which can parse queries statements by statement
393pub struct StatementStream {
394	stack: Stack,
395	settings: ParserSettings,
396	col_offset: usize,
397	line_offset: usize,
398}
399
400impl StatementStream {
401	#[allow(clippy::new_without_default)]
402	pub fn new() -> Self {
403		Self::new_with_settings(ParserSettings::default())
404	}
405
406	pub fn new_with_settings(settings: ParserSettings) -> Self {
407		StatementStream {
408			stack: Stack::new(),
409			settings,
410			col_offset: 0,
411			line_offset: 0,
412		}
413	}
414
415	/// updates the line and column offset after consuming bytes.
416	fn accumulate_line_col(&mut self, bytes: &[u8]) {
417		// The parser should have ensured that bytes is a valid utf-8 string.
418		// TODO: Maybe change this to unsafe cast once we have more convidence in the parsers
419		// correctness.
420		let (line_num, remaining) =
421			std::str::from_utf8(bytes).unwrap().lines().enumerate().last().unwrap_or((0, ""));
422
423		self.line_offset += line_num;
424		if line_num > 0 {
425			self.col_offset = 0;
426		}
427		self.col_offset += remaining.chars().count();
428	}
429
430	/// Parses a statement if the buffer contains sufficient data to parse a statement.
431	///
432	/// When it will have done so the it will remove the read bytes from the buffer and return
433	/// Ok(Some(_)). In case of a parsing error it will return Err(_), this will not consume data.
434	///
435	/// If the function returns Ok(None), not enough data was in the buffer to fully parse a
436	/// statement, the function might still consume data from the buffer, like whitespace between statements,
437	/// when a none is returned.
438	pub fn parse_partial(
439		&mut self,
440		buffer: &mut BytesMut,
441	) -> Result<Option<sql::Statement>, RenderedError> {
442		let mut slice = &**buffer;
443		if slice.len() > u32::MAX as usize {
444			// limit slice length.
445			slice = &slice[..u32::MAX as usize];
446		}
447
448		let mut parser = Parser::new_with_settings(slice, self.settings.clone());
449
450		// eat empty statements.
451		while parser.eat(t!(";")) {}
452
453		if parser.peek().span.offset != 0 && buffer.len() > u32::MAX as usize {
454			// we ate some bytes statements, so in order to ensure whe can parse a full statement
455			// of 4gigs we need recreate the parser starting with the empty bytes removed.
456			let eaten = buffer.split_to(parser.peek().span.offset as usize);
457			self.accumulate_line_col(&eaten);
458			slice = &**buffer;
459			if slice.len() > u32::MAX as usize {
460				// limit slice length.
461				slice = &slice[..u32::MAX as usize];
462			}
463			parser = Parser::new_with_settings(slice, self.settings.clone())
464		}
465
466		// test if the buffer is now empty, which would cause the parse_statement function to fail.
467		if parser.peek().is_eof() {
468			return Ok(None);
469		}
470
471		let res = self.stack.enter(|ctx| parser.parse_statement(ctx)).finish();
472		if parser.peek().is_eof() {
473			if buffer.len() > u32::MAX as usize {
474				let error = syntax_error!("Cannot parse query, statement exceeded maximum size of 4GB", @parser.last_span());
475				return Err(error
476					.render_on_bytes(buffer)
477					.offset_location(self.line_offset, self.col_offset));
478			}
479
480			// finished on an eof token.
481			// We can't know if this is an actual result, or if it would change when more data
482			// is available.
483			return Ok(None);
484		}
485
486		// we need a trailing semicolon.
487		if !parser.eat(t!(";")) {
488			let peek = parser.next();
489
490			if parser.peek1().is_eof() {
491				return Ok(None);
492			}
493
494			if let Err(e) = res {
495				return Err(e
496					.render_on_bytes(slice)
497					.offset_location(self.line_offset, self.col_offset));
498			}
499
500			let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
501				@peek.span => "maybe forgot a semicolon after the previous statement?");
502			return Err(error
503				.render_on_bytes(slice)
504				.offset_location(self.line_offset, self.col_offset));
505		}
506
507		// Eat possible empty statements.
508		while parser.eat(t!(";")) {}
509
510		let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
511		let res = res.map(Some).map_err(|e| {
512			e.render_on_bytes(&eaten).offset_location(self.line_offset, self.col_offset)
513		});
514		self.accumulate_line_col(&eaten);
515		res
516	}
517
518	/// Parse remaining statements once the buffer is complete.
519	pub fn parse_complete(
520		&mut self,
521		buffer: &mut BytesMut,
522	) -> Result<Option<sql::Statement>, RenderedError> {
523		let mut slice = &**buffer;
524		if slice.len() > u32::MAX as usize {
525			// limit slice length.
526			slice = &slice[..u32::MAX as usize];
527		}
528
529		let mut parser = Parser::new_with_settings(slice, self.settings.clone());
530		// eat empty statements.
531		while parser.eat(t!(";")) {}
532
533		if parser.peek().is_eof() {
534			// There were no statements in the buffer, clear possible used
535			buffer.clear();
536			return Ok(None);
537		}
538
539		match self.stack.enter(|ctx| parser.parse_statement(ctx)).finish() {
540			Ok(x) => {
541				if !parser.peek().is_eof() && !parser.eat(t!(";")) {
542					let peek = parser.peek();
543					let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
544						@peek.span => "maybe forgot a semicolon after the previous statement?");
545					return Err(error
546						.render_on_bytes(slice)
547						.offset_location(self.line_offset, self.col_offset));
548				}
549
550				let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
551				self.accumulate_line_col(&eaten);
552				Ok(Some(x))
553			}
554			Err(e) => {
555				Err(e.render_on_bytes(slice).offset_location(self.line_offset, self.col_offset))
556			}
557		}
558	}
559}