surrealdb_core/syn/parser/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
//! Module implementing the SurrealQL parser.
//!
//! The SurrealQL parse is a relatively simple recursive decent parser.
//! Most of the functions of the SurrealQL parser peek a token from the lexer and then decide to
//! take a path depending on which token is next.
//!
//! # Implementation Details
//!
//! There are a bunch of common patterns for which this module has some confinence functions.
//! - Whenever only one token can be next you should use the `expected!` macro. This macro
//!     ensures that the given token type is next and if not returns a parser error.
//! - Whenever a limited set of tokens can be next it is common to match the token kind and then
//!     have a catch all arm which calles the macro `unexpected!`. This macro will raise an parse
//!     error with information about the type of token it recieves and what it expected.
//! - If a single token can be optionally next use [`Parser::eat`] this function returns a bool
//!     depending on if the given tokenkind was eaten.
//! - If a closing delimiting token is expected use `Parser::expect_closing_delimiter`. This
//!     function will raise an error if the expected delimiter isn't the next token. This error will
//!     also point to which delimiter the parser expected to be closed.
//!
//! ## Far Token Peek
//!
//! Occasionally the parser needs to check further ahead than peeking allows.
//! This is done with the [`Parser::peek1`] function. This function peeks one token further then
//! peek.
//!
//! ## WhiteSpace Tokens
//!
//! The lexer produces whitespace tokens, these are tokens which are normally ignored in most place
//! in the syntax as they have no bearing on the meaning of a statements. [`Parser::next`] and
//! [`Parser::peek`] automatically skip over any whitespace tokens. However in some places, like
//! in a record-id and when gluing tokens, these white-space tokens are required for correct
//! parsing. In which case the function [`Parser::next_whitespace`] and others with `_whitespace`
//! are used. These functions don't skip whitespace tokens. However these functions do not undo
//! whitespace tokens which might have been skipped. Implementers must be carefull to not call a
//! functions which requires whitespace tokens when they may already have been skipped.
//!
//! ## Compound tokens and token gluing.
//!
//! SurrealQL has a bunch of tokens which have complex rules for when they are allowed and the
//! value they contain. Such tokens are named compound tokens, and examples include a javascript
//! body, strand-like tokens, regex, numbers, etc.
//!
//! These tokens need to be manually requested from the lexer with the [`Lexer::lex_compound`]
//! function.
//!
//! This manually request of tokens leads to a problems when used in conjunction with peeking. Take
//! for instance the production `{ "foo": "bar"}`. `"foo"` is a compound token so when intially
//! encountered the lexer only returns a `"` token and then that token needs to be collected into a
//! the full strand token. However the parser needs to figure out if we are parsing an object
//! or a block so it needs to look past the compound token to see if the next token is `:`. This is
//! where gluing comes in. Calling `Parser::glue` checks if the next token could start a compound
//! token and combines them into a single token. This can only be done in places where we know if
//! we encountered a leading token of a compound token it will result in the 'default' compound token.

use self::token_buffer::TokenBuffer;
use crate::{
	sql::{self, Datetime, Duration, Strand, Uuid},
	syn::{
		error::{bail, SyntaxError},
		lexer::{compound::NumberKind, Lexer},
		token::{t, Span, Token, TokenKind},
	},
};
use bytes::BytesMut;
use reblessive::{Stack, Stk};

mod basic;
mod builtin;
mod expression;
mod function;
mod glue;
mod idiom;
mod json;
mod kind;
pub(crate) mod mac;
mod object;
mod prime;
mod stmt;
mod thing;
mod token;
mod token_buffer;

pub(crate) use mac::{enter_object_recursion, enter_query_recursion, unexpected};

use super::error::{syntax_error, RenderedError};

#[cfg(test)]
pub mod test;

/// The result returned by most parser function.
pub type ParseResult<T> = Result<T, SyntaxError>;

/// A result of trying to parse a possibly partial query.
#[derive(Debug)]
#[non_exhaustive]
pub enum PartialResult<T> {
	MoreData,
	/// Parsing the source produced no reasonable value.
	Empty {
		used: usize,
	},
	Ok {
		value: T,
		used: usize,
	},
	Err {
		err: SyntaxError,
		used: usize,
	},
}

#[derive(Default)]
pub enum GluedValue {
	Duration(Duration),
	Datetime(Datetime),
	Uuid(Uuid),
	Number(NumberKind),
	Strand(Strand),
	#[default]
	None,
}

#[derive(Clone, Debug)]
pub struct ParserSettings {
	/// Parse strand like the old parser where a strand which looks like a UUID, Record-Id, Or a
	/// DateTime will be parsed as a date-time.
	pub legacy_strands: bool,
	/// Set whether to allow record-id's which don't adheare to regular ident rules.
	/// Setting this to true will allow parsing of, for example, `foo:0bar`. This would be rejected
	/// by normal identifier rules as most identifiers can't start with a number.
	pub flexible_record_id: bool,
	/// Disallow a query to have objects deeper that limit.
	/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
	pub object_recursion_limit: usize,
	/// Disallow a query from being deeper than the give limit.
	/// A query recurses when a statement contains another statement within itself.
	/// Examples are subquery and blocks like block statements and if statements and such.
	pub query_recursion_limit: usize,
	/// Whether record references are enabled.
	pub references_enabled: bool,
	/// Whether bearer access is enabled
	pub bearer_access_enabled: bool,
}

impl Default for ParserSettings {
	fn default() -> Self {
		ParserSettings {
			legacy_strands: false,
			flexible_record_id: true,
			object_recursion_limit: 100,
			query_recursion_limit: 20,
			references_enabled: false,
			bearer_access_enabled: false,
		}
	}
}

/// The SurrealQL parser.
pub struct Parser<'a> {
	lexer: Lexer<'a>,
	last_span: Span,
	token_buffer: TokenBuffer<4>,
	glued_value: GluedValue,
	pub(crate) table_as_field: bool,
	settings: ParserSettings,
}

impl<'a> Parser<'a> {
	/// Create a new parser from a give source.
	pub fn new(source: &'a [u8]) -> Self {
		Parser::new_with_settings(source, ParserSettings::default())
	}

	/// Create a new parser from a give source.
	pub fn new_with_settings(source: &'a [u8], settings: ParserSettings) -> Self {
		Parser {
			lexer: Lexer::new(source),
			last_span: Span::empty(),
			token_buffer: TokenBuffer::new(),
			glued_value: GluedValue::None,
			table_as_field: true,
			settings,
		}
	}

	pub fn with_settings(mut self, settings: ParserSettings) -> Self {
		self.settings = settings;
		self
	}

	/// Returns the next token and advance the parser one token forward.
	#[allow(clippy::should_implement_trait)]
	pub fn next(&mut self) -> Token {
		let res = loop {
			let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
			if res.kind != TokenKind::WhiteSpace {
				break res;
			}
		};
		self.last_span = res.span;
		res
	}

	/// Returns the next token and advance the parser one token forward.
	///
	/// This function is like next but returns whitespace tokens which are normally skipped
	#[allow(clippy::should_implement_trait)]
	pub fn next_whitespace(&mut self) -> Token {
		let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
		self.last_span = res.span;
		res
	}

	/// Returns if there is a token in the token buffer, meaning that a token was peeked.
	pub fn has_peek(&self) -> bool {
		self.token_buffer.is_empty()
	}

	/// Consume the current peeked value and advance the parser one token forward.
	///
	/// Should only be called after peeking a value.
	pub fn pop_peek(&mut self) -> Token {
		let res = self.token_buffer.pop().unwrap();
		self.last_span = res.span;
		res
	}

	/// Returns the next token without consuming it.
	pub fn peek(&mut self) -> Token {
		loop {
			let Some(x) = self.token_buffer.first() else {
				let res = loop {
					let res = self.lexer.next_token();
					if res.kind != TokenKind::WhiteSpace {
						break res;
					}
				};
				self.token_buffer.push(res);
				return res;
			};
			if x.kind == TokenKind::WhiteSpace {
				self.token_buffer.pop();
				continue;
			}
			break x;
		}
	}

	/// Returns the next token without consuming it.
	///
	/// This function is like peek but returns whitespace tokens which are normally skipped
	/// Does not undo tokens skipped in a previous normal peek.
	pub fn peek_whitespace(&mut self) -> Token {
		let Some(x) = self.token_buffer.first() else {
			let res = self.lexer.next_token();
			self.token_buffer.push(res);
			return res;
		};
		x
	}

	/// Return the token kind of the next token without consuming it.
	pub fn peek_kind(&mut self) -> TokenKind {
		self.peek().kind
	}

	/// Returns the next n'th token without consuming it.
	/// `peek_token_at(0)` is equivalent to `peek`.
	pub(crate) fn peek_token_at(&mut self, at: u8) -> Token {
		for _ in self.token_buffer.len()..=at {
			let r = loop {
				let r = self.lexer.next_token();
				if r.kind != TokenKind::WhiteSpace {
					break r;
				}
			};
			self.token_buffer.push(r);
		}
		self.token_buffer.at(at).unwrap()
	}

	pub fn peek1(&mut self) -> Token {
		self.peek_token_at(1)
	}

	/// Returns the next n'th token without consuming it.
	/// `peek_token_at(0)` is equivalent to `peek`.
	pub fn peek_whitespace_token_at(&mut self, at: u8) -> Token {
		for _ in self.token_buffer.len()..=at {
			let r = self.lexer.next_token();
			self.token_buffer.push(r);
		}
		self.token_buffer.at(at).unwrap()
	}

	pub fn peek_whitespace1(&mut self) -> Token {
		self.peek_whitespace_token_at(1)
	}

	/// Returns the span of the next token if it was already peeked, otherwise returns the token of
	/// the last consumed token.
	pub fn recent_span(&mut self) -> Span {
		self.token_buffer.first().map(|x| x.span).unwrap_or(self.last_span)
	}

	///  returns the token of the last consumed token.
	pub fn last_span(&mut self) -> Span {
		self.last_span
	}

	pub fn assert_finished(&mut self) -> ParseResult<()> {
		let p = self.peek();
		if self.peek().kind != TokenKind::Eof {
			bail!("Unexpected token `{}`, expected no more tokens",p.kind, @p.span);
		}
		Ok(())
	}

	/// Eat the next token if it is of the given kind.
	/// Returns whether a token was eaten.
	pub fn eat(&mut self, token: TokenKind) -> bool {
		let peek = self.peek();
		if token == peek.kind {
			self.token_buffer.pop();
			self.last_span = peek.span;
			true
		} else {
			false
		}
	}

	/// Eat the next token if it is of the given kind.
	/// Returns whether a token was eaten.
	///
	/// Unlike [`Parser::eat`] this doesn't skip whitespace tokens
	pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
		let peek = self.peek_whitespace();
		if token == peek.kind {
			self.token_buffer.pop();
			self.last_span = peek.span;
			true
		} else {
			false
		}
	}

	/// Forces the next token to be the given one.
	/// Used in token gluing to replace the current one with the glued token.
	fn prepend_token(&mut self, token: Token) {
		self.token_buffer.push_front(token);
	}

	/// Checks if the next token is of the given kind. If it isn't it returns a UnclosedDelimiter
	/// error.
	fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
		let peek = self.peek();
		if peek.kind != kind {
			bail!("Unexpected token `{}` expected delimiter `{kind}`",
				peek.kind,
				@self.recent_span(),
				@should_close => "expected this delimiter to close"
			);
		}
		self.pop_peek();
		Ok(())
	}

	/// Recover the parser state to after a given span.
	pub fn backup_after(&mut self, span: Span) {
		self.token_buffer.clear();
		self.lexer.backup_after(span);
	}

	/// Parse a full query.
	///
	/// This is the primary entry point of the parser.
	pub async fn parse_query(&mut self, ctx: &mut Stk) -> ParseResult<sql::Query> {
		let statements = self.parse_stmt_list(ctx).await?;
		Ok(sql::Query(statements))
	}

	/// Parse a single statement.
	pub async fn parse_statement(&mut self, ctx: &mut Stk) -> ParseResult<sql::Statement> {
		self.parse_stmt(ctx).await
	}
}

/// A struct which can parse queries statements by statement
pub struct StatementStream {
	stack: Stack,
	settings: ParserSettings,
	col_offset: usize,
	line_offset: usize,
}

impl StatementStream {
	#[allow(clippy::new_without_default)]
	pub fn new() -> Self {
		Self::new_with_settings(ParserSettings::default())
	}

	pub fn new_with_settings(settings: ParserSettings) -> Self {
		StatementStream {
			stack: Stack::new(),
			settings,
			col_offset: 0,
			line_offset: 0,
		}
	}

	/// updates the line and column offset after consuming bytes.
	fn accumulate_line_col(&mut self, bytes: &[u8]) {
		// The parser should have ensured that bytes is a valid utf-8 string.
		// TODO: Maybe change this to unsafe cast once we have more convidence in the parsers
		// correctness.
		let (line_num, remaining) =
			std::str::from_utf8(bytes).unwrap().lines().enumerate().last().unwrap_or((0, ""));

		self.line_offset += line_num;
		if line_num > 0 {
			self.col_offset = 0;
		}
		self.col_offset += remaining.chars().count();
	}

	/// Parses a statement if the buffer contains sufficient data to parse a statement.
	///
	/// When it will have done so the it will remove the read bytes from the buffer and return
	/// Ok(Some(_)). In case of a parsing error it will return Err(_), this will not consume data.
	///
	/// If the function returns Ok(None), not enough data was in the buffer to fully parse a
	/// statement, the function might still consume data from the buffer, like whitespace between statements,
	/// when a none is returned.
	pub fn parse_partial(
		&mut self,
		buffer: &mut BytesMut,
	) -> Result<Option<sql::Statement>, RenderedError> {
		let mut slice = &**buffer;
		if slice.len() > u32::MAX as usize {
			// limit slice length.
			slice = &slice[..u32::MAX as usize];
		}

		let mut parser = Parser::new_with_settings(slice, self.settings.clone());

		// eat empty statements.
		while parser.eat(t!(";")) {}

		if parser.peek().span.offset != 0 && buffer.len() > u32::MAX as usize {
			// we ate some bytes statements, so in order to ensure whe can parse a full statement
			// of 4gigs we need recreate the parser starting with the empty bytes removed.
			let eaten = buffer.split_to(parser.peek().span.offset as usize);
			self.accumulate_line_col(&eaten);
			slice = &**buffer;
			if slice.len() > u32::MAX as usize {
				// limit slice length.
				slice = &slice[..u32::MAX as usize];
			}
			parser = Parser::new_with_settings(slice, self.settings.clone())
		}

		// test if the buffer is now empty, which would cause the parse_statement function to fail.
		if parser.peek().is_eof() {
			return Ok(None);
		}

		let res = self.stack.enter(|ctx| parser.parse_statement(ctx)).finish();
		if parser.peek().is_eof() {
			if buffer.len() > u32::MAX as usize {
				let error = syntax_error!("Cannot parse query, statement exceeded maximum size of 4GB", @parser.last_span());
				return Err(error
					.render_on_bytes(buffer)
					.offset_location(self.line_offset, self.col_offset));
			}

			// finished on an eof token.
			// We can't know if this is an actual result, or if it would change when more data
			// is available.
			return Ok(None);
		}

		// we need a trailing semicolon.
		if !parser.eat(t!(";")) {
			let peek = parser.next();

			if parser.peek1().is_eof() {
				return Ok(None);
			}

			if let Err(e) = res {
				return Err(e
					.render_on_bytes(slice)
					.offset_location(self.line_offset, self.col_offset));
			}

			let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
				@peek.span => "maybe forgot a semicolon after the previous statement?");
			return Err(error
				.render_on_bytes(slice)
				.offset_location(self.line_offset, self.col_offset));
		}

		// Eat possible empty statements.
		while parser.eat(t!(";")) {}

		let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
		let res = res.map(Some).map_err(|e| {
			e.render_on_bytes(&eaten).offset_location(self.line_offset, self.col_offset)
		});
		self.accumulate_line_col(&eaten);
		res
	}

	/// Parse remaining statements once the buffer is complete.
	pub fn parse_complete(
		&mut self,
		buffer: &mut BytesMut,
	) -> Result<Option<sql::Statement>, RenderedError> {
		let mut slice = &**buffer;
		if slice.len() > u32::MAX as usize {
			// limit slice length.
			slice = &slice[..u32::MAX as usize];
		}

		let mut parser = Parser::new_with_settings(slice, self.settings.clone());
		// eat empty statements.
		while parser.eat(t!(";")) {}

		if parser.peek().is_eof() {
			// There were no statements in the buffer, clear possible used
			buffer.clear();
			return Ok(None);
		}

		match self.stack.enter(|ctx| parser.parse_statement(ctx)).finish() {
			Ok(x) => {
				if !parser.peek().is_eof() && !parser.eat(t!(";")) {
					let peek = parser.peek();
					let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
						@peek.span => "maybe forgot a semicolon after the previous statement?");
					return Err(error
						.render_on_bytes(slice)
						.offset_location(self.line_offset, self.col_offset));
				}

				let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
				self.accumulate_line_col(&eaten);
				Ok(Some(x))
			}
			Err(e) => {
				Err(e.render_on_bytes(slice).offset_location(self.line_offset, self.col_offset))
			}
		}
	}
}