surrealdb_core/syn/parser/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556
//! Module implementing the SurrealQL parser.
//!
//! The SurrealQL parse is a relatively simple recursive decent parser.
//! Most of the functions of the SurrealQL parser peek a token from the lexer and then decide to
//! take a path depending on which token is next.
//!
//! # Implementation Details
//!
//! There are a bunch of common patterns for which this module has some confinence functions.
//! - Whenever only one token can be next you should use the `expected!` macro. This macro
//! ensures that the given token type is next and if not returns a parser error.
//! - Whenever a limited set of tokens can be next it is common to match the token kind and then
//! have a catch all arm which calles the macro `unexpected!`. This macro will raise an parse
//! error with information about the type of token it recieves and what it expected.
//! - If a single token can be optionally next use [`Parser::eat`] this function returns a bool
//! depending on if the given tokenkind was eaten.
//! - If a closing delimiting token is expected use `Parser::expect_closing_delimiter`. This
//! function will raise an error if the expected delimiter isn't the next token. This error will
//! also point to which delimiter the parser expected to be closed.
//!
//! ## Far Token Peek
//!
//! Occasionally the parser needs to check further ahead than peeking allows.
//! This is done with the [`Parser::peek1`] function. This function peeks one token further then
//! peek.
//!
//! ## WhiteSpace Tokens
//!
//! The lexer produces whitespace tokens, these are tokens which are normally ignored in most place
//! in the syntax as they have no bearing on the meaning of a statements. [`Parser::next`] and
//! [`Parser::peek`] automatically skip over any whitespace tokens. However in some places, like
//! in a record-id and when gluing tokens, these white-space tokens are required for correct
//! parsing. In which case the function [`Parser::next_whitespace`] and others with `_whitespace`
//! are used. These functions don't skip whitespace tokens. However these functions do not undo
//! whitespace tokens which might have been skipped. Implementers must be carefull to not call a
//! functions which requires whitespace tokens when they may already have been skipped.
//!
//! ## Compound tokens and token gluing.
//!
//! SurrealQL has a bunch of tokens which have complex rules for when they are allowed and the
//! value they contain. Such tokens are named compound tokens, and examples include a javascript
//! body, strand-like tokens, regex, numbers, etc.
//!
//! These tokens need to be manually requested from the lexer with the [`Lexer::lex_compound`]
//! function.
//!
//! This manually request of tokens leads to a problems when used in conjunction with peeking. Take
//! for instance the production `{ "foo": "bar"}`. `"foo"` is a compound token so when intially
//! encountered the lexer only returns a `"` token and then that token needs to be collected into a
//! the full strand token. However the parser needs to figure out if we are parsing an object
//! or a block so it needs to look past the compound token to see if the next token is `:`. This is
//! where gluing comes in. Calling `Parser::glue` checks if the next token could start a compound
//! token and combines them into a single token. This can only be done in places where we know if
//! we encountered a leading token of a compound token it will result in the 'default' compound token.
use self::token_buffer::TokenBuffer;
use crate::{
sql::{self, Datetime, Duration, Strand, Uuid},
syn::{
error::{bail, SyntaxError},
lexer::{compound::NumberKind, Lexer},
token::{t, Span, Token, TokenKind},
},
};
use bytes::BytesMut;
use reblessive::{Stack, Stk};
mod basic;
mod builtin;
mod expression;
mod function;
mod glue;
mod idiom;
mod json;
mod kind;
pub(crate) mod mac;
mod object;
mod prime;
mod stmt;
mod thing;
mod token;
mod token_buffer;
pub(crate) use mac::{enter_object_recursion, enter_query_recursion, unexpected};
use super::error::{syntax_error, RenderedError};
#[cfg(test)]
pub mod test;
/// The result returned by most parser function.
pub type ParseResult<T> = Result<T, SyntaxError>;
/// A result of trying to parse a possibly partial query.
#[derive(Debug)]
#[non_exhaustive]
pub enum PartialResult<T> {
MoreData,
/// Parsing the source produced no reasonable value.
Empty {
used: usize,
},
Ok {
value: T,
used: usize,
},
Err {
err: SyntaxError,
used: usize,
},
}
#[derive(Default)]
pub enum GluedValue {
Duration(Duration),
Datetime(Datetime),
Uuid(Uuid),
Number(NumberKind),
Strand(Strand),
#[default]
None,
}
#[derive(Clone, Debug)]
pub struct ParserSettings {
/// Parse strand like the old parser where a strand which looks like a UUID, Record-Id, Or a
/// DateTime will be parsed as a date-time.
pub legacy_strands: bool,
/// Set whether to allow record-id's which don't adheare to regular ident rules.
/// Setting this to true will allow parsing of, for example, `foo:0bar`. This would be rejected
/// by normal identifier rules as most identifiers can't start with a number.
pub flexible_record_id: bool,
/// Disallow a query to have objects deeper that limit.
/// Arrays also count towards objects. So `[{foo: [] }]` would be 3 deep.
pub object_recursion_limit: usize,
/// Disallow a query from being deeper than the give limit.
/// A query recurses when a statement contains another statement within itself.
/// Examples are subquery and blocks like block statements and if statements and such.
pub query_recursion_limit: usize,
/// Whether record references are enabled.
pub references_enabled: bool,
/// Whether bearer access is enabled
pub bearer_access_enabled: bool,
}
impl Default for ParserSettings {
fn default() -> Self {
ParserSettings {
legacy_strands: false,
flexible_record_id: true,
object_recursion_limit: 100,
query_recursion_limit: 20,
references_enabled: false,
bearer_access_enabled: false,
}
}
}
/// The SurrealQL parser.
pub struct Parser<'a> {
lexer: Lexer<'a>,
last_span: Span,
token_buffer: TokenBuffer<4>,
glued_value: GluedValue,
pub(crate) table_as_field: bool,
settings: ParserSettings,
}
impl<'a> Parser<'a> {
/// Create a new parser from a give source.
pub fn new(source: &'a [u8]) -> Self {
Parser::new_with_settings(source, ParserSettings::default())
}
/// Create a new parser from a give source.
pub fn new_with_settings(source: &'a [u8], settings: ParserSettings) -> Self {
Parser {
lexer: Lexer::new(source),
last_span: Span::empty(),
token_buffer: TokenBuffer::new(),
glued_value: GluedValue::None,
table_as_field: true,
settings,
}
}
pub fn with_settings(mut self, settings: ParserSettings) -> Self {
self.settings = settings;
self
}
/// Returns the next token and advance the parser one token forward.
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Token {
let res = loop {
let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
if res.kind != TokenKind::WhiteSpace {
break res;
}
};
self.last_span = res.span;
res
}
/// Returns the next token and advance the parser one token forward.
///
/// This function is like next but returns whitespace tokens which are normally skipped
#[allow(clippy::should_implement_trait)]
pub fn next_whitespace(&mut self) -> Token {
let res = self.token_buffer.pop().unwrap_or_else(|| self.lexer.next_token());
self.last_span = res.span;
res
}
/// Returns if there is a token in the token buffer, meaning that a token was peeked.
pub fn has_peek(&self) -> bool {
self.token_buffer.is_empty()
}
/// Consume the current peeked value and advance the parser one token forward.
///
/// Should only be called after peeking a value.
pub fn pop_peek(&mut self) -> Token {
let res = self.token_buffer.pop().unwrap();
self.last_span = res.span;
res
}
/// Returns the next token without consuming it.
pub fn peek(&mut self) -> Token {
loop {
let Some(x) = self.token_buffer.first() else {
let res = loop {
let res = self.lexer.next_token();
if res.kind != TokenKind::WhiteSpace {
break res;
}
};
self.token_buffer.push(res);
return res;
};
if x.kind == TokenKind::WhiteSpace {
self.token_buffer.pop();
continue;
}
break x;
}
}
/// Returns the next token without consuming it.
///
/// This function is like peek but returns whitespace tokens which are normally skipped
/// Does not undo tokens skipped in a previous normal peek.
pub fn peek_whitespace(&mut self) -> Token {
let Some(x) = self.token_buffer.first() else {
let res = self.lexer.next_token();
self.token_buffer.push(res);
return res;
};
x
}
/// Return the token kind of the next token without consuming it.
pub fn peek_kind(&mut self) -> TokenKind {
self.peek().kind
}
/// Returns the next n'th token without consuming it.
/// `peek_token_at(0)` is equivalent to `peek`.
pub(crate) fn peek_token_at(&mut self, at: u8) -> Token {
for _ in self.token_buffer.len()..=at {
let r = loop {
let r = self.lexer.next_token();
if r.kind != TokenKind::WhiteSpace {
break r;
}
};
self.token_buffer.push(r);
}
self.token_buffer.at(at).unwrap()
}
pub fn peek1(&mut self) -> Token {
self.peek_token_at(1)
}
/// Returns the next n'th token without consuming it.
/// `peek_token_at(0)` is equivalent to `peek`.
pub fn peek_whitespace_token_at(&mut self, at: u8) -> Token {
for _ in self.token_buffer.len()..=at {
let r = self.lexer.next_token();
self.token_buffer.push(r);
}
self.token_buffer.at(at).unwrap()
}
pub fn peek_whitespace1(&mut self) -> Token {
self.peek_whitespace_token_at(1)
}
/// Returns the span of the next token if it was already peeked, otherwise returns the token of
/// the last consumed token.
pub fn recent_span(&mut self) -> Span {
self.token_buffer.first().map(|x| x.span).unwrap_or(self.last_span)
}
/// returns the token of the last consumed token.
pub fn last_span(&mut self) -> Span {
self.last_span
}
pub fn assert_finished(&mut self) -> ParseResult<()> {
let p = self.peek();
if self.peek().kind != TokenKind::Eof {
bail!("Unexpected token `{}`, expected no more tokens",p.kind, @p.span);
}
Ok(())
}
/// Eat the next token if it is of the given kind.
/// Returns whether a token was eaten.
pub fn eat(&mut self, token: TokenKind) -> bool {
let peek = self.peek();
if token == peek.kind {
self.token_buffer.pop();
self.last_span = peek.span;
true
} else {
false
}
}
/// Eat the next token if it is of the given kind.
/// Returns whether a token was eaten.
///
/// Unlike [`Parser::eat`] this doesn't skip whitespace tokens
pub fn eat_whitespace(&mut self, token: TokenKind) -> bool {
let peek = self.peek_whitespace();
if token == peek.kind {
self.token_buffer.pop();
self.last_span = peek.span;
true
} else {
false
}
}
/// Forces the next token to be the given one.
/// Used in token gluing to replace the current one with the glued token.
fn prepend_token(&mut self, token: Token) {
self.token_buffer.push_front(token);
}
/// Checks if the next token is of the given kind. If it isn't it returns a UnclosedDelimiter
/// error.
fn expect_closing_delimiter(&mut self, kind: TokenKind, should_close: Span) -> ParseResult<()> {
let peek = self.peek();
if peek.kind != kind {
bail!("Unexpected token `{}` expected delimiter `{kind}`",
peek.kind,
@self.recent_span(),
@should_close => "expected this delimiter to close"
);
}
self.pop_peek();
Ok(())
}
/// Recover the parser state to after a given span.
pub fn backup_after(&mut self, span: Span) {
self.token_buffer.clear();
self.lexer.backup_after(span);
}
/// Parse a full query.
///
/// This is the primary entry point of the parser.
pub async fn parse_query(&mut self, ctx: &mut Stk) -> ParseResult<sql::Query> {
let statements = self.parse_stmt_list(ctx).await?;
Ok(sql::Query(statements))
}
/// Parse a single statement.
pub async fn parse_statement(&mut self, ctx: &mut Stk) -> ParseResult<sql::Statement> {
self.parse_stmt(ctx).await
}
}
/// A struct which can parse queries statements by statement
pub struct StatementStream {
stack: Stack,
settings: ParserSettings,
col_offset: usize,
line_offset: usize,
}
impl StatementStream {
#[allow(clippy::new_without_default)]
pub fn new() -> Self {
Self::new_with_settings(ParserSettings::default())
}
pub fn new_with_settings(settings: ParserSettings) -> Self {
StatementStream {
stack: Stack::new(),
settings,
col_offset: 0,
line_offset: 0,
}
}
/// updates the line and column offset after consuming bytes.
fn accumulate_line_col(&mut self, bytes: &[u8]) {
// The parser should have ensured that bytes is a valid utf-8 string.
// TODO: Maybe change this to unsafe cast once we have more convidence in the parsers
// correctness.
let (line_num, remaining) =
std::str::from_utf8(bytes).unwrap().lines().enumerate().last().unwrap_or((0, ""));
self.line_offset += line_num;
if line_num > 0 {
self.col_offset = 0;
}
self.col_offset += remaining.chars().count();
}
/// Parses a statement if the buffer contains sufficient data to parse a statement.
///
/// When it will have done so the it will remove the read bytes from the buffer and return
/// Ok(Some(_)). In case of a parsing error it will return Err(_), this will not consume data.
///
/// If the function returns Ok(None), not enough data was in the buffer to fully parse a
/// statement, the function might still consume data from the buffer, like whitespace between statements,
/// when a none is returned.
pub fn parse_partial(
&mut self,
buffer: &mut BytesMut,
) -> Result<Option<sql::Statement>, RenderedError> {
let mut slice = &**buffer;
if slice.len() > u32::MAX as usize {
// limit slice length.
slice = &slice[..u32::MAX as usize];
}
let mut parser = Parser::new_with_settings(slice, self.settings.clone());
// eat empty statements.
while parser.eat(t!(";")) {}
if parser.peek().span.offset != 0 && buffer.len() > u32::MAX as usize {
// we ate some bytes statements, so in order to ensure whe can parse a full statement
// of 4gigs we need recreate the parser starting with the empty bytes removed.
let eaten = buffer.split_to(parser.peek().span.offset as usize);
self.accumulate_line_col(&eaten);
slice = &**buffer;
if slice.len() > u32::MAX as usize {
// limit slice length.
slice = &slice[..u32::MAX as usize];
}
parser = Parser::new_with_settings(slice, self.settings.clone())
}
// test if the buffer is now empty, which would cause the parse_statement function to fail.
if parser.peek().is_eof() {
return Ok(None);
}
let res = self.stack.enter(|ctx| parser.parse_statement(ctx)).finish();
if parser.peek().is_eof() {
if buffer.len() > u32::MAX as usize {
let error = syntax_error!("Cannot parse query, statement exceeded maximum size of 4GB", @parser.last_span());
return Err(error
.render_on_bytes(buffer)
.offset_location(self.line_offset, self.col_offset));
}
// finished on an eof token.
// We can't know if this is an actual result, or if it would change when more data
// is available.
return Ok(None);
}
// we need a trailing semicolon.
if !parser.eat(t!(";")) {
let peek = parser.next();
if parser.peek1().is_eof() {
return Ok(None);
}
if let Err(e) = res {
return Err(e
.render_on_bytes(slice)
.offset_location(self.line_offset, self.col_offset));
}
let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
@peek.span => "maybe forgot a semicolon after the previous statement?");
return Err(error
.render_on_bytes(slice)
.offset_location(self.line_offset, self.col_offset));
}
// Eat possible empty statements.
while parser.eat(t!(";")) {}
let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
let res = res.map(Some).map_err(|e| {
e.render_on_bytes(&eaten).offset_location(self.line_offset, self.col_offset)
});
self.accumulate_line_col(&eaten);
res
}
/// Parse remaining statements once the buffer is complete.
pub fn parse_complete(
&mut self,
buffer: &mut BytesMut,
) -> Result<Option<sql::Statement>, RenderedError> {
let mut slice = &**buffer;
if slice.len() > u32::MAX as usize {
// limit slice length.
slice = &slice[..u32::MAX as usize];
}
let mut parser = Parser::new_with_settings(slice, self.settings.clone());
// eat empty statements.
while parser.eat(t!(";")) {}
if parser.peek().is_eof() {
// There were no statements in the buffer, clear possible used
buffer.clear();
return Ok(None);
}
match self.stack.enter(|ctx| parser.parse_statement(ctx)).finish() {
Ok(x) => {
if !parser.peek().is_eof() && !parser.eat(t!(";")) {
let peek = parser.peek();
let error = syntax_error!("Unexpected token `{}` expected the query to end.",peek.kind.as_str(),
@peek.span => "maybe forgot a semicolon after the previous statement?");
return Err(error
.render_on_bytes(slice)
.offset_location(self.line_offset, self.col_offset));
}
let eaten = buffer.split_to(parser.last_span().after_offset() as usize);
self.accumulate_line_col(&eaten);
Ok(Some(x))
}
Err(e) => {
Err(e.render_on_bytes(slice).offset_location(self.line_offset, self.col_offset))
}
}
}
}