surrealdb_core/syn/lexer/compound/
strand.rs

1use std::ops::RangeInclusive;
2use std::{char, mem};
3
4use crate::syn::{
5	error::{bail, syntax_error, SyntaxError},
6	lexer::{unicode::chars, Lexer},
7	token::{t, Token},
8};
9
10pub fn strand(lexer: &mut Lexer, start: Token) -> Result<String, SyntaxError> {
11	let is_double = match start.kind {
12		t!("\"") => true,
13		t!("'") => false,
14		_ => panic!("Invalid start of strand compound token"),
15	};
16
17	loop {
18		let Some(x) = lexer.reader.next() else {
19			lexer.scratch.clear();
20			let err = syntax_error!("Unexpected end of file, expected strand to end",@lexer.current_span());
21			return Err(err.with_data_pending());
22		};
23
24		if x.is_ascii() {
25			match x {
26				b'\'' if !is_double => {
27					let res = mem::take(&mut lexer.scratch);
28					return Ok(res);
29				}
30				b'"' if is_double => {
31					let res = mem::take(&mut lexer.scratch);
32					return Ok(res);
33				}
34				b'\0' => {
35					bail!("Invalid null byte in source, null bytes are not valid SurrealQL characters",@lexer.current_span());
36				}
37				b'\\' => {
38					// Handle escape sequences.
39					let Some(next) = lexer.reader.next() else {
40						lexer.scratch.clear();
41						let err = syntax_error!("Unexpected end of file, expected strand to end",@lexer.current_span());
42						return Err(err.with_data_pending());
43					};
44					match next {
45						b'\\' => {
46							lexer.scratch.push('\\');
47						}
48						b'\'' if !is_double => {
49							lexer.scratch.push('\'');
50						}
51						b'\"' if is_double => {
52							lexer.scratch.push('\"');
53						}
54						b'/' => {
55							lexer.scratch.push('/');
56						}
57						b'b' => {
58							lexer.scratch.push(chars::BS);
59						}
60						b'f' => {
61							lexer.scratch.push(chars::FF);
62						}
63						b'n' => {
64							lexer.scratch.push(chars::LF);
65						}
66						b'r' => {
67							lexer.scratch.push(chars::CR);
68						}
69						b't' => {
70							lexer.scratch.push(chars::TAB);
71						}
72						b'u' => {
73							let c = lex_unicode_sequence(lexer)?;
74							lexer.scratch.push(c);
75						}
76						x => match lexer.reader.convert_to_char(x) {
77							Ok(char) => {
78								let valid_escape = if is_double {
79									'"'
80								} else {
81									'\''
82								};
83								bail!("Invalid escape character `{char}`, valid characters are `\\`, `{valid_escape}`, `/`, `b`, `f`, `n`, `r`, `t`, or `u` ", @lexer.current_span());
84							}
85							Err(e) => {
86								lexer.scratch.clear();
87								return Err(e.into());
88							}
89						},
90					}
91				}
92				x => lexer.scratch.push(x as char),
93			}
94		} else {
95			match lexer.reader.complete_char(x) {
96				Ok(x) => lexer.scratch.push(x),
97				Err(e) => {
98					lexer.scratch.clear();
99					return Err(e.into());
100				}
101			}
102		}
103	}
104}
105
106const LEADING_SURROGATES: RangeInclusive<u16> = 0xD800..=0xDBFF;
107const TRAILING_SURROGATES: RangeInclusive<u16> = 0xDC00..=0xDFFF;
108
109fn lex_unicode_sequence(lexer: &mut Lexer) -> Result<char, SyntaxError> {
110	if let Some(b'{') = lexer.reader.peek() {
111		lexer.reader.next();
112		return lex_bracket_unicode_sequence(lexer);
113	}
114
115	let leading = lex_bare_unicode_sequence(lexer)?;
116	if LEADING_SURROGATES.contains(&leading) {
117		if !(lexer.reader.next() == Some(b'\\') && lexer.reader.next() == Some(b'u')) {
118			bail!("Unicode escape sequence encoding a leading surrogate needs to be followed by a trailing surrogate", @lexer.current_span());
119		}
120		let trailing = lex_bare_unicode_sequence(lexer)?;
121		// Compute the codepoint.
122		// https://datacadamia.com/data/type/text/surrogate#from_surrogate_to_character_code
123		let codepoint = 0x10000
124			+ ((leading as u32 - *LEADING_SURROGATES.start() as u32) << 10)
125			+ trailing as u32
126			- *TRAILING_SURROGATES.start() as u32;
127
128		return char::from_u32(codepoint).ok_or_else(|| {
129			syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span())
130		});
131	}
132
133	char::from_u32(leading as u32)
134		.ok_or_else(|| syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span()))
135}
136
137fn lex_bracket_unicode_sequence(lexer: &mut Lexer) -> Result<char, SyntaxError> {
138	let mut accum = 0u32;
139	for _ in 0..6 {
140		let c = lexer.reader.peek().ok_or_else(
141			|| syntax_error!("Unexpected end of file, expected strand to end", @lexer.current_span()),
142		)?;
143
144		match c {
145			b'a'..=b'f' => {
146				accum <<= 4;
147				accum += (c - b'a') as u32 + 10;
148			}
149			b'A'..=b'F' => {
150				accum <<= 4;
151				accum += (c - b'A') as u32 + 10;
152			}
153			b'0'..=b'9' => {
154				accum <<= 4;
155				accum += (c - b'0') as u32;
156			}
157			_ => break,
158		}
159		lexer.reader.next();
160	}
161
162	let Some(b'}') = lexer.reader.next() else {
163		bail!("Missing end brace `}}` of unicode escape sequence", @lexer.current_span())
164	};
165
166	char::from_u32(accum)
167		.ok_or_else(|| syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span()))
168}
169
170fn lex_bare_unicode_sequence(lexer: &mut Lexer) -> Result<u16, SyntaxError> {
171	let mut accum: u16 = 0;
172	for _ in 0..4 {
173		let Some(c) = lexer.reader.next() else {
174			bail!("Missing characters after unicode escape sequence", @lexer.current_span());
175		};
176
177		accum <<= 4;
178		match c {
179			b'a'..=b'f' => {
180				accum += (c - b'a') as u16 + 10;
181			}
182			b'A'..=b'F' => {
183				accum += (c - b'A') as u16 + 10;
184			}
185			b'0'..=b'9' => {
186				accum += (c - b'0') as u16;
187			}
188			_ => {
189				bail!("Invalid character `{}` in unicode escape sequence, must be a hex digit.",c as char, @lexer.current_span());
190			}
191		}
192	}
193	Ok(accum)
194}