surrealdb_core/syn/lexer/compound/
strand.rs1use std::ops::RangeInclusive;
2use std::{char, mem};
3
4use crate::syn::{
5 error::{bail, syntax_error, SyntaxError},
6 lexer::{unicode::chars, Lexer},
7 token::{t, Token},
8};
9
10pub fn strand(lexer: &mut Lexer, start: Token) -> Result<String, SyntaxError> {
11 let is_double = match start.kind {
12 t!("\"") => true,
13 t!("'") => false,
14 _ => panic!("Invalid start of strand compound token"),
15 };
16
17 loop {
18 let Some(x) = lexer.reader.next() else {
19 lexer.scratch.clear();
20 let err = syntax_error!("Unexpected end of file, expected strand to end",@lexer.current_span());
21 return Err(err.with_data_pending());
22 };
23
24 if x.is_ascii() {
25 match x {
26 b'\'' if !is_double => {
27 let res = mem::take(&mut lexer.scratch);
28 return Ok(res);
29 }
30 b'"' if is_double => {
31 let res = mem::take(&mut lexer.scratch);
32 return Ok(res);
33 }
34 b'\0' => {
35 bail!("Invalid null byte in source, null bytes are not valid SurrealQL characters",@lexer.current_span());
36 }
37 b'\\' => {
38 let Some(next) = lexer.reader.next() else {
40 lexer.scratch.clear();
41 let err = syntax_error!("Unexpected end of file, expected strand to end",@lexer.current_span());
42 return Err(err.with_data_pending());
43 };
44 match next {
45 b'\\' => {
46 lexer.scratch.push('\\');
47 }
48 b'\'' if !is_double => {
49 lexer.scratch.push('\'');
50 }
51 b'\"' if is_double => {
52 lexer.scratch.push('\"');
53 }
54 b'/' => {
55 lexer.scratch.push('/');
56 }
57 b'b' => {
58 lexer.scratch.push(chars::BS);
59 }
60 b'f' => {
61 lexer.scratch.push(chars::FF);
62 }
63 b'n' => {
64 lexer.scratch.push(chars::LF);
65 }
66 b'r' => {
67 lexer.scratch.push(chars::CR);
68 }
69 b't' => {
70 lexer.scratch.push(chars::TAB);
71 }
72 b'u' => {
73 let c = lex_unicode_sequence(lexer)?;
74 lexer.scratch.push(c);
75 }
76 x => match lexer.reader.convert_to_char(x) {
77 Ok(char) => {
78 let valid_escape = if is_double {
79 '"'
80 } else {
81 '\''
82 };
83 bail!("Invalid escape character `{char}`, valid characters are `\\`, `{valid_escape}`, `/`, `b`, `f`, `n`, `r`, `t`, or `u` ", @lexer.current_span());
84 }
85 Err(e) => {
86 lexer.scratch.clear();
87 return Err(e.into());
88 }
89 },
90 }
91 }
92 x => lexer.scratch.push(x as char),
93 }
94 } else {
95 match lexer.reader.complete_char(x) {
96 Ok(x) => lexer.scratch.push(x),
97 Err(e) => {
98 lexer.scratch.clear();
99 return Err(e.into());
100 }
101 }
102 }
103 }
104}
105
106const LEADING_SURROGATES: RangeInclusive<u16> = 0xD800..=0xDBFF;
107const TRAILING_SURROGATES: RangeInclusive<u16> = 0xDC00..=0xDFFF;
108
109fn lex_unicode_sequence(lexer: &mut Lexer) -> Result<char, SyntaxError> {
110 if let Some(b'{') = lexer.reader.peek() {
111 lexer.reader.next();
112 return lex_bracket_unicode_sequence(lexer);
113 }
114
115 let leading = lex_bare_unicode_sequence(lexer)?;
116 if LEADING_SURROGATES.contains(&leading) {
117 if !(lexer.reader.next() == Some(b'\\') && lexer.reader.next() == Some(b'u')) {
118 bail!("Unicode escape sequence encoding a leading surrogate needs to be followed by a trailing surrogate", @lexer.current_span());
119 }
120 let trailing = lex_bare_unicode_sequence(lexer)?;
121 let codepoint = 0x10000
124 + ((leading as u32 - *LEADING_SURROGATES.start() as u32) << 10)
125 + trailing as u32
126 - *TRAILING_SURROGATES.start() as u32;
127
128 return char::from_u32(codepoint).ok_or_else(|| {
129 syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span())
130 });
131 }
132
133 char::from_u32(leading as u32)
134 .ok_or_else(|| syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span()))
135}
136
137fn lex_bracket_unicode_sequence(lexer: &mut Lexer) -> Result<char, SyntaxError> {
138 let mut accum = 0u32;
139 for _ in 0..6 {
140 let c = lexer.reader.peek().ok_or_else(
141 || syntax_error!("Unexpected end of file, expected strand to end", @lexer.current_span()),
142 )?;
143
144 match c {
145 b'a'..=b'f' => {
146 accum <<= 4;
147 accum += (c - b'a') as u32 + 10;
148 }
149 b'A'..=b'F' => {
150 accum <<= 4;
151 accum += (c - b'A') as u32 + 10;
152 }
153 b'0'..=b'9' => {
154 accum <<= 4;
155 accum += (c - b'0') as u32;
156 }
157 _ => break,
158 }
159 lexer.reader.next();
160 }
161
162 let Some(b'}') = lexer.reader.next() else {
163 bail!("Missing end brace `}}` of unicode escape sequence", @lexer.current_span())
164 };
165
166 char::from_u32(accum)
167 .ok_or_else(|| syntax_error!("Unicode escape sequences encode invalid character codepoint", @lexer.current_span()))
168}
169
170fn lex_bare_unicode_sequence(lexer: &mut Lexer) -> Result<u16, SyntaxError> {
171 let mut accum: u16 = 0;
172 for _ in 0..4 {
173 let Some(c) = lexer.reader.next() else {
174 bail!("Missing characters after unicode escape sequence", @lexer.current_span());
175 };
176
177 accum <<= 4;
178 match c {
179 b'a'..=b'f' => {
180 accum += (c - b'a') as u16 + 10;
181 }
182 b'A'..=b'F' => {
183 accum += (c - b'A') as u16 + 10;
184 }
185 b'0'..=b'9' => {
186 accum += (c - b'0') as u16;
187 }
188 _ => {
189 bail!("Invalid character `{}` in unicode escape sequence, must be a hex digit.",c as char, @lexer.current_span());
190 }
191 }
192 }
193 Ok(accum)
194}