surrealdb_core/syn/lexer/
reader.rs

1use thiserror::Error;
2
3use crate::syn::{error::SyntaxError, token::Span};
4
5#[derive(Error, Debug)]
6#[non_exhaustive]
7pub enum CharError {
8	#[error("found eof inside multi byte character")]
9	Eof,
10	#[error("string is not valid utf-8")]
11	Unicode,
12}
13
14impl From<CharError> for SyntaxError {
15	fn from(value: CharError) -> Self {
16		let e = SyntaxError::new("Invalid, non valid UTF-8 bytes, in source");
17		if let CharError::Eof = value {
18			e.with_data_pending()
19		} else {
20			e
21		}
22	}
23}
24
25#[derive(Clone, Debug)]
26#[non_exhaustive]
27pub struct BytesReader<'a> {
28	data: &'a [u8],
29	current: usize,
30}
31
32impl<'a> BytesReader<'a> {
33	pub fn new(slice: &'a [u8]) -> Self {
34		BytesReader {
35			data: slice,
36			current: 0,
37		}
38	}
39
40	#[inline]
41	pub fn remaining(&self) -> &'a [u8] {
42		&self.data[self.current..]
43	}
44
45	#[inline]
46	pub fn len(&self) -> usize {
47		self.remaining().len()
48	}
49
50	#[inline]
51	pub fn offset(&self) -> usize {
52		self.current
53	}
54
55	#[inline]
56	pub fn backup(&mut self, offset: usize) {
57		assert!(offset <= self.offset());
58		self.current = offset;
59	}
60
61	#[inline]
62	pub fn is_empty(&self) -> bool {
63		self.remaining().is_empty()
64	}
65
66	#[inline]
67	pub fn peek(&self) -> Option<u8> {
68		self.remaining().first().copied()
69	}
70
71	#[inline]
72	pub fn peek1(&self) -> Option<u8> {
73		self.remaining().get(1).copied()
74	}
75
76	#[inline]
77	pub fn span(&self, span: Span) -> &'a [u8] {
78		&self.data[(span.offset as usize)..(span.offset as usize + span.len as usize)]
79	}
80
81	#[inline]
82	pub fn next_continue_byte(&mut self) -> Result<u8, CharError> {
83		const CONTINUE_BYTE_PREFIX_MASK: u8 = 0b1100_0000;
84		const CONTINUE_BYTE_MASK: u8 = 0b0011_1111;
85
86		let byte = self.next().ok_or(CharError::Eof)?;
87		if byte & CONTINUE_BYTE_PREFIX_MASK != 0b1000_0000 {
88			return Err(CharError::Unicode);
89		}
90
91		Ok(byte & CONTINUE_BYTE_MASK)
92	}
93
94	pub fn convert_to_char(&mut self, start: u8) -> Result<char, CharError> {
95		if start.is_ascii() {
96			return Ok(start as char);
97		}
98		self.complete_char(start)
99	}
100
101	pub fn complete_char(&mut self, start: u8) -> Result<char, CharError> {
102		match start & 0b1111_1000 {
103			0b1100_0000 | 0b1101_0000 | 0b1100_1000 | 0b1101_1000 => {
104				let mut val = (start & 0b0001_1111) as u32;
105				val <<= 6;
106				let next = self.next_continue_byte()?;
107				val |= next as u32;
108				char::from_u32(val).ok_or(CharError::Unicode)
109			}
110			0b1110_0000 | 0b1110_1000 => {
111				let mut val = (start & 0b0000_1111) as u32;
112				val <<= 6;
113				let next = self.next_continue_byte()?;
114				val |= next as u32;
115				val <<= 6;
116				let next = self.next_continue_byte()?;
117				val |= next as u32;
118				char::from_u32(val).ok_or(CharError::Unicode)
119			}
120			0b1111_0000 => {
121				let mut val = (start & 0b0000_0111) as u32;
122				val <<= 6;
123				let next = self.next_continue_byte()?;
124				val |= next as u32;
125				val <<= 6;
126				let next = self.next_continue_byte()?;
127				val |= next as u32;
128				val <<= 6;
129				let next = self.next_continue_byte()?;
130				val |= next as u32;
131				char::from_u32(val).ok_or(CharError::Unicode)
132			}
133			_ => Err(CharError::Unicode),
134		}
135	}
136}
137
138impl Iterator for BytesReader<'_> {
139	type Item = u8;
140
141	#[inline]
142	fn next(&mut self) -> Option<Self::Item> {
143		let res = self.peek()?;
144		self.current += 1;
145		Some(res)
146	}
147	fn size_hint(&self) -> (usize, Option<usize>) {
148		let len = self.len();
149		(len, Some(len))
150	}
151}
152
153impl ExactSizeIterator for BytesReader<'_> {
154	fn len(&self) -> usize {
155		self.len()
156	}
157}