surrealdb_core/syn/lexer/
reader.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
use thiserror::Error;

use crate::syn::{error::SyntaxError, token::Span};

#[derive(Error, Debug)]
#[non_exhaustive]
pub enum CharError {
	#[error("found eof inside multi byte character")]
	Eof,
	#[error("string is not valid utf-8")]
	Unicode,
}

impl From<CharError> for SyntaxError {
	fn from(value: CharError) -> Self {
		let e = SyntaxError::new("Invalid, non valid UTF-8 bytes, in source");
		if let CharError::Eof = value {
			e.with_data_pending()
		} else {
			e
		}
	}
}

#[derive(Clone, Debug)]
#[non_exhaustive]
pub struct BytesReader<'a> {
	data: &'a [u8],
	current: usize,
}

impl<'a> BytesReader<'a> {
	pub fn new(slice: &'a [u8]) -> Self {
		BytesReader {
			data: slice,
			current: 0,
		}
	}

	#[inline]
	pub fn remaining(&self) -> &'a [u8] {
		&self.data[self.current..]
	}

	#[inline]
	pub fn len(&self) -> usize {
		self.remaining().len()
	}

	#[inline]
	pub fn offset(&self) -> usize {
		self.current
	}

	#[inline]
	pub fn backup(&mut self, offset: usize) {
		assert!(offset <= self.offset());
		self.current = offset;
	}

	#[inline]
	pub fn is_empty(&self) -> bool {
		self.remaining().is_empty()
	}

	#[inline]
	pub fn peek(&self) -> Option<u8> {
		self.remaining().first().copied()
	}

	#[inline]
	pub fn peek1(&self) -> Option<u8> {
		self.remaining().get(1).copied()
	}

	#[inline]
	pub fn span(&self, span: Span) -> &'a [u8] {
		&self.data[(span.offset as usize)..(span.offset as usize + span.len as usize)]
	}

	#[inline]
	pub fn next_continue_byte(&mut self) -> Result<u8, CharError> {
		const CONTINUE_BYTE_PREFIX_MASK: u8 = 0b1100_0000;
		const CONTINUE_BYTE_MASK: u8 = 0b0011_1111;

		let byte = self.next().ok_or(CharError::Eof)?;
		if byte & CONTINUE_BYTE_PREFIX_MASK != 0b1000_0000 {
			return Err(CharError::Unicode);
		}

		Ok(byte & CONTINUE_BYTE_MASK)
	}

	pub fn convert_to_char(&mut self, start: u8) -> Result<char, CharError> {
		if start.is_ascii() {
			return Ok(start as char);
		}
		self.complete_char(start)
	}

	pub fn complete_char(&mut self, start: u8) -> Result<char, CharError> {
		match start & 0b1111_1000 {
			0b1100_0000 | 0b1101_0000 | 0b1100_1000 | 0b1101_1000 => {
				let mut val = (start & 0b0001_1111) as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				char::from_u32(val).ok_or(CharError::Unicode)
			}
			0b1110_0000 | 0b1110_1000 => {
				let mut val = (start & 0b0000_1111) as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				char::from_u32(val).ok_or(CharError::Unicode)
			}
			0b1111_0000 => {
				let mut val = (start & 0b0000_0111) as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				val <<= 6;
				let next = self.next_continue_byte()?;
				val |= next as u32;
				char::from_u32(val).ok_or(CharError::Unicode)
			}
			_ => Err(CharError::Unicode),
		}
	}
}

impl Iterator for BytesReader<'_> {
	type Item = u8;

	#[inline]
	fn next(&mut self) -> Option<Self::Item> {
		let res = self.peek()?;
		self.current += 1;
		Some(res)
	}
	fn size_hint(&self) -> (usize, Option<usize>) {
		let len = self.len();
		(len, Some(len))
	}
}

impl ExactSizeIterator for BytesReader<'_> {
	fn len(&self) -> usize {
		self.len()
	}
}