lexical_util/iterator.rs
1//! Specialized iterator traits.
2//!
3//! The traits are for iterables containing bytes, and provide optimizations
4//! which then can be used for contiguous or non-contiguous iterables,
5//! including containers or iterators of any kind.
6
7#![cfg(feature = "parse")]
8
9use core::mem;
10
11// Re-export our digit iterators.
12#[cfg(not(feature = "format"))]
13pub use crate::noskip::{AsBytes, Bytes};
14#[cfg(feature = "format")]
15pub use crate::skip::{AsBytes, Bytes};
16
17/// A trait for working with iterables of bytes.
18///
19/// These iterators can either be contiguous or not contiguous and provide
20/// methods for reading data and accessing underlying data. The readers
21/// can either be contiguous or non-contiguous, although performance and
22/// some API methods may not be available for both.
23///
24/// # Safety
25///
26/// Safe if [`set_cursor`] is set to an index <= [`buffer_length`], so no
27/// out-of-bounds reads can occur. Also, [`get_buffer`] must return a slice of
28/// initialized bytes. The caller must also ensure that any calls that increment
29/// the cursor, such as [`step_by_unchecked`], [`step_unchecked`], and
30/// [`peek_many_unchecked`] never exceed [`buffer_length`] as well.
31///
32/// [`set_cursor`]: `Iter::set_cursor`
33/// [`buffer_length`]: `Iter::buffer_length`
34/// [`get_buffer`]: `Iter::get_buffer`
35/// [`step_by_unchecked`]: `Iter::step_by_unchecked`
36/// [`step_unchecked`]: `Iter::step_unchecked`
37/// [`peek_many_unchecked`]: `Iter::peek_many_unchecked`
38#[cfg(feature = "parse")]
39pub unsafe trait Iter<'a> {
40 /// Determine if the buffer is contiguous in memory.
41 const IS_CONTIGUOUS: bool;
42
43 // CURSORS
44 // -------
45
46 /// Get a ptr to the current start of the buffer.
47 #[inline(always)]
48 fn as_ptr(&self) -> *const u8 {
49 self.as_slice().as_ptr()
50 }
51
52 /// Get a slice to the current start of the buffer.
53 #[inline(always)]
54 fn as_slice(&self) -> &'a [u8] {
55 debug_assert!(self.cursor() <= self.buffer_length());
56 // SAFETY: safe since index must be in range.
57 unsafe { self.get_buffer().get_unchecked(self.cursor()..) }
58 }
59
60 /// Get a slice to the full underlying contiguous buffer,
61 fn get_buffer(&self) -> &'a [u8];
62
63 /// Get the total number of elements in the underlying buffer.
64 #[inline(always)]
65 fn buffer_length(&self) -> usize {
66 self.get_buffer().len()
67 }
68
69 /// Get if no bytes are available in the buffer.
70 ///
71 /// This operators on the underlying buffer: that is,
72 /// it returns if [`as_slice`] would return an empty slice.
73 ///
74 /// [as_slice]: Iter::as_slice
75 #[inline(always)]
76 fn is_buffer_empty(&self) -> bool {
77 self.cursor() >= self.get_buffer().len()
78 }
79
80 /// Get the current index of the iterator in the slice.
81 fn cursor(&self) -> usize;
82
83 /// Set the current index of the iterator in the slice.
84 ///
85 /// This is **NOT** the current position of the iterator,
86 /// since iterators may skip digits: this is the cursor
87 /// in the underlying buffer. For example, if `slc[2]` is
88 /// skipped, `set_cursor(3)` would be the 3rd element in
89 /// the iterator, not the 4th.
90 ///
91 /// # Safety
92 ///
93 /// Safe if `index <= self.buffer_length()`. Although this
94 /// won't affect safety, the caller also should be careful it
95 /// does not set the cursor within skipped characters
96 /// since this could affect correctness: an iterator that
97 /// only accepts non-consecutive digit separators would
98 /// pass if the cursor was set between the two.
99 unsafe fn set_cursor(&mut self, index: usize);
100
101 /// Get the current number of digits returned by the iterator.
102 ///
103 /// For contiguous iterators, this can include the sign character, decimal
104 /// point, and the exponent sign (that is, it is always the cursor). For
105 /// non-contiguous iterators, this must always be the only the number of
106 /// digits returned.
107 ///
108 /// This is never used for indexing but will be used for API detection.
109 fn current_count(&self) -> usize;
110
111 // PROPERTIES
112
113 /// Determine if the buffer is contiguous.
114 #[inline(always)]
115 fn is_contiguous(&self) -> bool {
116 Self::IS_CONTIGUOUS
117 }
118
119 /// Get the next value available without consuming it.
120 ///
121 /// This does **NOT** skip digits, and directly fetches the item
122 /// from the underlying buffer.
123 #[inline(always)]
124 fn first(&self) -> Option<&'a u8> {
125 self.get_buffer().get(self.cursor())
126 }
127
128 /// Check if the next element is a given value.
129 #[inline(always)]
130 fn first_is_cased(&self, value: u8) -> bool {
131 Some(&value) == self.first()
132 }
133
134 /// Check if the next element is a given value without case sensitivity.
135 #[inline(always)]
136 fn first_is_uncased(&self, value: u8) -> bool {
137 if let Some(&c) = self.first() {
138 c.eq_ignore_ascii_case(&value)
139 } else {
140 false
141 }
142 }
143
144 /// Check if the next item in buffer is a given value with optional case
145 /// sensitivity.
146 #[inline(always)]
147 fn first_is(&self, value: u8, is_cased: bool) -> bool {
148 if is_cased {
149 self.first_is_cased(value)
150 } else {
151 self.first_is_uncased(value)
152 }
153 }
154
155 // STEP BY
156 // -------
157
158 /// Advance the internal slice by `N` elements.
159 ///
160 /// This does not advance the iterator by `N` elements for
161 /// non-contiguous iterators: this just advances the internal,
162 /// underlying buffer. This is useful for multi-digit optimizations
163 /// for contiguous iterators.
164 ///
165 /// This does not increment the count of items: returns: this only
166 /// increments the index, not the total digits returned. You must use
167 /// this carefully: if stepping over a digit, you must then call
168 /// [`increment_count`] afterwards or else the internal count will
169 /// be incorrect.
170 ///
171 /// [`increment_count`]: DigitsIter::increment_count
172 ///
173 /// # Panics
174 ///
175 /// This will panic if the buffer advances for non-contiguous
176 /// iterators if the current byte is a digit separator, or if the
177 /// count is more than 1.
178 ///
179 /// # Safety
180 ///
181 /// As long as the iterator is at least `N` elements, this
182 /// is safe.
183 unsafe fn step_by_unchecked(&mut self, count: usize);
184
185 /// Advance the internal slice by 1 element.
186 ///
187 ///
188 /// This does not increment the count of items: returns: this only
189 /// increments the index, not the total digits returned. You must
190 /// use this carefully: if stepping over a digit, you must then call
191 /// [`increment_count`] afterwards or else the internal count will
192 /// be incorrect.
193 ///
194 /// [`increment_count`]: DigitsIter::increment_count
195 ///
196 /// # Panics
197 ///
198 /// This will panic if the buffer advances for non-contiguous
199 /// iterators if the current byte is a digit separator.
200 ///
201 /// # Safety
202 ///
203 /// Safe as long as the iterator is not empty.
204 #[inline(always)]
205 unsafe fn step_unchecked(&mut self) {
206 debug_assert!(!self.as_slice().is_empty());
207 // SAFETY: safe if `self.index < self.buffer_length()`.
208 unsafe { self.step_by_unchecked(1) };
209 }
210
211 // READ
212 // ----
213
214 /// Read a value of a difference type from the iterator.
215 ///
216 /// This does **not** advance the internal state of the iterator.
217 /// This can only be implemented for contiguous iterators: non-
218 /// contiguous iterators **MUST** panic.
219 ///
220 /// # Panics
221 ///
222 /// If the iterator is a non-contiguous iterator.
223 ///
224 /// # Safety
225 ///
226 /// Safe as long as the number of the buffer is contains as least as
227 /// many bytes as the size of V. This must be unimplemented for
228 /// non-contiguous iterators.
229 #[inline(always)]
230 unsafe fn peek_many_unchecked<V>(&self) -> V {
231 unimplemented!();
232 }
233
234 /// Try to read a the next four bytes as a u32.
235 ///
236 /// This does not advance the internal state of the iterator.
237 #[inline(always)]
238 fn peek_u32(&self) -> Option<u32> {
239 if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u32>() {
240 // SAFETY: safe since we've guaranteed the buffer is greater than
241 // the number of elements read. u32 is valid for all bit patterns
242 unsafe { Some(self.peek_many_unchecked()) }
243 } else {
244 None
245 }
246 }
247
248 /// Try to read the next eight bytes as a u64.
249 ///
250 /// This does not advance the internal state of the iterator.
251 #[inline(always)]
252 fn peek_u64(&self) -> Option<u64> {
253 if Self::IS_CONTIGUOUS && self.as_slice().len() >= mem::size_of::<u64>() {
254 // SAFETY: safe since we've guaranteed the buffer is greater than
255 // the number of elements read. u64 is valid for all bit patterns
256 unsafe { Some(self.peek_many_unchecked()) }
257 } else {
258 None
259 }
260 }
261}
262
263/// Iterator over a contiguous block of bytes.
264///
265/// This allows us to convert to-and-from-slices, raw pointers, and
266/// peek/query the data from either end cheaply.
267///
268/// A default implementation is provided for slice iterators.
269/// This trait **should never** return `null` from `as_ptr`, or be
270/// implemented for non-contiguous data.
271pub trait DigitsIter<'a>: Iterator<Item = &'a u8> + Iter<'a> {
272 /// Get if the iterator cannot return any more elements.
273 ///
274 /// This may advance the internal iterator state, but not
275 /// modify the next returned value.
276 ///
277 /// If this is an iterator, this is based on the number of items
278 /// left to be returned. We do not necessarly know the length of
279 /// the buffer. If this is a non-contiguous iterator, this **MUST**
280 /// advance the state until it knows a value can be returned.
281 ///
282 /// Any incorrect implementations of this affect all safety invariants
283 /// for the rest of the trait. For contiguous iterators, this can be
284 /// as simple as checking if `self.cursor >= self.slc.len()`, but for
285 /// non-contiguous iterators you **MUST** advance to the next element
286 /// to be returned, then check to see if a value exists. The safest
287 /// implementation is always to check if `self.peek().is_none()` and
288 /// ensure [`peek`] is always safe.
289 ///
290 /// If you would like to see if the cursor is at the end of the buffer,
291 /// see [`is_buffer_empty`] instead.
292 ///
293 /// [is_buffer_empty]: Iter::is_buffer_empty
294 /// [peek]: DigitsIter::peek
295 #[inline(always)]
296 #[allow(clippy::wrong_self_convention)] // reason="required for peeking next item"
297 fn is_consumed(&mut self) -> bool {
298 self.peek().is_none()
299 }
300
301 /// Increment the number of digits that have been returned by the iterator.
302 ///
303 /// For contiguous iterators, this is a no-op. For non-contiguous iterators,
304 /// this increments the count by 1.
305 fn increment_count(&mut self);
306
307 /// Peek the next value of the iterator, without consuming it.
308 ///
309 /// Note that this can modify the internal state, by skipping digits
310 /// for iterators that find the first non-zero value, etc. We optimize
311 /// this for the case where we have contiguous iterators, since
312 /// non-contiguous iterators already have a major performance penalty.
313 fn peek(&mut self) -> Option<Self::Item>;
314
315 /// Peek the next value of the iterator, and step only if it exists.
316 #[inline(always)]
317 fn try_read(&mut self) -> Option<Self::Item> {
318 if let Some(value) = self.peek() {
319 // SAFETY: the slice cannot be empty because we peeked a value.
320 unsafe { self.step_unchecked() };
321 Some(value)
322 } else {
323 None
324 }
325 }
326
327 /// Check if the next element is a given value.
328 #[inline(always)]
329 fn peek_is_cased(&mut self, value: u8) -> bool {
330 Some(&value) == self.peek()
331 }
332
333 /// Check if the next element is a given value without case sensitivity.
334 #[inline(always)]
335 fn peek_is_uncased(&mut self, value: u8) -> bool {
336 if let Some(&c) = self.peek() {
337 c.eq_ignore_ascii_case(&value)
338 } else {
339 false
340 }
341 }
342
343 /// Check if the next element is a given value with optional case
344 /// sensitivity.
345 #[inline(always)]
346 fn peek_is(&mut self, value: u8, is_cased: bool) -> bool {
347 if is_cased {
348 self.peek_is_cased(value)
349 } else {
350 self.peek_is_uncased(value)
351 }
352 }
353
354 /// Peek the next value and consume it if the read value matches the
355 /// expected one.
356 #[inline(always)]
357 fn read_if<Pred: FnOnce(u8) -> bool>(&mut self, pred: Pred) -> Option<u8> {
358 // NOTE: This was implemented to remove usage of unsafe throughout to code
359 // base, however, performance was really not up to scratch. I'm not sure
360 // the cause of this.
361 if let Some(&peeked) = self.peek() {
362 if pred(peeked) {
363 // SAFETY: the slice cannot be empty because we peeked a value.
364 unsafe { self.step_unchecked() };
365 Some(peeked)
366 } else {
367 None
368 }
369 } else {
370 None
371 }
372 }
373
374 /// Read a value if the value matches the provided one.
375 #[inline(always)]
376 fn read_if_value_cased(&mut self, value: u8) -> Option<u8> {
377 if self.peek() == Some(&value) {
378 // SAFETY: the slice cannot be empty because we peeked a value.
379 unsafe { self.step_unchecked() };
380 Some(value)
381 } else {
382 None
383 }
384 }
385
386 /// Read a value if the value matches the provided one without case
387 /// sensitivity.
388 #[inline(always)]
389 fn read_if_value_uncased(&mut self, value: u8) -> Option<u8> {
390 self.read_if(|x| x.eq_ignore_ascii_case(&value))
391 }
392
393 /// Read a value if the value matches the provided one.
394 #[inline(always)]
395 fn read_if_value(&mut self, value: u8, is_cased: bool) -> Option<u8> {
396 if is_cased {
397 self.read_if_value_cased(value)
398 } else {
399 self.read_if_value_uncased(value)
400 }
401 }
402
403 /// Skip zeros from the start of the iterator
404 #[inline(always)]
405 fn skip_zeros(&mut self) -> usize {
406 let start = self.current_count();
407 while self.read_if_value_cased(b'0').is_some() {
408 self.increment_count();
409 }
410 self.current_count() - start
411 }
412
413 /// Determine if the character is a digit.
414 fn is_digit(&self, value: u8) -> bool;
415}