polars_arrow/
offset.rs

1//! Contains the declaration of [`Offset`]
2use std::hint::unreachable_unchecked;
3use std::ops::Deref;
4
5use polars_error::{polars_bail, polars_err, PolarsError, PolarsResult};
6
7use crate::array::Splitable;
8use crate::buffer::Buffer;
9pub use crate::types::Offset;
10
11/// A wrapper type of [`Vec<O>`] representing the invariants of Arrow's offsets.
12/// It is guaranteed to (sound to assume that):
13/// * every element is `>= 0`
14/// * element at position `i` is >= than element at position `i-1`.
15#[derive(Debug, Clone, PartialEq, Eq)]
16pub struct Offsets<O: Offset>(Vec<O>);
17
18impl<O: Offset> Default for Offsets<O> {
19    #[inline]
20    fn default() -> Self {
21        Self::new()
22    }
23}
24
25impl<O: Offset> Deref for Offsets<O> {
26    type Target = [O];
27
28    fn deref(&self) -> &Self::Target {
29        self.as_slice()
30    }
31}
32
33impl<O: Offset> TryFrom<Vec<O>> for Offsets<O> {
34    type Error = PolarsError;
35
36    #[inline]
37    fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
38        try_check_offsets(&offsets)?;
39        Ok(Self(offsets))
40    }
41}
42
43impl<O: Offset> TryFrom<Buffer<O>> for OffsetsBuffer<O> {
44    type Error = PolarsError;
45
46    #[inline]
47    fn try_from(offsets: Buffer<O>) -> Result<Self, Self::Error> {
48        try_check_offsets(&offsets)?;
49        Ok(Self(offsets))
50    }
51}
52
53impl<O: Offset> TryFrom<Vec<O>> for OffsetsBuffer<O> {
54    type Error = PolarsError;
55
56    #[inline]
57    fn try_from(offsets: Vec<O>) -> Result<Self, Self::Error> {
58        try_check_offsets(&offsets)?;
59        Ok(Self(offsets.into()))
60    }
61}
62
63impl<O: Offset> From<Offsets<O>> for OffsetsBuffer<O> {
64    #[inline]
65    fn from(offsets: Offsets<O>) -> Self {
66        Self(offsets.0.into())
67    }
68}
69
70impl<O: Offset> Offsets<O> {
71    /// Returns an empty [`Offsets`] (i.e. with a single element, the zero)
72    #[inline]
73    pub fn new() -> Self {
74        Self(vec![O::zero()])
75    }
76
77    /// Returns an [`Offsets`] whose all lengths are zero.
78    #[inline]
79    pub fn new_zeroed(length: usize) -> Self {
80        Self(vec![O::zero(); length + 1])
81    }
82
83    /// Creates a new [`Offsets`] from an iterator of lengths
84    #[inline]
85    pub fn try_from_iter<I: IntoIterator<Item = usize>>(iter: I) -> PolarsResult<Self> {
86        let iterator = iter.into_iter();
87        let (lower, _) = iterator.size_hint();
88        let mut offsets = Self::with_capacity(lower);
89        for item in iterator {
90            offsets.try_push(item)?
91        }
92        Ok(offsets)
93    }
94
95    /// Returns a new [`Offsets`] with a capacity, allocating at least `capacity + 1` entries.
96    pub fn with_capacity(capacity: usize) -> Self {
97        let mut offsets = Vec::with_capacity(capacity + 1);
98        offsets.push(O::zero());
99        Self(offsets)
100    }
101
102    /// Returns the capacity of [`Offsets`].
103    pub fn capacity(&self) -> usize {
104        self.0.capacity() - 1
105    }
106
107    /// Reserves `additional` entries.
108    pub fn reserve(&mut self, additional: usize) {
109        self.0.reserve(additional);
110    }
111
112    /// Shrinks the capacity of self to fit.
113    pub fn shrink_to_fit(&mut self) {
114        self.0.shrink_to_fit();
115    }
116
117    /// Pushes a new element with a given length.
118    /// # Error
119    /// This function errors iff the new last item is larger than what `O` supports.
120    /// # Implementation
121    /// This function:
122    /// * checks that this length does not overflow
123    #[inline]
124    pub fn try_push(&mut self, length: usize) -> PolarsResult<()> {
125        if O::IS_LARGE {
126            let length = O::from_as_usize(length);
127            let old_length = self.last();
128            let new_length = *old_length + length;
129            self.0.push(new_length);
130            Ok(())
131        } else {
132            let length =
133                O::from_usize(length).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
134
135            let old_length = self.last();
136            let new_length = old_length
137                .checked_add(&length)
138                .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
139            self.0.push(new_length);
140            Ok(())
141        }
142    }
143
144    /// Returns [`Offsets`] assuming that `offsets` fulfills its invariants
145    ///
146    /// # Safety
147    /// This is safe iff the invariants of this struct are guaranteed in `offsets`.
148    #[inline]
149    pub unsafe fn new_unchecked(offsets: Vec<O>) -> Self {
150        #[cfg(debug_assertions)]
151        {
152            let mut prev_offset = O::default();
153            let mut is_monotonely_increasing = true;
154            for offset in &offsets {
155                is_monotonely_increasing &= *offset >= prev_offset;
156                prev_offset = *offset;
157            }
158            assert!(
159                is_monotonely_increasing,
160                "Unsafe precondition violated. Invariant of offsets broken."
161            );
162        }
163
164        Self(offsets)
165    }
166
167    /// Returns the last offset of this container.
168    #[inline]
169    pub fn last(&self) -> &O {
170        match self.0.last() {
171            Some(element) => element,
172            None => unsafe { unreachable_unchecked() },
173        }
174    }
175
176    /// Returns a `length` corresponding to the position `index`
177    /// # Panic
178    /// This function panics iff `index >= self.len()`
179    #[inline]
180    pub fn length_at(&self, index: usize) -> usize {
181        let (start, end) = self.start_end(index);
182        end - start
183    }
184
185    /// Returns a range (start, end) corresponding to the position `index`
186    /// # Panic
187    /// This function panics iff `index >= self.len_proxy()`
188    #[inline]
189    pub fn start_end(&self, index: usize) -> (usize, usize) {
190        // soundness: the invariant of the function
191        assert!(index < self.len_proxy());
192        unsafe { self.start_end_unchecked(index) }
193    }
194
195    /// Returns a range (start, end) corresponding to the position `index`
196    ///
197    /// # Safety
198    /// `index` must be `< self.len()`
199    #[inline]
200    pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
201        // soundness: the invariant of the function
202        let start = self.0.get_unchecked(index).to_usize();
203        let end = self.0.get_unchecked(index + 1).to_usize();
204        (start, end)
205    }
206
207    /// Returns the length an array with these offsets would be.
208    #[inline]
209    pub fn len_proxy(&self) -> usize {
210        self.0.len() - 1
211    }
212
213    #[inline]
214    /// Returns the number of offsets in this container.
215    pub fn len(&self) -> usize {
216        self.0.len()
217    }
218
219    /// Returns the byte slice stored in this buffer
220    #[inline]
221    pub fn as_slice(&self) -> &[O] {
222        self.0.as_slice()
223    }
224
225    /// Pops the last element
226    #[inline]
227    pub fn pop(&mut self) -> Option<O> {
228        if self.len_proxy() == 0 {
229            None
230        } else {
231            self.0.pop()
232        }
233    }
234
235    /// Extends itself with `additional` elements equal to the last offset.
236    /// This is useful to extend offsets with empty values, e.g. for null slots.
237    #[inline]
238    pub fn extend_constant(&mut self, additional: usize) {
239        let offset = *self.last();
240        if additional == 1 {
241            self.0.push(offset)
242        } else {
243            self.0.resize(self.len() + additional, offset)
244        }
245    }
246
247    /// Try to create a new [`Offsets`] from a sequence of `lengths`
248    /// # Errors
249    /// This function errors iff this operation overflows for the maximum value of `O`.
250    #[inline]
251    pub fn try_from_lengths<I: Iterator<Item = usize>>(lengths: I) -> PolarsResult<Self> {
252        let mut self_ = Self::with_capacity(lengths.size_hint().0);
253        self_.try_extend_from_lengths(lengths)?;
254        Ok(self_)
255    }
256
257    /// Try extend from an iterator of lengths
258    /// # Errors
259    /// This function errors iff this operation overflows for the maximum value of `O`.
260    #[inline]
261    pub fn try_extend_from_lengths<I: Iterator<Item = usize>>(
262        &mut self,
263        lengths: I,
264    ) -> PolarsResult<()> {
265        let mut total_length = 0;
266        let mut offset = *self.last();
267        let original_offset = offset.to_usize();
268
269        let lengths = lengths.map(|length| {
270            total_length += length;
271            O::from_as_usize(length)
272        });
273
274        let offsets = lengths.map(|length| {
275            offset += length; // this may overflow, checked below
276            offset
277        });
278        self.0.extend(offsets);
279
280        let last_offset = original_offset
281            .checked_add(total_length)
282            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
283        O::from_usize(last_offset).ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
284        Ok(())
285    }
286
287    /// Extends itself from another [`Offsets`]
288    /// # Errors
289    /// This function errors iff this operation overflows for the maximum value of `O`.
290    pub fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
291        let mut length = *self.last();
292        let other_length = *other.last();
293        // check if the operation would overflow
294        length
295            .checked_add(&other_length)
296            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
297
298        let lengths = other.as_slice().windows(2).map(|w| w[1] - w[0]);
299        let offsets = lengths.map(|new_length| {
300            length += new_length;
301            length
302        });
303        self.0.extend(offsets);
304        Ok(())
305    }
306
307    /// Extends itself from another [`Offsets`] sliced by `start, length`
308    /// # Errors
309    /// This function errors iff this operation overflows for the maximum value of `O`.
310    pub fn try_extend_from_slice(
311        &mut self,
312        other: &OffsetsBuffer<O>,
313        start: usize,
314        length: usize,
315    ) -> PolarsResult<()> {
316        if length == 0 {
317            return Ok(());
318        }
319        let other = &other.0[start..start + length + 1];
320        let other_length = other.last().expect("Length to be non-zero");
321        let mut length = *self.last();
322        // check if the operation would overflow
323        length
324            .checked_add(other_length)
325            .ok_or_else(|| polars_err!(ComputeError: "overflow"))?;
326
327        let lengths = other.windows(2).map(|w| w[1] - w[0]);
328        let offsets = lengths.map(|new_length| {
329            length += new_length;
330            length
331        });
332        self.0.extend(offsets);
333        Ok(())
334    }
335
336    /// Returns the inner [`Vec`].
337    #[inline]
338    pub fn into_inner(self) -> Vec<O> {
339        self.0
340    }
341}
342
343/// Checks that `offsets` is monotonically increasing.
344fn try_check_offsets<O: Offset>(offsets: &[O]) -> PolarsResult<()> {
345    // this code is carefully constructed to auto-vectorize, don't change naively!
346    match offsets.first() {
347        None => polars_bail!(ComputeError: "offsets must have at least one element"),
348        Some(first) => {
349            if *first < O::zero() {
350                polars_bail!(ComputeError: "offsets must be larger than 0")
351            }
352            let mut previous = *first;
353            let mut any_invalid = false;
354
355            // This loop will auto-vectorize because there is not any break,
356            // an invalid value will be returned once the whole offsets buffer is processed.
357            for offset in offsets {
358                if previous > *offset {
359                    any_invalid = true
360                }
361                previous = *offset;
362            }
363
364            if any_invalid {
365                polars_bail!(ComputeError: "offsets must be monotonically increasing")
366            } else {
367                Ok(())
368            }
369        },
370    }
371}
372
373/// A wrapper type of [`Buffer<O>`] that is guaranteed to:
374/// * Always contain an element
375/// * Every element is `>= 0`
376/// * element at position `i` is >= than element at position `i-1`.
377#[derive(Clone, PartialEq, Debug)]
378pub struct OffsetsBuffer<O: Offset>(Buffer<O>);
379
380impl<O: Offset> Default for OffsetsBuffer<O> {
381    #[inline]
382    fn default() -> Self {
383        Self(vec![O::zero()].into())
384    }
385}
386
387impl<O: Offset> OffsetsBuffer<O> {
388    /// # Safety
389    /// This is safe iff the invariants of this struct are guaranteed in `offsets`.
390    #[inline]
391    pub unsafe fn new_unchecked(offsets: Buffer<O>) -> Self {
392        Self(offsets)
393    }
394
395    /// Returns an empty [`OffsetsBuffer`] (i.e. with a single element, the zero)
396    #[inline]
397    pub fn new() -> Self {
398        Self(vec![O::zero()].into())
399    }
400
401    #[inline]
402    pub fn one_with_length(length: O) -> Self {
403        Self(vec![O::zero(), length].into())
404    }
405
406    /// Copy-on-write API to convert [`OffsetsBuffer`] into [`Offsets`].
407    #[inline]
408    pub fn into_mut(self) -> either::Either<Self, Offsets<O>> {
409        self.0
410            .into_mut()
411            // SAFETY: Offsets and OffsetsBuffer share invariants
412            .map_right(|offsets| unsafe { Offsets::new_unchecked(offsets) })
413            .map_left(Self)
414    }
415
416    /// Returns a reference to its internal [`Buffer`].
417    #[inline]
418    pub fn buffer(&self) -> &Buffer<O> {
419        &self.0
420    }
421
422    /// Returns what the length an array with these offsets would be.
423    #[inline]
424    pub fn len_proxy(&self) -> usize {
425        self.0.len() - 1
426    }
427
428    /// Returns the number of offsets in this container.
429    #[inline]
430    pub fn len(&self) -> usize {
431        self.0.len()
432    }
433
434    /// Returns the byte slice stored in this buffer
435    #[inline]
436    pub fn as_slice(&self) -> &[O] {
437        self.0.as_slice()
438    }
439
440    /// Returns the range of the offsets.
441    #[inline]
442    pub fn range(&self) -> O {
443        *self.last() - *self.first()
444    }
445
446    /// Returns the first offset.
447    #[inline]
448    pub fn first(&self) -> &O {
449        match self.0.first() {
450            Some(element) => element,
451            None => unsafe { unreachable_unchecked() },
452        }
453    }
454
455    /// Returns the last offset.
456    #[inline]
457    pub fn last(&self) -> &O {
458        match self.0.last() {
459            Some(element) => element,
460            None => unsafe { unreachable_unchecked() },
461        }
462    }
463
464    /// Returns a `length` corresponding to the position `index`
465    /// # Panic
466    /// This function panics iff `index >= self.len_proxy()`
467    #[inline]
468    pub fn length_at(&self, index: usize) -> usize {
469        let (start, end) = self.start_end(index);
470        end - start
471    }
472
473    /// Returns a range (start, end) corresponding to the position `index`
474    /// # Panic
475    /// This function panics iff `index >= self.len_proxy()`
476    #[inline]
477    pub fn start_end(&self, index: usize) -> (usize, usize) {
478        // soundness: the invariant of the function
479        assert!(index < self.len_proxy());
480        unsafe { self.start_end_unchecked(index) }
481    }
482
483    /// Returns a range (start, end) corresponding to the position `index`
484    ///
485    /// # Safety
486    /// `index` must be `< self.len()`
487    #[inline]
488    pub unsafe fn start_end_unchecked(&self, index: usize) -> (usize, usize) {
489        // soundness: the invariant of the function
490        let start = self.0.get_unchecked(index).to_usize();
491        let end = self.0.get_unchecked(index + 1).to_usize();
492        (start, end)
493    }
494
495    /// Slices this [`OffsetsBuffer`].
496    /// # Panics
497    /// Panics if `offset + length` is larger than `len`
498    /// or `length == 0`.
499    #[inline]
500    pub fn slice(&mut self, offset: usize, length: usize) {
501        assert!(length > 0);
502        self.0.slice(offset, length);
503    }
504
505    /// Slices this [`OffsetsBuffer`] starting at `offset`.
506    ///
507    /// # Safety
508    /// The caller must ensure `offset + length <= self.len()`
509    #[inline]
510    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
511        self.0.slice_unchecked(offset, length);
512    }
513
514    /// Returns an iterator with the lengths of the offsets
515    #[inline]
516    pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
517        self.0.windows(2).map(|w| (w[1] - w[0]).to_usize())
518    }
519
520    /// Returns `(offset, len)` pairs.
521    #[inline]
522    pub fn offset_and_length_iter(&self) -> impl ExactSizeIterator<Item = (usize, usize)> + '_ {
523        self.windows(2).map(|x| {
524            let [l, r] = x else { unreachable!() };
525            let l = l.to_usize();
526            let r = r.to_usize();
527            (l, r - l)
528        })
529    }
530
531    /// Offset and length of the primitive (leaf) array for a double+ nested list for every outer
532    /// row.
533    pub fn leaf_ranges_iter(
534        offsets: &[Self],
535    ) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
536        let others = &offsets[1..];
537
538        offsets[0].windows(2).map(move |x| {
539            let [l, r] = x else { unreachable!() };
540            let mut l = l.to_usize();
541            let mut r = r.to_usize();
542
543            for o in others {
544                let slc = o.as_slice();
545                l = slc[l].to_usize();
546                r = slc[r].to_usize();
547            }
548
549            l..r
550        })
551    }
552
553    /// Return the full range of the leaf array used by the list.
554    pub fn leaf_full_start_end(offsets: &[Self]) -> core::ops::Range<usize> {
555        let mut l = offsets[0].first().to_usize();
556        let mut r = offsets[0].last().to_usize();
557
558        for o in &offsets[1..] {
559            let slc = o.as_slice();
560            l = slc[l].to_usize();
561            r = slc[r].to_usize();
562        }
563
564        l..r
565    }
566
567    /// Returns the inner [`Buffer`].
568    #[inline]
569    pub fn into_inner(self) -> Buffer<O> {
570        self.0
571    }
572
573    /// Returns the offset difference between `start` and `end`.
574    #[inline]
575    pub fn delta(&self, start: usize, end: usize) -> usize {
576        assert!(start <= end);
577
578        (self.0[end + 1] - self.0[start]).to_usize()
579    }
580}
581
582impl From<&OffsetsBuffer<i32>> for OffsetsBuffer<i64> {
583    fn from(offsets: &OffsetsBuffer<i32>) -> Self {
584        // this conversion is lossless and uphelds all invariants
585        Self(
586            offsets
587                .buffer()
588                .iter()
589                .map(|x| *x as i64)
590                .collect::<Vec<_>>()
591                .into(),
592        )
593    }
594}
595
596impl TryFrom<&OffsetsBuffer<i64>> for OffsetsBuffer<i32> {
597    type Error = PolarsError;
598
599    fn try_from(offsets: &OffsetsBuffer<i64>) -> Result<Self, Self::Error> {
600        i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
601
602        // this conversion is lossless and uphelds all invariants
603        Ok(Self(
604            offsets
605                .buffer()
606                .iter()
607                .map(|x| *x as i32)
608                .collect::<Vec<_>>()
609                .into(),
610        ))
611    }
612}
613
614impl From<Offsets<i32>> for Offsets<i64> {
615    fn from(offsets: Offsets<i32>) -> Self {
616        // this conversion is lossless and uphelds all invariants
617        Self(
618            offsets
619                .as_slice()
620                .iter()
621                .map(|x| *x as i64)
622                .collect::<Vec<_>>(),
623        )
624    }
625}
626
627impl TryFrom<Offsets<i64>> for Offsets<i32> {
628    type Error = PolarsError;
629
630    fn try_from(offsets: Offsets<i64>) -> Result<Self, Self::Error> {
631        i32::try_from(*offsets.last()).map_err(|_| polars_err!(ComputeError: "overflow"))?;
632
633        // this conversion is lossless and uphelds all invariants
634        Ok(Self(
635            offsets
636                .as_slice()
637                .iter()
638                .map(|x| *x as i32)
639                .collect::<Vec<_>>(),
640        ))
641    }
642}
643
644impl<O: Offset> std::ops::Deref for OffsetsBuffer<O> {
645    type Target = [O];
646
647    #[inline]
648    fn deref(&self) -> &[O] {
649        self.0.as_slice()
650    }
651}
652
653impl<O: Offset> Splitable for OffsetsBuffer<O> {
654    fn check_bound(&self, offset: usize) -> bool {
655        offset <= self.len_proxy()
656    }
657
658    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
659        let mut lhs = self.0.clone();
660        let mut rhs = self.0.clone();
661
662        lhs.slice(0, offset + 1);
663        rhs.slice(offset, self.0.len() - offset);
664
665        (Self(lhs), Self(rhs))
666    }
667}