polars_arrow/array/binary/
mod.rs

1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::utils::{BitmapIter, ZipValidity};
7use crate::bitmap::Bitmap;
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod ffi;
14pub(super) mod fmt;
15mod iterator;
16pub use iterator::*;
17mod from;
18mod mutable_values;
19pub use mutable_values::*;
20mod mutable;
21pub use mutable::*;
22use polars_error::{polars_bail, PolarsResult};
23
24/// A [`BinaryArray`] is Arrow's semantically equivalent of an immutable `Vec<Option<Vec<u8>>>`.
25/// It implements [`Array`].
26///
27/// The size of this struct is `O(1)`, as all data is stored behind an [`std::sync::Arc`].
28/// # Example
29/// ```
30/// use polars_arrow::array::BinaryArray;
31/// use polars_arrow::bitmap::Bitmap;
32/// use polars_arrow::buffer::Buffer;
33///
34/// let array = BinaryArray::<i32>::from([Some([1, 2].as_ref()), None, Some([3].as_ref())]);
35/// assert_eq!(array.value(0), &[1, 2]);
36/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some([1, 2].as_ref()), None, Some([3].as_ref())]);
37/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec![[1, 2].as_ref(), &[], &[3]]);
38/// // the underlying representation:
39/// assert_eq!(array.values(), &Buffer::from(vec![1, 2, 3]));
40/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 3]));
41/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
42/// ```
43///
44/// # Generic parameter
45/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
46/// memory usage:
47/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
48/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
49///
50/// # Safety
51/// The following invariants hold:
52/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
53/// * `len` is equal to `validity.len()`, when defined.
54#[derive(Clone)]
55pub struct BinaryArray<O: Offset> {
56    dtype: ArrowDataType,
57    offsets: OffsetsBuffer<O>,
58    values: Buffer<u8>,
59    validity: Option<Bitmap>,
60}
61
62impl<O: Offset> BinaryArray<O> {
63    /// Returns a [`BinaryArray`] created from its internal representation.
64    ///
65    /// # Errors
66    /// This function returns an error iff:
67    /// * The last offset is not equal to the values' length.
68    /// * the validity's length is not equal to `offsets.len()`.
69    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Binary` or `LargeBinary`.
70    /// # Implementation
71    /// This function is `O(1)`
72    pub fn try_new(
73        dtype: ArrowDataType,
74        offsets: OffsetsBuffer<O>,
75        values: Buffer<u8>,
76        validity: Option<Bitmap>,
77    ) -> PolarsResult<Self> {
78        try_check_offsets_bounds(&offsets, values.len())?;
79
80        if validity
81            .as_ref()
82            .is_some_and(|validity| validity.len() != offsets.len_proxy())
83        {
84            polars_bail!(ComputeError: "validity mask length must match the number of values")
85        }
86
87        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
88            polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
89        }
90
91        Ok(Self {
92            dtype,
93            offsets,
94            values,
95            validity,
96        })
97    }
98
99    /// Creates a new [`BinaryArray`] without checking invariants.
100    ///
101    /// # Safety
102    ///
103    /// The invariants must be valid (see try_new).
104    pub unsafe fn new_unchecked(
105        dtype: ArrowDataType,
106        offsets: OffsetsBuffer<O>,
107        values: Buffer<u8>,
108        validity: Option<Bitmap>,
109    ) -> Self {
110        Self {
111            dtype,
112            offsets,
113            values,
114            validity,
115        }
116    }
117
118    /// Creates a new [`BinaryArray`] from slices of `&[u8]`.
119    pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
120        Self::from_trusted_len_values_iter(slice.as_ref().iter())
121    }
122
123    /// Creates a new [`BinaryArray`] from a slice of optional `&[u8]`.
124    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
125    pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
126        MutableBinaryArray::<O>::from(slice).into()
127    }
128
129    /// Returns an iterator of `Option<&[u8]>` over every element of this array.
130    pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
131        ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
132    }
133
134    /// Returns an iterator of `&[u8]` over every element of this array, ignoring the validity
135    pub fn values_iter(&self) -> BinaryValueIter<O> {
136        BinaryValueIter::new(self)
137    }
138
139    /// Returns an iterator of the non-null values.
140    #[inline]
141    pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
142        NonNullValuesIter::new(self, self.validity())
143    }
144
145    /// Returns the length of this array
146    #[inline]
147    pub fn len(&self) -> usize {
148        self.offsets.len_proxy()
149    }
150
151    /// Returns the element at index `i`
152    /// # Panics
153    /// iff `i >= self.len()`
154    #[inline]
155    pub fn value(&self, i: usize) -> &[u8] {
156        assert!(i < self.len());
157        unsafe { self.value_unchecked(i) }
158    }
159
160    /// Returns the element at index `i`
161    ///
162    /// # Safety
163    /// Assumes that the `i < self.len`.
164    #[inline]
165    pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
166        // soundness: the invariant of the function
167        let (start, end) = self.offsets.start_end_unchecked(i);
168
169        // soundness: the invariant of the struct
170        self.values.get_unchecked(start..end)
171    }
172
173    /// Returns the element at index `i` or `None` if it is null
174    /// # Panics
175    /// iff `i >= self.len()`
176    #[inline]
177    pub fn get(&self, i: usize) -> Option<&[u8]> {
178        if !self.is_null(i) {
179            // soundness: Array::is_null panics if i >= self.len
180            unsafe { Some(self.value_unchecked(i)) }
181        } else {
182            None
183        }
184    }
185
186    /// Returns the [`ArrowDataType`] of this array.
187    #[inline]
188    pub fn dtype(&self) -> &ArrowDataType {
189        &self.dtype
190    }
191
192    /// Returns the values of this [`BinaryArray`].
193    #[inline]
194    pub fn values(&self) -> &Buffer<u8> {
195        &self.values
196    }
197
198    /// Returns the offsets of this [`BinaryArray`].
199    #[inline]
200    pub fn offsets(&self) -> &OffsetsBuffer<O> {
201        &self.offsets
202    }
203
204    /// The optional validity.
205    #[inline]
206    pub fn validity(&self) -> Option<&Bitmap> {
207        self.validity.as_ref()
208    }
209
210    /// Slices this [`BinaryArray`].
211    /// # Implementation
212    /// This function is `O(1)`.
213    /// # Panics
214    /// iff `offset + length > self.len()`.
215    pub fn slice(&mut self, offset: usize, length: usize) {
216        assert!(
217            offset + length <= self.len(),
218            "the offset of the new Buffer cannot exceed the existing length"
219        );
220        unsafe { self.slice_unchecked(offset, length) }
221    }
222
223    /// Slices this [`BinaryArray`].
224    /// # Implementation
225    /// This function is `O(1)`.
226    ///
227    /// # Safety
228    /// The caller must ensure that `offset + length <= self.len()`.
229    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
230        self.validity = self
231            .validity
232            .take()
233            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
234            .filter(|bitmap| bitmap.unset_bits() > 0);
235        self.offsets.slice_unchecked(offset, length + 1);
236    }
237
238    impl_sliced!();
239    impl_mut_validity!();
240    impl_into_array!();
241
242    /// Returns its internal representation
243    #[must_use]
244    pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
245        let Self {
246            dtype,
247            offsets,
248            values,
249            validity,
250        } = self;
251        (dtype, offsets, values, validity)
252    }
253
254    /// Try to convert this `BinaryArray` to a `MutableBinaryArray`
255    #[must_use]
256    pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
257        use Either::*;
258        if let Some(bitmap) = self.validity {
259            match bitmap.into_mut() {
260                // SAFETY: invariants are preserved
261                Left(bitmap) => Left(BinaryArray::new(
262                    self.dtype,
263                    self.offsets,
264                    self.values,
265                    Some(bitmap),
266                )),
267                Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
268                    (Left(values), Left(offsets)) => Left(BinaryArray::new(
269                        self.dtype,
270                        offsets,
271                        values,
272                        Some(mutable_bitmap.into()),
273                    )),
274                    (Left(values), Right(offsets)) => Left(BinaryArray::new(
275                        self.dtype,
276                        offsets.into(),
277                        values,
278                        Some(mutable_bitmap.into()),
279                    )),
280                    (Right(values), Left(offsets)) => Left(BinaryArray::new(
281                        self.dtype,
282                        offsets,
283                        values.into(),
284                        Some(mutable_bitmap.into()),
285                    )),
286                    (Right(values), Right(offsets)) => Right(
287                        MutableBinaryArray::try_new(
288                            self.dtype,
289                            offsets,
290                            values,
291                            Some(mutable_bitmap),
292                        )
293                        .unwrap(),
294                    ),
295                },
296            }
297        } else {
298            match (self.values.into_mut(), self.offsets.into_mut()) {
299                (Left(values), Left(offsets)) => {
300                    Left(BinaryArray::new(self.dtype, offsets, values, None))
301                },
302                (Left(values), Right(offsets)) => {
303                    Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
304                },
305                (Right(values), Left(offsets)) => {
306                    Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
307                },
308                (Right(values), Right(offsets)) => {
309                    Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
310                },
311            }
312        }
313    }
314
315    /// Creates an empty [`BinaryArray`], i.e. whose `.len` is zero.
316    pub fn new_empty(dtype: ArrowDataType) -> Self {
317        Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
318    }
319
320    /// Creates an null [`BinaryArray`], i.e. whose `.null_count() == .len()`.
321    #[inline]
322    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
323        unsafe {
324            Self::new_unchecked(
325                dtype,
326                Offsets::new_zeroed(length).into(),
327                Buffer::new(),
328                Some(Bitmap::new_zeroed(length)),
329            )
330        }
331    }
332
333    /// Returns the default [`ArrowDataType`], `DataType::Binary` or `DataType::LargeBinary`
334    pub fn default_dtype() -> ArrowDataType {
335        if O::IS_LARGE {
336            ArrowDataType::LargeBinary
337        } else {
338            ArrowDataType::Binary
339        }
340    }
341
342    /// Alias for unwrapping [`Self::try_new`]
343    pub fn new(
344        dtype: ArrowDataType,
345        offsets: OffsetsBuffer<O>,
346        values: Buffer<u8>,
347        validity: Option<Bitmap>,
348    ) -> Self {
349        Self::try_new(dtype, offsets, values, validity).unwrap()
350    }
351
352    /// Returns a [`BinaryArray`] from an iterator of trusted length.
353    ///
354    /// The [`BinaryArray`] is guaranteed to not have a validity
355    #[inline]
356    pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
357        iterator: I,
358    ) -> Self {
359        MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
360    }
361
362    /// Returns a new [`BinaryArray`] from a [`Iterator`] of `&[u8]`.
363    ///
364    /// The [`BinaryArray`] is guaranteed to not have a validity
365    pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
366        MutableBinaryArray::<O>::from_iter_values(iterator).into()
367    }
368
369    /// Creates a [`BinaryArray`] from an iterator of trusted length.
370    ///
371    /// # Safety
372    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
373    /// I.e. that `size_hint().1` correctly reports its length.
374    #[inline]
375    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
376    where
377        P: AsRef<[u8]>,
378        I: Iterator<Item = Option<P>>,
379    {
380        MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
381    }
382
383    /// Creates a [`BinaryArray`] from a [`TrustedLen`]
384    #[inline]
385    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
386    where
387        P: AsRef<[u8]>,
388        I: TrustedLen<Item = Option<P>>,
389    {
390        // soundness: I is `TrustedLen`
391        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
392    }
393
394    /// Creates a [`BinaryArray`] from an falible iterator of trusted length.
395    ///
396    /// # Safety
397    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
398    /// I.e. that `size_hint().1` correctly reports its length.
399    #[inline]
400    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
401    where
402        P: AsRef<[u8]>,
403        I: IntoIterator<Item = Result<Option<P>, E>>,
404    {
405        MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
406    }
407
408    /// Creates a [`BinaryArray`] from an fallible iterator of trusted length.
409    #[inline]
410    pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
411    where
412        P: AsRef<[u8]>,
413        I: TrustedLen<Item = Result<Option<P>, E>>,
414    {
415        // soundness: I: TrustedLen
416        unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
417    }
418}
419
420impl<O: Offset> Array for BinaryArray<O> {
421    impl_common_array!();
422
423    fn validity(&self) -> Option<&Bitmap> {
424        self.validity.as_ref()
425    }
426
427    #[inline]
428    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
429        Box::new(self.clone().with_validity(validity))
430    }
431}
432
433unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
434    #[inline]
435    fn values(&self) -> &[u8] {
436        self.values()
437    }
438
439    #[inline]
440    fn offsets(&self) -> &[O] {
441        self.offsets().buffer()
442    }
443}
444
445impl<O: Offset> Splitable for BinaryArray<O> {
446    #[inline(always)]
447    fn check_bound(&self, offset: usize) -> bool {
448        offset <= self.len()
449    }
450
451    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
452        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
453        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
454
455        (
456            Self {
457                dtype: self.dtype.clone(),
458                offsets: lhs_offsets,
459                values: self.values.clone(),
460                validity: lhs_validity,
461            },
462            Self {
463                dtype: self.dtype.clone(),
464                offsets: rhs_offsets,
465                values: self.values.clone(),
466                validity: rhs_validity,
467            },
468        )
469    }
470}