polars_arrow/array/utf8/
mod.rs

1use either::Either;
2
3use super::specification::try_check_utf8;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::array::BinaryArray;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::bitmap::Bitmap;
9use crate::buffer::Buffer;
10use crate::datatypes::ArrowDataType;
11use crate::offset::{Offset, Offsets, OffsetsBuffer};
12use crate::trusted_len::TrustedLen;
13
14mod ffi;
15pub(super) mod fmt;
16mod from;
17mod iterator;
18mod mutable;
19mod mutable_values;
20pub use iterator::*;
21pub use mutable::*;
22pub use mutable_values::MutableUtf8ValuesArray;
23use polars_error::*;
24
25// Auxiliary struct to allow presenting &str as [u8] to a generic function
26pub(super) struct StrAsBytes<P>(P);
27impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
28    #[inline(always)]
29    fn as_ref(&self) -> &[u8] {
30        self.0.as_ref().as_bytes()
31    }
32}
33
34/// A [`Utf8Array`] is arrow's semantic equivalent of an immutable `Vec<Option<String>>`.
35/// Cloning and slicing this struct is `O(1)`.
36/// # Example
37/// ```
38/// use polars_arrow::bitmap::Bitmap;
39/// use polars_arrow::buffer::Buffer;
40/// use polars_arrow::array::Utf8Array;
41/// # fn main() {
42/// let array = Utf8Array::<i32>::from([Some("hi"), None, Some("there")]);
43/// assert_eq!(array.value(0), "hi");
44/// assert_eq!(array.iter().collect::<Vec<_>>(), vec![Some("hi"), None, Some("there")]);
45/// assert_eq!(array.values_iter().collect::<Vec<_>>(), vec!["hi", "", "there"]);
46/// // the underlying representation
47/// assert_eq!(array.validity(), Some(&Bitmap::from([true, false, true])));
48/// assert_eq!(array.values(), &Buffer::from(b"hithere".to_vec()));
49/// assert_eq!(array.offsets().buffer(), &Buffer::from(vec![0, 2, 2, 2 + 5]));
50/// # }
51/// ```
52///
53/// # Generic parameter
54/// The generic parameter [`Offset`] can only be `i32` or `i64` and tradeoffs maximum array length with
55/// memory usage:
56/// * the sum of lengths of all elements cannot exceed `Offset::MAX`
57/// * the total size of the underlying data is `array.len() * size_of::<Offset>() + sum of lengths of all elements`
58///
59/// # Safety
60/// The following invariants hold:
61/// * Two consecutive `offsets` cast (`as`) to `usize` are valid slices of `values`.
62/// * A slice of `values` taken from two consecutive `offsets` is valid `utf8`.
63/// * `len` is equal to `validity.len()`, when defined.
64#[derive(Clone)]
65pub struct Utf8Array<O: Offset> {
66    dtype: ArrowDataType,
67    offsets: OffsetsBuffer<O>,
68    values: Buffer<u8>,
69    validity: Option<Bitmap>,
70}
71
72// constructors
73impl<O: Offset> Utf8Array<O> {
74    /// Returns a [`Utf8Array`] created from its internal representation.
75    ///
76    /// # Errors
77    /// This function returns an error iff:
78    /// * The last offset is greater than the values' length.
79    /// * the validity's length is not equal to `offsets.len_proxy()`.
80    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
81    /// * The `values` between two consecutive `offsets` are not valid utf8
82    /// # Implementation
83    /// This function is `O(N)` - checking utf8 is `O(N)`
84    pub fn try_new(
85        dtype: ArrowDataType,
86        offsets: OffsetsBuffer<O>,
87        values: Buffer<u8>,
88        validity: Option<Bitmap>,
89    ) -> PolarsResult<Self> {
90        try_check_utf8(&offsets, &values)?;
91        if validity
92            .as_ref()
93            .is_some_and(|validity| validity.len() != offsets.len_proxy())
94        {
95            polars_bail!(ComputeError: "validity mask length must match the number of values");
96        }
97
98        if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
99            polars_bail!(ComputeError: "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
100        }
101
102        Ok(Self {
103            dtype,
104            offsets,
105            values,
106            validity,
107        })
108    }
109
110    /// Returns a [`Utf8Array`] from a slice of `&str`.
111    ///
112    /// A convenience method that uses [`Self::from_trusted_len_values_iter`].
113    pub fn from_slice<T: AsRef<str>, P: AsRef<[T]>>(slice: P) -> Self {
114        Self::from_trusted_len_values_iter(slice.as_ref().iter())
115    }
116
117    /// Returns a new [`Utf8Array`] from a slice of `&str`.
118    ///
119    /// A convenience method that uses [`Self::from_trusted_len_iter`].
120    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
121    pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
122        MutableUtf8Array::<O>::from(slice).into()
123    }
124
125    /// Returns an iterator of `Option<&str>`
126    pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter<O>, BitmapIter> {
127        ZipValidity::new_with_validity(self.values_iter(), self.validity())
128    }
129
130    /// Returns an iterator of `&str`
131    pub fn values_iter(&self) -> Utf8ValuesIter<O> {
132        Utf8ValuesIter::new(self)
133    }
134
135    /// Returns an iterator of the non-null values `&str.
136    #[inline]
137    pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, Utf8Array<O>> {
138        NonNullValuesIter::new(self, self.validity())
139    }
140
141    /// Returns the length of this array
142    #[inline]
143    pub fn len(&self) -> usize {
144        self.offsets.len_proxy()
145    }
146
147    /// Returns the value of the element at index `i`, ignoring the array's validity.
148    /// # Panic
149    /// This function panics iff `i >= self.len`.
150    #[inline]
151    pub fn value(&self, i: usize) -> &str {
152        assert!(i < self.len());
153        unsafe { self.value_unchecked(i) }
154    }
155
156    /// Returns the value of the element at index `i`, ignoring the array's validity.
157    ///
158    /// # Safety
159    /// This function is safe iff `i < self.len`.
160    #[inline]
161    pub unsafe fn value_unchecked(&self, i: usize) -> &str {
162        // soundness: the invariant of the function
163        let (start, end) = self.offsets.start_end_unchecked(i);
164
165        // soundness: the invariant of the struct
166        let slice = self.values.get_unchecked(start..end);
167
168        // soundness: the invariant of the struct
169        std::str::from_utf8_unchecked(slice)
170    }
171
172    /// Returns the element at index `i` or `None` if it is null
173    /// # Panics
174    /// iff `i >= self.len()`
175    #[inline]
176    pub fn get(&self, i: usize) -> Option<&str> {
177        if !self.is_null(i) {
178            // soundness: Array::is_null panics if i >= self.len
179            unsafe { Some(self.value_unchecked(i)) }
180        } else {
181            None
182        }
183    }
184
185    /// Returns the [`ArrowDataType`] of this array.
186    #[inline]
187    pub fn dtype(&self) -> &ArrowDataType {
188        &self.dtype
189    }
190
191    /// Returns the values of this [`Utf8Array`].
192    #[inline]
193    pub fn values(&self) -> &Buffer<u8> {
194        &self.values
195    }
196
197    /// Returns the offsets of this [`Utf8Array`].
198    #[inline]
199    pub fn offsets(&self) -> &OffsetsBuffer<O> {
200        &self.offsets
201    }
202
203    /// The optional validity.
204    #[inline]
205    pub fn validity(&self) -> Option<&Bitmap> {
206        self.validity.as_ref()
207    }
208
209    /// Slices this [`Utf8Array`].
210    /// # Implementation
211    /// This function is `O(1)`.
212    /// # Panics
213    /// iff `offset + length > self.len()`.
214    pub fn slice(&mut self, offset: usize, length: usize) {
215        assert!(
216            offset + length <= self.len(),
217            "the offset of the new array cannot exceed the arrays' length"
218        );
219        unsafe { self.slice_unchecked(offset, length) }
220    }
221
222    /// Slices this [`Utf8Array`].
223    /// # Implementation
224    /// This function is `O(1)`
225    ///
226    /// # Safety
227    /// The caller must ensure that `offset + length <= self.len()`.
228    pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
229        self.validity = self
230            .validity
231            .take()
232            .map(|bitmap| bitmap.sliced_unchecked(offset, length))
233            .filter(|bitmap| bitmap.unset_bits() > 0);
234        self.offsets.slice_unchecked(offset, length + 1);
235    }
236
237    impl_sliced!();
238    impl_mut_validity!();
239    impl_into_array!();
240
241    /// Returns its internal representation
242    #[must_use]
243    pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
244        let Self {
245            dtype,
246            offsets,
247            values,
248            validity,
249        } = self;
250        (dtype, offsets, values, validity)
251    }
252
253    /// Try to convert this `Utf8Array` to a `MutableUtf8Array`
254    #[must_use]
255    pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {
256        use Either::*;
257        if let Some(bitmap) = self.validity {
258            match bitmap.into_mut() {
259                // SAFETY: invariants are preserved
260                Left(bitmap) => Left(unsafe {
261                    Utf8Array::new_unchecked(self.dtype, self.offsets, self.values, Some(bitmap))
262                }),
263                Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
264                    (Left(values), Left(offsets)) => {
265                        // SAFETY: invariants are preserved
266                        Left(unsafe {
267                            Utf8Array::new_unchecked(
268                                self.dtype,
269                                offsets,
270                                values,
271                                Some(mutable_bitmap.into()),
272                            )
273                        })
274                    },
275                    (Left(values), Right(offsets)) => {
276                        // SAFETY: invariants are preserved
277                        Left(unsafe {
278                            Utf8Array::new_unchecked(
279                                self.dtype,
280                                offsets.into(),
281                                values,
282                                Some(mutable_bitmap.into()),
283                            )
284                        })
285                    },
286                    (Right(values), Left(offsets)) => {
287                        // SAFETY: invariants are preserved
288                        Left(unsafe {
289                            Utf8Array::new_unchecked(
290                                self.dtype,
291                                offsets,
292                                values.into(),
293                                Some(mutable_bitmap.into()),
294                            )
295                        })
296                    },
297                    (Right(values), Right(offsets)) => Right(unsafe {
298                        MutableUtf8Array::new_unchecked(
299                            self.dtype,
300                            offsets,
301                            values,
302                            Some(mutable_bitmap),
303                        )
304                    }),
305                },
306            }
307        } else {
308            match (self.values.into_mut(), self.offsets.into_mut()) {
309                (Left(values), Left(offsets)) => {
310                    Left(unsafe { Utf8Array::new_unchecked(self.dtype, offsets, values, None) })
311                },
312                (Left(values), Right(offsets)) => Left(unsafe {
313                    Utf8Array::new_unchecked(self.dtype, offsets.into(), values, None)
314                }),
315                (Right(values), Left(offsets)) => Left(unsafe {
316                    Utf8Array::new_unchecked(self.dtype, offsets, values.into(), None)
317                }),
318                (Right(values), Right(offsets)) => Right(unsafe {
319                    MutableUtf8Array::new_unchecked(self.dtype, offsets, values, None)
320                }),
321            }
322        }
323    }
324
325    /// Returns a new empty [`Utf8Array`].
326    ///
327    /// The array is guaranteed to have no elements nor validity.
328    #[inline]
329    pub fn new_empty(dtype: ArrowDataType) -> Self {
330        unsafe { Self::new_unchecked(dtype, OffsetsBuffer::new(), Buffer::new(), None) }
331    }
332
333    /// Returns a new [`Utf8Array`] whose all slots are null / `None`.
334    #[inline]
335    pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
336        Self::new(
337            dtype,
338            Offsets::new_zeroed(length).into(),
339            Buffer::new(),
340            Some(Bitmap::new_zeroed(length)),
341        )
342    }
343
344    /// Returns a default [`ArrowDataType`] of this array, which depends on the generic parameter `O`: `DataType::Utf8` or `DataType::LargeUtf8`
345    pub fn default_dtype() -> ArrowDataType {
346        if O::IS_LARGE {
347            ArrowDataType::LargeUtf8
348        } else {
349            ArrowDataType::Utf8
350        }
351    }
352
353    /// Creates a new [`Utf8Array`] without checking for offsets monotinicity nor utf8-validity
354    ///
355    /// # Panic
356    /// This function panics (in debug mode only) iff:
357    /// * The last offset is greater than the values' length.
358    /// * the validity's length is not equal to `offsets.len_proxy()`.
359    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
360    ///
361    /// # Safety
362    /// This function is unsound iff:
363    /// * The `values` between two consecutive `offsets` are not valid utf8
364    /// # Implementation
365    /// This function is `O(1)`
366    pub unsafe fn new_unchecked(
367        dtype: ArrowDataType,
368        offsets: OffsetsBuffer<O>,
369        values: Buffer<u8>,
370        validity: Option<Bitmap>,
371    ) -> Self {
372        debug_assert!(
373            offsets.last().to_usize() <= values.len(),
374            "offsets must not exceed the values length"
375        );
376        debug_assert!(
377            validity
378                .as_ref()
379                .is_none_or(|validity| validity.len() == offsets.len_proxy()),
380            "validity mask length must match the number of values"
381        );
382        debug_assert!(
383            dtype.to_physical_type() == Self::default_dtype().to_physical_type(),
384            "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
385        );
386
387        Self {
388            dtype,
389            offsets,
390            values,
391            validity,
392        }
393    }
394
395    /// Creates a new [`Utf8Array`].
396    /// # Panics
397    /// This function panics iff:
398    /// * The last offset is greater than the values' length.
399    /// * the validity's length is not equal to `offsets.len_proxy()`.
400    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
401    /// * The `values` between two consecutive `offsets` are not valid utf8
402    /// # Implementation
403    /// This function is `O(N)` - checking utf8 is `O(N)`
404    pub fn new(
405        dtype: ArrowDataType,
406        offsets: OffsetsBuffer<O>,
407        values: Buffer<u8>,
408        validity: Option<Bitmap>,
409    ) -> Self {
410        Self::try_new(dtype, offsets, values, validity).unwrap()
411    }
412
413    /// Returns a (non-null) [`Utf8Array`] created from a [`TrustedLen`] of `&str`.
414    /// # Implementation
415    /// This function is `O(N)`
416    #[inline]
417    pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
418        iterator: I,
419    ) -> Self {
420        MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
421    }
422
423    /// Creates a new [`Utf8Array`] from a [`Iterator`] of `&str`.
424    pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
425        MutableUtf8Array::<O>::from_iter_values(iterator).into()
426    }
427
428    /// Creates a [`Utf8Array`] from an iterator of trusted length.
429    ///
430    /// # Safety
431    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
432    /// I.e. that `size_hint().1` correctly reports its length.
433    #[inline]
434    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
435    where
436        P: AsRef<str>,
437        I: Iterator<Item = Option<P>>,
438    {
439        MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
440    }
441
442    /// Creates a [`Utf8Array`] from an iterator of trusted length.
443    #[inline]
444    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
445    where
446        P: AsRef<str>,
447        I: TrustedLen<Item = Option<P>>,
448    {
449        MutableUtf8Array::<O>::from_trusted_len_iter(iterator).into()
450    }
451
452    /// Creates a [`Utf8Array`] from an falible iterator of trusted length.
453    ///
454    /// # Safety
455    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
456    /// I.e. that `size_hint().1` correctly reports its length.
457    #[inline]
458    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
459        iterator: I,
460    ) -> std::result::Result<Self, E>
461    where
462        P: AsRef<str>,
463        I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
464    {
465        MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
466    }
467
468    /// Creates a [`Utf8Array`] from an fallible iterator of trusted length.
469    #[inline]
470    pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> std::result::Result<Self, E>
471    where
472        P: AsRef<str>,
473        I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
474    {
475        MutableUtf8Array::<O>::try_from_trusted_len_iter(iter).map(|x| x.into())
476    }
477
478    /// Applies a function `f` to the validity of this array.
479    ///
480    /// This is an API to leverage clone-on-write
481    /// # Panics
482    /// This function panics if the function `f` modifies the length of the [`Bitmap`].
483    pub fn apply_validity<F: FnOnce(Bitmap) -> Bitmap>(&mut self, f: F) {
484        if let Some(validity) = std::mem::take(&mut self.validity) {
485            self.set_validity(Some(f(validity)))
486        }
487    }
488
489    // Convert this [`Utf8Array`] to a [`BinaryArray`].
490    pub fn to_binary(&self) -> BinaryArray<O> {
491        unsafe {
492            BinaryArray::new_unchecked(
493                BinaryArray::<O>::default_dtype(),
494                self.offsets.clone(),
495                self.values.clone(),
496                self.validity.clone(),
497            )
498        }
499    }
500}
501
502impl<O: Offset> Splitable for Utf8Array<O> {
503    #[inline(always)]
504    fn check_bound(&self, offset: usize) -> bool {
505        offset <= self.len()
506    }
507
508    unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
509        let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
510        let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
511
512        (
513            Self {
514                dtype: self.dtype.clone(),
515                offsets: lhs_offsets,
516                values: self.values.clone(),
517                validity: lhs_validity,
518            },
519            Self {
520                dtype: self.dtype.clone(),
521                offsets: rhs_offsets,
522                values: self.values.clone(),
523                validity: rhs_validity,
524            },
525        )
526    }
527}
528
529impl<O: Offset> Array for Utf8Array<O> {
530    impl_common_array!();
531
532    fn validity(&self) -> Option<&Bitmap> {
533        self.validity.as_ref()
534    }
535
536    #[inline]
537    fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
538        Box::new(self.clone().with_validity(validity))
539    }
540}
541
542unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {
543    #[inline]
544    fn values(&self) -> &[u8] {
545        self.values()
546    }
547
548    #[inline]
549    fn offsets(&self) -> &[O] {
550        self.offsets().buffer()
551    }
552}
553
554impl<O: Offset> Default for Utf8Array<O> {
555    fn default() -> Self {
556        let dtype = if O::IS_LARGE {
557            ArrowDataType::LargeUtf8
558        } else {
559            ArrowDataType::Utf8
560        };
561        Utf8Array::new(dtype, Default::default(), Default::default(), None)
562    }
563}