polars_arrow/array/utf8/
mutable.rs

1use std::sync::Arc;
2
3use polars_error::{polars_bail, PolarsResult};
4
5use super::{MutableUtf8ValuesArray, MutableUtf8ValuesIter, StrAsBytes, Utf8Array};
6use crate::array::physical_binary::*;
7use crate::array::{Array, MutableArray, TryExtend, TryExtendFromSelf, TryPush};
8use crate::bitmap::utils::{BitmapIter, ZipValidity};
9use crate::bitmap::{Bitmap, MutableBitmap};
10use crate::datatypes::ArrowDataType;
11use crate::offset::{Offset, Offsets};
12use crate::trusted_len::TrustedLen;
13
14/// A [`MutableArray`] that builds a [`Utf8Array`]. It differs
15/// from [`MutableUtf8ValuesArray`] in that it can build nullable [`Utf8Array`]s.
16#[derive(Debug, Clone)]
17pub struct MutableUtf8Array<O: Offset> {
18    values: MutableUtf8ValuesArray<O>,
19    validity: Option<MutableBitmap>,
20}
21
22impl<O: Offset> From<MutableUtf8Array<O>> for Utf8Array<O> {
23    fn from(other: MutableUtf8Array<O>) -> Self {
24        let validity = other.validity.and_then(|x| {
25            let validity: Option<Bitmap> = x.into();
26            validity
27        });
28        let array: Utf8Array<O> = other.values.into();
29        array.with_validity(validity)
30    }
31}
32
33impl<O: Offset> Default for MutableUtf8Array<O> {
34    fn default() -> Self {
35        Self::new()
36    }
37}
38
39impl<O: Offset> MutableUtf8Array<O> {
40    /// Initializes a new empty [`MutableUtf8Array`].
41    pub fn new() -> Self {
42        Self {
43            values: Default::default(),
44            validity: None,
45        }
46    }
47
48    /// Returns a [`MutableUtf8Array`] created from its internal representation.
49    ///
50    /// # Errors
51    /// This function returns an error iff:
52    /// * The last offset is not equal to the values' length.
53    /// * the validity's length is not equal to `offsets.len()`.
54    /// * The `dtype`'s [`crate::datatypes::PhysicalType`] is not equal to either `Utf8` or `LargeUtf8`.
55    /// * The `values` between two consecutive `offsets` are not valid utf8
56    /// # Implementation
57    /// This function is `O(N)` - checking utf8 is `O(N)`
58    pub fn try_new(
59        dtype: ArrowDataType,
60        offsets: Offsets<O>,
61        values: Vec<u8>,
62        validity: Option<MutableBitmap>,
63    ) -> PolarsResult<Self> {
64        let values = MutableUtf8ValuesArray::try_new(dtype, offsets, values)?;
65
66        if validity
67            .as_ref()
68            .is_some_and(|validity| validity.len() != values.len())
69        {
70            polars_bail!(ComputeError: "validity's length must be equal to the number of values")
71        }
72
73        Ok(Self { values, validity })
74    }
75
76    /// Create a [`MutableUtf8Array`] out of low-end APIs.
77    ///
78    /// # Safety
79    /// The caller must ensure that every value between offsets is a valid utf8.
80    /// # Panics
81    /// This function panics iff:
82    /// * The `offsets` and `values` are inconsistent
83    /// * The validity is not `None` and its length is different from `offsets`'s length minus one.
84    pub unsafe fn new_unchecked(
85        dtype: ArrowDataType,
86        offsets: Offsets<O>,
87        values: Vec<u8>,
88        validity: Option<MutableBitmap>,
89    ) -> Self {
90        let values = MutableUtf8ValuesArray::new_unchecked(dtype, offsets, values);
91        if let Some(ref validity) = validity {
92            assert_eq!(values.len(), validity.len());
93        }
94        Self { values, validity }
95    }
96
97    /// Creates a new [`MutableUtf8Array`] from a slice of optional `&[u8]`.
98    // Note: this can't be `impl From` because Rust does not allow double `AsRef` on it.
99    pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
100        Self::from_trusted_len_iter(slice.as_ref().iter().map(|x| x.as_ref()))
101    }
102
103    fn default_dtype() -> ArrowDataType {
104        Utf8Array::<O>::default_dtype()
105    }
106
107    /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots.
108    pub fn with_capacity(capacity: usize) -> Self {
109        Self::with_capacities(capacity, 0)
110    }
111
112    /// Initializes a new [`MutableUtf8Array`] with a pre-allocated capacity of slots and values.
113    pub fn with_capacities(capacity: usize, values: usize) -> Self {
114        Self {
115            values: MutableUtf8ValuesArray::with_capacities(capacity, values),
116            validity: None,
117        }
118    }
119
120    /// Reserves `additional` elements and `additional_values` on the values buffer.
121    pub fn reserve(&mut self, additional: usize, additional_values: usize) {
122        self.values.reserve(additional, additional_values);
123        if let Some(x) = self.validity.as_mut() {
124            x.reserve(additional)
125        }
126    }
127
128    /// Reserves `additional` elements and `additional_values` on the values buffer.
129    pub fn capacity(&self) -> usize {
130        self.values.capacity()
131    }
132
133    /// Returns the length of this array
134    #[inline]
135    pub fn len(&self) -> usize {
136        self.values.len()
137    }
138
139    /// Pushes a new element to the array.
140    /// # Panic
141    /// This operation panics iff the length of all values (in bytes) exceeds `O` maximum value.
142    #[inline]
143    pub fn push<T: AsRef<str>>(&mut self, value: Option<T>) {
144        self.try_push(value).unwrap()
145    }
146
147    /// Returns the value of the element at index `i`, ignoring the array's validity.
148    #[inline]
149    pub fn value(&self, i: usize) -> &str {
150        self.values.value(i)
151    }
152
153    /// Returns the value of the element at index `i`, ignoring the array's validity.
154    ///
155    /// # Safety
156    /// This function is safe iff `i < self.len`.
157    #[inline]
158    pub unsafe fn value_unchecked(&self, i: usize) -> &str {
159        self.values.value_unchecked(i)
160    }
161
162    /// Pop the last entry from [`MutableUtf8Array`].
163    /// This function returns `None` iff this array is empty.
164    pub fn pop(&mut self) -> Option<String> {
165        let value = self.values.pop()?;
166        self.validity
167            .as_mut()
168            .map(|x| x.pop()?.then(|| ()))
169            .unwrap_or_else(|| Some(()))
170            .map(|_| value)
171    }
172
173    fn init_validity(&mut self) {
174        let mut validity = MutableBitmap::with_capacity(self.values.capacity());
175        validity.extend_constant(self.len(), true);
176        validity.set(self.len() - 1, false);
177        self.validity = Some(validity);
178    }
179
180    /// Returns an iterator of `Option<&str>`
181    pub fn iter(&self) -> ZipValidity<&str, MutableUtf8ValuesIter<O>, BitmapIter> {
182        ZipValidity::new(self.values_iter(), self.validity.as_ref().map(|x| x.iter()))
183    }
184
185    /// Converts itself into an [`Array`].
186    pub fn into_arc(self) -> Arc<dyn Array> {
187        let a: Utf8Array<O> = self.into();
188        Arc::new(a)
189    }
190
191    /// Shrinks the capacity of the [`MutableUtf8Array`] to fit its current length.
192    pub fn shrink_to_fit(&mut self) {
193        self.values.shrink_to_fit();
194        if let Some(validity) = &mut self.validity {
195            validity.shrink_to_fit()
196        }
197    }
198
199    /// Extract the low-end APIs from the [`MutableUtf8Array`].
200    pub fn into_data(self) -> (ArrowDataType, Offsets<O>, Vec<u8>, Option<MutableBitmap>) {
201        let (dtype, offsets, values) = self.values.into_inner();
202        (dtype, offsets, values, self.validity)
203    }
204
205    /// Returns an iterator of `&str`
206    pub fn values_iter(&self) -> MutableUtf8ValuesIter<O> {
207        self.values.iter()
208    }
209
210    /// Sets the validity.
211    /// # Panic
212    /// Panics iff the validity's len is not equal to the existing values' length.
213    pub fn set_validity(&mut self, validity: Option<MutableBitmap>) {
214        if let Some(validity) = &validity {
215            assert_eq!(self.values.len(), validity.len())
216        }
217        self.validity = validity;
218    }
219
220    /// Applies a function `f` to the validity of this array.
221    ///
222    /// This is an API to leverage clone-on-write
223    /// # Panics
224    /// This function panics if the function `f` modifies the length of the [`Bitmap`].
225    pub fn apply_validity<F: FnOnce(MutableBitmap) -> MutableBitmap>(&mut self, f: F) {
226        if let Some(validity) = std::mem::take(&mut self.validity) {
227            self.set_validity(Some(f(validity)))
228        }
229    }
230}
231
232impl<O: Offset> MutableUtf8Array<O> {
233    /// returns its values.
234    pub fn values(&self) -> &Vec<u8> {
235        self.values.values()
236    }
237
238    /// returns its offsets.
239    pub fn offsets(&self) -> &Offsets<O> {
240        self.values.offsets()
241    }
242}
243
244impl<O: Offset> MutableArray for MutableUtf8Array<O> {
245    fn len(&self) -> usize {
246        self.len()
247    }
248
249    fn validity(&self) -> Option<&MutableBitmap> {
250        self.validity.as_ref()
251    }
252
253    fn as_box(&mut self) -> Box<dyn Array> {
254        let array: Utf8Array<O> = std::mem::take(self).into();
255        array.boxed()
256    }
257
258    fn as_arc(&mut self) -> Arc<dyn Array> {
259        let array: Utf8Array<O> = std::mem::take(self).into();
260        array.arced()
261    }
262
263    fn dtype(&self) -> &ArrowDataType {
264        if O::IS_LARGE {
265            &ArrowDataType::LargeUtf8
266        } else {
267            &ArrowDataType::Utf8
268        }
269    }
270
271    fn as_any(&self) -> &dyn std::any::Any {
272        self
273    }
274
275    fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
276        self
277    }
278
279    #[inline]
280    fn push_null(&mut self) {
281        self.push::<&str>(None)
282    }
283
284    fn reserve(&mut self, additional: usize) {
285        self.reserve(additional, 0)
286    }
287
288    fn shrink_to_fit(&mut self) {
289        self.shrink_to_fit()
290    }
291}
292
293impl<O: Offset, P: AsRef<str>> FromIterator<Option<P>> for MutableUtf8Array<O> {
294    fn from_iter<I: IntoIterator<Item = Option<P>>>(iter: I) -> Self {
295        Self::try_from_iter(iter).unwrap()
296    }
297}
298
299impl<O: Offset> MutableUtf8Array<O> {
300    /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
301    /// This differs from `extended_trusted_len` which accepts iterator of optional values.
302    #[inline]
303    pub fn extend_trusted_len_values<I, P>(&mut self, iterator: I)
304    where
305        P: AsRef<str>,
306        I: TrustedLen<Item = P>,
307    {
308        unsafe { self.extend_trusted_len_values_unchecked(iterator) }
309    }
310
311    /// Extends the [`MutableUtf8Array`] from an iterator of values.
312    /// This differs from `extended_trusted_len` which accepts iterator of optional values.
313    #[inline]
314    pub fn extend_values<I, P>(&mut self, iterator: I)
315    where
316        P: AsRef<str>,
317        I: Iterator<Item = P>,
318    {
319        let length = self.values.len();
320        self.values.extend(iterator);
321        let additional = self.values.len() - length;
322
323        if let Some(validity) = self.validity.as_mut() {
324            validity.extend_constant(additional, true);
325        }
326    }
327
328    /// Extends the [`MutableUtf8Array`] from an iterator of values of trusted len.
329    /// This differs from `extended_trusted_len_unchecked` which accepts iterator of optional
330    /// values.
331    ///
332    /// # Safety
333    /// The iterator must be trusted len.
334    #[inline]
335    pub unsafe fn extend_trusted_len_values_unchecked<I, P>(&mut self, iterator: I)
336    where
337        P: AsRef<str>,
338        I: Iterator<Item = P>,
339    {
340        let length = self.values.len();
341        self.values.extend_trusted_len_unchecked(iterator);
342        let additional = self.values.len() - length;
343
344        if let Some(validity) = self.validity.as_mut() {
345            validity.extend_constant(additional, true);
346        }
347    }
348
349    /// Extends the [`MutableUtf8Array`] from an iterator of trusted len.
350    #[inline]
351    pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
352    where
353        P: AsRef<str>,
354        I: TrustedLen<Item = Option<P>>,
355    {
356        unsafe { self.extend_trusted_len_unchecked(iterator) }
357    }
358
359    /// Extends [`MutableUtf8Array`] from an iterator of trusted len.
360    ///
361    /// # Safety
362    /// The iterator must be trusted len.
363    #[inline]
364    pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
365    where
366        P: AsRef<str>,
367        I: Iterator<Item = Option<P>>,
368    {
369        if self.validity.is_none() {
370            let mut validity = MutableBitmap::new();
371            validity.extend_constant(self.len(), true);
372            self.validity = Some(validity);
373        }
374
375        self.values
376            .extend_from_trusted_len_iter(self.validity.as_mut().unwrap(), iterator);
377    }
378
379    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
380    ///
381    /// # Safety
382    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
383    /// I.e. that `size_hint().1` correctly reports its length.
384    #[inline]
385    pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
386    where
387        P: AsRef<str>,
388        I: Iterator<Item = Option<P>>,
389    {
390        let iterator = iterator.map(|x| x.map(StrAsBytes));
391        let (validity, offsets, values) = trusted_len_unzip(iterator);
392
393        // soundness: P is `str`
394        Self::new_unchecked(Self::default_dtype(), offsets, values, validity)
395    }
396
397    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length.
398    #[inline]
399    pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
400    where
401        P: AsRef<str>,
402        I: TrustedLen<Item = Option<P>>,
403    {
404        // soundness: I is `TrustedLen`
405        unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
406    }
407
408    /// Creates a [`MutableUtf8Array`] from an iterator of trusted length of `&str`.
409    ///
410    /// # Safety
411    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
412    /// I.e. that `size_hint().1` correctly reports its length.
413    #[inline]
414    pub unsafe fn from_trusted_len_values_iter_unchecked<T: AsRef<str>, I: Iterator<Item = T>>(
415        iterator: I,
416    ) -> Self {
417        MutableUtf8ValuesArray::from_trusted_len_iter_unchecked(iterator).into()
418    }
419
420    /// Creates a new [`MutableUtf8Array`] from a [`TrustedLen`] of `&str`.
421    #[inline]
422    pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
423        iterator: I,
424    ) -> Self {
425        // soundness: I is `TrustedLen`
426        unsafe { Self::from_trusted_len_values_iter_unchecked(iterator) }
427    }
428
429    /// Creates a new [`MutableUtf8Array`] from an iterator.
430    /// # Error
431    /// This operation errors iff the total length in bytes on the iterator exceeds `O`'s maximum value.
432    /// (`i32::MAX` or `i64::MAX` respectively).
433    fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = Option<P>>>(
434        iter: I,
435    ) -> PolarsResult<Self> {
436        let iterator = iter.into_iter();
437        let (lower, _) = iterator.size_hint();
438        let mut array = Self::with_capacity(lower);
439        for item in iterator {
440            array.try_push(item)?;
441        }
442        Ok(array)
443    }
444
445    /// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
446    ///
447    /// # Safety
448    /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html).
449    /// I.e. that `size_hint().1` correctly reports its length.
450    #[inline]
451    pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
452        iterator: I,
453    ) -> std::result::Result<Self, E>
454    where
455        P: AsRef<str>,
456        I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
457    {
458        let iterator = iterator.into_iter();
459
460        let iterator = iterator.map(|x| x.map(|x| x.map(StrAsBytes)));
461        let (validity, offsets, values) = try_trusted_len_unzip(iterator)?;
462
463        // soundness: P is `str`
464        Ok(Self::new_unchecked(
465            Self::default_dtype(),
466            offsets,
467            values,
468            validity,
469        ))
470    }
471
472    /// Creates a [`MutableUtf8Array`] from an falible iterator of trusted length.
473    #[inline]
474    pub fn try_from_trusted_len_iter<E, I, P>(iterator: I) -> std::result::Result<Self, E>
475    where
476        P: AsRef<str>,
477        I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
478    {
479        // soundness: I: TrustedLen
480        unsafe { Self::try_from_trusted_len_iter_unchecked(iterator) }
481    }
482
483    /// Creates a new [`MutableUtf8Array`] from a [`Iterator`] of `&str`.
484    pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
485        MutableUtf8ValuesArray::from_iter(iterator).into()
486    }
487
488    /// Extend with a fallible iterator
489    pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
490    where
491        E: std::error::Error,
492        I: IntoIterator<Item = std::result::Result<Option<T>, E>>,
493        T: AsRef<str>,
494    {
495        let mut iter = iter.into_iter();
496        self.reserve(iter.size_hint().0, 0);
497        iter.try_for_each(|x| {
498            self.push(x?);
499            Ok(())
500        })
501    }
502}
503
504impl<O: Offset, T: AsRef<str>> Extend<Option<T>> for MutableUtf8Array<O> {
505    fn extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) {
506        self.try_extend(iter).unwrap();
507    }
508}
509
510impl<O: Offset, T: AsRef<str>> TryExtend<Option<T>> for MutableUtf8Array<O> {
511    fn try_extend<I: IntoIterator<Item = Option<T>>>(&mut self, iter: I) -> PolarsResult<()> {
512        let mut iter = iter.into_iter();
513        self.reserve(iter.size_hint().0, 0);
514        iter.try_for_each(|x| self.try_push(x))
515    }
516}
517
518impl<O: Offset, T: AsRef<str>> TryPush<Option<T>> for MutableUtf8Array<O> {
519    #[inline]
520    fn try_push(&mut self, value: Option<T>) -> PolarsResult<()> {
521        match value {
522            Some(value) => {
523                self.values.try_push(value.as_ref())?;
524
525                if let Some(validity) = &mut self.validity {
526                    validity.push(true)
527                }
528            },
529            None => {
530                self.values.push("");
531                match &mut self.validity {
532                    Some(validity) => validity.push(false),
533                    None => self.init_validity(),
534                }
535            },
536        }
537        Ok(())
538    }
539}
540
541impl<O: Offset> PartialEq for MutableUtf8Array<O> {
542    fn eq(&self, other: &Self) -> bool {
543        self.iter().eq(other.iter())
544    }
545}
546
547impl<O: Offset> TryExtendFromSelf for MutableUtf8Array<O> {
548    fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
549        extend_validity(self.len(), &mut self.validity, &other.validity);
550
551        self.values.try_extend_from_self(&other.values)
552    }
553}