arrow_array/builder/
generic_byte_run_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::types::bytes::ByteArrayNativeType;
19use std::{any::Any, sync::Arc};
20
21use crate::{
22    types::{BinaryType, ByteArrayType, LargeBinaryType, LargeUtf8Type, RunEndIndexType, Utf8Type},
23    ArrayRef, ArrowPrimitiveType, RunArray,
24};
25
26use super::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
27
28use arrow_buffer::ArrowNativeType;
29
30/// Builder for [`RunArray`] of [`GenericByteArray`](crate::array::GenericByteArray)
31///
32/// # Example:
33///
34/// ```
35///
36/// # use arrow_array::builder::GenericByteRunBuilder;
37/// # use arrow_array::{GenericByteArray, BinaryArray};
38/// # use arrow_array::types::{BinaryType, Int16Type};
39/// # use arrow_array::{Array, Int16Array};
40/// # use arrow_array::cast::AsArray;
41///
42/// let mut builder =
43/// GenericByteRunBuilder::<Int16Type, BinaryType>::new();
44/// builder.extend([Some(b"abc"), Some(b"abc"), None, Some(b"def")].into_iter());
45/// builder.append_value(b"def");
46/// builder.append_null();
47/// let array = builder.finish();
48///
49/// assert_eq!(array.run_ends().values(), &[2, 3, 5, 6]);
50///
51/// let av = array.values();
52///
53/// assert!(!av.is_null(0));
54/// assert!(av.is_null(1));
55/// assert!(!av.is_null(2));
56/// assert!(av.is_null(3));
57///
58/// // Values are polymorphic and so require a downcast.
59/// let ava: &BinaryArray = av.as_binary();
60///
61/// assert_eq!(ava.value(0), b"abc");
62/// assert_eq!(ava.value(2), b"def");
63/// ```
64#[derive(Debug)]
65pub struct GenericByteRunBuilder<R, V>
66where
67    R: ArrowPrimitiveType,
68    V: ByteArrayType,
69{
70    run_ends_builder: PrimitiveBuilder<R>,
71    values_builder: GenericByteBuilder<V>,
72    current_value: Vec<u8>,
73    has_current_value: bool,
74    current_run_end_index: usize,
75    prev_run_end_index: usize,
76}
77
78impl<R, V> Default for GenericByteRunBuilder<R, V>
79where
80    R: ArrowPrimitiveType,
81    V: ByteArrayType,
82{
83    fn default() -> Self {
84        Self::new()
85    }
86}
87
88impl<R, V> GenericByteRunBuilder<R, V>
89where
90    R: ArrowPrimitiveType,
91    V: ByteArrayType,
92{
93    /// Creates a new `GenericByteRunBuilder`
94    pub fn new() -> Self {
95        Self {
96            run_ends_builder: PrimitiveBuilder::new(),
97            values_builder: GenericByteBuilder::<V>::new(),
98            current_value: Vec::new(),
99            has_current_value: false,
100            current_run_end_index: 0,
101            prev_run_end_index: 0,
102        }
103    }
104
105    /// Creates a new `GenericByteRunBuilder` with the provided capacity
106    ///
107    /// `capacity`: the expected number of run-end encoded values.
108    /// `data_capacity`: the expected number of bytes of run end encoded values
109    pub fn with_capacity(capacity: usize, data_capacity: usize) -> Self {
110        Self {
111            run_ends_builder: PrimitiveBuilder::with_capacity(capacity),
112            values_builder: GenericByteBuilder::<V>::with_capacity(capacity, data_capacity),
113            current_value: Vec::new(),
114            has_current_value: false,
115            current_run_end_index: 0,
116            prev_run_end_index: 0,
117        }
118    }
119}
120
121impl<R, V> ArrayBuilder for GenericByteRunBuilder<R, V>
122where
123    R: RunEndIndexType,
124    V: ByteArrayType,
125{
126    /// Returns the builder as a non-mutable `Any` reference.
127    fn as_any(&self) -> &dyn Any {
128        self
129    }
130
131    /// Returns the builder as a mutable `Any` reference.
132    fn as_any_mut(&mut self) -> &mut dyn Any {
133        self
134    }
135
136    /// Returns the boxed builder as a box of `Any`.
137    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
138        self
139    }
140
141    /// Returns the length of logical array encoded by
142    /// the eventual runs array.
143    fn len(&self) -> usize {
144        self.current_run_end_index
145    }
146
147    /// Builds the array and reset this builder.
148    fn finish(&mut self) -> ArrayRef {
149        Arc::new(self.finish())
150    }
151
152    /// Builds the array without resetting the builder.
153    fn finish_cloned(&self) -> ArrayRef {
154        Arc::new(self.finish_cloned())
155    }
156}
157
158impl<R, V> GenericByteRunBuilder<R, V>
159where
160    R: RunEndIndexType,
161    V: ByteArrayType,
162{
163    /// Appends optional value to the logical array encoded by the RunArray.
164    pub fn append_option(&mut self, input_value: Option<impl AsRef<V::Native>>) {
165        match input_value {
166            Some(value) => self.append_value(value),
167            None => self.append_null(),
168        }
169    }
170
171    /// Appends value to the logical array encoded by the RunArray.
172    pub fn append_value(&mut self, input_value: impl AsRef<V::Native>) {
173        let value: &[u8] = input_value.as_ref().as_ref();
174        if !self.has_current_value {
175            self.append_run_end();
176            self.current_value.extend_from_slice(value);
177            self.has_current_value = true;
178        } else if self.current_value.as_slice() != value {
179            self.append_run_end();
180            self.current_value.clear();
181            self.current_value.extend_from_slice(value);
182        }
183        self.current_run_end_index += 1;
184    }
185
186    /// Appends null to the logical array encoded by the RunArray.
187    pub fn append_null(&mut self) {
188        if self.has_current_value {
189            self.append_run_end();
190            self.current_value.clear();
191            self.has_current_value = false;
192        }
193        self.current_run_end_index += 1;
194    }
195
196    /// Creates the RunArray and resets the builder.
197    /// Panics if RunArray cannot be built.
198    pub fn finish(&mut self) -> RunArray<R> {
199        // write the last run end to the array.
200        self.append_run_end();
201
202        // reset the run end index to zero.
203        self.current_value.clear();
204        self.has_current_value = false;
205        self.current_run_end_index = 0;
206        self.prev_run_end_index = 0;
207
208        // build the run encoded array by adding run_ends and values array as its children.
209        let run_ends_array = self.run_ends_builder.finish();
210        let values_array = self.values_builder.finish();
211        RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
212    }
213
214    /// Creates the RunArray and without resetting the builder.
215    /// Panics if RunArray cannot be built.
216    pub fn finish_cloned(&self) -> RunArray<R> {
217        let mut run_ends_array = self.run_ends_builder.finish_cloned();
218        let mut values_array = self.values_builder.finish_cloned();
219
220        // Add current run if one exists
221        if self.prev_run_end_index != self.current_run_end_index {
222            let mut run_end_builder = run_ends_array.into_builder().unwrap();
223            let mut values_builder = values_array.into_builder().unwrap();
224            self.append_run_end_with_builders(&mut run_end_builder, &mut values_builder);
225            run_ends_array = run_end_builder.finish();
226            values_array = values_builder.finish();
227        }
228
229        RunArray::<R>::try_new(&run_ends_array, &values_array).unwrap()
230    }
231
232    // Appends the current run to the array.
233    fn append_run_end(&mut self) {
234        // empty array or the function called without appending any value.
235        if self.prev_run_end_index == self.current_run_end_index {
236            return;
237        }
238        let run_end_index = self.run_end_index_as_native();
239        self.run_ends_builder.append_value(run_end_index);
240        if self.has_current_value {
241            let slice = self.current_value.as_slice();
242            let native = unsafe {
243                // Safety:
244                // As self.current_value is created from V::Native. The value V::Native can be
245                // built back from the bytes without validations
246                V::Native::from_bytes_unchecked(slice)
247            };
248            self.values_builder.append_value(native);
249        } else {
250            self.values_builder.append_null();
251        }
252        self.prev_run_end_index = self.current_run_end_index;
253    }
254
255    // Similar to `append_run_end` but on custom builders.
256    // Used in `finish_cloned` which is not suppose to mutate `self`.
257    fn append_run_end_with_builders(
258        &self,
259        run_ends_builder: &mut PrimitiveBuilder<R>,
260        values_builder: &mut GenericByteBuilder<V>,
261    ) {
262        let run_end_index = self.run_end_index_as_native();
263        run_ends_builder.append_value(run_end_index);
264        if self.has_current_value {
265            let slice = self.current_value.as_slice();
266            let native = unsafe {
267                // Safety:
268                // As self.current_value is created from V::Native. The value V::Native can be
269                // built back from the bytes without validations
270                V::Native::from_bytes_unchecked(slice)
271            };
272            values_builder.append_value(native);
273        } else {
274            values_builder.append_null();
275        }
276    }
277
278    fn run_end_index_as_native(&self) -> R::Native {
279        R::Native::from_usize(self.current_run_end_index).unwrap_or_else(|| {
280            panic!(
281                "Cannot convert the value {} from `usize` to native form of arrow datatype {}",
282                self.current_run_end_index,
283                R::DATA_TYPE
284            )
285        })
286    }
287}
288
289impl<R, V, S> Extend<Option<S>> for GenericByteRunBuilder<R, V>
290where
291    R: RunEndIndexType,
292    V: ByteArrayType,
293    S: AsRef<V::Native>,
294{
295    fn extend<T: IntoIterator<Item = Option<S>>>(&mut self, iter: T) {
296        for elem in iter {
297            self.append_option(elem);
298        }
299    }
300}
301
302/// Builder for [`RunArray`] of [`StringArray`](crate::array::StringArray)
303///
304/// ```
305/// // Create a run-end encoded array with run-end indexes data type as `i16`.
306/// // The encoded values are Strings.
307///
308/// # use arrow_array::builder::StringRunBuilder;
309/// # use arrow_array::{Int16Array, StringArray};
310/// # use arrow_array::types::Int16Type;
311/// # use arrow_array::cast::AsArray;
312/// #
313/// let mut builder = StringRunBuilder::<Int16Type>::new();
314///
315/// // The builder builds the dictionary value by value
316/// builder.append_value("abc");
317/// builder.append_null();
318/// builder.extend([Some("def"), Some("def"), Some("abc")]);
319/// let array = builder.finish();
320///
321/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
322///
323/// // Values are polymorphic and so require a downcast.
324/// let av = array.values();
325/// let ava: &StringArray = av.as_string::<i32>();
326///
327/// assert_eq!(ava.value(0), "abc");
328/// assert!(av.is_null(1));
329/// assert_eq!(ava.value(2), "def");
330/// assert_eq!(ava.value(3), "abc");
331///
332/// ```
333pub type StringRunBuilder<K> = GenericByteRunBuilder<K, Utf8Type>;
334
335/// Builder for [`RunArray`] of [`LargeStringArray`](crate::array::LargeStringArray)
336pub type LargeStringRunBuilder<K> = GenericByteRunBuilder<K, LargeUtf8Type>;
337
338/// Builder for [`RunArray`] of [`BinaryArray`](crate::array::BinaryArray)
339///
340/// ```
341/// // Create a run-end encoded array with run-end indexes data type as `i16`.
342/// // The encoded data is binary values.
343///
344/// # use arrow_array::builder::BinaryRunBuilder;
345/// # use arrow_array::{BinaryArray, Int16Array};
346/// # use arrow_array::cast::AsArray;
347/// # use arrow_array::types::Int16Type;
348///
349/// let mut builder = BinaryRunBuilder::<Int16Type>::new();
350///
351/// // The builder builds the dictionary value by value
352/// builder.append_value(b"abc");
353/// builder.append_null();
354/// builder.extend([Some(b"def"), Some(b"def"), Some(b"abc")]);
355/// let array = builder.finish();
356///
357/// assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
358///
359/// // Values are polymorphic and so require a downcast.
360/// let av = array.values();
361/// let ava: &BinaryArray = av.as_binary();
362///
363/// assert_eq!(ava.value(0), b"abc");
364/// assert!(av.is_null(1));
365/// assert_eq!(ava.value(2), b"def");
366/// assert_eq!(ava.value(3), b"abc");
367///
368/// ```
369pub type BinaryRunBuilder<K> = GenericByteRunBuilder<K, BinaryType>;
370
371/// Builder for [`RunArray`] of [`LargeBinaryArray`](crate::array::LargeBinaryArray)
372pub type LargeBinaryRunBuilder<K> = GenericByteRunBuilder<K, LargeBinaryType>;
373
374#[cfg(test)]
375mod tests {
376    use super::*;
377
378    use crate::array::Array;
379    use crate::cast::AsArray;
380    use crate::types::{Int16Type, Int32Type};
381    use crate::GenericByteArray;
382    use crate::Int16RunArray;
383
384    fn test_bytes_run_builder<T>(values: Vec<&T::Native>)
385    where
386        T: ByteArrayType,
387        <T as ByteArrayType>::Native: PartialEq,
388        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
389    {
390        let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
391        builder.append_value(values[0]);
392        builder.append_value(values[0]);
393        builder.append_value(values[0]);
394        builder.append_null();
395        builder.append_null();
396        builder.append_value(values[1]);
397        builder.append_value(values[1]);
398        builder.append_value(values[2]);
399        builder.append_value(values[2]);
400        builder.append_value(values[2]);
401        builder.append_value(values[2]);
402        let array = builder.finish();
403
404        assert_eq!(array.len(), 11);
405        assert_eq!(array.null_count(), 0);
406        assert_eq!(array.logical_null_count(), 2);
407
408        assert_eq!(array.run_ends().values(), &[3, 5, 7, 11]);
409
410        // Values are polymorphic and so require a downcast.
411        let av = array.values();
412        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
413
414        assert_eq!(*ava.value(0), *values[0]);
415        assert!(ava.is_null(1));
416        assert_eq!(*ava.value(2), *values[1]);
417        assert_eq!(*ava.value(3), *values[2]);
418    }
419
420    #[test]
421    fn test_string_run_builder() {
422        test_bytes_run_builder::<Utf8Type>(vec!["abc", "def", "ghi"]);
423    }
424
425    #[test]
426    fn test_string_run_builder_with_empty_strings() {
427        test_bytes_run_builder::<Utf8Type>(vec!["abc", "", "ghi"]);
428    }
429
430    #[test]
431    fn test_binary_run_builder() {
432        test_bytes_run_builder::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
433    }
434
435    fn test_bytes_run_builder_finish_cloned<T>(values: Vec<&T::Native>)
436    where
437        T: ByteArrayType,
438        <T as ByteArrayType>::Native: PartialEq,
439        <T as ByteArrayType>::Native: AsRef<<T as ByteArrayType>::Native>,
440    {
441        let mut builder = GenericByteRunBuilder::<Int16Type, T>::new();
442
443        builder.append_value(values[0]);
444        builder.append_null();
445        builder.append_value(values[1]);
446        builder.append_value(values[1]);
447        builder.append_value(values[0]);
448        let mut array: Int16RunArray = builder.finish_cloned();
449
450        assert_eq!(array.len(), 5);
451        assert_eq!(array.null_count(), 0);
452        assert_eq!(array.logical_null_count(), 1);
453
454        assert_eq!(array.run_ends().values(), &[1, 2, 4, 5]);
455
456        // Values are polymorphic and so require a downcast.
457        let av = array.values();
458        let ava: &GenericByteArray<T> = av.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
459
460        assert_eq!(ava.value(0), values[0]);
461        assert!(ava.is_null(1));
462        assert_eq!(ava.value(2), values[1]);
463        assert_eq!(ava.value(3), values[0]);
464
465        // Append last value before `finish_cloned` (`value[0]`) again and ensure it has only
466        // one entry in final output.
467        builder.append_value(values[0]);
468        builder.append_value(values[0]);
469        builder.append_value(values[1]);
470        array = builder.finish();
471
472        assert_eq!(array.len(), 8);
473        assert_eq!(array.null_count(), 0);
474        assert_eq!(array.logical_null_count(), 1);
475
476        assert_eq!(array.run_ends().values(), &[1, 2, 4, 7, 8]);
477
478        // Values are polymorphic and so require a downcast.
479        let av2 = array.values();
480        let ava2: &GenericByteArray<T> =
481            av2.as_any().downcast_ref::<GenericByteArray<T>>().unwrap();
482
483        assert_eq!(ava2.value(0), values[0]);
484        assert!(ava2.is_null(1));
485        assert_eq!(ava2.value(2), values[1]);
486        // The value appended before and after `finish_cloned` has only one entry.
487        assert_eq!(ava2.value(3), values[0]);
488        assert_eq!(ava2.value(4), values[1]);
489    }
490
491    #[test]
492    fn test_string_run_builder_finish_cloned() {
493        test_bytes_run_builder_finish_cloned::<Utf8Type>(vec!["abc", "def", "ghi"]);
494    }
495
496    #[test]
497    fn test_binary_run_builder_finish_cloned() {
498        test_bytes_run_builder_finish_cloned::<BinaryType>(vec![b"abc", b"def", b"ghi"]);
499    }
500
501    #[test]
502    fn test_extend() {
503        let mut builder = StringRunBuilder::<Int32Type>::new();
504        builder.extend(["a", "a", "a", "", "", "b", "b"].into_iter().map(Some));
505        builder.extend(["b", "cupcakes", "cupcakes"].into_iter().map(Some));
506        let array = builder.finish();
507
508        assert_eq!(array.len(), 10);
509        assert_eq!(array.run_ends().values(), &[3, 5, 8, 10]);
510
511        let str_array = array.values().as_string::<i32>();
512        assert_eq!(str_array.value(0), "a");
513        assert_eq!(str_array.value(1), "");
514        assert_eq!(str_array.value(2), "b");
515        assert_eq!(str_array.value(3), "cupcakes");
516    }
517}