arrow_array/builder/
generic_bytes_builder.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::builder::{ArrayBuilder, BufferBuilder, UInt8BufferBuilder};
19use crate::types::{ByteArrayType, GenericBinaryType, GenericStringType};
20use crate::{ArrayRef, GenericByteArray, OffsetSizeTrait};
21use arrow_buffer::NullBufferBuilder;
22use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
23use arrow_data::ArrayDataBuilder;
24use std::any::Any;
25use std::sync::Arc;
26
27/// Builder for [`GenericByteArray`]
28///
29/// For building strings, see docs on [`GenericStringBuilder`].
30/// For building binary, see docs on [`GenericBinaryBuilder`].
31pub struct GenericByteBuilder<T: ByteArrayType> {
32    value_builder: UInt8BufferBuilder,
33    offsets_builder: BufferBuilder<T::Offset>,
34    null_buffer_builder: NullBufferBuilder,
35}
36
37impl<T: ByteArrayType> GenericByteBuilder<T> {
38    /// Creates a new [`GenericByteBuilder`].
39    pub fn new() -> Self {
40        Self::with_capacity(1024, 1024)
41    }
42
43    /// Creates a new [`GenericByteBuilder`].
44    ///
45    /// - `item_capacity` is the number of items to pre-allocate.
46    ///   The size of the preallocated buffer of offsets is the number of items plus one.
47    /// - `data_capacity` is the total number of bytes of data to pre-allocate
48    ///   (for all items, not per item).
49    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
50        let mut offsets_builder = BufferBuilder::<T::Offset>::new(item_capacity + 1);
51        offsets_builder.append(T::Offset::from_usize(0).unwrap());
52        Self {
53            value_builder: UInt8BufferBuilder::new(data_capacity),
54            offsets_builder,
55            null_buffer_builder: NullBufferBuilder::new(item_capacity),
56        }
57    }
58
59    /// Creates a new  [`GenericByteBuilder`] from buffers.
60    ///
61    /// # Safety
62    ///
63    /// This doesn't verify buffer contents as it assumes the buffers are from
64    /// existing and valid [`GenericByteArray`].
65    pub unsafe fn new_from_buffer(
66        offsets_buffer: MutableBuffer,
67        value_buffer: MutableBuffer,
68        null_buffer: Option<MutableBuffer>,
69    ) -> Self {
70        let offsets_builder = BufferBuilder::<T::Offset>::new_from_buffer(offsets_buffer);
71        let value_builder = BufferBuilder::<u8>::new_from_buffer(value_buffer);
72
73        let null_buffer_builder = null_buffer
74            .map(|buffer| NullBufferBuilder::new_from_buffer(buffer, offsets_builder.len() - 1))
75            .unwrap_or_else(|| NullBufferBuilder::new_with_len(offsets_builder.len() - 1));
76
77        Self {
78            offsets_builder,
79            value_builder,
80            null_buffer_builder,
81        }
82    }
83
84    #[inline]
85    fn next_offset(&self) -> T::Offset {
86        T::Offset::from_usize(self.value_builder.len()).expect("byte array offset overflow")
87    }
88
89    /// Appends a value into the builder.
90    ///
91    /// See the [GenericStringBuilder] documentation for examples of
92    /// incrementally building string values with multiple `write!` calls.
93    ///
94    /// # Panics
95    ///
96    /// Panics if the resulting length of [`Self::values_slice`] would exceed
97    /// `T::Offset::MAX` bytes.
98    ///
99    /// For example, this can happen with [`StringArray`] or [`BinaryArray`]
100    /// where the total length of all values exceeds 2GB
101    ///
102    /// [`StringArray`]: crate::StringArray
103    /// [`BinaryArray`]: crate::BinaryArray
104    #[inline]
105    pub fn append_value(&mut self, value: impl AsRef<T::Native>) {
106        self.value_builder.append_slice(value.as_ref().as_ref());
107        self.null_buffer_builder.append(true);
108        self.offsets_builder.append(self.next_offset());
109    }
110
111    /// Append an `Option` value into the builder.
112    ///
113    /// - A `None` value will append a null value.
114    /// - A `Some` value will append the value.
115    ///
116    /// See [`Self::append_value`] for more panic information.
117    #[inline]
118    pub fn append_option(&mut self, value: Option<impl AsRef<T::Native>>) {
119        match value {
120            None => self.append_null(),
121            Some(v) => self.append_value(v),
122        };
123    }
124
125    /// Append a null value into the builder.
126    #[inline]
127    pub fn append_null(&mut self) {
128        self.null_buffer_builder.append(false);
129        self.offsets_builder.append(self.next_offset());
130    }
131
132    /// Builds the [`GenericByteArray`] and reset this builder.
133    pub fn finish(&mut self) -> GenericByteArray<T> {
134        let array_type = T::DATA_TYPE;
135        let array_builder = ArrayDataBuilder::new(array_type)
136            .len(self.len())
137            .add_buffer(self.offsets_builder.finish())
138            .add_buffer(self.value_builder.finish())
139            .nulls(self.null_buffer_builder.finish());
140
141        self.offsets_builder.append(self.next_offset());
142        let array_data = unsafe { array_builder.build_unchecked() };
143        GenericByteArray::from(array_data)
144    }
145
146    /// Builds the [`GenericByteArray`] without resetting the builder.
147    pub fn finish_cloned(&self) -> GenericByteArray<T> {
148        let array_type = T::DATA_TYPE;
149        let offset_buffer = Buffer::from_slice_ref(self.offsets_builder.as_slice());
150        let value_buffer = Buffer::from_slice_ref(self.value_builder.as_slice());
151        let array_builder = ArrayDataBuilder::new(array_type)
152            .len(self.len())
153            .add_buffer(offset_buffer)
154            .add_buffer(value_buffer)
155            .nulls(self.null_buffer_builder.finish_cloned());
156
157        let array_data = unsafe { array_builder.build_unchecked() };
158        GenericByteArray::from(array_data)
159    }
160
161    /// Returns the current values buffer as a slice
162    pub fn values_slice(&self) -> &[u8] {
163        self.value_builder.as_slice()
164    }
165
166    /// Returns the current offsets buffer as a slice
167    pub fn offsets_slice(&self) -> &[T::Offset] {
168        self.offsets_builder.as_slice()
169    }
170
171    /// Returns the current null buffer as a slice
172    pub fn validity_slice(&self) -> Option<&[u8]> {
173        self.null_buffer_builder.as_slice()
174    }
175
176    /// Returns the current null buffer as a mutable slice
177    pub fn validity_slice_mut(&mut self) -> Option<&mut [u8]> {
178        self.null_buffer_builder.as_slice_mut()
179    }
180}
181
182impl<T: ByteArrayType> std::fmt::Debug for GenericByteBuilder<T> {
183    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
184        write!(f, "{}{}Builder", T::Offset::PREFIX, T::PREFIX)?;
185        f.debug_struct("")
186            .field("value_builder", &self.value_builder)
187            .field("offsets_builder", &self.offsets_builder)
188            .field("null_buffer_builder", &self.null_buffer_builder)
189            .finish()
190    }
191}
192
193impl<T: ByteArrayType> Default for GenericByteBuilder<T> {
194    fn default() -> Self {
195        Self::new()
196    }
197}
198
199impl<T: ByteArrayType> ArrayBuilder for GenericByteBuilder<T> {
200    /// Returns the number of binary slots in the builder
201    fn len(&self) -> usize {
202        self.null_buffer_builder.len()
203    }
204
205    /// Builds the array and reset this builder.
206    fn finish(&mut self) -> ArrayRef {
207        Arc::new(self.finish())
208    }
209
210    /// Builds the array without resetting the builder.
211    fn finish_cloned(&self) -> ArrayRef {
212        Arc::new(self.finish_cloned())
213    }
214
215    /// Returns the builder as a non-mutable `Any` reference.
216    fn as_any(&self) -> &dyn Any {
217        self
218    }
219
220    /// Returns the builder as a mutable `Any` reference.
221    fn as_any_mut(&mut self) -> &mut dyn Any {
222        self
223    }
224
225    /// Returns the boxed builder as a box of `Any`.
226    fn into_box_any(self: Box<Self>) -> Box<dyn Any> {
227        self
228    }
229}
230
231impl<T: ByteArrayType, V: AsRef<T::Native>> Extend<Option<V>> for GenericByteBuilder<T> {
232    #[inline]
233    fn extend<I: IntoIterator<Item = Option<V>>>(&mut self, iter: I) {
234        for v in iter {
235            self.append_option(v)
236        }
237    }
238}
239
240/// Array builder for [`GenericStringArray`][crate::GenericStringArray]
241///
242/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
243/// [`GenericByteBuilder::append_null`].
244///
245/// This builder also implements [`std::fmt::Write`] with any written data
246/// included in the next appended value. This allows using [`std::fmt::Display`]
247/// with standard Rust idioms like `write!` and `writeln!` to write data
248/// directly to the builder without intermediate allocations.
249///
250/// # Example writing strings with `append_value`
251/// ```
252/// # use arrow_array::builder::GenericStringBuilder;
253/// let mut builder = GenericStringBuilder::<i32>::new();
254///
255/// // Write one string value
256/// builder.append_value("foobarbaz");
257///
258/// // Write a second string
259/// builder.append_value("v2");
260///
261/// let array = builder.finish();
262/// assert_eq!(array.value(0), "foobarbaz");
263/// assert_eq!(array.value(1), "v2");
264/// ```
265///
266/// # Example incrementally writing strings with `std::fmt::Write`
267///
268/// ```
269/// # use std::fmt::Write;
270/// # use arrow_array::builder::GenericStringBuilder;
271/// let mut builder = GenericStringBuilder::<i32>::new();
272///
273/// // Write data in multiple `write!` calls
274/// write!(builder, "foo").unwrap();
275/// write!(builder, "bar").unwrap();
276/// // The next call to append_value finishes the current string
277/// // including all previously written strings.
278/// builder.append_value("baz");
279///
280/// // Write second value with a single write call
281/// write!(builder, "v2").unwrap();
282/// // finish the value by calling append_value with an empty string
283/// builder.append_value("");
284///
285/// let array = builder.finish();
286/// assert_eq!(array.value(0), "foobarbaz");
287/// assert_eq!(array.value(1), "v2");
288/// ```
289pub type GenericStringBuilder<O> = GenericByteBuilder<GenericStringType<O>>;
290
291impl<O: OffsetSizeTrait> std::fmt::Write for GenericStringBuilder<O> {
292    fn write_str(&mut self, s: &str) -> std::fmt::Result {
293        self.value_builder.append_slice(s.as_bytes());
294        Ok(())
295    }
296}
297
298///  Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray]
299///
300/// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with
301/// [`GenericByteBuilder::append_null`].
302///
303/// # Example
304/// ```
305/// # use arrow_array::builder::GenericBinaryBuilder;
306/// let mut builder = GenericBinaryBuilder::<i32>::new();
307///
308/// // Write data
309/// builder.append_value("foo");
310///
311/// // Write second value
312/// builder.append_value(&[0,1,2]);
313///
314/// let array = builder.finish();
315/// // binary values
316/// assert_eq!(array.value(0), b"foo");
317/// assert_eq!(array.value(1), b"\x00\x01\x02");
318/// ```
319///
320/// # Example incrementally writing bytes with `write_bytes`
321///
322/// ```
323/// # use std::io::Write;
324/// # use arrow_array::builder::GenericBinaryBuilder;
325/// let mut builder = GenericBinaryBuilder::<i32>::new();
326///
327/// // Write data in multiple `write_bytes` calls
328/// write!(builder, "foo").unwrap();
329/// write!(builder, "bar").unwrap();
330/// // The next call to append_value finishes the current string
331/// // including all previously written strings.
332/// builder.append_value("baz");
333///
334/// // Write second value with a single write call
335/// write!(builder, "v2").unwrap();
336/// // finish the value by calling append_value with an empty string
337/// builder.append_value("");
338///
339/// let array = builder.finish();
340/// assert_eq!(array.value(0), "foobarbaz".as_bytes());
341/// assert_eq!(array.value(1), "v2".as_bytes());
342/// ```
343pub type GenericBinaryBuilder<O> = GenericByteBuilder<GenericBinaryType<O>>;
344
345impl<O: OffsetSizeTrait> std::io::Write for GenericBinaryBuilder<O> {
346    fn write(&mut self, bs: &[u8]) -> std::io::Result<usize> {
347        self.value_builder.append_slice(bs);
348        Ok(bs.len())
349    }
350
351    fn flush(&mut self) -> std::io::Result<()> {
352        Ok(())
353    }
354}
355
356#[cfg(test)]
357mod tests {
358    use super::*;
359    use crate::array::Array;
360    use crate::GenericStringArray;
361    use std::fmt::Write as _;
362    use std::io::Write as _;
363
364    fn _test_generic_binary_builder<O: OffsetSizeTrait>() {
365        let mut builder = GenericBinaryBuilder::<O>::new();
366
367        builder.append_value(b"hello");
368        builder.append_value(b"");
369        builder.append_null();
370        builder.append_value(b"rust");
371
372        let array = builder.finish();
373
374        assert_eq!(4, array.len());
375        assert_eq!(1, array.null_count());
376        assert_eq!(b"hello", array.value(0));
377        assert_eq!([] as [u8; 0], array.value(1));
378        assert!(array.is_null(2));
379        assert_eq!(b"rust", array.value(3));
380        assert_eq!(O::from_usize(5).unwrap(), array.value_offsets()[2]);
381        assert_eq!(O::from_usize(4).unwrap(), array.value_length(3));
382    }
383
384    #[test]
385    fn test_binary_builder() {
386        _test_generic_binary_builder::<i32>()
387    }
388
389    #[test]
390    fn test_large_binary_builder() {
391        _test_generic_binary_builder::<i64>()
392    }
393
394    fn _test_generic_binary_builder_all_nulls<O: OffsetSizeTrait>() {
395        let mut builder = GenericBinaryBuilder::<O>::new();
396        builder.append_null();
397        builder.append_null();
398        builder.append_null();
399        assert_eq!(3, builder.len());
400        assert!(!builder.is_empty());
401
402        let array = builder.finish();
403        assert_eq!(3, array.null_count());
404        assert_eq!(3, array.len());
405        assert!(array.is_null(0));
406        assert!(array.is_null(1));
407        assert!(array.is_null(2));
408    }
409
410    #[test]
411    fn test_binary_builder_all_nulls() {
412        _test_generic_binary_builder_all_nulls::<i32>()
413    }
414
415    #[test]
416    fn test_large_binary_builder_all_nulls() {
417        _test_generic_binary_builder_all_nulls::<i64>()
418    }
419
420    fn _test_generic_binary_builder_reset<O: OffsetSizeTrait>() {
421        let mut builder = GenericBinaryBuilder::<O>::new();
422
423        builder.append_value(b"hello");
424        builder.append_value(b"");
425        builder.append_null();
426        builder.append_value(b"rust");
427        builder.finish();
428
429        assert!(builder.is_empty());
430
431        builder.append_value(b"parquet");
432        builder.append_null();
433        builder.append_value(b"arrow");
434        builder.append_value(b"");
435        let array = builder.finish();
436
437        assert_eq!(4, array.len());
438        assert_eq!(1, array.null_count());
439        assert_eq!(b"parquet", array.value(0));
440        assert!(array.is_null(1));
441        assert_eq!(b"arrow", array.value(2));
442        assert_eq!(b"", array.value(1));
443        assert_eq!(O::zero(), array.value_offsets()[0]);
444        assert_eq!(O::from_usize(7).unwrap(), array.value_offsets()[2]);
445        assert_eq!(O::from_usize(5).unwrap(), array.value_length(2));
446    }
447
448    #[test]
449    fn test_binary_builder_reset() {
450        _test_generic_binary_builder_reset::<i32>()
451    }
452
453    #[test]
454    fn test_large_binary_builder_reset() {
455        _test_generic_binary_builder_reset::<i64>()
456    }
457
458    fn _test_generic_string_array_builder<O: OffsetSizeTrait>() {
459        let mut builder = GenericStringBuilder::<O>::new();
460        let owned = "arrow".to_owned();
461
462        builder.append_value("hello");
463        builder.append_value("");
464        builder.append_value(&owned);
465        builder.append_null();
466        builder.append_option(Some("rust"));
467        builder.append_option(None::<&str>);
468        builder.append_option(None::<String>);
469        assert_eq!(7, builder.len());
470
471        assert_eq!(
472            GenericStringArray::<O>::from(vec![
473                Some("hello"),
474                Some(""),
475                Some("arrow"),
476                None,
477                Some("rust"),
478                None,
479                None
480            ]),
481            builder.finish()
482        );
483    }
484
485    #[test]
486    fn test_string_array_builder() {
487        _test_generic_string_array_builder::<i32>()
488    }
489
490    #[test]
491    fn test_large_string_array_builder() {
492        _test_generic_string_array_builder::<i64>()
493    }
494
495    fn _test_generic_string_array_builder_finish<O: OffsetSizeTrait>() {
496        let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
497
498        builder.append_value("hello");
499        builder.append_value("rust");
500        builder.append_null();
501
502        builder.finish();
503        assert!(builder.is_empty());
504        assert_eq!(&[O::zero()], builder.offsets_slice());
505
506        builder.append_value("arrow");
507        builder.append_value("parquet");
508        let arr = builder.finish();
509        // array should not have null buffer because there is not `null` value.
510        assert!(arr.nulls().is_none());
511        assert_eq!(GenericStringArray::<O>::from(vec!["arrow", "parquet"]), arr,)
512    }
513
514    #[test]
515    fn test_string_array_builder_finish() {
516        _test_generic_string_array_builder_finish::<i32>()
517    }
518
519    #[test]
520    fn test_large_string_array_builder_finish() {
521        _test_generic_string_array_builder_finish::<i64>()
522    }
523
524    fn _test_generic_string_array_builder_finish_cloned<O: OffsetSizeTrait>() {
525        let mut builder = GenericStringBuilder::<O>::with_capacity(3, 11);
526
527        builder.append_value("hello");
528        builder.append_value("rust");
529        builder.append_null();
530
531        let mut arr = builder.finish_cloned();
532        assert!(!builder.is_empty());
533        assert_eq!(3, arr.len());
534
535        builder.append_value("arrow");
536        builder.append_value("parquet");
537        arr = builder.finish();
538
539        assert!(arr.nulls().is_some());
540        assert_eq!(&[O::zero()], builder.offsets_slice());
541        assert_eq!(5, arr.len());
542    }
543
544    #[test]
545    fn test_string_array_builder_finish_cloned() {
546        _test_generic_string_array_builder_finish_cloned::<i32>()
547    }
548
549    #[test]
550    fn test_large_string_array_builder_finish_cloned() {
551        _test_generic_string_array_builder_finish_cloned::<i64>()
552    }
553
554    #[test]
555    fn test_extend() {
556        let mut builder = GenericStringBuilder::<i32>::new();
557        builder.extend(["a", "b", "c", "", "a", "b", "c"].into_iter().map(Some));
558        builder.extend(["d", "cupcakes", "hello"].into_iter().map(Some));
559        let array = builder.finish();
560        assert_eq!(array.value_offsets(), &[0, 1, 2, 3, 3, 4, 5, 6, 7, 15, 20]);
561        assert_eq!(array.value_data(), b"abcabcdcupcakeshello");
562    }
563
564    #[test]
565    fn test_write_str() {
566        let mut builder = GenericStringBuilder::<i32>::new();
567        write!(builder, "foo").unwrap();
568        builder.append_value("");
569        writeln!(builder, "bar").unwrap();
570        builder.append_value("");
571        write!(builder, "fiz").unwrap();
572        write!(builder, "buz").unwrap();
573        builder.append_value("");
574        let a = builder.finish();
575        let r: Vec<_> = a.iter().flatten().collect();
576        assert_eq!(r, &["foo", "bar\n", "fizbuz"])
577    }
578
579    #[test]
580    fn test_write_bytes() {
581        let mut builder = GenericBinaryBuilder::<i32>::new();
582        write!(builder, "foo").unwrap();
583        builder.append_value("");
584        writeln!(builder, "bar").unwrap();
585        builder.append_value("");
586        write!(builder, "fiz").unwrap();
587        write!(builder, "buz").unwrap();
588        builder.append_value("");
589        let a = builder.finish();
590        let r: Vec<_> = a.iter().flatten().collect();
591        assert_eq!(
592            r,
593            &["foo".as_bytes(), "bar\n".as_bytes(), "fizbuz".as_bytes()]
594        )
595    }
596}