arrow_data/transform/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Low-level array data abstractions.
19//!
20//! Provides utilities for creating, manipulating, and converting Arrow arrays
21//! made of primitive types, strings, and nested types.
22
23use super::{data::new_buffers, ArrayData, ArrayDataBuilder, ByteView};
24use crate::bit_mask::set_bits;
25use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
26use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer};
27use arrow_schema::{ArrowError, DataType, IntervalUnit, UnionMode};
28use half::f16;
29use num::Integer;
30use std::mem;
31
32mod boolean;
33mod fixed_binary;
34mod fixed_size_list;
35mod list;
36mod null;
37mod primitive;
38mod structure;
39mod union;
40mod utils;
41mod variable_size;
42
43type ExtendNullBits<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize) + 'a>;
44// function that extends `[start..start+len]` to the mutable array.
45// this is dynamic because different data_types influence how buffers and children are extended.
46type Extend<'a> = Box<dyn Fn(&mut _MutableArrayData, usize, usize, usize) + 'a>;
47
48type ExtendNulls = Box<dyn Fn(&mut _MutableArrayData, usize)>;
49
50/// A mutable [ArrayData] that knows how to freeze itself into an [ArrayData].
51/// This is just a data container.
52#[derive(Debug)]
53struct _MutableArrayData<'a> {
54    pub data_type: DataType,
55    pub null_count: usize,
56
57    pub len: usize,
58    pub null_buffer: Option<MutableBuffer>,
59
60    // arrow specification only allows up to 3 buffers (2 ignoring the nulls above).
61    // Thus, we place them in the stack to avoid bound checks and greater data locality.
62    pub buffer1: MutableBuffer,
63    pub buffer2: MutableBuffer,
64    pub child_data: Vec<MutableArrayData<'a>>,
65}
66
67impl _MutableArrayData<'_> {
68    fn null_buffer(&mut self) -> &mut MutableBuffer {
69        self.null_buffer
70            .as_mut()
71            .expect("MutableArrayData not nullable")
72    }
73}
74
75fn build_extend_null_bits(array: &ArrayData, use_nulls: bool) -> ExtendNullBits {
76    if let Some(nulls) = array.nulls() {
77        let bytes = nulls.validity();
78        Box::new(move |mutable, start, len| {
79            let mutable_len = mutable.len;
80            let out = mutable.null_buffer();
81            utils::resize_for_bits(out, mutable_len + len);
82            mutable.null_count += set_bits(
83                out.as_slice_mut(),
84                bytes,
85                mutable_len,
86                nulls.offset() + start,
87                len,
88            );
89        })
90    } else if use_nulls {
91        Box::new(|mutable, _, len| {
92            let mutable_len = mutable.len;
93            let out = mutable.null_buffer();
94            utils::resize_for_bits(out, mutable_len + len);
95            let write_data = out.as_slice_mut();
96            (0..len).for_each(|i| {
97                bit_util::set_bit(write_data, mutable_len + i);
98            });
99        })
100    } else {
101        Box::new(|_, _, _| {})
102    }
103}
104
105/// Efficiently create an [ArrayData] from one or more existing [ArrayData]s by
106/// copying chunks.
107///
108/// The main use case of this struct is to perform unary operations to arrays of
109/// arbitrary types, such as `filter` and `take`.
110///
111/// # Example
112/// ```
113/// use arrow_buffer::Buffer;
114/// use arrow_data::ArrayData;
115/// use arrow_data::transform::MutableArrayData;
116/// use arrow_schema::DataType;
117/// fn i32_array(values: &[i32]) -> ArrayData {
118///   ArrayData::try_new(DataType::Int32, 5, None, 0, vec![Buffer::from_slice_ref(values)], vec![]).unwrap()
119/// }
120/// let arr1  = i32_array(&[1, 2, 3, 4, 5]);
121/// let arr2  = i32_array(&[6, 7, 8, 9, 10]);
122/// // Create a mutable array for copying values from arr1 and arr2, with a capacity for 6 elements
123/// let capacity = 3 * std::mem::size_of::<i32>();
124/// let mut mutable = MutableArrayData::new(vec![&arr1, &arr2], false, 10);
125/// // Copy the first 3 elements from arr1
126/// mutable.extend(0, 0, 3);
127/// // Copy the last 3 elements from arr2
128/// mutable.extend(1, 2, 4);
129/// // Complete the MutableArrayData into a new ArrayData
130/// let frozen = mutable.freeze();
131/// assert_eq!(frozen, i32_array(&[1, 2, 3, 8, 9, 10]));
132/// ```
133pub struct MutableArrayData<'a> {
134    /// Input arrays: the data being read FROM.
135    ///
136    /// Note this is "dead code" because all actual references to the arrays are
137    /// stored in closures for extending values and nulls.
138    #[allow(dead_code)]
139    arrays: Vec<&'a ArrayData>,
140
141    /// In progress output array: The data being written TO
142    ///
143    /// Note these fields are in a separate struct, [_MutableArrayData], as they
144    /// cannot be in [MutableArrayData] itself due to mutability invariants (interior
145    /// mutability): [MutableArrayData] contains a function that can only mutate
146    /// [_MutableArrayData], not [MutableArrayData] itself
147    data: _MutableArrayData<'a>,
148
149    /// The child data of the `Array` in Dictionary arrays.
150    ///
151    /// This is not stored in `_MutableArrayData` because these values are
152    /// constant and only needed at the end, when freezing [_MutableArrayData].
153    dictionary: Option<ArrayData>,
154
155    /// Variadic data buffers referenced by views.
156    ///
157    /// Note this this is not stored in `_MutableArrayData` because these values
158    /// are constant and only needed at the end, when freezing
159    /// [_MutableArrayData]
160    variadic_data_buffers: Vec<Buffer>,
161
162    /// function used to extend output array with values from input arrays.
163    ///
164    /// This function's lifetime is bound to the input arrays because it reads
165    /// values from them.
166    extend_values: Vec<Extend<'a>>,
167
168    /// function used to extend the output array with nulls from input arrays.
169    ///
170    /// This function's lifetime is bound to the input arrays because it reads
171    /// nulls from it.
172    extend_null_bits: Vec<ExtendNullBits<'a>>,
173
174    /// function used to extend the output array with null elements.
175    ///
176    /// This function is independent of the arrays and therefore has no lifetime.
177    extend_nulls: ExtendNulls,
178}
179
180impl std::fmt::Debug for MutableArrayData<'_> {
181    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
182        // ignores the closures.
183        f.debug_struct("MutableArrayData")
184            .field("data", &self.data)
185            .finish()
186    }
187}
188
189/// Builds an extend that adds `offset` to the source primitive
190/// Additionally validates that `max` fits into the
191/// the underlying primitive returning None if not
192fn build_extend_dictionary(array: &ArrayData, offset: usize, max: usize) -> Option<Extend> {
193    macro_rules! validate_and_build {
194        ($dt: ty) => {{
195            let _: $dt = max.try_into().ok()?;
196            let offset: $dt = offset.try_into().ok()?;
197            Some(primitive::build_extend_with_offset(array, offset))
198        }};
199    }
200    match array.data_type() {
201        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
202            DataType::UInt8 => validate_and_build!(u8),
203            DataType::UInt16 => validate_and_build!(u16),
204            DataType::UInt32 => validate_and_build!(u32),
205            DataType::UInt64 => validate_and_build!(u64),
206            DataType::Int8 => validate_and_build!(i8),
207            DataType::Int16 => validate_and_build!(i16),
208            DataType::Int32 => validate_and_build!(i32),
209            DataType::Int64 => validate_and_build!(i64),
210            _ => unreachable!(),
211        },
212        _ => None,
213    }
214}
215
216/// Builds an extend that adds `buffer_offset` to any buffer indices encountered
217fn build_extend_view(array: &ArrayData, buffer_offset: u32) -> Extend {
218    let views = array.buffer::<u128>(0);
219    Box::new(
220        move |mutable: &mut _MutableArrayData, _, start: usize, len: usize| {
221            mutable
222                .buffer1
223                .extend(views[start..start + len].iter().map(|v| {
224                    let len = *v as u32;
225                    if len <= 12 {
226                        return *v; // Stored inline
227                    }
228                    let mut view = ByteView::from(*v);
229                    view.buffer_index += buffer_offset;
230                    view.into()
231                }))
232        },
233    )
234}
235
236fn build_extend(array: &ArrayData) -> Extend {
237    match array.data_type() {
238        DataType::Null => null::build_extend(array),
239        DataType::Boolean => boolean::build_extend(array),
240        DataType::UInt8 => primitive::build_extend::<u8>(array),
241        DataType::UInt16 => primitive::build_extend::<u16>(array),
242        DataType::UInt32 => primitive::build_extend::<u32>(array),
243        DataType::UInt64 => primitive::build_extend::<u64>(array),
244        DataType::Int8 => primitive::build_extend::<i8>(array),
245        DataType::Int16 => primitive::build_extend::<i16>(array),
246        DataType::Int32 => primitive::build_extend::<i32>(array),
247        DataType::Int64 => primitive::build_extend::<i64>(array),
248        DataType::Float32 => primitive::build_extend::<f32>(array),
249        DataType::Float64 => primitive::build_extend::<f64>(array),
250        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
251            primitive::build_extend::<i32>(array)
252        }
253        DataType::Date64
254        | DataType::Time64(_)
255        | DataType::Timestamp(_, _)
256        | DataType::Duration(_)
257        | DataType::Interval(IntervalUnit::DayTime) => primitive::build_extend::<i64>(array),
258        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::build_extend::<i128>(array),
259        DataType::Decimal128(_, _) => primitive::build_extend::<i128>(array),
260        DataType::Decimal256(_, _) => primitive::build_extend::<i256>(array),
261        DataType::Utf8 | DataType::Binary => variable_size::build_extend::<i32>(array),
262        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::build_extend::<i64>(array),
263        DataType::BinaryView | DataType::Utf8View => unreachable!("should use build_extend_view"),
264        DataType::Map(_, _) | DataType::List(_) => list::build_extend::<i32>(array),
265        DataType::ListView(_) | DataType::LargeListView(_) => {
266            unimplemented!("ListView/LargeListView not implemented")
267        }
268        DataType::LargeList(_) => list::build_extend::<i64>(array),
269        DataType::Dictionary(_, _) => unreachable!("should use build_extend_dictionary"),
270        DataType::Struct(_) => structure::build_extend(array),
271        DataType::FixedSizeBinary(_) => fixed_binary::build_extend(array),
272        DataType::Float16 => primitive::build_extend::<f16>(array),
273        DataType::FixedSizeList(_, _) => fixed_size_list::build_extend(array),
274        DataType::Union(_, mode) => match mode {
275            UnionMode::Sparse => union::build_extend_sparse(array),
276            UnionMode::Dense => union::build_extend_dense(array),
277        },
278        DataType::RunEndEncoded(_, _) => todo!(),
279    }
280}
281
282fn build_extend_nulls(data_type: &DataType) -> ExtendNulls {
283    Box::new(match data_type {
284        DataType::Null => null::extend_nulls,
285        DataType::Boolean => boolean::extend_nulls,
286        DataType::UInt8 => primitive::extend_nulls::<u8>,
287        DataType::UInt16 => primitive::extend_nulls::<u16>,
288        DataType::UInt32 => primitive::extend_nulls::<u32>,
289        DataType::UInt64 => primitive::extend_nulls::<u64>,
290        DataType::Int8 => primitive::extend_nulls::<i8>,
291        DataType::Int16 => primitive::extend_nulls::<i16>,
292        DataType::Int32 => primitive::extend_nulls::<i32>,
293        DataType::Int64 => primitive::extend_nulls::<i64>,
294        DataType::Float32 => primitive::extend_nulls::<f32>,
295        DataType::Float64 => primitive::extend_nulls::<f64>,
296        DataType::Date32 | DataType::Time32(_) | DataType::Interval(IntervalUnit::YearMonth) => {
297            primitive::extend_nulls::<i32>
298        }
299        DataType::Date64
300        | DataType::Time64(_)
301        | DataType::Timestamp(_, _)
302        | DataType::Duration(_)
303        | DataType::Interval(IntervalUnit::DayTime) => primitive::extend_nulls::<i64>,
304        DataType::Interval(IntervalUnit::MonthDayNano) => primitive::extend_nulls::<i128>,
305        DataType::Decimal128(_, _) => primitive::extend_nulls::<i128>,
306        DataType::Decimal256(_, _) => primitive::extend_nulls::<i256>,
307        DataType::Utf8 | DataType::Binary => variable_size::extend_nulls::<i32>,
308        DataType::LargeUtf8 | DataType::LargeBinary => variable_size::extend_nulls::<i64>,
309        DataType::BinaryView | DataType::Utf8View => primitive::extend_nulls::<u128>,
310        DataType::Map(_, _) | DataType::List(_) => list::extend_nulls::<i32>,
311        DataType::ListView(_) | DataType::LargeListView(_) => {
312            unimplemented!("ListView/LargeListView not implemented")
313        }
314        DataType::LargeList(_) => list::extend_nulls::<i64>,
315        DataType::Dictionary(child_data_type, _) => match child_data_type.as_ref() {
316            DataType::UInt8 => primitive::extend_nulls::<u8>,
317            DataType::UInt16 => primitive::extend_nulls::<u16>,
318            DataType::UInt32 => primitive::extend_nulls::<u32>,
319            DataType::UInt64 => primitive::extend_nulls::<u64>,
320            DataType::Int8 => primitive::extend_nulls::<i8>,
321            DataType::Int16 => primitive::extend_nulls::<i16>,
322            DataType::Int32 => primitive::extend_nulls::<i32>,
323            DataType::Int64 => primitive::extend_nulls::<i64>,
324            _ => unreachable!(),
325        },
326        DataType::Struct(_) => structure::extend_nulls,
327        DataType::FixedSizeBinary(_) => fixed_binary::extend_nulls,
328        DataType::Float16 => primitive::extend_nulls::<f16>,
329        DataType::FixedSizeList(_, _) => fixed_size_list::extend_nulls,
330        DataType::Union(_, mode) => match mode {
331            UnionMode::Sparse => union::extend_nulls_sparse,
332            UnionMode::Dense => union::extend_nulls_dense,
333        },
334        DataType::RunEndEncoded(_, _) => todo!(),
335    })
336}
337
338fn preallocate_offset_and_binary_buffer<Offset: ArrowNativeType + Integer>(
339    capacity: usize,
340    binary_size: usize,
341) -> [MutableBuffer; 2] {
342    // offsets
343    let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<Offset>());
344    // safety: `unsafe` code assumes that this buffer is initialized with one element
345    buffer.push(Offset::zero());
346
347    [
348        buffer,
349        MutableBuffer::new(binary_size * mem::size_of::<u8>()),
350    ]
351}
352
353/// Define capacities to pre-allocate for child data or data buffers.
354#[derive(Debug, Clone)]
355pub enum Capacities {
356    /// Binary, Utf8 and LargeUtf8 data types
357    ///
358    /// Defines
359    /// * the capacity of the array offsets
360    /// * the capacity of the binary/ str buffer
361    Binary(usize, Option<usize>),
362    /// List and LargeList data types
363    ///
364    /// Defines
365    /// * the capacity of the array offsets
366    /// * the capacity of the child data
367    List(usize, Option<Box<Capacities>>),
368    /// Struct type
369    ///
370    /// Defines
371    /// * the capacity of the array
372    /// * the capacities of the fields
373    Struct(usize, Option<Vec<Capacities>>),
374    /// Dictionary type
375    ///
376    /// Defines
377    /// * the capacity of the array/keys
378    /// * the capacity of the values
379    Dictionary(usize, Option<Box<Capacities>>),
380    /// Don't preallocate inner buffers and rely on array growth strategy
381    Array(usize),
382}
383
384impl<'a> MutableArrayData<'a> {
385    /// Returns a new [MutableArrayData] with capacity to `capacity` slots and
386    /// specialized to create an [ArrayData] from multiple `arrays`.
387    ///
388    /// # Arguments
389    /// * `arrays` - the source arrays to copy from
390    /// * `use_nulls` - a flag used to optimize insertions
391    ///   - `false` if the only source of nulls are the arrays themselves
392    ///   - `true` if the user plans to call [MutableArrayData::extend_nulls].
393    /// * capacity - the preallocated capacity of the output array, in bytes
394    ///
395    /// Thus, if `use_nulls` is `false`, calling
396    /// [MutableArrayData::extend_nulls] should not be used.
397    pub fn new(arrays: Vec<&'a ArrayData>, use_nulls: bool, capacity: usize) -> Self {
398        Self::with_capacities(arrays, use_nulls, Capacities::Array(capacity))
399    }
400
401    /// Similar to [MutableArrayData::new], but lets users define the
402    /// preallocated capacities of the array with more granularity.
403    ///
404    /// See [MutableArrayData::new] for more information on the arguments.
405    ///
406    /// # Panics
407    ///
408    /// This function panics if the given `capacities` don't match the data type
409    /// of `arrays`. Or when a [Capacities] variant is not yet supported.
410    pub fn with_capacities(
411        arrays: Vec<&'a ArrayData>,
412        use_nulls: bool,
413        capacities: Capacities,
414    ) -> Self {
415        let data_type = arrays[0].data_type();
416
417        for a in arrays.iter().skip(1) {
418            assert_eq!(
419                data_type,
420                a.data_type(),
421                "Arrays with inconsistent types passed to MutableArrayData"
422            )
423        }
424
425        // if any of the arrays has nulls, insertions from any array requires setting bits
426        // as there is at least one array with nulls.
427        let use_nulls = use_nulls | arrays.iter().any(|array| array.null_count() > 0);
428
429        let mut array_capacity;
430
431        let [buffer1, buffer2] = match (data_type, &capacities) {
432            (
433                DataType::LargeUtf8 | DataType::LargeBinary,
434                Capacities::Binary(capacity, Some(value_cap)),
435            ) => {
436                array_capacity = *capacity;
437                preallocate_offset_and_binary_buffer::<i64>(*capacity, *value_cap)
438            }
439            (DataType::Utf8 | DataType::Binary, Capacities::Binary(capacity, Some(value_cap))) => {
440                array_capacity = *capacity;
441                preallocate_offset_and_binary_buffer::<i32>(*capacity, *value_cap)
442            }
443            (_, Capacities::Array(capacity)) => {
444                array_capacity = *capacity;
445                new_buffers(data_type, *capacity)
446            }
447            (
448                DataType::List(_) | DataType::LargeList(_) | DataType::FixedSizeList(_, _),
449                Capacities::List(capacity, _),
450            ) => {
451                array_capacity = *capacity;
452                new_buffers(data_type, *capacity)
453            }
454            _ => panic!("Capacities: {capacities:?} not yet supported"),
455        };
456
457        let child_data = match &data_type {
458            DataType::Decimal128(_, _)
459            | DataType::Decimal256(_, _)
460            | DataType::Null
461            | DataType::Boolean
462            | DataType::UInt8
463            | DataType::UInt16
464            | DataType::UInt32
465            | DataType::UInt64
466            | DataType::Int8
467            | DataType::Int16
468            | DataType::Int32
469            | DataType::Int64
470            | DataType::Float16
471            | DataType::Float32
472            | DataType::Float64
473            | DataType::Date32
474            | DataType::Date64
475            | DataType::Time32(_)
476            | DataType::Time64(_)
477            | DataType::Duration(_)
478            | DataType::Timestamp(_, _)
479            | DataType::Utf8
480            | DataType::Binary
481            | DataType::LargeUtf8
482            | DataType::LargeBinary
483            | DataType::BinaryView
484            | DataType::Utf8View
485            | DataType::Interval(_)
486            | DataType::FixedSizeBinary(_) => vec![],
487            DataType::ListView(_) | DataType::LargeListView(_) => {
488                unimplemented!("ListView/LargeListView not implemented")
489            }
490            DataType::Map(_, _) | DataType::List(_) | DataType::LargeList(_) => {
491                let children = arrays
492                    .iter()
493                    .map(|array| &array.child_data()[0])
494                    .collect::<Vec<_>>();
495
496                let capacities =
497                    if let Capacities::List(capacity, ref child_capacities) = capacities {
498                        child_capacities
499                            .clone()
500                            .map(|c| *c)
501                            .unwrap_or(Capacities::Array(capacity))
502                    } else {
503                        Capacities::Array(array_capacity)
504                    };
505
506                vec![MutableArrayData::with_capacities(
507                    children, use_nulls, capacities,
508                )]
509            }
510            // the dictionary type just appends keys and clones the values.
511            DataType::Dictionary(_, _) => vec![],
512            DataType::Struct(fields) => match capacities {
513                Capacities::Struct(capacity, Some(ref child_capacities)) => {
514                    array_capacity = capacity;
515                    (0..fields.len())
516                        .zip(child_capacities)
517                        .map(|(i, child_cap)| {
518                            let child_arrays = arrays
519                                .iter()
520                                .map(|array| &array.child_data()[i])
521                                .collect::<Vec<_>>();
522                            MutableArrayData::with_capacities(
523                                child_arrays,
524                                use_nulls,
525                                child_cap.clone(),
526                            )
527                        })
528                        .collect::<Vec<_>>()
529                }
530                Capacities::Struct(capacity, None) => {
531                    array_capacity = capacity;
532                    (0..fields.len())
533                        .map(|i| {
534                            let child_arrays = arrays
535                                .iter()
536                                .map(|array| &array.child_data()[i])
537                                .collect::<Vec<_>>();
538                            MutableArrayData::new(child_arrays, use_nulls, capacity)
539                        })
540                        .collect::<Vec<_>>()
541                }
542                _ => (0..fields.len())
543                    .map(|i| {
544                        let child_arrays = arrays
545                            .iter()
546                            .map(|array| &array.child_data()[i])
547                            .collect::<Vec<_>>();
548                        MutableArrayData::new(child_arrays, use_nulls, array_capacity)
549                    })
550                    .collect::<Vec<_>>(),
551            },
552            DataType::RunEndEncoded(_, _) => {
553                let run_ends_child = arrays
554                    .iter()
555                    .map(|array| &array.child_data()[0])
556                    .collect::<Vec<_>>();
557                let value_child = arrays
558                    .iter()
559                    .map(|array| &array.child_data()[1])
560                    .collect::<Vec<_>>();
561                vec![
562                    MutableArrayData::new(run_ends_child, false, array_capacity),
563                    MutableArrayData::new(value_child, use_nulls, array_capacity),
564                ]
565            }
566            DataType::FixedSizeList(_, size) => {
567                let children = arrays
568                    .iter()
569                    .map(|array| &array.child_data()[0])
570                    .collect::<Vec<_>>();
571                let capacities =
572                    if let Capacities::List(capacity, ref child_capacities) = capacities {
573                        child_capacities
574                            .clone()
575                            .map(|c| *c)
576                            .unwrap_or(Capacities::Array(capacity * *size as usize))
577                    } else {
578                        Capacities::Array(array_capacity * *size as usize)
579                    };
580                vec![MutableArrayData::with_capacities(
581                    children, use_nulls, capacities,
582                )]
583            }
584            DataType::Union(fields, _) => (0..fields.len())
585                .map(|i| {
586                    let child_arrays = arrays
587                        .iter()
588                        .map(|array| &array.child_data()[i])
589                        .collect::<Vec<_>>();
590                    MutableArrayData::new(child_arrays, use_nulls, array_capacity)
591                })
592                .collect::<Vec<_>>(),
593        };
594
595        // Get the dictionary if any, and if it is a concatenation of multiple
596        let (dictionary, dict_concat) = match &data_type {
597            DataType::Dictionary(_, _) => {
598                // If more than one dictionary, concatenate dictionaries together
599                let dict_concat = !arrays
600                    .windows(2)
601                    .all(|a| a[0].child_data()[0].ptr_eq(&a[1].child_data()[0]));
602
603                match dict_concat {
604                    false => (Some(arrays[0].child_data()[0].clone()), false),
605                    true => {
606                        if let Capacities::Dictionary(_, _) = capacities {
607                            panic!("dictionary capacity not yet supported")
608                        }
609                        let dictionaries: Vec<_> =
610                            arrays.iter().map(|array| &array.child_data()[0]).collect();
611                        let lengths: Vec<_> = dictionaries
612                            .iter()
613                            .map(|dictionary| dictionary.len())
614                            .collect();
615                        let capacity = lengths.iter().sum();
616
617                        let mut mutable = MutableArrayData::new(dictionaries, false, capacity);
618
619                        for (i, len) in lengths.iter().enumerate() {
620                            mutable.extend(i, 0, *len)
621                        }
622
623                        (Some(mutable.freeze()), true)
624                    }
625                }
626            }
627            _ => (None, false),
628        };
629
630        let variadic_data_buffers = match &data_type {
631            DataType::BinaryView | DataType::Utf8View => arrays
632                .iter()
633                .flat_map(|x| x.buffers().iter().skip(1))
634                .map(Buffer::clone)
635                .collect(),
636            _ => vec![],
637        };
638
639        let extend_nulls = build_extend_nulls(data_type);
640
641        let extend_null_bits = arrays
642            .iter()
643            .map(|array| build_extend_null_bits(array, use_nulls))
644            .collect();
645
646        let null_buffer = use_nulls.then(|| {
647            let null_bytes = bit_util::ceil(array_capacity, 8);
648            MutableBuffer::from_len_zeroed(null_bytes)
649        });
650
651        let extend_values = match &data_type {
652            DataType::Dictionary(_, _) => {
653                let mut next_offset = 0;
654                let extend_values: Result<Vec<_>, _> = arrays
655                    .iter()
656                    .map(|array| {
657                        let offset = next_offset;
658                        let dict_len = array.child_data()[0].len();
659
660                        if dict_concat {
661                            next_offset += dict_len;
662                        }
663
664                        build_extend_dictionary(array, offset, offset + dict_len)
665                            .ok_or(ArrowError::DictionaryKeyOverflowError)
666                    })
667                    .collect();
668
669                extend_values.expect("MutableArrayData::new is infallible")
670            }
671            DataType::BinaryView | DataType::Utf8View => {
672                let mut next_offset = 0u32;
673                arrays
674                    .iter()
675                    .map(|arr| {
676                        let num_data_buffers = (arr.buffers().len() - 1) as u32;
677                        let offset = next_offset;
678                        next_offset = next_offset
679                            .checked_add(num_data_buffers)
680                            .expect("view buffer index overflow");
681                        build_extend_view(arr, offset)
682                    })
683                    .collect()
684            }
685            _ => arrays.iter().map(|array| build_extend(array)).collect(),
686        };
687
688        let data = _MutableArrayData {
689            data_type: data_type.clone(),
690            len: 0,
691            null_count: 0,
692            null_buffer,
693            buffer1,
694            buffer2,
695            child_data,
696        };
697        Self {
698            arrays,
699            data,
700            dictionary,
701            variadic_data_buffers,
702            extend_values,
703            extend_null_bits,
704            extend_nulls,
705        }
706    }
707
708    /// Extends the in progress array with a region of the input arrays
709    ///
710    /// # Arguments
711    /// * `index` - the index of array that you what to copy values from
712    /// * `start` - the start index of the chunk (inclusive)
713    /// * `end` - the end index of the chunk (exclusive)
714    ///
715    /// # Panic
716    /// This function panics if there is an invalid index,
717    /// i.e. `index` >= the number of source arrays
718    /// or `end` > the length of the `index`th array
719    pub fn extend(&mut self, index: usize, start: usize, end: usize) {
720        let len = end - start;
721        (self.extend_null_bits[index])(&mut self.data, start, len);
722        (self.extend_values[index])(&mut self.data, index, start, len);
723        self.data.len += len;
724    }
725
726    /// Extends the in progress array with null elements, ignoring the input arrays.
727    ///
728    /// # Panics
729    ///
730    /// Panics if [`MutableArrayData`] not created with `use_nulls` or nullable source arrays
731    pub fn extend_nulls(&mut self, len: usize) {
732        self.data.len += len;
733        let bit_len = bit_util::ceil(self.data.len, 8);
734        let nulls = self.data.null_buffer();
735        nulls.resize(bit_len, 0);
736        self.data.null_count += len;
737        (self.extend_nulls)(&mut self.data, len);
738    }
739
740    /// Returns the current length
741    #[inline]
742    pub fn len(&self) -> usize {
743        self.data.len
744    }
745
746    /// Returns true if len is 0
747    #[inline]
748    pub fn is_empty(&self) -> bool {
749        self.data.len == 0
750    }
751
752    /// Returns the current null count
753    #[inline]
754    pub fn null_count(&self) -> usize {
755        self.data.null_count
756    }
757
758    /// Creates a [ArrayData] from the in progress array, consuming `self`.
759    pub fn freeze(self) -> ArrayData {
760        unsafe { self.into_builder().build_unchecked() }
761    }
762
763    /// Consume self and returns the in progress array as [`ArrayDataBuilder`].
764    ///
765    /// This is useful for extending the default behavior of MutableArrayData.
766    pub fn into_builder(self) -> ArrayDataBuilder {
767        let data = self.data;
768
769        let buffers = match data.data_type {
770            DataType::Null | DataType::Struct(_) | DataType::FixedSizeList(_, _) => {
771                vec![]
772            }
773            DataType::BinaryView | DataType::Utf8View => {
774                let mut b = self.variadic_data_buffers;
775                b.insert(0, data.buffer1.into());
776                b
777            }
778            DataType::Utf8 | DataType::Binary | DataType::LargeUtf8 | DataType::LargeBinary => {
779                vec![data.buffer1.into(), data.buffer2.into()]
780            }
781            DataType::Union(_, mode) => {
782                match mode {
783                    // Based on Union's DataTypeLayout
784                    UnionMode::Sparse => vec![data.buffer1.into()],
785                    UnionMode::Dense => vec![data.buffer1.into(), data.buffer2.into()],
786                }
787            }
788            _ => vec![data.buffer1.into()],
789        };
790
791        let child_data = match data.data_type {
792            DataType::Dictionary(_, _) => vec![self.dictionary.unwrap()],
793            _ => data.child_data.into_iter().map(|x| x.freeze()).collect(),
794        };
795
796        let nulls = data
797            .null_buffer
798            .map(|nulls| {
799                let bools = BooleanBuffer::new(nulls.into(), 0, data.len);
800                unsafe { NullBuffer::new_unchecked(bools, data.null_count) }
801            })
802            .filter(|n| n.null_count() > 0);
803
804        ArrayDataBuilder::new(data.data_type)
805            .offset(0)
806            .len(data.len)
807            .nulls(nulls)
808            .buffers(buffers)
809            .child_data(child_data)
810    }
811}
812
813// See arrow/tests/array_transform.rs for tests of transform functionality
814
815#[cfg(test)]
816mod test {
817    use super::*;
818    use arrow_schema::Field;
819    use std::sync::Arc;
820
821    #[test]
822    fn test_list_append_with_capacities() {
823        let array = ArrayData::new_empty(&DataType::List(Arc::new(Field::new(
824            "element",
825            DataType::Int64,
826            false,
827        ))));
828
829        let mutable = MutableArrayData::with_capacities(
830            vec![&array],
831            false,
832            Capacities::List(6, Some(Box::new(Capacities::Array(17)))),
833        );
834
835        // capacities are rounded up to multiples of 64 by MutableBuffer
836        assert_eq!(mutable.data.buffer1.capacity(), 64);
837        assert_eq!(mutable.data.child_data[0].data.buffer1.capacity(), 192);
838    }
839}