arrow_data/
data.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains [`ArrayData`], a generic representation of Arrow array data which encapsulates
19//! common attributes and operations for Arrow array.
20
21use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24    bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::data::private::UnsafeFlag;
32use crate::{equal, validate_binary_view, validate_string_view};
33
34#[inline]
35pub(crate) fn contains_nulls(
36    null_bit_buffer: Option<&NullBuffer>,
37    offset: usize,
38    len: usize,
39) -> bool {
40    match null_bit_buffer {
41        Some(buffer) => {
42            match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
43                Some((start, end)) => start != 0 || end != len,
44                None => len != 0, // No non-null values
45            }
46        }
47        None => false, // No null buffer
48    }
49}
50
51#[inline]
52pub(crate) fn count_nulls(
53    null_bit_buffer: Option<&NullBuffer>,
54    offset: usize,
55    len: usize,
56) -> usize {
57    if let Some(buf) = null_bit_buffer {
58        let buffer = buf.buffer();
59        len - buffer.count_set_bits_offset(offset + buf.offset(), len)
60    } else {
61        0
62    }
63}
64
65/// creates 2 [`MutableBuffer`]s with a given `capacity` (in slots).
66#[inline]
67pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
68    let empty_buffer = MutableBuffer::new(0);
69    match data_type {
70        DataType::Null => [empty_buffer, MutableBuffer::new(0)],
71        DataType::Boolean => {
72            let bytes = bit_util::ceil(capacity, 8);
73            let buffer = MutableBuffer::new(bytes);
74            [buffer, empty_buffer]
75        }
76        DataType::UInt8
77        | DataType::UInt16
78        | DataType::UInt32
79        | DataType::UInt64
80        | DataType::Int8
81        | DataType::Int16
82        | DataType::Int32
83        | DataType::Int64
84        | DataType::Float16
85        | DataType::Float32
86        | DataType::Float64
87        | DataType::Decimal128(_, _)
88        | DataType::Decimal256(_, _)
89        | DataType::Date32
90        | DataType::Time32(_)
91        | DataType::Date64
92        | DataType::Time64(_)
93        | DataType::Duration(_)
94        | DataType::Timestamp(_, _)
95        | DataType::Interval(_) => [
96            MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
97            empty_buffer,
98        ],
99        DataType::Utf8 | DataType::Binary => {
100            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
101            // safety: `unsafe` code assumes that this buffer is initialized with one element
102            buffer.push(0i32);
103            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
104        }
105        DataType::LargeUtf8 | DataType::LargeBinary => {
106            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
107            // safety: `unsafe` code assumes that this buffer is initialized with one element
108            buffer.push(0i64);
109            [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
110        }
111        DataType::BinaryView | DataType::Utf8View => [
112            MutableBuffer::new(capacity * mem::size_of::<u128>()),
113            empty_buffer,
114        ],
115        DataType::List(_) | DataType::Map(_, _) => {
116            // offset buffer always starts with a zero
117            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
118            buffer.push(0i32);
119            [buffer, empty_buffer]
120        }
121        DataType::ListView(_) => [
122            MutableBuffer::new(capacity * mem::size_of::<i32>()),
123            MutableBuffer::new(capacity * mem::size_of::<i32>()),
124        ],
125        DataType::LargeList(_) => {
126            // offset buffer always starts with a zero
127            let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
128            buffer.push(0i64);
129            [buffer, empty_buffer]
130        }
131        DataType::LargeListView(_) => [
132            MutableBuffer::new(capacity * mem::size_of::<i64>()),
133            MutableBuffer::new(capacity * mem::size_of::<i64>()),
134        ],
135        DataType::FixedSizeBinary(size) => {
136            [MutableBuffer::new(capacity * *size as usize), empty_buffer]
137        }
138        DataType::Dictionary(k, _) => [
139            MutableBuffer::new(capacity * k.primitive_width().unwrap()),
140            empty_buffer,
141        ],
142        DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
143            [empty_buffer, MutableBuffer::new(0)]
144        }
145        DataType::Union(_, mode) => {
146            let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
147            match mode {
148                UnionMode::Sparse => [type_ids, empty_buffer],
149                UnionMode::Dense => {
150                    let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
151                    [type_ids, offsets]
152                }
153            }
154        }
155    }
156}
157
158/// A generic representation of Arrow array data which encapsulates common attributes
159/// and operations for Arrow array.
160///
161/// Specific operations for different arrays types (e.g., primitive, list, struct)
162/// are implemented in `Array`.
163///
164/// # Memory Layout
165///
166/// `ArrayData` has references to one or more underlying data buffers
167/// and optional child ArrayData, depending on type as illustrated
168/// below. Bitmaps are not shown for simplicity but they are stored
169/// similarly to the buffers.
170///
171/// ```text
172///                        offset
173///                       points to
174/// ┌───────────────────┐ start of  ┌───────┐       Different
175/// │                   │   data    │       │     ArrayData may
176/// │ArrayData {        │           │....   │     also refers to
177/// │  data_type: ...   │   ─ ─ ─ ─▶│1234   │  ┌ ─  the same
178/// │  offset: ... ─ ─ ─│─ ┘        │4372   │      underlying
179/// │  len: ...    ─ ─ ─│─ ┐        │4888   │  │     buffer with different offset/len
180/// │  buffers: [       │           │5882   │◀─
181/// │    ...            │  │        │4323   │
182/// │  ]                │   ─ ─ ─ ─▶│4859   │
183/// │  child_data: [    │           │....   │
184/// │    ...            │           │       │
185/// │  ]                │           └───────┘
186/// │}                  │
187/// │                   │            Shared Buffer uses
188/// │               │   │            bytes::Bytes to hold
189/// └───────────────────┘            actual data values
190///           ┌ ─ ─ ┘
191///
192///           ▼
193/// ┌───────────────────┐
194/// │ArrayData {        │
195/// │  ...              │
196/// │}                  │
197/// │                   │
198/// └───────────────────┘
199///
200/// Child ArrayData may also have its own buffers and children
201/// ```
202
203#[derive(Debug, Clone)]
204pub struct ArrayData {
205    /// The data type for this array data
206    data_type: DataType,
207
208    /// The number of elements in this array data
209    len: usize,
210
211    /// The offset into this array data, in number of items
212    offset: usize,
213
214    /// The buffers for this array data. Note that depending on the array types, this
215    /// could hold different kinds of buffers (e.g., value buffer, value offset buffer)
216    /// at different positions.
217    buffers: Vec<Buffer>,
218
219    /// The child(ren) of this array. Only non-empty for nested types, currently
220    /// `ListArray` and `StructArray`.
221    child_data: Vec<ArrayData>,
222
223    /// The null bitmap. A `None` value for this indicates all values are non-null in
224    /// this array.
225    nulls: Option<NullBuffer>,
226}
227
228/// A thread-safe, shared reference to the Arrow array data.
229pub type ArrayDataRef = Arc<ArrayData>;
230
231impl ArrayData {
232    /// Create a new ArrayData instance;
233    ///
234    /// If `null_count` is not specified, the number of nulls in
235    /// null_bit_buffer is calculated.
236    ///
237    /// If the number of nulls is 0 then the null_bit_buffer
238    /// is set to `None`.
239    ///
240    /// # Safety
241    ///
242    /// The input values *must* form a valid Arrow array for
243    /// `data_type`, or undefined behavior can result.
244    ///
245    /// Note: This is a low level API and most users of the arrow
246    /// crate should create arrays using the methods in the `array`
247    /// module.
248    pub unsafe fn new_unchecked(
249        data_type: DataType,
250        len: usize,
251        null_count: Option<usize>,
252        null_bit_buffer: Option<Buffer>,
253        offset: usize,
254        buffers: Vec<Buffer>,
255        child_data: Vec<ArrayData>,
256    ) -> Self {
257        let mut skip_validation = UnsafeFlag::new();
258        // SAFETY: caller responsible for ensuring data is valid
259        skip_validation.set(true);
260
261        ArrayDataBuilder {
262            data_type,
263            len,
264            null_count,
265            null_bit_buffer,
266            nulls: None,
267            offset,
268            buffers,
269            child_data,
270            align_buffers: false,
271            skip_validation,
272        }
273        .build()
274        .unwrap()
275    }
276
277    /// Create a new ArrayData, validating that the provided buffers form a valid
278    /// Arrow array of the specified data type.
279    ///
280    /// If the number of nulls in `null_bit_buffer` is 0 then the null_bit_buffer
281    /// is set to `None`.
282    ///
283    /// Internally this calls through to [`Self::validate_data`]
284    ///
285    /// Note: This is a low level API and most users of the arrow crate should create
286    /// arrays using the builders found in [arrow_array](https://docs.rs/arrow-array)
287    pub fn try_new(
288        data_type: DataType,
289        len: usize,
290        null_bit_buffer: Option<Buffer>,
291        offset: usize,
292        buffers: Vec<Buffer>,
293        child_data: Vec<ArrayData>,
294    ) -> Result<Self, ArrowError> {
295        // we must check the length of `null_bit_buffer` first
296        // because we use this buffer to calculate `null_count`
297        // in `Self::new_unchecked`.
298        if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
299            let needed_len = bit_util::ceil(len + offset, 8);
300            if null_bit_buffer.len() < needed_len {
301                return Err(ArrowError::InvalidArgumentError(format!(
302                    "null_bit_buffer size too small. got {} needed {}",
303                    null_bit_buffer.len(),
304                    needed_len
305                )));
306            }
307        }
308        // Safety justification: `validate_full` is called below
309        let new_self = unsafe {
310            Self::new_unchecked(
311                data_type,
312                len,
313                None,
314                null_bit_buffer,
315                offset,
316                buffers,
317                child_data,
318            )
319        };
320
321        // As the data is not trusted, do a full validation of its contents
322        // We don't need to validate children as we can assume that the
323        // [`ArrayData`] in `child_data` have already been validated through
324        // a call to `ArrayData::try_new` or created using unsafe
325        new_self.validate_data()?;
326        Ok(new_self)
327    }
328
329    /// Returns a builder to construct a [`ArrayData`] instance of the same [`DataType`]
330    #[inline]
331    pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
332        ArrayDataBuilder::new(data_type)
333    }
334
335    /// Returns a reference to the [`DataType`] of this [`ArrayData`]
336    #[inline]
337    pub const fn data_type(&self) -> &DataType {
338        &self.data_type
339    }
340
341    /// Returns the [`Buffer`] storing data for this [`ArrayData`]
342    pub fn buffers(&self) -> &[Buffer] {
343        &self.buffers
344    }
345
346    /// Returns a slice of children [`ArrayData`]. This will be non
347    /// empty for type such as lists and structs.
348    pub fn child_data(&self) -> &[ArrayData] {
349        &self.child_data[..]
350    }
351
352    /// Returns whether the element at index `i` is null
353    #[inline]
354    pub fn is_null(&self, i: usize) -> bool {
355        match &self.nulls {
356            Some(v) => v.is_null(i),
357            None => false,
358        }
359    }
360
361    /// Returns a reference to the null buffer of this [`ArrayData`] if any
362    ///
363    /// Note: [`ArrayData::offset`] does NOT apply to the returned [`NullBuffer`]
364    #[inline]
365    pub fn nulls(&self) -> Option<&NullBuffer> {
366        self.nulls.as_ref()
367    }
368
369    /// Returns whether the element at index `i` is not null
370    #[inline]
371    pub fn is_valid(&self, i: usize) -> bool {
372        !self.is_null(i)
373    }
374
375    /// Returns the length (i.e., number of elements) of this [`ArrayData`].
376    #[inline]
377    pub const fn len(&self) -> usize {
378        self.len
379    }
380
381    /// Returns whether this [`ArrayData`] is empty
382    #[inline]
383    pub const fn is_empty(&self) -> bool {
384        self.len == 0
385    }
386
387    /// Returns the offset of this [`ArrayData`]
388    #[inline]
389    pub const fn offset(&self) -> usize {
390        self.offset
391    }
392
393    /// Returns the total number of nulls in this array
394    #[inline]
395    pub fn null_count(&self) -> usize {
396        self.nulls
397            .as_ref()
398            .map(|x| x.null_count())
399            .unwrap_or_default()
400    }
401
402    /// Returns the total number of bytes of memory occupied by the
403    /// buffers owned by this [`ArrayData`] and all of its
404    /// children. (See also diagram on [`ArrayData`]).
405    ///
406    /// Note that this [`ArrayData`] may only refer to a subset of the
407    /// data in the underlying [`Buffer`]s (due to `offset` and
408    /// `length`), but the size returned includes the entire size of
409    /// the buffers.
410    ///
411    /// If multiple [`ArrayData`]s refer to the same underlying
412    /// [`Buffer`]s they will both report the same size.
413    pub fn get_buffer_memory_size(&self) -> usize {
414        let mut size = 0;
415        for buffer in &self.buffers {
416            size += buffer.capacity();
417        }
418        if let Some(bitmap) = &self.nulls {
419            size += bitmap.buffer().capacity()
420        }
421        for child in &self.child_data {
422            size += child.get_buffer_memory_size();
423        }
424        size
425    }
426
427    /// Returns the total number of the bytes of memory occupied by
428    /// the buffers by this slice of [`ArrayData`] (See also diagram on [`ArrayData`]).
429    ///
430    /// This is approximately the number of bytes if a new
431    /// [`ArrayData`] was formed by creating new [`Buffer`]s with
432    /// exactly the data needed.
433    ///
434    /// For example, a [`DataType::Int64`] with `100` elements,
435    /// [`Self::get_slice_memory_size`] would return `100 * 8 = 800`. If
436    /// the [`ArrayData`] was then [`Self::slice`]ed to refer to its
437    /// first `20` elements, then [`Self::get_slice_memory_size`] on the
438    /// sliced [`ArrayData`] would return `20 * 8 = 160`.
439    pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
440        let mut result: usize = 0;
441        let layout = layout(&self.data_type);
442
443        for spec in layout.buffers.iter() {
444            match spec {
445                BufferSpec::FixedWidth { byte_width, .. } => {
446                    let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
447                        ArrowError::ComputeError(
448                            "Integer overflow computing buffer size".to_string(),
449                        )
450                    })?;
451                    result += buffer_size;
452                }
453                BufferSpec::VariableWidth => {
454                    let buffer_len: usize;
455                    match self.data_type {
456                        DataType::Utf8 | DataType::Binary => {
457                            let offsets = self.typed_offsets::<i32>()?;
458                            buffer_len = (offsets[self.len] - offsets[0] ) as usize;
459                        }
460                        DataType::LargeUtf8 | DataType::LargeBinary => {
461                            let offsets = self.typed_offsets::<i64>()?;
462                            buffer_len = (offsets[self.len] - offsets[0]) as usize;
463                        }
464                        _ => {
465                            return Err(ArrowError::NotYetImplemented(format!(
466                            "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
467                            self.data_type
468                            )))
469                        }
470                    };
471                    result += buffer_len;
472                }
473                BufferSpec::BitMap => {
474                    let buffer_size = bit_util::ceil(self.len, 8);
475                    result += buffer_size;
476                }
477                BufferSpec::AlwaysNull => {
478                    // Nothing to do
479                }
480            }
481        }
482
483        if self.nulls().is_some() {
484            result += bit_util::ceil(self.len, 8);
485        }
486
487        for child in &self.child_data {
488            result += child.get_slice_memory_size()?;
489        }
490        Ok(result)
491    }
492
493    /// Returns the total number of bytes of memory occupied
494    /// physically by this [`ArrayData`] and all its [`Buffer`]s and
495    /// children. (See also diagram on [`ArrayData`]).
496    ///
497    /// Equivalent to:
498    ///  `size_of_val(self)` +
499    ///  [`Self::get_buffer_memory_size`] +
500    ///  `size_of_val(child)` for all children
501    pub fn get_array_memory_size(&self) -> usize {
502        let mut size = mem::size_of_val(self);
503
504        // Calculate rest of the fields top down which contain actual data
505        for buffer in &self.buffers {
506            size += mem::size_of::<Buffer>();
507            size += buffer.capacity();
508        }
509        if let Some(nulls) = &self.nulls {
510            size += nulls.buffer().capacity();
511        }
512        for child in &self.child_data {
513            size += child.get_array_memory_size();
514        }
515
516        size
517    }
518
519    /// Creates a zero-copy slice of itself. This creates a new
520    /// [`ArrayData`] pointing at the same underlying [`Buffer`]s with a
521    /// different offset and len
522    ///
523    /// # Panics
524    ///
525    /// Panics if `offset + length > self.len()`.
526    pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
527        assert!((offset + length) <= self.len());
528
529        if let DataType::Struct(_) = self.data_type() {
530            // Slice into children
531            let new_offset = self.offset + offset;
532            let new_data = ArrayData {
533                data_type: self.data_type().clone(),
534                len: length,
535                offset: new_offset,
536                buffers: self.buffers.clone(),
537                // Slice child data, to propagate offsets down to them
538                child_data: self
539                    .child_data()
540                    .iter()
541                    .map(|data| data.slice(offset, length))
542                    .collect(),
543                nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
544            };
545
546            new_data
547        } else {
548            let mut new_data = self.clone();
549
550            new_data.len = length;
551            new_data.offset = offset + self.offset;
552            new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
553
554            new_data
555        }
556    }
557
558    /// Returns the `buffer` as a slice of type `T` starting at self.offset
559    /// # Panics
560    /// This function panics if:
561    /// * the buffer is not byte-aligned with type T, or
562    /// * the datatype is `Boolean` (it corresponds to a bit-packed buffer where the offset is not applicable)
563    pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
564        &self.buffers()[buffer].typed_data()[self.offset..]
565    }
566
567    /// Returns a new [`ArrayData`] valid for `data_type` containing `len` null values
568    pub fn new_null(data_type: &DataType, len: usize) -> Self {
569        let bit_len = bit_util::ceil(len, 8);
570        let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
571
572        let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
573            Some(width) => (vec![zeroed(width * len)], vec![], true),
574            None => match data_type {
575                DataType::Null => (vec![], vec![], false),
576                DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
577                DataType::Binary | DataType::Utf8 => {
578                    (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
579                }
580                DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
581                DataType::LargeBinary | DataType::LargeUtf8 => {
582                    (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
583                }
584                DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
585                DataType::List(f) | DataType::Map(f, _) => (
586                    vec![zeroed((len + 1) * 4)],
587                    vec![ArrayData::new_empty(f.data_type())],
588                    true,
589                ),
590                DataType::LargeList(f) => (
591                    vec![zeroed((len + 1) * 8)],
592                    vec![ArrayData::new_empty(f.data_type())],
593                    true,
594                ),
595                DataType::FixedSizeList(f, list_len) => (
596                    vec![],
597                    vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
598                    true,
599                ),
600                DataType::Struct(fields) => (
601                    vec![],
602                    fields
603                        .iter()
604                        .map(|f| Self::new_null(f.data_type(), len))
605                        .collect(),
606                    true,
607                ),
608                DataType::Dictionary(k, v) => (
609                    vec![zeroed(k.primitive_width().unwrap() * len)],
610                    vec![ArrayData::new_empty(v.as_ref())],
611                    true,
612                ),
613                DataType::Union(f, mode) => {
614                    let (id, _) = f.iter().next().unwrap();
615                    let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
616                    let buffers = match mode {
617                        UnionMode::Sparse => vec![ids],
618                        UnionMode::Dense => {
619                            let end_offset = i32::from_usize(len).unwrap();
620                            vec![ids, Buffer::from_iter(0_i32..end_offset)]
621                        }
622                    };
623
624                    let children = f
625                        .iter()
626                        .enumerate()
627                        .map(|(idx, (_, f))| {
628                            if idx == 0 || *mode == UnionMode::Sparse {
629                                Self::new_null(f.data_type(), len)
630                            } else {
631                                Self::new_empty(f.data_type())
632                            }
633                        })
634                        .collect();
635
636                    (buffers, children, false)
637                }
638                DataType::RunEndEncoded(r, v) => {
639                    let runs = match r.data_type() {
640                        DataType::Int16 => {
641                            let i = i16::from_usize(len).expect("run overflow");
642                            Buffer::from_slice_ref([i])
643                        }
644                        DataType::Int32 => {
645                            let i = i32::from_usize(len).expect("run overflow");
646                            Buffer::from_slice_ref([i])
647                        }
648                        DataType::Int64 => {
649                            let i = i64::from_usize(len).expect("run overflow");
650                            Buffer::from_slice_ref([i])
651                        }
652                        dt => unreachable!("Invalid run ends data type {dt}"),
653                    };
654
655                    let builder = ArrayData::builder(r.data_type().clone())
656                        .len(1)
657                        .buffers(vec![runs]);
658
659                    // SAFETY:
660                    // Valid by construction
661                    let runs = unsafe { builder.build_unchecked() };
662                    (
663                        vec![],
664                        vec![runs, ArrayData::new_null(v.data_type(), 1)],
665                        false,
666                    )
667                }
668                d => unreachable!("{d}"),
669            },
670        };
671
672        let mut builder = ArrayDataBuilder::new(data_type.clone())
673            .len(len)
674            .buffers(buffers)
675            .child_data(child_data);
676
677        if has_nulls {
678            builder = builder.nulls(Some(NullBuffer::new_null(len)))
679        }
680
681        // SAFETY:
682        // Data valid by construction
683        unsafe { builder.build_unchecked() }
684    }
685
686    /// Returns a new empty [ArrayData] valid for `data_type`.
687    pub fn new_empty(data_type: &DataType) -> Self {
688        Self::new_null(data_type, 0)
689    }
690
691    /// Verifies that the buffers meet the minimum alignment requirements for the data type
692    ///
693    /// Buffers that are not adequately aligned will be copied to a new aligned allocation
694    ///
695    /// This can be useful for when interacting with data sent over IPC or FFI, that may
696    /// not meet the minimum alignment requirements
697    ///
698    /// This also aligns buffers of children data
699    pub fn align_buffers(&mut self) {
700        let layout = layout(&self.data_type);
701        for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
702            if let BufferSpec::FixedWidth { alignment, .. } = spec {
703                if buffer.as_ptr().align_offset(*alignment) != 0 {
704                    *buffer = Buffer::from_slice_ref(buffer.as_ref());
705                }
706            }
707        }
708        // align children data recursively
709        for data in self.child_data.iter_mut() {
710            data.align_buffers()
711        }
712    }
713
714    /// "cheap" validation of an `ArrayData`. Ensures buffers are
715    /// sufficiently sized to store `len` + `offset` total elements of
716    /// `data_type` and performs other inexpensive consistency checks.
717    ///
718    /// This check is "cheap" in the sense that it does not validate the
719    /// contents of the buffers (e.g. that all offsets for UTF8 arrays
720    /// are within the bounds of the values buffer).
721    ///
722    /// See [ArrayData::validate_data] to validate fully the offset content
723    /// and the validity of utf8 data
724    pub fn validate(&self) -> Result<(), ArrowError> {
725        // Need at least this mich space in each buffer
726        let len_plus_offset = self.len + self.offset;
727
728        // Check that the data layout conforms to the spec
729        let layout = layout(&self.data_type);
730
731        if !layout.can_contain_null_mask && self.nulls.is_some() {
732            return Err(ArrowError::InvalidArgumentError(format!(
733                "Arrays of type {:?} cannot contain a null bitmask",
734                self.data_type,
735            )));
736        }
737
738        // Check data buffers length for view types and other types
739        if self.buffers.len() < layout.buffers.len()
740            || (!layout.variadic && self.buffers.len() != layout.buffers.len())
741        {
742            return Err(ArrowError::InvalidArgumentError(format!(
743                "Expected {} buffers in array of type {:?}, got {}",
744                layout.buffers.len(),
745                self.data_type,
746                self.buffers.len(),
747            )));
748        }
749
750        for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
751            match spec {
752                BufferSpec::FixedWidth {
753                    byte_width,
754                    alignment,
755                } => {
756                    let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
757
758                    if buffer.len() < min_buffer_size {
759                        return Err(ArrowError::InvalidArgumentError(format!(
760                            "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
761                            min_buffer_size, i, self.data_type, buffer.len()
762                        )));
763                    }
764
765                    let align_offset = buffer.as_ptr().align_offset(*alignment);
766                    if align_offset != 0 {
767                        return Err(ArrowError::InvalidArgumentError(format!(
768                            "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
769                            self.data_type, align_offset.min(alignment - align_offset)
770                        )));
771                    }
772                }
773                BufferSpec::VariableWidth => {
774                    // not cheap to validate (need to look at the
775                    // data). Partially checked in validate_offsets
776                    // called below. Can check with `validate_full`
777                }
778                BufferSpec::BitMap => {
779                    let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
780                    if buffer.len() < min_buffer_size {
781                        return Err(ArrowError::InvalidArgumentError(format!(
782                            "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
783                            min_buffer_size, i, self.data_type, buffer.len()
784                        )));
785                    }
786                }
787                BufferSpec::AlwaysNull => {
788                    // Nothing to validate
789                }
790            }
791        }
792
793        // check null bit buffer size
794        if let Some(nulls) = self.nulls() {
795            if nulls.null_count() > self.len {
796                return Err(ArrowError::InvalidArgumentError(format!(
797                    "null_count {} for an array exceeds length of {} elements",
798                    nulls.null_count(),
799                    self.len
800                )));
801            }
802
803            let actual_len = nulls.validity().len();
804            let needed_len = bit_util::ceil(len_plus_offset, 8);
805            if actual_len < needed_len {
806                return Err(ArrowError::InvalidArgumentError(format!(
807                    "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
808                )));
809            }
810
811            if nulls.len() != self.len {
812                return Err(ArrowError::InvalidArgumentError(format!(
813                    "null buffer incorrect size. got {} expected {}",
814                    nulls.len(),
815                    self.len
816                )));
817            }
818        }
819
820        self.validate_child_data()?;
821
822        // Additional Type specific checks
823        match &self.data_type {
824            DataType::Utf8 | DataType::Binary => {
825                self.validate_offsets::<i32>(self.buffers[1].len())?;
826            }
827            DataType::LargeUtf8 | DataType::LargeBinary => {
828                self.validate_offsets::<i64>(self.buffers[1].len())?;
829            }
830            DataType::Dictionary(key_type, _value_type) => {
831                // At the moment, constructing a DictionaryArray will also check this
832                if !DataType::is_dictionary_key_type(key_type) {
833                    return Err(ArrowError::InvalidArgumentError(format!(
834                        "Dictionary key type must be integer, but was {key_type}"
835                    )));
836                }
837            }
838            DataType::RunEndEncoded(run_ends_type, _) => {
839                if run_ends_type.is_nullable() {
840                    return Err(ArrowError::InvalidArgumentError(
841                        "The nullable should be set to false for the field defining run_ends array.".to_string()
842                    ));
843                }
844                if !DataType::is_run_ends_type(run_ends_type.data_type()) {
845                    return Err(ArrowError::InvalidArgumentError(format!(
846                        "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
847                        run_ends_type.data_type()
848                    )));
849                }
850            }
851            _ => {}
852        };
853
854        Ok(())
855    }
856
857    /// Returns a reference to the data in `buffer` as a typed slice
858    /// (typically `&[i32]` or `&[i64]`) after validating. The
859    /// returned slice is guaranteed to have at least `self.len + 1`
860    /// entries.
861    ///
862    /// For an empty array, the `buffer` can also be empty.
863    fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
864        // An empty list-like array can have 0 offsets
865        if self.len == 0 && self.buffers[0].is_empty() {
866            return Ok(&[]);
867        }
868
869        self.typed_buffer(0, self.len + 1)
870    }
871
872    /// Returns a reference to the data in `buffers[idx]` as a typed slice after validating
873    fn typed_buffer<T: ArrowNativeType + num::Num>(
874        &self,
875        idx: usize,
876        len: usize,
877    ) -> Result<&[T], ArrowError> {
878        let buffer = &self.buffers[idx];
879
880        let required_len = (len + self.offset) * mem::size_of::<T>();
881
882        if buffer.len() < required_len {
883            return Err(ArrowError::InvalidArgumentError(format!(
884                "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
885                idx,
886                self.data_type,
887                required_len,
888                buffer.len()
889            )));
890        }
891
892        Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
893    }
894
895    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
896    /// offsets (of type T) into some other buffer of `values_length` bytes long
897    fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
898        &self,
899        values_length: usize,
900    ) -> Result<(), ArrowError> {
901        // Justification: buffer size was validated above
902        let offsets = self.typed_offsets::<T>()?;
903        if offsets.is_empty() {
904            return Ok(());
905        }
906
907        let first_offset = offsets[0].to_usize().ok_or_else(|| {
908            ArrowError::InvalidArgumentError(format!(
909                "Error converting offset[0] ({}) to usize for {}",
910                offsets[0], self.data_type
911            ))
912        })?;
913
914        let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
915            ArrowError::InvalidArgumentError(format!(
916                "Error converting offset[{}] ({}) to usize for {}",
917                self.len, offsets[self.len], self.data_type
918            ))
919        })?;
920
921        if first_offset > values_length {
922            return Err(ArrowError::InvalidArgumentError(format!(
923                "First offset {} of {} is larger than values length {}",
924                first_offset, self.data_type, values_length,
925            )));
926        }
927
928        if last_offset > values_length {
929            return Err(ArrowError::InvalidArgumentError(format!(
930                "Last offset {} of {} is larger than values length {}",
931                last_offset, self.data_type, values_length,
932            )));
933        }
934
935        if first_offset > last_offset {
936            return Err(ArrowError::InvalidArgumentError(format!(
937                "First offset {} in {} is smaller than last offset {}",
938                first_offset, self.data_type, last_offset,
939            )));
940        }
941
942        Ok(())
943    }
944
945    /// Does a cheap sanity check that the `self.len` values in `buffer` are valid
946    /// offsets and sizes (of type T) into some other buffer of `values_length` bytes long
947    fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
948        &self,
949        values_length: usize,
950    ) -> Result<(), ArrowError> {
951        let offsets: &[T] = self.typed_buffer(0, self.len)?;
952        let sizes: &[T] = self.typed_buffer(1, self.len)?;
953        for i in 0..values_length {
954            let size = sizes[i].to_usize().ok_or_else(|| {
955                ArrowError::InvalidArgumentError(format!(
956                    "Error converting size[{}] ({}) to usize for {}",
957                    i, sizes[i], self.data_type
958                ))
959            })?;
960            let offset = offsets[i].to_usize().ok_or_else(|| {
961                ArrowError::InvalidArgumentError(format!(
962                    "Error converting offset[{}] ({}) to usize for {}",
963                    i, offsets[i], self.data_type
964                ))
965            })?;
966            if size
967                .checked_add(offset)
968                .expect("Offset and size have exceeded the usize boundary")
969                > values_length
970            {
971                return Err(ArrowError::InvalidArgumentError(format!(
972                    "Size {} at index {} is larger than the remaining values for {}",
973                    size, i, self.data_type
974                )));
975            }
976        }
977        Ok(())
978    }
979
980    /// Validates the layout of `child_data` ArrayData structures
981    fn validate_child_data(&self) -> Result<(), ArrowError> {
982        match &self.data_type {
983            DataType::List(field) | DataType::Map(field, _) => {
984                let values_data = self.get_single_valid_child_data(field.data_type())?;
985                self.validate_offsets::<i32>(values_data.len)?;
986                Ok(())
987            }
988            DataType::LargeList(field) => {
989                let values_data = self.get_single_valid_child_data(field.data_type())?;
990                self.validate_offsets::<i64>(values_data.len)?;
991                Ok(())
992            }
993            DataType::ListView(field) => {
994                let values_data = self.get_single_valid_child_data(field.data_type())?;
995                self.validate_offsets_and_sizes::<i32>(values_data.len)?;
996                Ok(())
997            }
998            DataType::LargeListView(field) => {
999                let values_data = self.get_single_valid_child_data(field.data_type())?;
1000                self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1001                Ok(())
1002            }
1003            DataType::FixedSizeList(field, list_size) => {
1004                let values_data = self.get_single_valid_child_data(field.data_type())?;
1005
1006                let list_size: usize = (*list_size).try_into().map_err(|_| {
1007                    ArrowError::InvalidArgumentError(format!(
1008                        "{} has a negative list_size {}",
1009                        self.data_type, list_size
1010                    ))
1011                })?;
1012
1013                let expected_values_len = self.len
1014                    .checked_mul(list_size)
1015                    .expect("integer overflow computing expected number of expected values in FixedListSize");
1016
1017                if values_data.len < expected_values_len {
1018                    return Err(ArrowError::InvalidArgumentError(format!(
1019                        "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1020                        values_data.len, list_size, list_size, self.data_type
1021                    )));
1022                }
1023
1024                Ok(())
1025            }
1026            DataType::Struct(fields) => {
1027                self.validate_num_child_data(fields.len())?;
1028                for (i, field) in fields.iter().enumerate() {
1029                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1030
1031                    // Ensure child field has sufficient size
1032                    if field_data.len < self.len {
1033                        return Err(ArrowError::InvalidArgumentError(format!(
1034                            "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1035                            self.data_type, i, field.name(), field_data.len, self.len
1036                        )));
1037                    }
1038                }
1039                Ok(())
1040            }
1041            DataType::RunEndEncoded(run_ends_field, values_field) => {
1042                self.validate_num_child_data(2)?;
1043                let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1044                let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1045                if run_ends_data.len != values_data.len {
1046                    return Err(ArrowError::InvalidArgumentError(format!(
1047                        "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1048                        run_ends_data.len, values_data.len
1049                    )));
1050                }
1051                if run_ends_data.nulls.is_some() {
1052                    return Err(ArrowError::InvalidArgumentError(
1053                        "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1054                    ));
1055                }
1056                Ok(())
1057            }
1058            DataType::Union(fields, mode) => {
1059                self.validate_num_child_data(fields.len())?;
1060
1061                for (i, (_, field)) in fields.iter().enumerate() {
1062                    let field_data = self.get_valid_child_data(i, field.data_type())?;
1063
1064                    if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1065                        return Err(ArrowError::InvalidArgumentError(format!(
1066                            "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1067                            i, field_data.len, self.len + self.offset
1068                        )));
1069                    }
1070                }
1071                Ok(())
1072            }
1073            DataType::Dictionary(_key_type, value_type) => {
1074                self.get_single_valid_child_data(value_type)?;
1075                Ok(())
1076            }
1077            _ => {
1078                // other types do not have child data
1079                if !self.child_data.is_empty() {
1080                    return Err(ArrowError::InvalidArgumentError(format!(
1081                        "Expected no child arrays for type {} but got {}",
1082                        self.data_type,
1083                        self.child_data.len()
1084                    )));
1085                }
1086                Ok(())
1087            }
1088        }
1089    }
1090
1091    /// Ensures that this array data has a single child_data with the
1092    /// expected type, and calls `validate()` on it. Returns a
1093    /// reference to that child_data
1094    fn get_single_valid_child_data(
1095        &self,
1096        expected_type: &DataType,
1097    ) -> Result<&ArrayData, ArrowError> {
1098        self.validate_num_child_data(1)?;
1099        self.get_valid_child_data(0, expected_type)
1100    }
1101
1102    /// Returns `Err` if self.child_data does not have exactly `expected_len` elements
1103    fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1104        if self.child_data.len() != expected_len {
1105            Err(ArrowError::InvalidArgumentError(format!(
1106                "Value data for {} should contain {} child data array(s), had {}",
1107                self.data_type,
1108                expected_len,
1109                self.child_data.len()
1110            )))
1111        } else {
1112            Ok(())
1113        }
1114    }
1115
1116    /// Ensures that `child_data[i]` has the expected type, calls
1117    /// `validate()` on it, and returns a reference to that child_data
1118    fn get_valid_child_data(
1119        &self,
1120        i: usize,
1121        expected_type: &DataType,
1122    ) -> Result<&ArrayData, ArrowError> {
1123        let values_data = self.child_data.get(i).ok_or_else(|| {
1124            ArrowError::InvalidArgumentError(format!(
1125                "{} did not have enough child arrays. Expected at least {} but had only {}",
1126                self.data_type,
1127                i + 1,
1128                self.child_data.len()
1129            ))
1130        })?;
1131
1132        if expected_type != &values_data.data_type {
1133            return Err(ArrowError::InvalidArgumentError(format!(
1134                "Child type mismatch for {}. Expected {} but child data had {}",
1135                self.data_type, expected_type, values_data.data_type
1136            )));
1137        }
1138
1139        values_data.validate()?;
1140        Ok(values_data)
1141    }
1142
1143    /// Validate that the data contained within this [`ArrayData`] is valid
1144    ///
1145    /// 1. Null count is correct
1146    /// 2. All offsets are valid
1147    /// 3. All String data is valid UTF-8
1148    /// 4. All dictionary offsets are valid
1149    ///
1150    /// Internally this calls:
1151    ///
1152    /// * [`Self::validate`]
1153    /// * [`Self::validate_nulls`]
1154    /// * [`Self::validate_values`]
1155    ///
1156    /// Note: this does not recurse into children, for a recursive variant
1157    /// see [`Self::validate_full`]
1158    pub fn validate_data(&self) -> Result<(), ArrowError> {
1159        self.validate()?;
1160
1161        self.validate_nulls()?;
1162        self.validate_values()?;
1163        Ok(())
1164    }
1165
1166    /// Performs a full recursive validation of this [`ArrayData`] and all its children
1167    ///
1168    /// This is equivalent to calling [`Self::validate_data`] on this [`ArrayData`]
1169    /// and all its children recursively
1170    pub fn validate_full(&self) -> Result<(), ArrowError> {
1171        self.validate_data()?;
1172        // validate all children recursively
1173        self.child_data
1174            .iter()
1175            .enumerate()
1176            .try_for_each(|(i, child_data)| {
1177                child_data.validate_full().map_err(|e| {
1178                    ArrowError::InvalidArgumentError(format!(
1179                        "{} child #{} invalid: {}",
1180                        self.data_type, i, e
1181                    ))
1182                })
1183            })?;
1184        Ok(())
1185    }
1186
1187    /// Validates the values stored within this [`ArrayData`] are valid
1188    /// without recursing into child [`ArrayData`]
1189    ///
1190    /// Does not (yet) check
1191    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1192    /// 2. the the null count is correct and that any
1193    /// 3. nullability requirements of its children are correct
1194    ///
1195    /// [#85]: https://github.com/apache/arrow-rs/issues/85
1196    pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1197        if let Some(nulls) = &self.nulls {
1198            let actual = nulls.len() - nulls.inner().count_set_bits();
1199            if actual != nulls.null_count() {
1200                return Err(ArrowError::InvalidArgumentError(format!(
1201                    "null_count value ({}) doesn't match actual number of nulls in array ({})",
1202                    nulls.null_count(),
1203                    actual
1204                )));
1205            }
1206        }
1207
1208        // In general non-nullable children should not contain nulls, however, for certain
1209        // types, such as StructArray and FixedSizeList, nulls in the parent take up
1210        // space in the child. As such we permit nulls in the children in the corresponding
1211        // positions for such types
1212        match &self.data_type {
1213            DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1214                if !f.is_nullable() {
1215                    self.validate_non_nullable(None, &self.child_data[0])?
1216                }
1217            }
1218            DataType::FixedSizeList(field, len) => {
1219                let child = &self.child_data[0];
1220                if !field.is_nullable() {
1221                    match &self.nulls {
1222                        Some(nulls) => {
1223                            let element_len = *len as usize;
1224                            let expanded = nulls.expand(element_len);
1225                            self.validate_non_nullable(Some(&expanded), child)?;
1226                        }
1227                        None => self.validate_non_nullable(None, child)?,
1228                    }
1229                }
1230            }
1231            DataType::Struct(fields) => {
1232                for (field, child) in fields.iter().zip(&self.child_data) {
1233                    if !field.is_nullable() {
1234                        self.validate_non_nullable(self.nulls(), child)?
1235                    }
1236                }
1237            }
1238            _ => {}
1239        }
1240
1241        Ok(())
1242    }
1243
1244    /// Verifies that `child` contains no nulls not present in `mask`
1245    fn validate_non_nullable(
1246        &self,
1247        mask: Option<&NullBuffer>,
1248        child: &ArrayData,
1249    ) -> Result<(), ArrowError> {
1250        let mask = match mask {
1251            Some(mask) => mask,
1252            None => {
1253                return match child.null_count() {
1254                    0 => Ok(()),
1255                    _ => Err(ArrowError::InvalidArgumentError(format!(
1256                        "non-nullable child of type {} contains nulls not present in parent {}",
1257                        child.data_type, self.data_type
1258                    ))),
1259                }
1260            }
1261        };
1262
1263        match child.nulls() {
1264            Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1265                "non-nullable child of type {} contains nulls not present in parent",
1266                child.data_type
1267            ))),
1268            _ => Ok(()),
1269        }
1270    }
1271
1272    /// Validates the values stored within this [`ArrayData`] are valid
1273    /// without recursing into child [`ArrayData`]
1274    ///
1275    /// Does not (yet) check
1276    /// 1. Union type_ids are valid see [#85](https://github.com/apache/arrow-rs/issues/85)
1277    pub fn validate_values(&self) -> Result<(), ArrowError> {
1278        match &self.data_type {
1279            DataType::Utf8 => self.validate_utf8::<i32>(),
1280            DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1281            DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1282            DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1283            DataType::BinaryView => {
1284                let views = self.typed_buffer::<u128>(0, self.len)?;
1285                validate_binary_view(views, &self.buffers[1..])
1286            }
1287            DataType::Utf8View => {
1288                let views = self.typed_buffer::<u128>(0, self.len)?;
1289                validate_string_view(views, &self.buffers[1..])
1290            }
1291            DataType::List(_) | DataType::Map(_, _) => {
1292                let child = &self.child_data[0];
1293                self.validate_offsets_full::<i32>(child.len)
1294            }
1295            DataType::LargeList(_) => {
1296                let child = &self.child_data[0];
1297                self.validate_offsets_full::<i64>(child.len)
1298            }
1299            DataType::Union(_, _) => {
1300                // Validate Union Array as part of implementing new Union semantics
1301                // See comments in `ArrayData::validate()`
1302                // https://github.com/apache/arrow-rs/issues/85
1303                //
1304                // TODO file follow on ticket for full union validation
1305                Ok(())
1306            }
1307            DataType::Dictionary(key_type, _value_type) => {
1308                let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1309                let max_value = dictionary_length - 1;
1310                match key_type.as_ref() {
1311                    DataType::UInt8 => self.check_bounds::<u8>(max_value),
1312                    DataType::UInt16 => self.check_bounds::<u16>(max_value),
1313                    DataType::UInt32 => self.check_bounds::<u32>(max_value),
1314                    DataType::UInt64 => self.check_bounds::<u64>(max_value),
1315                    DataType::Int8 => self.check_bounds::<i8>(max_value),
1316                    DataType::Int16 => self.check_bounds::<i16>(max_value),
1317                    DataType::Int32 => self.check_bounds::<i32>(max_value),
1318                    DataType::Int64 => self.check_bounds::<i64>(max_value),
1319                    _ => unreachable!(),
1320                }
1321            }
1322            DataType::RunEndEncoded(run_ends, _values) => {
1323                let run_ends_data = self.child_data()[0].clone();
1324                match run_ends.data_type() {
1325                    DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1326                    DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1327                    DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1328                    _ => unreachable!(),
1329                }
1330            }
1331            _ => {
1332                // No extra validation check required for other types
1333                Ok(())
1334            }
1335        }
1336    }
1337
1338    /// Calls the `validate(item_index, range)` function for each of
1339    /// the ranges specified in the arrow offsets buffer of type
1340    /// `T`. Also validates that each offset is smaller than
1341    /// `offset_limit`
1342    ///
1343    /// For an empty array, the offsets buffer can either be empty
1344    /// or contain a single `0`.
1345    ///
1346    /// For example, the offsets buffer contained `[1, 2, 4]`, this
1347    /// function would call `validate([1,2])`, and `validate([2,4])`
1348    fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1349    where
1350        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1351        V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1352    {
1353        self.typed_offsets::<T>()?
1354            .iter()
1355            .enumerate()
1356            .map(|(i, x)| {
1357                // check if the offset can be converted to usize
1358                let r = x.to_usize().ok_or_else(|| {
1359                    ArrowError::InvalidArgumentError(format!(
1360                        "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1361                    );
1362                // check if the offset exceeds the limit
1363                match r {
1364                    Ok(n) if n <= offset_limit => Ok((i, n)),
1365                    Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1366                        "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1367                    ),
1368                    Err(e) => Err(e),
1369                }
1370            })
1371            .scan(0_usize, |start, end| {
1372                // check offsets are monotonically increasing
1373                match end {
1374                    Ok((i, end)) if *start <= end => {
1375                        let range = Some(Ok((i, *start..end)));
1376                        *start = end;
1377                        range
1378                    }
1379                    Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1380                        "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1381                        i - 1, start, end))
1382                    )),
1383                    Err(err) => Some(Err(err)),
1384                }
1385            })
1386            .skip(1) // the first element is meaningless
1387            .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1388                let (item_index, range) = res?;
1389                validate(item_index-1, range)
1390            })
1391    }
1392
1393    /// Ensures that all strings formed by the offsets in `buffers[0]`
1394    /// into `buffers[1]` are valid utf8 sequences
1395    fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1396    where
1397        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1398    {
1399        let values_buffer = &self.buffers[1].as_slice();
1400        if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1401            // Validate Offsets are correct
1402            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1403                if !values_str.is_char_boundary(range.start)
1404                    || !values_str.is_char_boundary(range.end)
1405                {
1406                    return Err(ArrowError::InvalidArgumentError(format!(
1407                        "incomplete utf-8 byte sequence from index {string_index}"
1408                    )));
1409                }
1410                Ok(())
1411            })
1412        } else {
1413            // find specific offset that failed utf8 validation
1414            self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1415                std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1416                    ArrowError::InvalidArgumentError(format!(
1417                        "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1418                    ))
1419                })?;
1420                Ok(())
1421            })
1422        }
1423    }
1424
1425    /// Ensures that all offsets in `buffers[0]` into `buffers[1]` are
1426    /// between `0` and `offset_limit`
1427    fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1428    where
1429        T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1430    {
1431        self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1432            // No validation applied to each value, but the iteration
1433            // itself applies bounds checking to each range
1434            Ok(())
1435        })
1436    }
1437
1438    /// Validates that each value in self.buffers (typed as T)
1439    /// is within the range [0, max_value], inclusive
1440    fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1441    where
1442        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1443    {
1444        let required_len = self.len + self.offset;
1445        let buffer = &self.buffers[0];
1446
1447        // This should have been checked as part of `validate()` prior
1448        // to calling `validate_full()` but double check to be sure
1449        assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1450
1451        // Justification: buffer size was validated above
1452        let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1453
1454        indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1455            // Do not check the value is null (value can be arbitrary)
1456            if self.is_null(i) {
1457                return Ok(());
1458            }
1459            let dict_index: i64 = dict_index.try_into().map_err(|_| {
1460                ArrowError::InvalidArgumentError(format!(
1461                    "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1462                ))
1463            })?;
1464
1465            if dict_index < 0 || dict_index > max_value {
1466                return Err(ArrowError::InvalidArgumentError(format!(
1467                    "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1468                )));
1469            }
1470            Ok(())
1471        })
1472    }
1473
1474    /// Validates that each value in run_ends array is positive and strictly increasing.
1475    fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1476    where
1477        T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1478    {
1479        let values = self.typed_buffer::<T>(0, self.len)?;
1480        let mut prev_value: i64 = 0_i64;
1481        values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1482            let value: i64 = inp_value.try_into().map_err(|_| {
1483                ArrowError::InvalidArgumentError(format!(
1484                    "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1485                ))
1486            })?;
1487            if value <= 0_i64 {
1488                return Err(ArrowError::InvalidArgumentError(format!(
1489                    "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1490                )));
1491            }
1492            if ix > 0 && value <= prev_value {
1493                return Err(ArrowError::InvalidArgumentError(format!(
1494                    "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1495                )));
1496            }
1497
1498            prev_value = value;
1499            Ok(())
1500        })?;
1501
1502        if prev_value.as_usize() < (self.offset + self.len) {
1503            return Err(ArrowError::InvalidArgumentError(format!(
1504                "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1505                self.offset + self.len
1506            )));
1507        }
1508        Ok(())
1509    }
1510
1511    /// Returns true if this `ArrayData` is equal to `other`, using pointer comparisons
1512    /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
1513    /// return false when the arrays are logically equal
1514    pub fn ptr_eq(&self, other: &Self) -> bool {
1515        if self.offset != other.offset
1516            || self.len != other.len
1517            || self.data_type != other.data_type
1518            || self.buffers.len() != other.buffers.len()
1519            || self.child_data.len() != other.child_data.len()
1520        {
1521            return false;
1522        }
1523
1524        match (&self.nulls, &other.nulls) {
1525            (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1526            (Some(_), None) | (None, Some(_)) => return false,
1527            _ => {}
1528        };
1529
1530        if !self
1531            .buffers
1532            .iter()
1533            .zip(other.buffers.iter())
1534            .all(|(a, b)| a.as_ptr() == b.as_ptr())
1535        {
1536            return false;
1537        }
1538
1539        self.child_data
1540            .iter()
1541            .zip(other.child_data.iter())
1542            .all(|(a, b)| a.ptr_eq(b))
1543    }
1544
1545    /// Converts this [`ArrayData`] into an [`ArrayDataBuilder`]
1546    pub fn into_builder(self) -> ArrayDataBuilder {
1547        self.into()
1548    }
1549}
1550
1551/// Return the expected [`DataTypeLayout`] Arrays of this data
1552/// type are expected to have
1553pub fn layout(data_type: &DataType) -> DataTypeLayout {
1554    // based on C/C++ implementation in
1555    // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc)
1556    use arrow_schema::IntervalUnit::*;
1557
1558    match data_type {
1559        DataType::Null => DataTypeLayout {
1560            buffers: vec![],
1561            can_contain_null_mask: false,
1562            variadic: false,
1563        },
1564        DataType::Boolean => DataTypeLayout {
1565            buffers: vec![BufferSpec::BitMap],
1566            can_contain_null_mask: true,
1567            variadic: false,
1568        },
1569        DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1570        DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1571        DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1572        DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1573        DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1574        DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1575        DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1576        DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1577        DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1578        DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1579        DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1580        DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1581        DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1582        DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1583        DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1584        DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1585        DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1586        DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1587        DataType::Interval(MonthDayNano) => {
1588            DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1589        }
1590        DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1591        DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1592        DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1593        DataType::FixedSizeBinary(size) => {
1594            let spec = BufferSpec::FixedWidth {
1595                byte_width: (*size).try_into().unwrap(),
1596                alignment: mem::align_of::<u8>(),
1597            };
1598            DataTypeLayout {
1599                buffers: vec![spec],
1600                can_contain_null_mask: true,
1601                variadic: false,
1602            }
1603        }
1604        DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1605        DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1606        DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1607        DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1608        DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1609        DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), // all in child data
1610        DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1611        DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1612        DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1613        DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1614        DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1615        DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), // all in child data,
1616        DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data,
1617        DataType::Union(_, mode) => {
1618            let type_ids = BufferSpec::FixedWidth {
1619                byte_width: mem::size_of::<i8>(),
1620                alignment: mem::align_of::<i8>(),
1621            };
1622
1623            DataTypeLayout {
1624                buffers: match mode {
1625                    UnionMode::Sparse => {
1626                        vec![type_ids]
1627                    }
1628                    UnionMode::Dense => {
1629                        vec![
1630                            type_ids,
1631                            BufferSpec::FixedWidth {
1632                                byte_width: mem::size_of::<i32>(),
1633                                alignment: mem::align_of::<i32>(),
1634                            },
1635                        ]
1636                    }
1637                },
1638                can_contain_null_mask: false,
1639                variadic: false,
1640            }
1641        }
1642        DataType::Dictionary(key_type, _value_type) => layout(key_type),
1643    }
1644}
1645
1646/// Layout specification for a data type
1647#[derive(Debug, PartialEq, Eq)]
1648// Note: Follows structure from C++: https://github.com/apache/arrow/blob/master/cpp/src/arrow/type.h#L91
1649pub struct DataTypeLayout {
1650    /// A vector of buffer layout specifications, one for each expected buffer
1651    pub buffers: Vec<BufferSpec>,
1652
1653    /// Can contain a null bitmask
1654    pub can_contain_null_mask: bool,
1655
1656    /// This field only applies to the view type [`DataType::BinaryView`] and [`DataType::Utf8View`]
1657    /// If `variadic` is true, the number of buffers expected is only lower-bounded by
1658    /// buffers.len(). Buffers that exceed the lower bound are legal.
1659    pub variadic: bool,
1660}
1661
1662impl DataTypeLayout {
1663    /// Describes a basic numeric array where each element has type `T`
1664    pub fn new_fixed_width<T>() -> Self {
1665        Self {
1666            buffers: vec![BufferSpec::FixedWidth {
1667                byte_width: mem::size_of::<T>(),
1668                alignment: mem::align_of::<T>(),
1669            }],
1670            can_contain_null_mask: true,
1671            variadic: false,
1672        }
1673    }
1674
1675    /// Describes arrays which have no data of their own
1676    /// but may still have a Null Bitmap (e.g. FixedSizeList)
1677    pub fn new_nullable_empty() -> Self {
1678        Self {
1679            buffers: vec![],
1680            can_contain_null_mask: true,
1681            variadic: false,
1682        }
1683    }
1684
1685    /// Describes arrays which have no data of their own
1686    /// (e.g. RunEndEncoded).
1687    pub fn new_empty() -> Self {
1688        Self {
1689            buffers: vec![],
1690            can_contain_null_mask: false,
1691            variadic: false,
1692        }
1693    }
1694
1695    /// Describes a basic numeric array where each element has a fixed
1696    /// with offset buffer of type `T`, followed by a
1697    /// variable width data buffer
1698    pub fn new_binary<T>() -> Self {
1699        Self {
1700            buffers: vec![
1701                // offsets
1702                BufferSpec::FixedWidth {
1703                    byte_width: mem::size_of::<T>(),
1704                    alignment: mem::align_of::<T>(),
1705                },
1706                // values
1707                BufferSpec::VariableWidth,
1708            ],
1709            can_contain_null_mask: true,
1710            variadic: false,
1711        }
1712    }
1713
1714    /// Describes a view type
1715    pub fn new_view() -> Self {
1716        Self {
1717            buffers: vec![BufferSpec::FixedWidth {
1718                byte_width: mem::size_of::<u128>(),
1719                alignment: mem::align_of::<u128>(),
1720            }],
1721            can_contain_null_mask: true,
1722            variadic: true,
1723        }
1724    }
1725
1726    /// Describes a list view type
1727    pub fn new_list_view<T>() -> Self {
1728        Self {
1729            buffers: vec![
1730                BufferSpec::FixedWidth {
1731                    byte_width: mem::size_of::<T>(),
1732                    alignment: mem::align_of::<T>(),
1733                },
1734                BufferSpec::FixedWidth {
1735                    byte_width: mem::size_of::<T>(),
1736                    alignment: mem::align_of::<T>(),
1737                },
1738            ],
1739            can_contain_null_mask: true,
1740            variadic: true,
1741        }
1742    }
1743}
1744
1745/// Layout specification for a single data type buffer
1746#[derive(Debug, PartialEq, Eq)]
1747pub enum BufferSpec {
1748    /// Each element is a fixed width primitive, with the given `byte_width` and `alignment`
1749    ///
1750    /// `alignment` is the alignment required by Rust for an array of the corresponding primitive,
1751    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
1752    ///
1753    /// Arrow-rs requires that all buffers have at least this alignment, to allow for
1754    /// [slice](std::slice) based APIs. Alignment in excess of this is not required to allow
1755    /// for array slicing and interoperability with `Vec`, which cannot be over-aligned.
1756    ///
1757    /// Note that these alignment requirements will vary between architectures
1758    FixedWidth {
1759        /// The width of each element in bytes
1760        byte_width: usize,
1761        /// The alignment required by Rust for an array of the corresponding primitive
1762        alignment: usize,
1763    },
1764    /// Variable width, such as string data for utf8 data
1765    VariableWidth,
1766    /// Buffer holds a bitmap.
1767    ///
1768    /// Note: Unlike the C++ implementation, the null/validity buffer
1769    /// is handled specially rather than as another of the buffers in
1770    /// the spec, so this variant is only used for the Boolean type.
1771    BitMap,
1772    /// Buffer is always null. Unused currently in Rust implementation,
1773    /// (used in C++ for Union type)
1774    #[allow(dead_code)]
1775    AlwaysNull,
1776}
1777
1778impl PartialEq for ArrayData {
1779    fn eq(&self, other: &Self) -> bool {
1780        equal::equal(self, other)
1781    }
1782}
1783
1784mod private {
1785    /// A boolean flag that cannot be mutated outside of unsafe code.
1786    ///
1787    /// Defaults to a value of false.
1788    ///
1789    /// This structure is used to enforce safety in the [`ArrayDataBuilder`]
1790    ///
1791    /// [`ArrayDataBuilder`]: super::ArrayDataBuilder
1792    #[derive(Debug)]
1793    pub struct UnsafeFlag(bool);
1794
1795    impl UnsafeFlag {
1796        /// Creates a new `UnsafeFlag` with the value set to `false`
1797        #[inline]
1798        pub const fn new() -> Self {
1799            Self(false)
1800        }
1801
1802        #[inline]
1803        pub unsafe fn set(&mut self, val: bool) {
1804            self.0 = val;
1805        }
1806
1807        #[inline]
1808        pub fn get(&self) -> bool {
1809            self.0
1810        }
1811    }
1812}
1813
1814/// Builder for [`ArrayData`] type
1815#[derive(Debug)]
1816pub struct ArrayDataBuilder {
1817    data_type: DataType,
1818    len: usize,
1819    null_count: Option<usize>,
1820    null_bit_buffer: Option<Buffer>,
1821    nulls: Option<NullBuffer>,
1822    offset: usize,
1823    buffers: Vec<Buffer>,
1824    child_data: Vec<ArrayData>,
1825    /// Should buffers be realigned (copying if necessary)?
1826    ///
1827    /// Defaults to false.
1828    align_buffers: bool,
1829    /// Should data validation be skipped for this [`ArrayData`]?
1830    ///
1831    /// Defaults to false.
1832    ///
1833    /// # Safety
1834    ///
1835    /// This flag can only be set to true using `unsafe` APIs. However, once true
1836    /// subsequent calls to `build()` may result in undefined behavior if the data
1837    /// is not valid.
1838    skip_validation: UnsafeFlag,
1839}
1840
1841impl ArrayDataBuilder {
1842    #[inline]
1843    /// Creates a new array data builder
1844    pub const fn new(data_type: DataType) -> Self {
1845        Self {
1846            data_type,
1847            len: 0,
1848            null_count: None,
1849            null_bit_buffer: None,
1850            nulls: None,
1851            offset: 0,
1852            buffers: vec![],
1853            child_data: vec![],
1854            align_buffers: false,
1855            skip_validation: UnsafeFlag::new(),
1856        }
1857    }
1858
1859    /// Creates a new array data builder from an existing one, changing the data type
1860    pub fn data_type(self, data_type: DataType) -> Self {
1861        Self { data_type, ..self }
1862    }
1863
1864    #[inline]
1865    #[allow(clippy::len_without_is_empty)]
1866    /// Sets the length of the [ArrayData]
1867    pub const fn len(mut self, n: usize) -> Self {
1868        self.len = n;
1869        self
1870    }
1871
1872    /// Sets the null buffer of the [ArrayData]
1873    pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1874        self.nulls = nulls;
1875        self.null_count = None;
1876        self.null_bit_buffer = None;
1877        self
1878    }
1879
1880    /// Sets the null count of the [ArrayData]
1881    pub fn null_count(mut self, null_count: usize) -> Self {
1882        self.null_count = Some(null_count);
1883        self
1884    }
1885
1886    /// Sets the `null_bit_buffer` of the [ArrayData]
1887    pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1888        self.nulls = None;
1889        self.null_bit_buffer = buf;
1890        self
1891    }
1892
1893    /// Sets the offset of the [ArrayData]
1894    #[inline]
1895    pub const fn offset(mut self, n: usize) -> Self {
1896        self.offset = n;
1897        self
1898    }
1899
1900    /// Sets the buffers of the [ArrayData]
1901    pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1902        self.buffers = v;
1903        self
1904    }
1905
1906    /// Adds a single buffer to the [ArrayData]'s buffers
1907    pub fn add_buffer(mut self, b: Buffer) -> Self {
1908        self.buffers.push(b);
1909        self
1910    }
1911
1912    /// Adds multiple buffers to the [ArrayData]'s buffers
1913    pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1914        self.buffers.extend(bs);
1915        self
1916    }
1917
1918    /// Sets the child data of the [ArrayData]
1919    pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1920        self.child_data = v;
1921        self
1922    }
1923
1924    /// Adds a single child data to the [ArrayData]'s child data
1925    pub fn add_child_data(mut self, r: ArrayData) -> Self {
1926        self.child_data.push(r);
1927        self
1928    }
1929
1930    /// Creates an array data, without any validation
1931    ///
1932    /// Note: This is shorthand for `self.skip_validation(true).build().unwrap()`
1933    ///
1934    /// # Safety
1935    ///
1936    /// The same caveats as [`ArrayData::new_unchecked`]
1937    /// apply.
1938    pub unsafe fn build_unchecked(self) -> ArrayData {
1939        self.skip_validation(true).build().unwrap()
1940    }
1941
1942    /// Creates an `ArrayData`, consuming `self`
1943    ///
1944    /// # Safety
1945    ///
1946    /// By default the underlying buffers are checked to ensure they are valid
1947    /// Arrow data. However, if the [`Self::skip_validation`] flag has been set
1948    /// to true (by the `unsafe` API) this validation is skipped. If the data is
1949    /// not valid, undefined behavior will result.
1950    pub fn build(self) -> Result<ArrayData, ArrowError> {
1951        let Self {
1952            data_type,
1953            len,
1954            null_count,
1955            null_bit_buffer,
1956            nulls,
1957            offset,
1958            buffers,
1959            child_data,
1960            align_buffers,
1961            skip_validation,
1962        } = self;
1963
1964        let nulls = nulls
1965            .or_else(|| {
1966                let buffer = null_bit_buffer?;
1967                let buffer = BooleanBuffer::new(buffer, offset, len);
1968                Some(match null_count {
1969                    Some(n) => {
1970                        // SAFETY: call to `data.validate_data()` below validates the null buffer is valid
1971                        unsafe { NullBuffer::new_unchecked(buffer, n) }
1972                    }
1973                    None => NullBuffer::new(buffer),
1974                })
1975            })
1976            .filter(|b| b.null_count() != 0);
1977
1978        let mut data = ArrayData {
1979            data_type,
1980            len,
1981            offset,
1982            buffers,
1983            child_data,
1984            nulls,
1985        };
1986
1987        if align_buffers {
1988            data.align_buffers();
1989        }
1990
1991        // SAFETY: `skip_validation` is only set to true using `unsafe` APIs
1992        if !skip_validation.get() || cfg!(feature = "force_validate") {
1993            data.validate_data()?;
1994        }
1995        Ok(data)
1996    }
1997
1998    /// Creates an array data, validating all inputs, and aligning any buffers
1999    #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2000    pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2001        self.align_buffers(true).build()
2002    }
2003
2004    /// Ensure that all buffers are aligned, copying data if necessary
2005    ///
2006    /// Rust requires that arrays are aligned to their corresponding primitive,
2007    /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`].
2008    ///
2009    /// [`ArrayData`] therefore requires that all buffers have at least this alignment,
2010    /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`].
2011    ///
2012    /// As this alignment is architecture specific, and not guaranteed by all arrow implementations,
2013    /// this flag is provided to automatically copy buffers to a new correctly aligned allocation
2014    /// when necessary, making it useful when interacting with buffers produced by other systems,
2015    /// e.g. IPC or FFI.
2016    ///
2017    /// If this flag is not enabled, `[Self::build`] return an error on encountering
2018    /// insufficiently aligned buffers.
2019    pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2020        self.align_buffers = align_buffers;
2021        self
2022    }
2023
2024    /// Skips validation of the data.
2025    ///
2026    /// If this flag is enabled, `[Self::build`] will skip validation of the
2027    /// data
2028    ///
2029    /// If this flag is not enabled, `[Self::build`] will validate that all
2030    /// buffers are valid and will return an error if any data is invalid.
2031    /// Validation can be expensive.
2032    ///
2033    /// # Safety
2034    ///
2035    /// If validation is skipped, the buffers must form a valid Arrow array,
2036    /// otherwise undefined behavior will result
2037    pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2038        self.skip_validation.set(skip_validation);
2039        self
2040    }
2041}
2042
2043impl From<ArrayData> for ArrayDataBuilder {
2044    fn from(d: ArrayData) -> Self {
2045        Self {
2046            data_type: d.data_type,
2047            len: d.len,
2048            offset: d.offset,
2049            buffers: d.buffers,
2050            child_data: d.child_data,
2051            nulls: d.nulls,
2052            null_bit_buffer: None,
2053            null_count: None,
2054            align_buffers: false,
2055            skip_validation: UnsafeFlag::new(),
2056        }
2057    }
2058}
2059
2060#[cfg(test)]
2061mod tests {
2062    use super::*;
2063    use arrow_schema::{Field, Fields};
2064
2065    // See arrow/tests/array_data_validation.rs for test of array validation
2066
2067    /// returns a buffer initialized with some constant value for tests
2068    fn make_i32_buffer(n: usize) -> Buffer {
2069        Buffer::from_slice_ref(vec![42i32; n])
2070    }
2071
2072    /// returns a buffer initialized with some constant value for tests
2073    fn make_f32_buffer(n: usize) -> Buffer {
2074        Buffer::from_slice_ref(vec![42f32; n])
2075    }
2076
2077    #[test]
2078    fn test_builder() {
2079        // Buffer needs to be at least 25 long
2080        let v = (0..25).collect::<Vec<i32>>();
2081        let b1 = Buffer::from_slice_ref(&v);
2082        let arr_data = ArrayData::builder(DataType::Int32)
2083            .len(20)
2084            .offset(5)
2085            .add_buffer(b1)
2086            .null_bit_buffer(Some(Buffer::from([
2087                0b01011111, 0b10110101, 0b01100011, 0b00011110,
2088            ])))
2089            .build()
2090            .unwrap();
2091
2092        assert_eq!(20, arr_data.len());
2093        assert_eq!(10, arr_data.null_count());
2094        assert_eq!(5, arr_data.offset());
2095        assert_eq!(1, arr_data.buffers().len());
2096        assert_eq!(
2097            Buffer::from_slice_ref(&v).as_slice(),
2098            arr_data.buffers()[0].as_slice()
2099        );
2100    }
2101
2102    #[test]
2103    fn test_builder_with_child_data() {
2104        let child_arr_data = ArrayData::try_new(
2105            DataType::Int32,
2106            5,
2107            None,
2108            0,
2109            vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2110            vec![],
2111        )
2112        .unwrap();
2113
2114        let field = Arc::new(Field::new("x", DataType::Int32, true));
2115        let data_type = DataType::Struct(vec![field].into());
2116
2117        let arr_data = ArrayData::builder(data_type)
2118            .len(5)
2119            .offset(0)
2120            .add_child_data(child_arr_data.clone())
2121            .build()
2122            .unwrap();
2123
2124        assert_eq!(5, arr_data.len());
2125        assert_eq!(1, arr_data.child_data().len());
2126        assert_eq!(child_arr_data, arr_data.child_data()[0]);
2127    }
2128
2129    #[test]
2130    fn test_null_count() {
2131        let mut bit_v: [u8; 2] = [0; 2];
2132        bit_util::set_bit(&mut bit_v, 0);
2133        bit_util::set_bit(&mut bit_v, 3);
2134        bit_util::set_bit(&mut bit_v, 10);
2135        let arr_data = ArrayData::builder(DataType::Int32)
2136            .len(16)
2137            .add_buffer(make_i32_buffer(16))
2138            .null_bit_buffer(Some(Buffer::from(bit_v)))
2139            .build()
2140            .unwrap();
2141        assert_eq!(13, arr_data.null_count());
2142
2143        // Test with offset
2144        let mut bit_v: [u8; 2] = [0; 2];
2145        bit_util::set_bit(&mut bit_v, 0);
2146        bit_util::set_bit(&mut bit_v, 3);
2147        bit_util::set_bit(&mut bit_v, 10);
2148        let arr_data = ArrayData::builder(DataType::Int32)
2149            .len(12)
2150            .offset(2)
2151            .add_buffer(make_i32_buffer(14)) // requires at least 14 bytes of space,
2152            .null_bit_buffer(Some(Buffer::from(bit_v)))
2153            .build()
2154            .unwrap();
2155        assert_eq!(10, arr_data.null_count());
2156    }
2157
2158    #[test]
2159    fn test_null_buffer_ref() {
2160        let mut bit_v: [u8; 2] = [0; 2];
2161        bit_util::set_bit(&mut bit_v, 0);
2162        bit_util::set_bit(&mut bit_v, 3);
2163        bit_util::set_bit(&mut bit_v, 10);
2164        let arr_data = ArrayData::builder(DataType::Int32)
2165            .len(16)
2166            .add_buffer(make_i32_buffer(16))
2167            .null_bit_buffer(Some(Buffer::from(bit_v)))
2168            .build()
2169            .unwrap();
2170        assert!(arr_data.nulls().is_some());
2171        assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2172    }
2173
2174    #[test]
2175    fn test_slice() {
2176        let mut bit_v: [u8; 2] = [0; 2];
2177        bit_util::set_bit(&mut bit_v, 0);
2178        bit_util::set_bit(&mut bit_v, 3);
2179        bit_util::set_bit(&mut bit_v, 10);
2180        let data = ArrayData::builder(DataType::Int32)
2181            .len(16)
2182            .add_buffer(make_i32_buffer(16))
2183            .null_bit_buffer(Some(Buffer::from(bit_v)))
2184            .build()
2185            .unwrap();
2186        let new_data = data.slice(1, 15);
2187        assert_eq!(data.len() - 1, new_data.len());
2188        assert_eq!(1, new_data.offset());
2189        assert_eq!(data.null_count(), new_data.null_count());
2190
2191        // slice of a slice (removes one null)
2192        let new_data = new_data.slice(1, 14);
2193        assert_eq!(data.len() - 2, new_data.len());
2194        assert_eq!(2, new_data.offset());
2195        assert_eq!(data.null_count() - 1, new_data.null_count());
2196    }
2197
2198    #[test]
2199    fn test_equality() {
2200        let int_data = ArrayData::builder(DataType::Int32)
2201            .len(1)
2202            .add_buffer(make_i32_buffer(1))
2203            .build()
2204            .unwrap();
2205
2206        let float_data = ArrayData::builder(DataType::Float32)
2207            .len(1)
2208            .add_buffer(make_f32_buffer(1))
2209            .build()
2210            .unwrap();
2211        assert_ne!(int_data, float_data);
2212        assert!(!int_data.ptr_eq(&float_data));
2213        assert!(int_data.ptr_eq(&int_data));
2214
2215        #[allow(clippy::redundant_clone)]
2216        let int_data_clone = int_data.clone();
2217        assert_eq!(int_data, int_data_clone);
2218        assert!(int_data.ptr_eq(&int_data_clone));
2219        assert!(int_data_clone.ptr_eq(&int_data));
2220
2221        let int_data_slice = int_data_clone.slice(1, 0);
2222        assert!(int_data_slice.ptr_eq(&int_data_slice));
2223        assert!(!int_data.ptr_eq(&int_data_slice));
2224        assert!(!int_data_slice.ptr_eq(&int_data));
2225
2226        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2227        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2228        let string_data = ArrayData::try_new(
2229            DataType::Utf8,
2230            3,
2231            Some(Buffer::from_iter(vec![true, false, true])),
2232            0,
2233            vec![offsets_buffer, data_buffer],
2234            vec![],
2235        )
2236        .unwrap();
2237
2238        assert_ne!(float_data, string_data);
2239        assert!(!float_data.ptr_eq(&string_data));
2240
2241        assert!(string_data.ptr_eq(&string_data));
2242
2243        #[allow(clippy::redundant_clone)]
2244        let string_data_cloned = string_data.clone();
2245        assert!(string_data_cloned.ptr_eq(&string_data));
2246        assert!(string_data.ptr_eq(&string_data_cloned));
2247
2248        let string_data_slice = string_data.slice(1, 2);
2249        assert!(string_data_slice.ptr_eq(&string_data_slice));
2250        assert!(!string_data_slice.ptr_eq(&string_data))
2251    }
2252
2253    #[test]
2254    fn test_slice_memory_size() {
2255        let mut bit_v: [u8; 2] = [0; 2];
2256        bit_util::set_bit(&mut bit_v, 0);
2257        bit_util::set_bit(&mut bit_v, 3);
2258        bit_util::set_bit(&mut bit_v, 10);
2259        let data = ArrayData::builder(DataType::Int32)
2260            .len(16)
2261            .add_buffer(make_i32_buffer(16))
2262            .null_bit_buffer(Some(Buffer::from(bit_v)))
2263            .build()
2264            .unwrap();
2265        let new_data = data.slice(1, 14);
2266        assert_eq!(
2267            data.get_slice_memory_size().unwrap() - 8,
2268            new_data.get_slice_memory_size().unwrap()
2269        );
2270        let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2271        let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2272        let string_data = ArrayData::try_new(
2273            DataType::Utf8,
2274            3,
2275            Some(Buffer::from_iter(vec![true, false, true])),
2276            0,
2277            vec![offsets_buffer, data_buffer],
2278            vec![],
2279        )
2280        .unwrap();
2281        let string_data_slice = string_data.slice(1, 2);
2282        //4 bytes of offset and 2 bytes of data reduced by slicing.
2283        assert_eq!(
2284            string_data.get_slice_memory_size().unwrap() - 6,
2285            string_data_slice.get_slice_memory_size().unwrap()
2286        );
2287    }
2288
2289    #[test]
2290    fn test_count_nulls() {
2291        let buffer = Buffer::from([0b00010110, 0b10011111]);
2292        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2293        let count = count_nulls(Some(&buffer), 0, 16);
2294        assert_eq!(count, 7);
2295
2296        let count = count_nulls(Some(&buffer), 4, 8);
2297        assert_eq!(count, 3);
2298    }
2299
2300    #[test]
2301    fn test_contains_nulls() {
2302        let buffer: Buffer =
2303            MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2304        let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2305        assert!(contains_nulls(Some(&buffer), 0, 6));
2306        assert!(contains_nulls(Some(&buffer), 0, 3));
2307        assert!(!contains_nulls(Some(&buffer), 3, 2));
2308        assert!(!contains_nulls(Some(&buffer), 0, 0));
2309    }
2310
2311    #[test]
2312    fn test_alignment() {
2313        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2314        let sliced = buffer.slice(1);
2315
2316        let mut data = ArrayData {
2317            data_type: DataType::Int32,
2318            len: 0,
2319            offset: 0,
2320            buffers: vec![buffer],
2321            child_data: vec![],
2322            nulls: None,
2323        };
2324        data.validate_full().unwrap();
2325
2326        // break alignment in data
2327        data.buffers[0] = sliced;
2328        let err = data.validate().unwrap_err();
2329
2330        assert_eq!(
2331            err.to_string(),
2332            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2333        );
2334
2335        data.align_buffers();
2336        data.validate_full().unwrap();
2337    }
2338
2339    #[test]
2340    fn test_alignment_struct() {
2341        let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2342        let sliced = buffer.slice(1);
2343
2344        let child_data = ArrayData {
2345            data_type: DataType::Int32,
2346            len: 0,
2347            offset: 0,
2348            buffers: vec![buffer],
2349            child_data: vec![],
2350            nulls: None,
2351        };
2352
2353        let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2354        let mut data = ArrayData {
2355            data_type: schema,
2356            len: 0,
2357            offset: 0,
2358            buffers: vec![],
2359            child_data: vec![child_data],
2360            nulls: None,
2361        };
2362        data.validate_full().unwrap();
2363
2364        // break alignment in child data
2365        data.child_data[0].buffers[0] = sliced;
2366        let err = data.validate().unwrap_err();
2367
2368        assert_eq!(
2369            err.to_string(),
2370            "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2371        );
2372
2373        data.align_buffers();
2374        data.validate_full().unwrap();
2375    }
2376
2377    #[test]
2378    fn test_null_view_types() {
2379        let array_len = 32;
2380        let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2381        assert_eq!(array.len(), array_len);
2382        for i in 0..array.len() {
2383            assert!(array.is_null(i));
2384        }
2385
2386        let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2387        assert_eq!(array.len(), array_len);
2388        for i in 0..array.len() {
2389            assert!(array.is_null(i));
2390        }
2391    }
2392}