1use crate::bit_iterator::BitSliceIterator;
22use arrow_buffer::buffer::{BooleanBuffer, NullBuffer};
23use arrow_buffer::{
24 bit_util, i256, ArrowNativeType, Buffer, IntervalDayTime, IntervalMonthDayNano, MutableBuffer,
25};
26use arrow_schema::{ArrowError, DataType, UnionMode};
27use std::mem;
28use std::ops::Range;
29use std::sync::Arc;
30
31use crate::data::private::UnsafeFlag;
32use crate::{equal, validate_binary_view, validate_string_view};
33
34#[inline]
35pub(crate) fn contains_nulls(
36 null_bit_buffer: Option<&NullBuffer>,
37 offset: usize,
38 len: usize,
39) -> bool {
40 match null_bit_buffer {
41 Some(buffer) => {
42 match BitSliceIterator::new(buffer.validity(), buffer.offset() + offset, len).next() {
43 Some((start, end)) => start != 0 || end != len,
44 None => len != 0, }
46 }
47 None => false, }
49}
50
51#[inline]
52pub(crate) fn count_nulls(
53 null_bit_buffer: Option<&NullBuffer>,
54 offset: usize,
55 len: usize,
56) -> usize {
57 if let Some(buf) = null_bit_buffer {
58 let buffer = buf.buffer();
59 len - buffer.count_set_bits_offset(offset + buf.offset(), len)
60 } else {
61 0
62 }
63}
64
65#[inline]
67pub(crate) fn new_buffers(data_type: &DataType, capacity: usize) -> [MutableBuffer; 2] {
68 let empty_buffer = MutableBuffer::new(0);
69 match data_type {
70 DataType::Null => [empty_buffer, MutableBuffer::new(0)],
71 DataType::Boolean => {
72 let bytes = bit_util::ceil(capacity, 8);
73 let buffer = MutableBuffer::new(bytes);
74 [buffer, empty_buffer]
75 }
76 DataType::UInt8
77 | DataType::UInt16
78 | DataType::UInt32
79 | DataType::UInt64
80 | DataType::Int8
81 | DataType::Int16
82 | DataType::Int32
83 | DataType::Int64
84 | DataType::Float16
85 | DataType::Float32
86 | DataType::Float64
87 | DataType::Decimal128(_, _)
88 | DataType::Decimal256(_, _)
89 | DataType::Date32
90 | DataType::Time32(_)
91 | DataType::Date64
92 | DataType::Time64(_)
93 | DataType::Duration(_)
94 | DataType::Timestamp(_, _)
95 | DataType::Interval(_) => [
96 MutableBuffer::new(capacity * data_type.primitive_width().unwrap()),
97 empty_buffer,
98 ],
99 DataType::Utf8 | DataType::Binary => {
100 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
101 buffer.push(0i32);
103 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
104 }
105 DataType::LargeUtf8 | DataType::LargeBinary => {
106 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
107 buffer.push(0i64);
109 [buffer, MutableBuffer::new(capacity * mem::size_of::<u8>())]
110 }
111 DataType::BinaryView | DataType::Utf8View => [
112 MutableBuffer::new(capacity * mem::size_of::<u128>()),
113 empty_buffer,
114 ],
115 DataType::List(_) | DataType::Map(_, _) => {
116 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i32>());
118 buffer.push(0i32);
119 [buffer, empty_buffer]
120 }
121 DataType::ListView(_) => [
122 MutableBuffer::new(capacity * mem::size_of::<i32>()),
123 MutableBuffer::new(capacity * mem::size_of::<i32>()),
124 ],
125 DataType::LargeList(_) => {
126 let mut buffer = MutableBuffer::new((1 + capacity) * mem::size_of::<i64>());
128 buffer.push(0i64);
129 [buffer, empty_buffer]
130 }
131 DataType::LargeListView(_) => [
132 MutableBuffer::new(capacity * mem::size_of::<i64>()),
133 MutableBuffer::new(capacity * mem::size_of::<i64>()),
134 ],
135 DataType::FixedSizeBinary(size) => {
136 [MutableBuffer::new(capacity * *size as usize), empty_buffer]
137 }
138 DataType::Dictionary(k, _) => [
139 MutableBuffer::new(capacity * k.primitive_width().unwrap()),
140 empty_buffer,
141 ],
142 DataType::FixedSizeList(_, _) | DataType::Struct(_) | DataType::RunEndEncoded(_, _) => {
143 [empty_buffer, MutableBuffer::new(0)]
144 }
145 DataType::Union(_, mode) => {
146 let type_ids = MutableBuffer::new(capacity * mem::size_of::<i8>());
147 match mode {
148 UnionMode::Sparse => [type_ids, empty_buffer],
149 UnionMode::Dense => {
150 let offsets = MutableBuffer::new(capacity * mem::size_of::<i32>());
151 [type_ids, offsets]
152 }
153 }
154 }
155 }
156}
157
158#[derive(Debug, Clone)]
204pub struct ArrayData {
205 data_type: DataType,
207
208 len: usize,
210
211 offset: usize,
213
214 buffers: Vec<Buffer>,
218
219 child_data: Vec<ArrayData>,
222
223 nulls: Option<NullBuffer>,
226}
227
228pub type ArrayDataRef = Arc<ArrayData>;
230
231impl ArrayData {
232 pub unsafe fn new_unchecked(
249 data_type: DataType,
250 len: usize,
251 null_count: Option<usize>,
252 null_bit_buffer: Option<Buffer>,
253 offset: usize,
254 buffers: Vec<Buffer>,
255 child_data: Vec<ArrayData>,
256 ) -> Self {
257 let mut skip_validation = UnsafeFlag::new();
258 skip_validation.set(true);
260
261 ArrayDataBuilder {
262 data_type,
263 len,
264 null_count,
265 null_bit_buffer,
266 nulls: None,
267 offset,
268 buffers,
269 child_data,
270 align_buffers: false,
271 skip_validation,
272 }
273 .build()
274 .unwrap()
275 }
276
277 pub fn try_new(
288 data_type: DataType,
289 len: usize,
290 null_bit_buffer: Option<Buffer>,
291 offset: usize,
292 buffers: Vec<Buffer>,
293 child_data: Vec<ArrayData>,
294 ) -> Result<Self, ArrowError> {
295 if let Some(null_bit_buffer) = null_bit_buffer.as_ref() {
299 let needed_len = bit_util::ceil(len + offset, 8);
300 if null_bit_buffer.len() < needed_len {
301 return Err(ArrowError::InvalidArgumentError(format!(
302 "null_bit_buffer size too small. got {} needed {}",
303 null_bit_buffer.len(),
304 needed_len
305 )));
306 }
307 }
308 let new_self = unsafe {
310 Self::new_unchecked(
311 data_type,
312 len,
313 None,
314 null_bit_buffer,
315 offset,
316 buffers,
317 child_data,
318 )
319 };
320
321 new_self.validate_data()?;
326 Ok(new_self)
327 }
328
329 #[inline]
331 pub const fn builder(data_type: DataType) -> ArrayDataBuilder {
332 ArrayDataBuilder::new(data_type)
333 }
334
335 #[inline]
337 pub const fn data_type(&self) -> &DataType {
338 &self.data_type
339 }
340
341 pub fn buffers(&self) -> &[Buffer] {
343 &self.buffers
344 }
345
346 pub fn child_data(&self) -> &[ArrayData] {
349 &self.child_data[..]
350 }
351
352 #[inline]
354 pub fn is_null(&self, i: usize) -> bool {
355 match &self.nulls {
356 Some(v) => v.is_null(i),
357 None => false,
358 }
359 }
360
361 #[inline]
365 pub fn nulls(&self) -> Option<&NullBuffer> {
366 self.nulls.as_ref()
367 }
368
369 #[inline]
371 pub fn is_valid(&self, i: usize) -> bool {
372 !self.is_null(i)
373 }
374
375 #[inline]
377 pub const fn len(&self) -> usize {
378 self.len
379 }
380
381 #[inline]
383 pub const fn is_empty(&self) -> bool {
384 self.len == 0
385 }
386
387 #[inline]
389 pub const fn offset(&self) -> usize {
390 self.offset
391 }
392
393 #[inline]
395 pub fn null_count(&self) -> usize {
396 self.nulls
397 .as_ref()
398 .map(|x| x.null_count())
399 .unwrap_or_default()
400 }
401
402 pub fn get_buffer_memory_size(&self) -> usize {
414 let mut size = 0;
415 for buffer in &self.buffers {
416 size += buffer.capacity();
417 }
418 if let Some(bitmap) = &self.nulls {
419 size += bitmap.buffer().capacity()
420 }
421 for child in &self.child_data {
422 size += child.get_buffer_memory_size();
423 }
424 size
425 }
426
427 pub fn get_slice_memory_size(&self) -> Result<usize, ArrowError> {
440 let mut result: usize = 0;
441 let layout = layout(&self.data_type);
442
443 for spec in layout.buffers.iter() {
444 match spec {
445 BufferSpec::FixedWidth { byte_width, .. } => {
446 let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| {
447 ArrowError::ComputeError(
448 "Integer overflow computing buffer size".to_string(),
449 )
450 })?;
451 result += buffer_size;
452 }
453 BufferSpec::VariableWidth => {
454 let buffer_len: usize;
455 match self.data_type {
456 DataType::Utf8 | DataType::Binary => {
457 let offsets = self.typed_offsets::<i32>()?;
458 buffer_len = (offsets[self.len] - offsets[0] ) as usize;
459 }
460 DataType::LargeUtf8 | DataType::LargeBinary => {
461 let offsets = self.typed_offsets::<i64>()?;
462 buffer_len = (offsets[self.len] - offsets[0]) as usize;
463 }
464 _ => {
465 return Err(ArrowError::NotYetImplemented(format!(
466 "Invalid data type for VariableWidth buffer. Expected Utf8, LargeUtf8, Binary or LargeBinary. Got {}",
467 self.data_type
468 )))
469 }
470 };
471 result += buffer_len;
472 }
473 BufferSpec::BitMap => {
474 let buffer_size = bit_util::ceil(self.len, 8);
475 result += buffer_size;
476 }
477 BufferSpec::AlwaysNull => {
478 }
480 }
481 }
482
483 if self.nulls().is_some() {
484 result += bit_util::ceil(self.len, 8);
485 }
486
487 for child in &self.child_data {
488 result += child.get_slice_memory_size()?;
489 }
490 Ok(result)
491 }
492
493 pub fn get_array_memory_size(&self) -> usize {
502 let mut size = mem::size_of_val(self);
503
504 for buffer in &self.buffers {
506 size += mem::size_of::<Buffer>();
507 size += buffer.capacity();
508 }
509 if let Some(nulls) = &self.nulls {
510 size += nulls.buffer().capacity();
511 }
512 for child in &self.child_data {
513 size += child.get_array_memory_size();
514 }
515
516 size
517 }
518
519 pub fn slice(&self, offset: usize, length: usize) -> ArrayData {
527 assert!((offset + length) <= self.len());
528
529 if let DataType::Struct(_) = self.data_type() {
530 let new_offset = self.offset + offset;
532 let new_data = ArrayData {
533 data_type: self.data_type().clone(),
534 len: length,
535 offset: new_offset,
536 buffers: self.buffers.clone(),
537 child_data: self
539 .child_data()
540 .iter()
541 .map(|data| data.slice(offset, length))
542 .collect(),
543 nulls: self.nulls.as_ref().map(|x| x.slice(offset, length)),
544 };
545
546 new_data
547 } else {
548 let mut new_data = self.clone();
549
550 new_data.len = length;
551 new_data.offset = offset + self.offset;
552 new_data.nulls = self.nulls.as_ref().map(|x| x.slice(offset, length));
553
554 new_data
555 }
556 }
557
558 pub fn buffer<T: ArrowNativeType>(&self, buffer: usize) -> &[T] {
564 &self.buffers()[buffer].typed_data()[self.offset..]
565 }
566
567 pub fn new_null(data_type: &DataType, len: usize) -> Self {
569 let bit_len = bit_util::ceil(len, 8);
570 let zeroed = |len: usize| Buffer::from(MutableBuffer::from_len_zeroed(len));
571
572 let (buffers, child_data, has_nulls) = match data_type.primitive_width() {
573 Some(width) => (vec![zeroed(width * len)], vec![], true),
574 None => match data_type {
575 DataType::Null => (vec![], vec![], false),
576 DataType::Boolean => (vec![zeroed(bit_len)], vec![], true),
577 DataType::Binary | DataType::Utf8 => {
578 (vec![zeroed((len + 1) * 4), zeroed(0)], vec![], true)
579 }
580 DataType::BinaryView | DataType::Utf8View => (vec![zeroed(len * 16)], vec![], true),
581 DataType::LargeBinary | DataType::LargeUtf8 => {
582 (vec![zeroed((len + 1) * 8), zeroed(0)], vec![], true)
583 }
584 DataType::FixedSizeBinary(i) => (vec![zeroed(*i as usize * len)], vec![], true),
585 DataType::List(f) | DataType::Map(f, _) => (
586 vec![zeroed((len + 1) * 4)],
587 vec![ArrayData::new_empty(f.data_type())],
588 true,
589 ),
590 DataType::LargeList(f) => (
591 vec![zeroed((len + 1) * 8)],
592 vec![ArrayData::new_empty(f.data_type())],
593 true,
594 ),
595 DataType::FixedSizeList(f, list_len) => (
596 vec![],
597 vec![ArrayData::new_null(f.data_type(), *list_len as usize * len)],
598 true,
599 ),
600 DataType::Struct(fields) => (
601 vec![],
602 fields
603 .iter()
604 .map(|f| Self::new_null(f.data_type(), len))
605 .collect(),
606 true,
607 ),
608 DataType::Dictionary(k, v) => (
609 vec![zeroed(k.primitive_width().unwrap() * len)],
610 vec![ArrayData::new_empty(v.as_ref())],
611 true,
612 ),
613 DataType::Union(f, mode) => {
614 let (id, _) = f.iter().next().unwrap();
615 let ids = Buffer::from_iter(std::iter::repeat(id).take(len));
616 let buffers = match mode {
617 UnionMode::Sparse => vec![ids],
618 UnionMode::Dense => {
619 let end_offset = i32::from_usize(len).unwrap();
620 vec![ids, Buffer::from_iter(0_i32..end_offset)]
621 }
622 };
623
624 let children = f
625 .iter()
626 .enumerate()
627 .map(|(idx, (_, f))| {
628 if idx == 0 || *mode == UnionMode::Sparse {
629 Self::new_null(f.data_type(), len)
630 } else {
631 Self::new_empty(f.data_type())
632 }
633 })
634 .collect();
635
636 (buffers, children, false)
637 }
638 DataType::RunEndEncoded(r, v) => {
639 let runs = match r.data_type() {
640 DataType::Int16 => {
641 let i = i16::from_usize(len).expect("run overflow");
642 Buffer::from_slice_ref([i])
643 }
644 DataType::Int32 => {
645 let i = i32::from_usize(len).expect("run overflow");
646 Buffer::from_slice_ref([i])
647 }
648 DataType::Int64 => {
649 let i = i64::from_usize(len).expect("run overflow");
650 Buffer::from_slice_ref([i])
651 }
652 dt => unreachable!("Invalid run ends data type {dt}"),
653 };
654
655 let builder = ArrayData::builder(r.data_type().clone())
656 .len(1)
657 .buffers(vec![runs]);
658
659 let runs = unsafe { builder.build_unchecked() };
662 (
663 vec![],
664 vec![runs, ArrayData::new_null(v.data_type(), 1)],
665 false,
666 )
667 }
668 d => unreachable!("{d}"),
669 },
670 };
671
672 let mut builder = ArrayDataBuilder::new(data_type.clone())
673 .len(len)
674 .buffers(buffers)
675 .child_data(child_data);
676
677 if has_nulls {
678 builder = builder.nulls(Some(NullBuffer::new_null(len)))
679 }
680
681 unsafe { builder.build_unchecked() }
684 }
685
686 pub fn new_empty(data_type: &DataType) -> Self {
688 Self::new_null(data_type, 0)
689 }
690
691 pub fn align_buffers(&mut self) {
700 let layout = layout(&self.data_type);
701 for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) {
702 if let BufferSpec::FixedWidth { alignment, .. } = spec {
703 if buffer.as_ptr().align_offset(*alignment) != 0 {
704 *buffer = Buffer::from_slice_ref(buffer.as_ref());
705 }
706 }
707 }
708 for data in self.child_data.iter_mut() {
710 data.align_buffers()
711 }
712 }
713
714 pub fn validate(&self) -> Result<(), ArrowError> {
725 let len_plus_offset = self.len + self.offset;
727
728 let layout = layout(&self.data_type);
730
731 if !layout.can_contain_null_mask && self.nulls.is_some() {
732 return Err(ArrowError::InvalidArgumentError(format!(
733 "Arrays of type {:?} cannot contain a null bitmask",
734 self.data_type,
735 )));
736 }
737
738 if self.buffers.len() < layout.buffers.len()
740 || (!layout.variadic && self.buffers.len() != layout.buffers.len())
741 {
742 return Err(ArrowError::InvalidArgumentError(format!(
743 "Expected {} buffers in array of type {:?}, got {}",
744 layout.buffers.len(),
745 self.data_type,
746 self.buffers.len(),
747 )));
748 }
749
750 for (i, (buffer, spec)) in self.buffers.iter().zip(layout.buffers.iter()).enumerate() {
751 match spec {
752 BufferSpec::FixedWidth {
753 byte_width,
754 alignment,
755 } => {
756 let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
757
758 if buffer.len() < min_buffer_size {
759 return Err(ArrowError::InvalidArgumentError(format!(
760 "Need at least {} bytes in buffers[{}] in array of type {:?}, but got {}",
761 min_buffer_size, i, self.data_type, buffer.len()
762 )));
763 }
764
765 let align_offset = buffer.as_ptr().align_offset(*alignment);
766 if align_offset != 0 {
767 return Err(ArrowError::InvalidArgumentError(format!(
768 "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}",
769 self.data_type, align_offset.min(alignment - align_offset)
770 )));
771 }
772 }
773 BufferSpec::VariableWidth => {
774 }
778 BufferSpec::BitMap => {
779 let min_buffer_size = bit_util::ceil(len_plus_offset, 8);
780 if buffer.len() < min_buffer_size {
781 return Err(ArrowError::InvalidArgumentError(format!(
782 "Need at least {} bytes for bitmap in buffers[{}] in array of type {:?}, but got {}",
783 min_buffer_size, i, self.data_type, buffer.len()
784 )));
785 }
786 }
787 BufferSpec::AlwaysNull => {
788 }
790 }
791 }
792
793 if let Some(nulls) = self.nulls() {
795 if nulls.null_count() > self.len {
796 return Err(ArrowError::InvalidArgumentError(format!(
797 "null_count {} for an array exceeds length of {} elements",
798 nulls.null_count(),
799 self.len
800 )));
801 }
802
803 let actual_len = nulls.validity().len();
804 let needed_len = bit_util::ceil(len_plus_offset, 8);
805 if actual_len < needed_len {
806 return Err(ArrowError::InvalidArgumentError(format!(
807 "null_bit_buffer size too small. got {actual_len} needed {needed_len}",
808 )));
809 }
810
811 if nulls.len() != self.len {
812 return Err(ArrowError::InvalidArgumentError(format!(
813 "null buffer incorrect size. got {} expected {}",
814 nulls.len(),
815 self.len
816 )));
817 }
818 }
819
820 self.validate_child_data()?;
821
822 match &self.data_type {
824 DataType::Utf8 | DataType::Binary => {
825 self.validate_offsets::<i32>(self.buffers[1].len())?;
826 }
827 DataType::LargeUtf8 | DataType::LargeBinary => {
828 self.validate_offsets::<i64>(self.buffers[1].len())?;
829 }
830 DataType::Dictionary(key_type, _value_type) => {
831 if !DataType::is_dictionary_key_type(key_type) {
833 return Err(ArrowError::InvalidArgumentError(format!(
834 "Dictionary key type must be integer, but was {key_type}"
835 )));
836 }
837 }
838 DataType::RunEndEncoded(run_ends_type, _) => {
839 if run_ends_type.is_nullable() {
840 return Err(ArrowError::InvalidArgumentError(
841 "The nullable should be set to false for the field defining run_ends array.".to_string()
842 ));
843 }
844 if !DataType::is_run_ends_type(run_ends_type.data_type()) {
845 return Err(ArrowError::InvalidArgumentError(format!(
846 "RunArray run_ends types must be Int16, Int32 or Int64, but was {}",
847 run_ends_type.data_type()
848 )));
849 }
850 }
851 _ => {}
852 };
853
854 Ok(())
855 }
856
857 fn typed_offsets<T: ArrowNativeType + num::Num>(&self) -> Result<&[T], ArrowError> {
864 if self.len == 0 && self.buffers[0].is_empty() {
866 return Ok(&[]);
867 }
868
869 self.typed_buffer(0, self.len + 1)
870 }
871
872 fn typed_buffer<T: ArrowNativeType + num::Num>(
874 &self,
875 idx: usize,
876 len: usize,
877 ) -> Result<&[T], ArrowError> {
878 let buffer = &self.buffers[idx];
879
880 let required_len = (len + self.offset) * mem::size_of::<T>();
881
882 if buffer.len() < required_len {
883 return Err(ArrowError::InvalidArgumentError(format!(
884 "Buffer {} of {} isn't large enough. Expected {} bytes got {}",
885 idx,
886 self.data_type,
887 required_len,
888 buffer.len()
889 )));
890 }
891
892 Ok(&buffer.typed_data::<T>()[self.offset..self.offset + len])
893 }
894
895 fn validate_offsets<T: ArrowNativeType + num::Num + std::fmt::Display>(
898 &self,
899 values_length: usize,
900 ) -> Result<(), ArrowError> {
901 let offsets = self.typed_offsets::<T>()?;
903 if offsets.is_empty() {
904 return Ok(());
905 }
906
907 let first_offset = offsets[0].to_usize().ok_or_else(|| {
908 ArrowError::InvalidArgumentError(format!(
909 "Error converting offset[0] ({}) to usize for {}",
910 offsets[0], self.data_type
911 ))
912 })?;
913
914 let last_offset = offsets[self.len].to_usize().ok_or_else(|| {
915 ArrowError::InvalidArgumentError(format!(
916 "Error converting offset[{}] ({}) to usize for {}",
917 self.len, offsets[self.len], self.data_type
918 ))
919 })?;
920
921 if first_offset > values_length {
922 return Err(ArrowError::InvalidArgumentError(format!(
923 "First offset {} of {} is larger than values length {}",
924 first_offset, self.data_type, values_length,
925 )));
926 }
927
928 if last_offset > values_length {
929 return Err(ArrowError::InvalidArgumentError(format!(
930 "Last offset {} of {} is larger than values length {}",
931 last_offset, self.data_type, values_length,
932 )));
933 }
934
935 if first_offset > last_offset {
936 return Err(ArrowError::InvalidArgumentError(format!(
937 "First offset {} in {} is smaller than last offset {}",
938 first_offset, self.data_type, last_offset,
939 )));
940 }
941
942 Ok(())
943 }
944
945 fn validate_offsets_and_sizes<T: ArrowNativeType + num::Num + std::fmt::Display>(
948 &self,
949 values_length: usize,
950 ) -> Result<(), ArrowError> {
951 let offsets: &[T] = self.typed_buffer(0, self.len)?;
952 let sizes: &[T] = self.typed_buffer(1, self.len)?;
953 for i in 0..values_length {
954 let size = sizes[i].to_usize().ok_or_else(|| {
955 ArrowError::InvalidArgumentError(format!(
956 "Error converting size[{}] ({}) to usize for {}",
957 i, sizes[i], self.data_type
958 ))
959 })?;
960 let offset = offsets[i].to_usize().ok_or_else(|| {
961 ArrowError::InvalidArgumentError(format!(
962 "Error converting offset[{}] ({}) to usize for {}",
963 i, offsets[i], self.data_type
964 ))
965 })?;
966 if size
967 .checked_add(offset)
968 .expect("Offset and size have exceeded the usize boundary")
969 > values_length
970 {
971 return Err(ArrowError::InvalidArgumentError(format!(
972 "Size {} at index {} is larger than the remaining values for {}",
973 size, i, self.data_type
974 )));
975 }
976 }
977 Ok(())
978 }
979
980 fn validate_child_data(&self) -> Result<(), ArrowError> {
982 match &self.data_type {
983 DataType::List(field) | DataType::Map(field, _) => {
984 let values_data = self.get_single_valid_child_data(field.data_type())?;
985 self.validate_offsets::<i32>(values_data.len)?;
986 Ok(())
987 }
988 DataType::LargeList(field) => {
989 let values_data = self.get_single_valid_child_data(field.data_type())?;
990 self.validate_offsets::<i64>(values_data.len)?;
991 Ok(())
992 }
993 DataType::ListView(field) => {
994 let values_data = self.get_single_valid_child_data(field.data_type())?;
995 self.validate_offsets_and_sizes::<i32>(values_data.len)?;
996 Ok(())
997 }
998 DataType::LargeListView(field) => {
999 let values_data = self.get_single_valid_child_data(field.data_type())?;
1000 self.validate_offsets_and_sizes::<i64>(values_data.len)?;
1001 Ok(())
1002 }
1003 DataType::FixedSizeList(field, list_size) => {
1004 let values_data = self.get_single_valid_child_data(field.data_type())?;
1005
1006 let list_size: usize = (*list_size).try_into().map_err(|_| {
1007 ArrowError::InvalidArgumentError(format!(
1008 "{} has a negative list_size {}",
1009 self.data_type, list_size
1010 ))
1011 })?;
1012
1013 let expected_values_len = self.len
1014 .checked_mul(list_size)
1015 .expect("integer overflow computing expected number of expected values in FixedListSize");
1016
1017 if values_data.len < expected_values_len {
1018 return Err(ArrowError::InvalidArgumentError(format!(
1019 "Values length {} is less than the length ({}) multiplied by the value size ({}) for {}",
1020 values_data.len, list_size, list_size, self.data_type
1021 )));
1022 }
1023
1024 Ok(())
1025 }
1026 DataType::Struct(fields) => {
1027 self.validate_num_child_data(fields.len())?;
1028 for (i, field) in fields.iter().enumerate() {
1029 let field_data = self.get_valid_child_data(i, field.data_type())?;
1030
1031 if field_data.len < self.len {
1033 return Err(ArrowError::InvalidArgumentError(format!(
1034 "{} child array #{} for field {} has length smaller than expected for struct array ({} < {})",
1035 self.data_type, i, field.name(), field_data.len, self.len
1036 )));
1037 }
1038 }
1039 Ok(())
1040 }
1041 DataType::RunEndEncoded(run_ends_field, values_field) => {
1042 self.validate_num_child_data(2)?;
1043 let run_ends_data = self.get_valid_child_data(0, run_ends_field.data_type())?;
1044 let values_data = self.get_valid_child_data(1, values_field.data_type())?;
1045 if run_ends_data.len != values_data.len {
1046 return Err(ArrowError::InvalidArgumentError(format!(
1047 "The run_ends array length should be the same as values array length. Run_ends array length is {}, values array length is {}",
1048 run_ends_data.len, values_data.len
1049 )));
1050 }
1051 if run_ends_data.nulls.is_some() {
1052 return Err(ArrowError::InvalidArgumentError(
1053 "Found null values in run_ends array. The run_ends array should not have null values.".to_string(),
1054 ));
1055 }
1056 Ok(())
1057 }
1058 DataType::Union(fields, mode) => {
1059 self.validate_num_child_data(fields.len())?;
1060
1061 for (i, (_, field)) in fields.iter().enumerate() {
1062 let field_data = self.get_valid_child_data(i, field.data_type())?;
1063
1064 if mode == &UnionMode::Sparse && field_data.len < (self.len + self.offset) {
1065 return Err(ArrowError::InvalidArgumentError(format!(
1066 "Sparse union child array #{} has length smaller than expected for union array ({} < {})",
1067 i, field_data.len, self.len + self.offset
1068 )));
1069 }
1070 }
1071 Ok(())
1072 }
1073 DataType::Dictionary(_key_type, value_type) => {
1074 self.get_single_valid_child_data(value_type)?;
1075 Ok(())
1076 }
1077 _ => {
1078 if !self.child_data.is_empty() {
1080 return Err(ArrowError::InvalidArgumentError(format!(
1081 "Expected no child arrays for type {} but got {}",
1082 self.data_type,
1083 self.child_data.len()
1084 )));
1085 }
1086 Ok(())
1087 }
1088 }
1089 }
1090
1091 fn get_single_valid_child_data(
1095 &self,
1096 expected_type: &DataType,
1097 ) -> Result<&ArrayData, ArrowError> {
1098 self.validate_num_child_data(1)?;
1099 self.get_valid_child_data(0, expected_type)
1100 }
1101
1102 fn validate_num_child_data(&self, expected_len: usize) -> Result<(), ArrowError> {
1104 if self.child_data.len() != expected_len {
1105 Err(ArrowError::InvalidArgumentError(format!(
1106 "Value data for {} should contain {} child data array(s), had {}",
1107 self.data_type,
1108 expected_len,
1109 self.child_data.len()
1110 )))
1111 } else {
1112 Ok(())
1113 }
1114 }
1115
1116 fn get_valid_child_data(
1119 &self,
1120 i: usize,
1121 expected_type: &DataType,
1122 ) -> Result<&ArrayData, ArrowError> {
1123 let values_data = self.child_data.get(i).ok_or_else(|| {
1124 ArrowError::InvalidArgumentError(format!(
1125 "{} did not have enough child arrays. Expected at least {} but had only {}",
1126 self.data_type,
1127 i + 1,
1128 self.child_data.len()
1129 ))
1130 })?;
1131
1132 if expected_type != &values_data.data_type {
1133 return Err(ArrowError::InvalidArgumentError(format!(
1134 "Child type mismatch for {}. Expected {} but child data had {}",
1135 self.data_type, expected_type, values_data.data_type
1136 )));
1137 }
1138
1139 values_data.validate()?;
1140 Ok(values_data)
1141 }
1142
1143 pub fn validate_data(&self) -> Result<(), ArrowError> {
1159 self.validate()?;
1160
1161 self.validate_nulls()?;
1162 self.validate_values()?;
1163 Ok(())
1164 }
1165
1166 pub fn validate_full(&self) -> Result<(), ArrowError> {
1171 self.validate_data()?;
1172 self.child_data
1174 .iter()
1175 .enumerate()
1176 .try_for_each(|(i, child_data)| {
1177 child_data.validate_full().map_err(|e| {
1178 ArrowError::InvalidArgumentError(format!(
1179 "{} child #{} invalid: {}",
1180 self.data_type, i, e
1181 ))
1182 })
1183 })?;
1184 Ok(())
1185 }
1186
1187 pub fn validate_nulls(&self) -> Result<(), ArrowError> {
1197 if let Some(nulls) = &self.nulls {
1198 let actual = nulls.len() - nulls.inner().count_set_bits();
1199 if actual != nulls.null_count() {
1200 return Err(ArrowError::InvalidArgumentError(format!(
1201 "null_count value ({}) doesn't match actual number of nulls in array ({})",
1202 nulls.null_count(),
1203 actual
1204 )));
1205 }
1206 }
1207
1208 match &self.data_type {
1213 DataType::List(f) | DataType::LargeList(f) | DataType::Map(f, _) => {
1214 if !f.is_nullable() {
1215 self.validate_non_nullable(None, &self.child_data[0])?
1216 }
1217 }
1218 DataType::FixedSizeList(field, len) => {
1219 let child = &self.child_data[0];
1220 if !field.is_nullable() {
1221 match &self.nulls {
1222 Some(nulls) => {
1223 let element_len = *len as usize;
1224 let expanded = nulls.expand(element_len);
1225 self.validate_non_nullable(Some(&expanded), child)?;
1226 }
1227 None => self.validate_non_nullable(None, child)?,
1228 }
1229 }
1230 }
1231 DataType::Struct(fields) => {
1232 for (field, child) in fields.iter().zip(&self.child_data) {
1233 if !field.is_nullable() {
1234 self.validate_non_nullable(self.nulls(), child)?
1235 }
1236 }
1237 }
1238 _ => {}
1239 }
1240
1241 Ok(())
1242 }
1243
1244 fn validate_non_nullable(
1246 &self,
1247 mask: Option<&NullBuffer>,
1248 child: &ArrayData,
1249 ) -> Result<(), ArrowError> {
1250 let mask = match mask {
1251 Some(mask) => mask,
1252 None => {
1253 return match child.null_count() {
1254 0 => Ok(()),
1255 _ => Err(ArrowError::InvalidArgumentError(format!(
1256 "non-nullable child of type {} contains nulls not present in parent {}",
1257 child.data_type, self.data_type
1258 ))),
1259 }
1260 }
1261 };
1262
1263 match child.nulls() {
1264 Some(nulls) if !mask.contains(nulls) => Err(ArrowError::InvalidArgumentError(format!(
1265 "non-nullable child of type {} contains nulls not present in parent",
1266 child.data_type
1267 ))),
1268 _ => Ok(()),
1269 }
1270 }
1271
1272 pub fn validate_values(&self) -> Result<(), ArrowError> {
1278 match &self.data_type {
1279 DataType::Utf8 => self.validate_utf8::<i32>(),
1280 DataType::LargeUtf8 => self.validate_utf8::<i64>(),
1281 DataType::Binary => self.validate_offsets_full::<i32>(self.buffers[1].len()),
1282 DataType::LargeBinary => self.validate_offsets_full::<i64>(self.buffers[1].len()),
1283 DataType::BinaryView => {
1284 let views = self.typed_buffer::<u128>(0, self.len)?;
1285 validate_binary_view(views, &self.buffers[1..])
1286 }
1287 DataType::Utf8View => {
1288 let views = self.typed_buffer::<u128>(0, self.len)?;
1289 validate_string_view(views, &self.buffers[1..])
1290 }
1291 DataType::List(_) | DataType::Map(_, _) => {
1292 let child = &self.child_data[0];
1293 self.validate_offsets_full::<i32>(child.len)
1294 }
1295 DataType::LargeList(_) => {
1296 let child = &self.child_data[0];
1297 self.validate_offsets_full::<i64>(child.len)
1298 }
1299 DataType::Union(_, _) => {
1300 Ok(())
1306 }
1307 DataType::Dictionary(key_type, _value_type) => {
1308 let dictionary_length: i64 = self.child_data[0].len.try_into().unwrap();
1309 let max_value = dictionary_length - 1;
1310 match key_type.as_ref() {
1311 DataType::UInt8 => self.check_bounds::<u8>(max_value),
1312 DataType::UInt16 => self.check_bounds::<u16>(max_value),
1313 DataType::UInt32 => self.check_bounds::<u32>(max_value),
1314 DataType::UInt64 => self.check_bounds::<u64>(max_value),
1315 DataType::Int8 => self.check_bounds::<i8>(max_value),
1316 DataType::Int16 => self.check_bounds::<i16>(max_value),
1317 DataType::Int32 => self.check_bounds::<i32>(max_value),
1318 DataType::Int64 => self.check_bounds::<i64>(max_value),
1319 _ => unreachable!(),
1320 }
1321 }
1322 DataType::RunEndEncoded(run_ends, _values) => {
1323 let run_ends_data = self.child_data()[0].clone();
1324 match run_ends.data_type() {
1325 DataType::Int16 => run_ends_data.check_run_ends::<i16>(),
1326 DataType::Int32 => run_ends_data.check_run_ends::<i32>(),
1327 DataType::Int64 => run_ends_data.check_run_ends::<i64>(),
1328 _ => unreachable!(),
1329 }
1330 }
1331 _ => {
1332 Ok(())
1334 }
1335 }
1336 }
1337
1338 fn validate_each_offset<T, V>(&self, offset_limit: usize, validate: V) -> Result<(), ArrowError>
1349 where
1350 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1351 V: Fn(usize, Range<usize>) -> Result<(), ArrowError>,
1352 {
1353 self.typed_offsets::<T>()?
1354 .iter()
1355 .enumerate()
1356 .map(|(i, x)| {
1357 let r = x.to_usize().ok_or_else(|| {
1359 ArrowError::InvalidArgumentError(format!(
1360 "Offset invariant failure: Could not convert offset {x} to usize at position {i}"))}
1361 );
1362 match r {
1364 Ok(n) if n <= offset_limit => Ok((i, n)),
1365 Ok(_) => Err(ArrowError::InvalidArgumentError(format!(
1366 "Offset invariant failure: offset at position {i} out of bounds: {x} > {offset_limit}"))
1367 ),
1368 Err(e) => Err(e),
1369 }
1370 })
1371 .scan(0_usize, |start, end| {
1372 match end {
1374 Ok((i, end)) if *start <= end => {
1375 let range = Some(Ok((i, *start..end)));
1376 *start = end;
1377 range
1378 }
1379 Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!(
1380 "Offset invariant failure: non-monotonic offset at slot {}: {} > {}",
1381 i - 1, start, end))
1382 )),
1383 Err(err) => Some(Err(err)),
1384 }
1385 })
1386 .skip(1) .try_for_each(|res: Result<(usize, Range<usize>), ArrowError>| {
1388 let (item_index, range) = res?;
1389 validate(item_index-1, range)
1390 })
1391 }
1392
1393 fn validate_utf8<T>(&self) -> Result<(), ArrowError>
1396 where
1397 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1398 {
1399 let values_buffer = &self.buffers[1].as_slice();
1400 if let Ok(values_str) = std::str::from_utf8(values_buffer) {
1401 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1403 if !values_str.is_char_boundary(range.start)
1404 || !values_str.is_char_boundary(range.end)
1405 {
1406 return Err(ArrowError::InvalidArgumentError(format!(
1407 "incomplete utf-8 byte sequence from index {string_index}"
1408 )));
1409 }
1410 Ok(())
1411 })
1412 } else {
1413 self.validate_each_offset::<T, _>(values_buffer.len(), |string_index, range| {
1415 std::str::from_utf8(&values_buffer[range.clone()]).map_err(|e| {
1416 ArrowError::InvalidArgumentError(format!(
1417 "Invalid UTF8 sequence at string index {string_index} ({range:?}): {e}"
1418 ))
1419 })?;
1420 Ok(())
1421 })
1422 }
1423 }
1424
1425 fn validate_offsets_full<T>(&self, offset_limit: usize) -> Result<(), ArrowError>
1428 where
1429 T: ArrowNativeType + TryInto<usize> + num::Num + std::fmt::Display,
1430 {
1431 self.validate_each_offset::<T, _>(offset_limit, |_string_index, _range| {
1432 Ok(())
1435 })
1436 }
1437
1438 fn check_bounds<T>(&self, max_value: i64) -> Result<(), ArrowError>
1441 where
1442 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1443 {
1444 let required_len = self.len + self.offset;
1445 let buffer = &self.buffers[0];
1446
1447 assert!(buffer.len() / mem::size_of::<T>() >= required_len);
1450
1451 let indexes: &[T] = &buffer.typed_data::<T>()[self.offset..self.offset + self.len];
1453
1454 indexes.iter().enumerate().try_for_each(|(i, &dict_index)| {
1455 if self.is_null(i) {
1457 return Ok(());
1458 }
1459 let dict_index: i64 = dict_index.try_into().map_err(|_| {
1460 ArrowError::InvalidArgumentError(format!(
1461 "Value at position {i} out of bounds: {dict_index} (can not convert to i64)"
1462 ))
1463 })?;
1464
1465 if dict_index < 0 || dict_index > max_value {
1466 return Err(ArrowError::InvalidArgumentError(format!(
1467 "Value at position {i} out of bounds: {dict_index} (should be in [0, {max_value}])"
1468 )));
1469 }
1470 Ok(())
1471 })
1472 }
1473
1474 fn check_run_ends<T>(&self) -> Result<(), ArrowError>
1476 where
1477 T: ArrowNativeType + TryInto<i64> + num::Num + std::fmt::Display,
1478 {
1479 let values = self.typed_buffer::<T>(0, self.len)?;
1480 let mut prev_value: i64 = 0_i64;
1481 values.iter().enumerate().try_for_each(|(ix, &inp_value)| {
1482 let value: i64 = inp_value.try_into().map_err(|_| {
1483 ArrowError::InvalidArgumentError(format!(
1484 "Value at position {ix} out of bounds: {inp_value} (can not convert to i64)"
1485 ))
1486 })?;
1487 if value <= 0_i64 {
1488 return Err(ArrowError::InvalidArgumentError(format!(
1489 "The values in run_ends array should be strictly positive. Found value {value} at index {ix} that does not match the criteria."
1490 )));
1491 }
1492 if ix > 0 && value <= prev_value {
1493 return Err(ArrowError::InvalidArgumentError(format!(
1494 "The values in run_ends array should be strictly increasing. Found value {value} at index {ix} with previous value {prev_value} that does not match the criteria."
1495 )));
1496 }
1497
1498 prev_value = value;
1499 Ok(())
1500 })?;
1501
1502 if prev_value.as_usize() < (self.offset + self.len) {
1503 return Err(ArrowError::InvalidArgumentError(format!(
1504 "The offset + length of array should be less or equal to last value in the run_ends array. The last value of run_ends array is {prev_value} and offset + length of array is {}.",
1505 self.offset + self.len
1506 )));
1507 }
1508 Ok(())
1509 }
1510
1511 pub fn ptr_eq(&self, other: &Self) -> bool {
1515 if self.offset != other.offset
1516 || self.len != other.len
1517 || self.data_type != other.data_type
1518 || self.buffers.len() != other.buffers.len()
1519 || self.child_data.len() != other.child_data.len()
1520 {
1521 return false;
1522 }
1523
1524 match (&self.nulls, &other.nulls) {
1525 (Some(a), Some(b)) if !a.inner().ptr_eq(b.inner()) => return false,
1526 (Some(_), None) | (None, Some(_)) => return false,
1527 _ => {}
1528 };
1529
1530 if !self
1531 .buffers
1532 .iter()
1533 .zip(other.buffers.iter())
1534 .all(|(a, b)| a.as_ptr() == b.as_ptr())
1535 {
1536 return false;
1537 }
1538
1539 self.child_data
1540 .iter()
1541 .zip(other.child_data.iter())
1542 .all(|(a, b)| a.ptr_eq(b))
1543 }
1544
1545 pub fn into_builder(self) -> ArrayDataBuilder {
1547 self.into()
1548 }
1549}
1550
1551pub fn layout(data_type: &DataType) -> DataTypeLayout {
1554 use arrow_schema::IntervalUnit::*;
1557
1558 match data_type {
1559 DataType::Null => DataTypeLayout {
1560 buffers: vec![],
1561 can_contain_null_mask: false,
1562 variadic: false,
1563 },
1564 DataType::Boolean => DataTypeLayout {
1565 buffers: vec![BufferSpec::BitMap],
1566 can_contain_null_mask: true,
1567 variadic: false,
1568 },
1569 DataType::Int8 => DataTypeLayout::new_fixed_width::<i8>(),
1570 DataType::Int16 => DataTypeLayout::new_fixed_width::<i16>(),
1571 DataType::Int32 => DataTypeLayout::new_fixed_width::<i32>(),
1572 DataType::Int64 => DataTypeLayout::new_fixed_width::<i64>(),
1573 DataType::UInt8 => DataTypeLayout::new_fixed_width::<u8>(),
1574 DataType::UInt16 => DataTypeLayout::new_fixed_width::<u16>(),
1575 DataType::UInt32 => DataTypeLayout::new_fixed_width::<u32>(),
1576 DataType::UInt64 => DataTypeLayout::new_fixed_width::<u64>(),
1577 DataType::Float16 => DataTypeLayout::new_fixed_width::<half::f16>(),
1578 DataType::Float32 => DataTypeLayout::new_fixed_width::<f32>(),
1579 DataType::Float64 => DataTypeLayout::new_fixed_width::<f64>(),
1580 DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::<i64>(),
1581 DataType::Date32 => DataTypeLayout::new_fixed_width::<i32>(),
1582 DataType::Date64 => DataTypeLayout::new_fixed_width::<i64>(),
1583 DataType::Time32(_) => DataTypeLayout::new_fixed_width::<i32>(),
1584 DataType::Time64(_) => DataTypeLayout::new_fixed_width::<i64>(),
1585 DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::<i32>(),
1586 DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::<IntervalDayTime>(),
1587 DataType::Interval(MonthDayNano) => {
1588 DataTypeLayout::new_fixed_width::<IntervalMonthDayNano>()
1589 }
1590 DataType::Duration(_) => DataTypeLayout::new_fixed_width::<i64>(),
1591 DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::<i128>(),
1592 DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::<i256>(),
1593 DataType::FixedSizeBinary(size) => {
1594 let spec = BufferSpec::FixedWidth {
1595 byte_width: (*size).try_into().unwrap(),
1596 alignment: mem::align_of::<u8>(),
1597 };
1598 DataTypeLayout {
1599 buffers: vec![spec],
1600 can_contain_null_mask: true,
1601 variadic: false,
1602 }
1603 }
1604 DataType::Binary => DataTypeLayout::new_binary::<i32>(),
1605 DataType::LargeBinary => DataTypeLayout::new_binary::<i64>(),
1606 DataType::Utf8 => DataTypeLayout::new_binary::<i32>(),
1607 DataType::LargeUtf8 => DataTypeLayout::new_binary::<i64>(),
1608 DataType::BinaryView | DataType::Utf8View => DataTypeLayout::new_view(),
1609 DataType::FixedSizeList(_, _) => DataTypeLayout::new_nullable_empty(), DataType::List(_) => DataTypeLayout::new_fixed_width::<i32>(),
1611 DataType::ListView(_) => DataTypeLayout::new_list_view::<i32>(),
1612 DataType::LargeListView(_) => DataTypeLayout::new_list_view::<i64>(),
1613 DataType::LargeList(_) => DataTypeLayout::new_fixed_width::<i64>(),
1614 DataType::Map(_, _) => DataTypeLayout::new_fixed_width::<i32>(),
1615 DataType::Struct(_) => DataTypeLayout::new_nullable_empty(), DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), DataType::Union(_, mode) => {
1618 let type_ids = BufferSpec::FixedWidth {
1619 byte_width: mem::size_of::<i8>(),
1620 alignment: mem::align_of::<i8>(),
1621 };
1622
1623 DataTypeLayout {
1624 buffers: match mode {
1625 UnionMode::Sparse => {
1626 vec![type_ids]
1627 }
1628 UnionMode::Dense => {
1629 vec![
1630 type_ids,
1631 BufferSpec::FixedWidth {
1632 byte_width: mem::size_of::<i32>(),
1633 alignment: mem::align_of::<i32>(),
1634 },
1635 ]
1636 }
1637 },
1638 can_contain_null_mask: false,
1639 variadic: false,
1640 }
1641 }
1642 DataType::Dictionary(key_type, _value_type) => layout(key_type),
1643 }
1644}
1645
1646#[derive(Debug, PartialEq, Eq)]
1648pub struct DataTypeLayout {
1650 pub buffers: Vec<BufferSpec>,
1652
1653 pub can_contain_null_mask: bool,
1655
1656 pub variadic: bool,
1660}
1661
1662impl DataTypeLayout {
1663 pub fn new_fixed_width<T>() -> Self {
1665 Self {
1666 buffers: vec![BufferSpec::FixedWidth {
1667 byte_width: mem::size_of::<T>(),
1668 alignment: mem::align_of::<T>(),
1669 }],
1670 can_contain_null_mask: true,
1671 variadic: false,
1672 }
1673 }
1674
1675 pub fn new_nullable_empty() -> Self {
1678 Self {
1679 buffers: vec![],
1680 can_contain_null_mask: true,
1681 variadic: false,
1682 }
1683 }
1684
1685 pub fn new_empty() -> Self {
1688 Self {
1689 buffers: vec![],
1690 can_contain_null_mask: false,
1691 variadic: false,
1692 }
1693 }
1694
1695 pub fn new_binary<T>() -> Self {
1699 Self {
1700 buffers: vec![
1701 BufferSpec::FixedWidth {
1703 byte_width: mem::size_of::<T>(),
1704 alignment: mem::align_of::<T>(),
1705 },
1706 BufferSpec::VariableWidth,
1708 ],
1709 can_contain_null_mask: true,
1710 variadic: false,
1711 }
1712 }
1713
1714 pub fn new_view() -> Self {
1716 Self {
1717 buffers: vec![BufferSpec::FixedWidth {
1718 byte_width: mem::size_of::<u128>(),
1719 alignment: mem::align_of::<u128>(),
1720 }],
1721 can_contain_null_mask: true,
1722 variadic: true,
1723 }
1724 }
1725
1726 pub fn new_list_view<T>() -> Self {
1728 Self {
1729 buffers: vec![
1730 BufferSpec::FixedWidth {
1731 byte_width: mem::size_of::<T>(),
1732 alignment: mem::align_of::<T>(),
1733 },
1734 BufferSpec::FixedWidth {
1735 byte_width: mem::size_of::<T>(),
1736 alignment: mem::align_of::<T>(),
1737 },
1738 ],
1739 can_contain_null_mask: true,
1740 variadic: true,
1741 }
1742 }
1743}
1744
1745#[derive(Debug, PartialEq, Eq)]
1747pub enum BufferSpec {
1748 FixedWidth {
1759 byte_width: usize,
1761 alignment: usize,
1763 },
1764 VariableWidth,
1766 BitMap,
1772 #[allow(dead_code)]
1775 AlwaysNull,
1776}
1777
1778impl PartialEq for ArrayData {
1779 fn eq(&self, other: &Self) -> bool {
1780 equal::equal(self, other)
1781 }
1782}
1783
1784mod private {
1785 #[derive(Debug)]
1793 pub struct UnsafeFlag(bool);
1794
1795 impl UnsafeFlag {
1796 #[inline]
1798 pub const fn new() -> Self {
1799 Self(false)
1800 }
1801
1802 #[inline]
1803 pub unsafe fn set(&mut self, val: bool) {
1804 self.0 = val;
1805 }
1806
1807 #[inline]
1808 pub fn get(&self) -> bool {
1809 self.0
1810 }
1811 }
1812}
1813
1814#[derive(Debug)]
1816pub struct ArrayDataBuilder {
1817 data_type: DataType,
1818 len: usize,
1819 null_count: Option<usize>,
1820 null_bit_buffer: Option<Buffer>,
1821 nulls: Option<NullBuffer>,
1822 offset: usize,
1823 buffers: Vec<Buffer>,
1824 child_data: Vec<ArrayData>,
1825 align_buffers: bool,
1829 skip_validation: UnsafeFlag,
1839}
1840
1841impl ArrayDataBuilder {
1842 #[inline]
1843 pub const fn new(data_type: DataType) -> Self {
1845 Self {
1846 data_type,
1847 len: 0,
1848 null_count: None,
1849 null_bit_buffer: None,
1850 nulls: None,
1851 offset: 0,
1852 buffers: vec![],
1853 child_data: vec![],
1854 align_buffers: false,
1855 skip_validation: UnsafeFlag::new(),
1856 }
1857 }
1858
1859 pub fn data_type(self, data_type: DataType) -> Self {
1861 Self { data_type, ..self }
1862 }
1863
1864 #[inline]
1865 #[allow(clippy::len_without_is_empty)]
1866 pub const fn len(mut self, n: usize) -> Self {
1868 self.len = n;
1869 self
1870 }
1871
1872 pub fn nulls(mut self, nulls: Option<NullBuffer>) -> Self {
1874 self.nulls = nulls;
1875 self.null_count = None;
1876 self.null_bit_buffer = None;
1877 self
1878 }
1879
1880 pub fn null_count(mut self, null_count: usize) -> Self {
1882 self.null_count = Some(null_count);
1883 self
1884 }
1885
1886 pub fn null_bit_buffer(mut self, buf: Option<Buffer>) -> Self {
1888 self.nulls = None;
1889 self.null_bit_buffer = buf;
1890 self
1891 }
1892
1893 #[inline]
1895 pub const fn offset(mut self, n: usize) -> Self {
1896 self.offset = n;
1897 self
1898 }
1899
1900 pub fn buffers(mut self, v: Vec<Buffer>) -> Self {
1902 self.buffers = v;
1903 self
1904 }
1905
1906 pub fn add_buffer(mut self, b: Buffer) -> Self {
1908 self.buffers.push(b);
1909 self
1910 }
1911
1912 pub fn add_buffers<I: IntoIterator<Item = Buffer>>(mut self, bs: I) -> Self {
1914 self.buffers.extend(bs);
1915 self
1916 }
1917
1918 pub fn child_data(mut self, v: Vec<ArrayData>) -> Self {
1920 self.child_data = v;
1921 self
1922 }
1923
1924 pub fn add_child_data(mut self, r: ArrayData) -> Self {
1926 self.child_data.push(r);
1927 self
1928 }
1929
1930 pub unsafe fn build_unchecked(self) -> ArrayData {
1939 self.skip_validation(true).build().unwrap()
1940 }
1941
1942 pub fn build(self) -> Result<ArrayData, ArrowError> {
1951 let Self {
1952 data_type,
1953 len,
1954 null_count,
1955 null_bit_buffer,
1956 nulls,
1957 offset,
1958 buffers,
1959 child_data,
1960 align_buffers,
1961 skip_validation,
1962 } = self;
1963
1964 let nulls = nulls
1965 .or_else(|| {
1966 let buffer = null_bit_buffer?;
1967 let buffer = BooleanBuffer::new(buffer, offset, len);
1968 Some(match null_count {
1969 Some(n) => {
1970 unsafe { NullBuffer::new_unchecked(buffer, n) }
1972 }
1973 None => NullBuffer::new(buffer),
1974 })
1975 })
1976 .filter(|b| b.null_count() != 0);
1977
1978 let mut data = ArrayData {
1979 data_type,
1980 len,
1981 offset,
1982 buffers,
1983 child_data,
1984 nulls,
1985 };
1986
1987 if align_buffers {
1988 data.align_buffers();
1989 }
1990
1991 if !skip_validation.get() || cfg!(feature = "force_validate") {
1993 data.validate_data()?;
1994 }
1995 Ok(data)
1996 }
1997
1998 #[deprecated(since = "54.1.0", note = "Use ArrayData::align_buffers instead")]
2000 pub fn build_aligned(self) -> Result<ArrayData, ArrowError> {
2001 self.align_buffers(true).build()
2002 }
2003
2004 pub fn align_buffers(mut self, align_buffers: bool) -> Self {
2020 self.align_buffers = align_buffers;
2021 self
2022 }
2023
2024 pub unsafe fn skip_validation(mut self, skip_validation: bool) -> Self {
2038 self.skip_validation.set(skip_validation);
2039 self
2040 }
2041}
2042
2043impl From<ArrayData> for ArrayDataBuilder {
2044 fn from(d: ArrayData) -> Self {
2045 Self {
2046 data_type: d.data_type,
2047 len: d.len,
2048 offset: d.offset,
2049 buffers: d.buffers,
2050 child_data: d.child_data,
2051 nulls: d.nulls,
2052 null_bit_buffer: None,
2053 null_count: None,
2054 align_buffers: false,
2055 skip_validation: UnsafeFlag::new(),
2056 }
2057 }
2058}
2059
2060#[cfg(test)]
2061mod tests {
2062 use super::*;
2063 use arrow_schema::{Field, Fields};
2064
2065 fn make_i32_buffer(n: usize) -> Buffer {
2069 Buffer::from_slice_ref(vec![42i32; n])
2070 }
2071
2072 fn make_f32_buffer(n: usize) -> Buffer {
2074 Buffer::from_slice_ref(vec![42f32; n])
2075 }
2076
2077 #[test]
2078 fn test_builder() {
2079 let v = (0..25).collect::<Vec<i32>>();
2081 let b1 = Buffer::from_slice_ref(&v);
2082 let arr_data = ArrayData::builder(DataType::Int32)
2083 .len(20)
2084 .offset(5)
2085 .add_buffer(b1)
2086 .null_bit_buffer(Some(Buffer::from([
2087 0b01011111, 0b10110101, 0b01100011, 0b00011110,
2088 ])))
2089 .build()
2090 .unwrap();
2091
2092 assert_eq!(20, arr_data.len());
2093 assert_eq!(10, arr_data.null_count());
2094 assert_eq!(5, arr_data.offset());
2095 assert_eq!(1, arr_data.buffers().len());
2096 assert_eq!(
2097 Buffer::from_slice_ref(&v).as_slice(),
2098 arr_data.buffers()[0].as_slice()
2099 );
2100 }
2101
2102 #[test]
2103 fn test_builder_with_child_data() {
2104 let child_arr_data = ArrayData::try_new(
2105 DataType::Int32,
2106 5,
2107 None,
2108 0,
2109 vec![Buffer::from_slice_ref([1i32, 2, 3, 4, 5])],
2110 vec![],
2111 )
2112 .unwrap();
2113
2114 let field = Arc::new(Field::new("x", DataType::Int32, true));
2115 let data_type = DataType::Struct(vec![field].into());
2116
2117 let arr_data = ArrayData::builder(data_type)
2118 .len(5)
2119 .offset(0)
2120 .add_child_data(child_arr_data.clone())
2121 .build()
2122 .unwrap();
2123
2124 assert_eq!(5, arr_data.len());
2125 assert_eq!(1, arr_data.child_data().len());
2126 assert_eq!(child_arr_data, arr_data.child_data()[0]);
2127 }
2128
2129 #[test]
2130 fn test_null_count() {
2131 let mut bit_v: [u8; 2] = [0; 2];
2132 bit_util::set_bit(&mut bit_v, 0);
2133 bit_util::set_bit(&mut bit_v, 3);
2134 bit_util::set_bit(&mut bit_v, 10);
2135 let arr_data = ArrayData::builder(DataType::Int32)
2136 .len(16)
2137 .add_buffer(make_i32_buffer(16))
2138 .null_bit_buffer(Some(Buffer::from(bit_v)))
2139 .build()
2140 .unwrap();
2141 assert_eq!(13, arr_data.null_count());
2142
2143 let mut bit_v: [u8; 2] = [0; 2];
2145 bit_util::set_bit(&mut bit_v, 0);
2146 bit_util::set_bit(&mut bit_v, 3);
2147 bit_util::set_bit(&mut bit_v, 10);
2148 let arr_data = ArrayData::builder(DataType::Int32)
2149 .len(12)
2150 .offset(2)
2151 .add_buffer(make_i32_buffer(14)) .null_bit_buffer(Some(Buffer::from(bit_v)))
2153 .build()
2154 .unwrap();
2155 assert_eq!(10, arr_data.null_count());
2156 }
2157
2158 #[test]
2159 fn test_null_buffer_ref() {
2160 let mut bit_v: [u8; 2] = [0; 2];
2161 bit_util::set_bit(&mut bit_v, 0);
2162 bit_util::set_bit(&mut bit_v, 3);
2163 bit_util::set_bit(&mut bit_v, 10);
2164 let arr_data = ArrayData::builder(DataType::Int32)
2165 .len(16)
2166 .add_buffer(make_i32_buffer(16))
2167 .null_bit_buffer(Some(Buffer::from(bit_v)))
2168 .build()
2169 .unwrap();
2170 assert!(arr_data.nulls().is_some());
2171 assert_eq!(&bit_v, arr_data.nulls().unwrap().validity());
2172 }
2173
2174 #[test]
2175 fn test_slice() {
2176 let mut bit_v: [u8; 2] = [0; 2];
2177 bit_util::set_bit(&mut bit_v, 0);
2178 bit_util::set_bit(&mut bit_v, 3);
2179 bit_util::set_bit(&mut bit_v, 10);
2180 let data = ArrayData::builder(DataType::Int32)
2181 .len(16)
2182 .add_buffer(make_i32_buffer(16))
2183 .null_bit_buffer(Some(Buffer::from(bit_v)))
2184 .build()
2185 .unwrap();
2186 let new_data = data.slice(1, 15);
2187 assert_eq!(data.len() - 1, new_data.len());
2188 assert_eq!(1, new_data.offset());
2189 assert_eq!(data.null_count(), new_data.null_count());
2190
2191 let new_data = new_data.slice(1, 14);
2193 assert_eq!(data.len() - 2, new_data.len());
2194 assert_eq!(2, new_data.offset());
2195 assert_eq!(data.null_count() - 1, new_data.null_count());
2196 }
2197
2198 #[test]
2199 fn test_equality() {
2200 let int_data = ArrayData::builder(DataType::Int32)
2201 .len(1)
2202 .add_buffer(make_i32_buffer(1))
2203 .build()
2204 .unwrap();
2205
2206 let float_data = ArrayData::builder(DataType::Float32)
2207 .len(1)
2208 .add_buffer(make_f32_buffer(1))
2209 .build()
2210 .unwrap();
2211 assert_ne!(int_data, float_data);
2212 assert!(!int_data.ptr_eq(&float_data));
2213 assert!(int_data.ptr_eq(&int_data));
2214
2215 #[allow(clippy::redundant_clone)]
2216 let int_data_clone = int_data.clone();
2217 assert_eq!(int_data, int_data_clone);
2218 assert!(int_data.ptr_eq(&int_data_clone));
2219 assert!(int_data_clone.ptr_eq(&int_data));
2220
2221 let int_data_slice = int_data_clone.slice(1, 0);
2222 assert!(int_data_slice.ptr_eq(&int_data_slice));
2223 assert!(!int_data.ptr_eq(&int_data_slice));
2224 assert!(!int_data_slice.ptr_eq(&int_data));
2225
2226 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2227 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2228 let string_data = ArrayData::try_new(
2229 DataType::Utf8,
2230 3,
2231 Some(Buffer::from_iter(vec![true, false, true])),
2232 0,
2233 vec![offsets_buffer, data_buffer],
2234 vec![],
2235 )
2236 .unwrap();
2237
2238 assert_ne!(float_data, string_data);
2239 assert!(!float_data.ptr_eq(&string_data));
2240
2241 assert!(string_data.ptr_eq(&string_data));
2242
2243 #[allow(clippy::redundant_clone)]
2244 let string_data_cloned = string_data.clone();
2245 assert!(string_data_cloned.ptr_eq(&string_data));
2246 assert!(string_data.ptr_eq(&string_data_cloned));
2247
2248 let string_data_slice = string_data.slice(1, 2);
2249 assert!(string_data_slice.ptr_eq(&string_data_slice));
2250 assert!(!string_data_slice.ptr_eq(&string_data))
2251 }
2252
2253 #[test]
2254 fn test_slice_memory_size() {
2255 let mut bit_v: [u8; 2] = [0; 2];
2256 bit_util::set_bit(&mut bit_v, 0);
2257 bit_util::set_bit(&mut bit_v, 3);
2258 bit_util::set_bit(&mut bit_v, 10);
2259 let data = ArrayData::builder(DataType::Int32)
2260 .len(16)
2261 .add_buffer(make_i32_buffer(16))
2262 .null_bit_buffer(Some(Buffer::from(bit_v)))
2263 .build()
2264 .unwrap();
2265 let new_data = data.slice(1, 14);
2266 assert_eq!(
2267 data.get_slice_memory_size().unwrap() - 8,
2268 new_data.get_slice_memory_size().unwrap()
2269 );
2270 let data_buffer = Buffer::from_slice_ref("abcdef".as_bytes());
2271 let offsets_buffer = Buffer::from_slice_ref([0_i32, 2_i32, 2_i32, 5_i32]);
2272 let string_data = ArrayData::try_new(
2273 DataType::Utf8,
2274 3,
2275 Some(Buffer::from_iter(vec![true, false, true])),
2276 0,
2277 vec![offsets_buffer, data_buffer],
2278 vec![],
2279 )
2280 .unwrap();
2281 let string_data_slice = string_data.slice(1, 2);
2282 assert_eq!(
2284 string_data.get_slice_memory_size().unwrap() - 6,
2285 string_data_slice.get_slice_memory_size().unwrap()
2286 );
2287 }
2288
2289 #[test]
2290 fn test_count_nulls() {
2291 let buffer = Buffer::from([0b00010110, 0b10011111]);
2292 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 16));
2293 let count = count_nulls(Some(&buffer), 0, 16);
2294 assert_eq!(count, 7);
2295
2296 let count = count_nulls(Some(&buffer), 4, 8);
2297 assert_eq!(count, 3);
2298 }
2299
2300 #[test]
2301 fn test_contains_nulls() {
2302 let buffer: Buffer =
2303 MutableBuffer::from_iter([false, false, false, true, true, false]).into();
2304 let buffer = NullBuffer::new(BooleanBuffer::new(buffer, 0, 6));
2305 assert!(contains_nulls(Some(&buffer), 0, 6));
2306 assert!(contains_nulls(Some(&buffer), 0, 3));
2307 assert!(!contains_nulls(Some(&buffer), 3, 2));
2308 assert!(!contains_nulls(Some(&buffer), 0, 0));
2309 }
2310
2311 #[test]
2312 fn test_alignment() {
2313 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2314 let sliced = buffer.slice(1);
2315
2316 let mut data = ArrayData {
2317 data_type: DataType::Int32,
2318 len: 0,
2319 offset: 0,
2320 buffers: vec![buffer],
2321 child_data: vec![],
2322 nulls: None,
2323 };
2324 data.validate_full().unwrap();
2325
2326 data.buffers[0] = sliced;
2328 let err = data.validate().unwrap_err();
2329
2330 assert_eq!(
2331 err.to_string(),
2332 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2333 );
2334
2335 data.align_buffers();
2336 data.validate_full().unwrap();
2337 }
2338
2339 #[test]
2340 fn test_alignment_struct() {
2341 let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]);
2342 let sliced = buffer.slice(1);
2343
2344 let child_data = ArrayData {
2345 data_type: DataType::Int32,
2346 len: 0,
2347 offset: 0,
2348 buffers: vec![buffer],
2349 child_data: vec![],
2350 nulls: None,
2351 };
2352
2353 let schema = DataType::Struct(Fields::from(vec![Field::new("a", DataType::Int32, false)]));
2354 let mut data = ArrayData {
2355 data_type: schema,
2356 len: 0,
2357 offset: 0,
2358 buffers: vec![],
2359 child_data: vec![child_data],
2360 nulls: None,
2361 };
2362 data.validate_full().unwrap();
2363
2364 data.child_data[0].buffers[0] = sliced;
2366 let err = data.validate().unwrap_err();
2367
2368 assert_eq!(
2369 err.to_string(),
2370 "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1"
2371 );
2372
2373 data.align_buffers();
2374 data.validate_full().unwrap();
2375 }
2376
2377 #[test]
2378 fn test_null_view_types() {
2379 let array_len = 32;
2380 let array = ArrayData::new_null(&DataType::BinaryView, array_len);
2381 assert_eq!(array.len(), array_len);
2382 for i in 0..array.len() {
2383 assert!(array.is_null(i));
2384 }
2385
2386 let array = ArrayData::new_null(&DataType::Utf8View, array_len);
2387 assert_eq!(array.len(), array_len);
2388 for i in 0..array.len() {
2389 assert!(array.is_null(i));
2390 }
2391 }
2392}