1use std::{
5 fmt::{self},
6 hash::{Hash, RandomState},
7 sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16 AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17 NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22 BitWidth,
23 DataSize,
24 Cardinality,
25 FixedSize,
26 NullCount,
27 MaxLength,
28}
29
30impl fmt::Debug for Stat {
31 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32 match self {
33 Self::BitWidth => write!(f, "BitWidth"),
34 Self::DataSize => write!(f, "DataSize"),
35 Self::Cardinality => write!(f, "Cardinality"),
36 Self::FixedSize => write!(f, "FixedSize"),
37 Self::NullCount => write!(f, "NullCount"),
38 Self::MaxLength => write!(f, "MaxLength"),
39 }
40 }
41}
42
43impl fmt::Display for Stat {
44 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45 write!(f, "{:?}", self)
46 }
47}
48
49pub trait ComputeStat {
50 fn compute_stat(&mut self);
51}
52
53impl ComputeStat for DataBlock {
54 fn compute_stat(&mut self) {
55 match self {
56 Self::Empty() => {}
57 Self::Constant(_) => {}
58 Self::AllNull(_) => {}
59 Self::Nullable(data_block) => data_block.data.compute_stat(),
60 Self::FixedWidth(data_block) => data_block.compute_stat(),
61 Self::FixedSizeList(data_block) => data_block.compute_stat(),
62 Self::VariableWidth(data_block) => data_block.compute_stat(),
63 Self::Opaque(data_block) => data_block.compute_stat(),
64 Self::Struct(data_block) => data_block.compute_stat(),
65 Self::Dictionary(_) => {}
66 }
67 }
68}
69
70impl ComputeStat for VariableWidthBlock {
71 fn compute_stat(&mut self) {
72 if !self.block_info.0.read().unwrap().is_empty() {
73 panic!("compute_stat should only be called once during DataBlock construction");
74 }
75 let data_size = self.data_size();
76 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
77
78 let cardinality_array = self.cardinality();
79
80 let max_length_array = self.max_length();
81
82 let mut info = self.block_info.0.write().unwrap();
83 info.insert(Stat::DataSize, data_size_array);
84 info.insert(Stat::Cardinality, cardinality_array);
85 info.insert(Stat::MaxLength, max_length_array);
86 }
87}
88
89impl ComputeStat for FixedWidthDataBlock {
90 fn compute_stat(&mut self) {
91 let data_size = self.data_size();
93 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
94
95 let max_bit_widths = self.max_bit_widths();
97
98 let max_len = self.bits_per_value / 8;
100 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
101
102 let cardidinality_array = if self.bits_per_value == 128 {
103 Some(self.cardinality())
104 } else {
105 None
106 };
107
108 let mut info = self.block_info.0.write().unwrap();
109 info.insert(Stat::DataSize, data_size_array);
110 info.insert(Stat::BitWidth, max_bit_widths);
111 info.insert(Stat::MaxLength, max_len_array);
112 if let Some(cardinality_array) = cardidinality_array {
113 info.insert(Stat::Cardinality, cardinality_array);
114 }
115 }
116}
117
118impl ComputeStat for FixedSizeListBlock {
119 fn compute_stat(&mut self) {
120 self.child.compute_stat();
128 }
129}
130
131impl ComputeStat for OpaqueBlock {
132 fn compute_stat(&mut self) {
133 let data_size = self.data_size();
135 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
136 let mut info = self.block_info.0.write().unwrap();
137 info.insert(Stat::DataSize, data_size_array);
138 }
139}
140
141pub trait GetStat: fmt::Debug {
142 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
143
144 fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
145 self.get_stat(stat)
146 .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
147 }
148
149 fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
150 let stat_value = self.expect_stat(stat);
151 let stat_value = stat_value.as_primitive::<T>();
152 if stat_value.len() != 1 {
153 panic!(
154 "{:?} DataBlock does not have exactly one value for `{} statistics.",
155 self, stat
156 );
157 }
158 stat_value.value(0)
159 }
160}
161
162impl GetStat for DataBlock {
163 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
164 match self {
165 Self::Empty() => None,
166 Self::Constant(_) => None,
167 Self::AllNull(data_block) => data_block.get_stat(stat),
168 Self::Nullable(data_block) => data_block.get_stat(stat),
169 Self::FixedWidth(data_block) => data_block.get_stat(stat),
170 Self::FixedSizeList(data_block) => data_block.get_stat(stat),
171 Self::VariableWidth(data_block) => data_block.get_stat(stat),
172 Self::Opaque(data_block) => data_block.get_stat(stat),
173 Self::Struct(data_block) => data_block.get_stat(stat),
174 Self::Dictionary(data_block) => data_block.get_stat(stat),
175 }
176 }
177}
178
179impl GetStat for NullableDataBlock {
181 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
184 self.data.get_stat(stat)
185 }
186}
187
188impl GetStat for VariableWidthBlock {
189 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
190 let block_info = self.block_info.0.read().unwrap();
191
192 if block_info.is_empty() {
193 panic!("get_stat should be called after statistics are computed.");
194 }
195 block_info.get(&stat).cloned()
196 }
197}
198
199impl GetStat for FixedSizeListBlock {
200 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
201 let child_stat = self.child.get_stat(stat);
202 match stat {
203 Stat::MaxLength => child_stat.map(|max_length| {
204 let max_length = max_length.as_primitive::<UInt64Type>().value(0);
207 Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
208 }),
209 _ => child_stat,
210 }
211 }
212}
213
214impl VariableWidthBlock {
215 fn cardinality(&mut self) -> Arc<dyn Array> {
218 const PRECISION: u8 = 4;
219 let mut hll: HyperLogLogPlus<&[u8], RandomState> =
220 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
221
222 match self.bits_per_offset {
223 32 => {
224 let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
225 let offsets: &[u32] = offsets_ref.as_ref();
226
227 offsets
228 .iter()
229 .zip(offsets.iter().skip(1))
230 .for_each(|(&start, &end)| {
231 hll.insert(&self.data[start as usize..end as usize]);
232 });
233 let cardinality = hll.count() as u64;
234 Arc::new(UInt64Array::from(vec![cardinality]))
235 }
236 64 => {
237 let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
238 let offsets: &[u64] = offsets_ref.as_ref();
239
240 offsets
241 .iter()
242 .zip(offsets.iter().skip(1))
243 .for_each(|(&start, &end)| {
244 hll.insert(&self.data[start as usize..end as usize]);
245 });
246
247 let cardinality = hll.count() as u64;
248 Arc::new(UInt64Array::from(vec![cardinality]))
249 }
250 _ => {
251 unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
252 }
253 }
254 }
255
256 fn max_length(&mut self) -> Arc<dyn Array> {
257 match self.bits_per_offset {
258 32 => {
259 let offsets = self.offsets.borrow_to_typed_slice::<u32>();
260 let offsets = offsets.as_ref();
261 let max_len = offsets
262 .windows(2)
263 .map(|pair| pair[1] - pair[0])
264 .max()
265 .unwrap_or(0);
266 Arc::new(UInt64Array::from(vec![max_len as u64]))
267 }
268 64 => {
269 let offsets = self.offsets.borrow_to_typed_slice::<u64>();
270 let offsets = offsets.as_ref();
271 let max_len = offsets
272 .windows(2)
273 .map(|pair| pair[1] - pair[0])
274 .max()
275 .unwrap_or(0);
276 Arc::new(UInt64Array::from(vec![max_len]))
277 }
278 _ => {
279 unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
280 }
281 }
282 }
283}
284
285impl GetStat for AllNullDataBlock {
286 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
287 match stat {
288 Stat::NullCount => {
289 let null_count = self.num_values;
290 Some(Arc::new(UInt64Array::from(vec![null_count])))
291 }
292 Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
293 _ => None,
294 }
295 }
296}
297
298impl GetStat for FixedWidthDataBlock {
299 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
300 let block_info = self.block_info.0.read().unwrap();
301
302 if block_info.is_empty() {
303 panic!("get_stat should be called after statistics are computed.");
304 }
305 block_info.get(&stat).cloned()
306 }
307}
308
309impl FixedWidthDataBlock {
310 fn max_bit_widths(&mut self) -> Arc<dyn Array> {
311 assert!(self.num_values > 0);
312
313 const CHUNK_SIZE: usize = 1024;
314
315 fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
316 slice
317 .chunks(CHUNK_SIZE)
318 .map(|chunk| {
319 let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
320 bits_per_value - max_value.leading_zeros() as u64
321 })
322 .collect()
323 }
324
325 match self.bits_per_value {
326 8 => {
327 let u8_slice = self.data.borrow_to_typed_slice::<u8>();
328 let u8_slice = u8_slice.as_ref();
329 Arc::new(UInt64Array::from(calculate_max_bit_width(
330 u8_slice,
331 self.bits_per_value,
332 )))
333 }
334 16 => {
335 let u16_slice = self.data.borrow_to_typed_slice::<u16>();
336 let u16_slice = u16_slice.as_ref();
337 Arc::new(UInt64Array::from(calculate_max_bit_width(
338 u16_slice,
339 self.bits_per_value,
340 )))
341 }
342 32 => {
343 let u32_slice = self.data.borrow_to_typed_slice::<u32>();
344 let u32_slice = u32_slice.as_ref();
345 Arc::new(UInt64Array::from(calculate_max_bit_width(
346 u32_slice,
347 self.bits_per_value,
348 )))
349 }
350 64 => {
351 let u64_slice = self.data.borrow_to_typed_slice::<u64>();
352 let u64_slice = u64_slice.as_ref();
353 Arc::new(UInt64Array::from(calculate_max_bit_width(
354 u64_slice,
355 self.bits_per_value,
356 )))
357 }
358 _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
359 }
360 }
361
362 fn cardinality(&mut self) -> Arc<dyn Array> {
363 match self.bits_per_value {
364 128 => {
365 let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
366 let u128_slice = u128_slice_ref.as_ref();
367
368 const PRECISION: u8 = 4;
369 let mut hll: HyperLogLogPlus<u128, RandomState> =
370 HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
371 for val in u128_slice {
372 hll.insert(val);
373 }
374 let cardinality = hll.count() as u64;
375 Arc::new(UInt64Array::from(vec![cardinality]))
376 }
377 _ => unreachable!(),
378 }
379 }
380}
381
382impl GetStat for OpaqueBlock {
383 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
384 let block_info = self.block_info.0.read().unwrap();
385
386 if block_info.is_empty() {
387 panic!("get_stat should be called after statistics are computed.");
388 }
389 block_info.get(&stat).cloned()
390 }
391}
392
393impl GetStat for DictionaryDataBlock {
394 fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
395 None
396 }
397}
398
399impl GetStat for StructDataBlock {
400 fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
401 let block_info = self.block_info.0.read().unwrap();
402 if block_info.is_empty() {
403 panic!("get_stat should be called after statistics are computed.")
404 }
405 block_info.get(&stat).cloned()
406 }
407}
408
409impl ComputeStat for StructDataBlock {
410 fn compute_stat(&mut self) {
411 let data_size = self.data_size();
412 let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
413
414 let max_len = self
415 .children
416 .iter()
417 .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
418 .sum::<u64>();
419 let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
420
421 let mut info = self.block_info.0.write().unwrap();
422 info.insert(Stat::DataSize, data_size_array);
423 info.insert(Stat::MaxLength, max_len_array);
424 }
425}
426
427#[cfg(test)]
428mod tests {
429 use std::sync::Arc;
430
431 use arrow_array::{
432 ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
433 UInt16Array, UInt32Array, UInt64Array, UInt8Array,
434 };
435 use arrow_schema::{DataType, Field};
436 use lance_arrow::DataTypeExt;
437 use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
438 use rand::SeedableRng;
439
440 use crate::statistics::{GetStat, Stat};
441
442 use super::DataBlock;
443
444 use arrow::{
445 array::AsArray,
446 compute::concat,
447 datatypes::{Int32Type, UInt64Type},
448 };
449 use arrow_array::Array;
450 #[test]
451 fn test_data_size_stat() {
452 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
453 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
454 let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap();
455 let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap();
456 let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap();
457 let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
458
459 let concatenated_array = concat(&[
460 &*Arc::new(arr1.clone()) as &dyn Array,
461 &*Arc::new(arr2.clone()) as &dyn Array,
462 &*Arc::new(arr3.clone()) as &dyn Array,
463 ])
464 .unwrap();
465
466 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
467
468 let total_buffer_size: usize = concatenated_array
469 .to_data()
470 .buffers()
471 .iter()
472 .map(|buffer| buffer.len())
473 .sum();
474 assert!(data_size == total_buffer_size as u64);
475
476 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
478 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
479 let block = DataBlock::from_array(arr.clone());
480 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
481
482 let total_buffer_size: usize = arr
483 .to_data()
484 .buffers()
485 .iter()
486 .map(|buffer| buffer.len())
487 .sum();
488 assert!(data_size == total_buffer_size as u64);
489
490 let fields = vec![
492 Arc::new(Field::new("int_field", DataType::Int32, false)),
493 Arc::new(Field::new("float_field", DataType::Float32, false)),
494 ]
495 .into();
496
497 let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields));
498 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
499 let block = DataBlock::from_array(arr.clone());
500 let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
501 let total_buffer_size: usize = arr_parts
502 .iter()
503 .map(|arr| {
504 arr.to_data()
505 .buffers()
506 .iter()
507 .map(|buffer| buffer.len())
508 .sum::<usize>()
509 })
510 .sum();
511 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
512 assert!(data_size == total_buffer_size as u64);
513
514 let mut gen = array::rand_type(&DataType::Dictionary(
516 Box::new(DataType::Int32),
517 Box::new(DataType::Utf8),
518 ));
519 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
520 let block = DataBlock::from_array(arr.clone());
521 assert!(block.get_stat(Stat::DataSize).is_none());
522
523 let mut gen = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
524 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
525 let block = DataBlock::from_array(arr.clone());
526 let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
527 let total_buffer_size: usize = arr
528 .to_data()
529 .buffers()
530 .iter()
531 .map(|buffer| buffer.len())
532 .sum();
533
534 assert!(data_size == total_buffer_size as u64);
535 }
536
537 #[test]
538 fn test_bit_width_stat_for_integers() {
539 let int8_array = Int8Array::from(vec![1, 2, 3]);
540 let array_ref: ArrayRef = Arc::new(int8_array);
541 let block = DataBlock::from_array(array_ref);
542
543 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
544 let actual_bit_width = block.expect_stat(Stat::BitWidth);
545
546 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
547
548 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
549 let array_ref: ArrayRef = Arc::new(int8_array);
550 let block = DataBlock::from_array(array_ref);
551
552 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
553 let actual_bit_width = block.expect_stat(Stat::BitWidth);
554 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
555
556 let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
557 let array_ref: ArrayRef = Arc::new(int8_array);
558 let block = DataBlock::from_array(array_ref);
559
560 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
561 let actual_bit_width = block.expect_stat(Stat::BitWidth);
562 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
563
564 let int8_array = Int8Array::from(vec![-1, 2, 3]);
565 let array_ref: ArrayRef = Arc::new(int8_array);
566 let block = DataBlock::from_array(array_ref);
567
568 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
569 let actual_bit_width = block.expect_stat(Stat::BitWidth);
570 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
571
572 let int16_array = Int16Array::from(vec![1, 2, 3]);
573 let array_ref: ArrayRef = Arc::new(int16_array);
574 let block = DataBlock::from_array(array_ref);
575
576 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
577 let actual_bit_width = block.expect_stat(Stat::BitWidth);
578 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
579
580 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
581 let array_ref: ArrayRef = Arc::new(int16_array);
582 let block = DataBlock::from_array(array_ref);
583
584 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
585 let actual_bit_width = block.expect_stat(Stat::BitWidth);
586 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
587
588 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
589 let array_ref: ArrayRef = Arc::new(int16_array);
590 let block = DataBlock::from_array(array_ref);
591
592 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
593 let actual_bit_width = block.expect_stat(Stat::BitWidth);
594 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
595
596 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
597 let array_ref: ArrayRef = Arc::new(int16_array);
598 let block = DataBlock::from_array(array_ref);
599
600 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
601 let actual_bit_width = block.expect_stat(Stat::BitWidth);
602 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
603
604 let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
605 let array_ref: ArrayRef = Arc::new(int16_array);
606 let block = DataBlock::from_array(array_ref);
607
608 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
609 let actual_bit_width = block.expect_stat(Stat::BitWidth);
610 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
611
612 let int16_array = Int16Array::from(vec![-1, 2, 3]);
613 let array_ref: ArrayRef = Arc::new(int16_array);
614 let block = DataBlock::from_array(array_ref);
615
616 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
617 let actual_bit_width = block.expect_stat(Stat::BitWidth);
618 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
619
620 let int32_array = Int32Array::from(vec![1, 2, 3]);
621 let array_ref: ArrayRef = Arc::new(int32_array);
622 let block = DataBlock::from_array(array_ref);
623
624 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
625 let actual_bit_width = block.expect_stat(Stat::BitWidth);
626 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
627
628 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
629 let array_ref: ArrayRef = Arc::new(int32_array);
630 let block = DataBlock::from_array(array_ref);
631
632 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
633 let actual_bit_width = block.expect_stat(Stat::BitWidth);
634 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
635
636 let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
637 let array_ref: ArrayRef = Arc::new(int32_array);
638 let block = DataBlock::from_array(array_ref);
639
640 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
641 let actual_bit_width = block.expect_stat(Stat::BitWidth);
642 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
643
644 let int32_array = Int32Array::from(vec![-1, 2, 3]);
645 let array_ref: ArrayRef = Arc::new(int32_array);
646 let block = DataBlock::from_array(array_ref);
647
648 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
649 let actual_bit_width = block.expect_stat(Stat::BitWidth);
650 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
651
652 let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
653 let array_ref: ArrayRef = Arc::new(int32_array);
654 let block = DataBlock::from_array(array_ref);
655
656 let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
657 let actual_bit_width = block.expect_stat(Stat::BitWidth);
658 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
659
660 let int64_array = Int64Array::from(vec![1, 2, 3]);
661 let array_ref: ArrayRef = Arc::new(int64_array);
662 let block = DataBlock::from_array(array_ref);
663
664 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
665 let actual_bit_width = block.expect_stat(Stat::BitWidth);
666 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
667
668 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
669 let array_ref: ArrayRef = Arc::new(int64_array);
670 let block = DataBlock::from_array(array_ref);
671
672 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
673 let actual_bit_width = block.expect_stat(Stat::BitWidth);
674 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
675
676 let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
677 let array_ref: ArrayRef = Arc::new(int64_array);
678 let block = DataBlock::from_array(array_ref);
679
680 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
681 let actual_bit_width = block.expect_stat(Stat::BitWidth);
682 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
683
684 let int64_array = Int64Array::from(vec![-1, 2, 3]);
685 let array_ref: ArrayRef = Arc::new(int64_array);
686 let block = DataBlock::from_array(array_ref);
687
688 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
689 let actual_bit_width = block.expect_stat(Stat::BitWidth);
690 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
691
692 let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
693 let array_ref: ArrayRef = Arc::new(int64_array);
694 let block = DataBlock::from_array(array_ref);
695
696 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
697 let actual_bit_width = block.expect_stat(Stat::BitWidth);
698 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
699
700 let uint8_array = UInt8Array::from(vec![1, 2, 3]);
701 let array_ref: ArrayRef = Arc::new(uint8_array);
702 let block = DataBlock::from_array(array_ref);
703
704 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
705 let actual_bit_width = block.expect_stat(Stat::BitWidth);
706 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
707
708 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
709 let array_ref: ArrayRef = Arc::new(uint8_array);
710 let block = DataBlock::from_array(array_ref);
711
712 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
713 let actual_bit_width = block.expect_stat(Stat::BitWidth);
714 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
715
716 let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
717 let array_ref: ArrayRef = Arc::new(uint8_array);
718 let block = DataBlock::from_array(array_ref);
719
720 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
721 let actual_bit_width = block.expect_stat(Stat::BitWidth);
722 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
723
724 let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
725 let array_ref: ArrayRef = Arc::new(uint8_array);
726 let block = DataBlock::from_array(array_ref);
727
728 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
729 let actual_bit_width = block.expect_stat(Stat::BitWidth);
730 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
731
732 let uint16_array = UInt16Array::from(vec![1, 2, 3]);
733 let array_ref: ArrayRef = Arc::new(uint16_array);
734 let block = DataBlock::from_array(array_ref);
735
736 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
737 let actual_bit_width = block.expect_stat(Stat::BitWidth);
738 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
739
740 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
741 let array_ref: ArrayRef = Arc::new(uint16_array);
742 let block = DataBlock::from_array(array_ref);
743
744 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
745 let actual_bit_width = block.expect_stat(Stat::BitWidth);
746 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
747
748 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
749 let array_ref: ArrayRef = Arc::new(uint16_array);
750 let block = DataBlock::from_array(array_ref);
751
752 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
753 let actual_bit_width = block.expect_stat(Stat::BitWidth);
754 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
755
756 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
757 let array_ref: ArrayRef = Arc::new(uint16_array);
758 let block = DataBlock::from_array(array_ref);
759
760 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
761 let actual_bit_width = block.expect_stat(Stat::BitWidth);
762 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
763
764 let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
765 let array_ref: ArrayRef = Arc::new(uint16_array);
766 let block = DataBlock::from_array(array_ref);
767
768 let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
769 let actual_bit_width = block.expect_stat(Stat::BitWidth);
770 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
771
772 let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
773 let array_ref: ArrayRef = Arc::new(uint16_array);
774 let block = DataBlock::from_array(array_ref);
775
776 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
777 let actual_bit_width = block.expect_stat(Stat::BitWidth);
778 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
779
780 let uint32_array = UInt32Array::from(vec![1, 2, 3]);
781 let array_ref: ArrayRef = Arc::new(uint32_array);
782 let block = DataBlock::from_array(array_ref);
783
784 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
785 let actual_bit_width = block.expect_stat(Stat::BitWidth);
786 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
787
788 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
789 let array_ref: ArrayRef = Arc::new(uint32_array);
790 let block = DataBlock::from_array(array_ref);
791
792 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
793 let actual_bit_width = block.expect_stat(Stat::BitWidth);
794 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
795
796 let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
797 let array_ref: ArrayRef = Arc::new(uint32_array);
798 let block = DataBlock::from_array(array_ref);
799
800 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
801 let actual_bit_width = block.expect_stat(Stat::BitWidth);
802 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
803
804 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
805 let array_ref: ArrayRef = Arc::new(uint32_array);
806 let block = DataBlock::from_array(array_ref);
807
808 let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
809 let actual_bit_width = block.expect_stat(Stat::BitWidth);
810 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
811
812 let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
813 let array_ref: ArrayRef = Arc::new(uint32_array);
814 let block = DataBlock::from_array(array_ref);
815
816 let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
817 let actual_bit_width = block.expect_stat(Stat::BitWidth);
818 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
819
820 let uint64_array = UInt64Array::from(vec![1, 2, 3]);
821 let array_ref: ArrayRef = Arc::new(uint64_array);
822 let block = DataBlock::from_array(array_ref);
823
824 let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
825 let actual_bit_width = block.expect_stat(Stat::BitWidth);
826 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
827
828 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
829 let array_ref: ArrayRef = Arc::new(uint64_array);
830 let block = DataBlock::from_array(array_ref);
831
832 let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
833 let actual_bit_width = block.expect_stat(Stat::BitWidth);
834 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
835
836 let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
837 let array_ref: ArrayRef = Arc::new(uint64_array);
838 let block = DataBlock::from_array(array_ref);
839
840 let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
841 let actual_bit_width = block.expect_stat(Stat::BitWidth);
842 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
843
844 let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
845 let array_ref: ArrayRef = Arc::new(uint64_array);
846 let block = DataBlock::from_array(array_ref);
847
848 let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
849 let actual_bit_width = block.expect_stat(Stat::BitWidth);
850 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
851
852 let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
853 let array_ref: ArrayRef = Arc::new(uint64_array);
854 let block = DataBlock::from_array(array_ref);
855
856 let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
857 let actual_bit_width = block.expect_stat(Stat::BitWidth);
858 assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
859 }
860
861 #[test]
862 fn test_bit_width_stat_more_than_1024() {
863 for data_type in [
864 DataType::Int8,
865 DataType::Int16,
866 DataType::Int32,
867 DataType::Int64,
868 ] {
869 let array1 = Int64Array::from(vec![3; 1024]);
870 let array2 = Int64Array::from(vec![8; 1024]);
871 let array3 = Int64Array::from(vec![-1; 10]);
872 let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
873 let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
874 let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
875
876 let arrays: Vec<&dyn arrow::array::Array> =
877 vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
878 let concatenated = concat(&arrays).unwrap();
879 let block = DataBlock::from_array(concatenated.clone());
880
881 let expected_bit_width = Arc::new(UInt64Array::from(vec![
882 2,
883 4,
884 (data_type.byte_width() * 8) as u64,
885 ])) as ArrayRef;
886 let actual_bit_widths = block.expect_stat(Stat::BitWidth);
887 assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
888 }
889 }
890
891 #[test]
892 fn test_bit_width_when_none() {
893 let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
894 let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
895 let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
896 let block = DataBlock::from_array(arr.clone());
897 assert!(block.get_stat(Stat::BitWidth).is_none(),);
898 }
899
900 #[test]
901 fn test_cardinality_variable_width_datablock() {
902 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
903 let block = DataBlock::from_array(string_array);
904 let expected_cardinality = 2;
905 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
906 assert_eq!(actual_cardinality, expected_cardinality,);
907
908 let string_array = StringArray::from(vec![
909 Some("to be named by variables"),
910 Some("to be passed as arguments to procedures"),
911 Some("to be returned as values of procedures"),
912 ]);
913 let block = DataBlock::from_array(string_array);
914 let expected_cardinality = 3;
915 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
916
917 assert_eq!(actual_cardinality, expected_cardinality,);
918
919 let string_array = StringArray::from(vec![
920 Some("Samuel Eilenberg"),
921 Some("Saunders Mac Lane"),
922 Some("Samuel Eilenberg"),
923 ]);
924 let block = DataBlock::from_array(string_array);
925 let expected_cardinality = 2;
926 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
927 assert_eq!(actual_cardinality, expected_cardinality,);
928
929 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
930 let block = DataBlock::from_array(string_array);
931 let expected_cardinality = 2;
932 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
933 assert_eq!(actual_cardinality, expected_cardinality,);
934
935 let string_array = LargeStringArray::from(vec![
936 Some("to be named by variables"),
937 Some("to be passed as arguments to procedures"),
938 Some("to be returned as values of procedures"),
939 ]);
940 let block = DataBlock::from_array(string_array);
941 let expected_cardinality = 3;
942 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
943 assert_eq!(actual_cardinality, expected_cardinality,);
944
945 let string_array = LargeStringArray::from(vec![
946 Some("Samuel Eilenberg"),
947 Some("Saunders Mac Lane"),
948 Some("Samuel Eilenberg"),
949 ]);
950 let block = DataBlock::from_array(string_array);
951 let expected_cardinality = 2;
952 let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
953 assert_eq!(actual_cardinality, expected_cardinality,);
954 }
955
956 #[test]
957 fn test_max_length_variable_width_datablock() {
958 let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
959 let block = DataBlock::from_array(string_array.clone());
960 let expected_max_length = string_array.value_length(0) as u64;
961 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
962 assert_eq!(actual_max_length, expected_max_length);
963
964 let string_array = StringArray::from(vec![
965 Some("to be named by variables"),
966 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
968 ]);
969 let block = DataBlock::from_array(string_array.clone());
970 let expected_max_length = string_array.value_length(1) as u64;
971 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
972 assert_eq!(actual_max_length, expected_max_length);
973
974 let string_array = StringArray::from(vec![
975 Some("Samuel Eilenberg"),
976 Some("Saunders Mac Lane"), Some("Samuel Eilenberg"),
978 ]);
979 let block = DataBlock::from_array(string_array.clone());
980 let expected_max_length = string_array.value_length(1) as u64;
981 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
982 assert_eq!(actual_max_length, expected_max_length);
983
984 let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
985 let block = DataBlock::from_array(string_array.clone());
986 let expected_max_length = string_array.value_length(1) as u64;
987 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
988 assert_eq!(actual_max_length, expected_max_length);
989
990 let string_array = LargeStringArray::from(vec![
991 Some("to be named by variables"),
992 Some("to be passed as arguments to procedures"), Some("to be returned as values of procedures"),
994 ]);
995 let block = DataBlock::from_array(string_array.clone());
996 let expected_max_length = string_array.value(1).len() as u64;
997 let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
998
999 assert_eq!(actual_max_length, expected_max_length);
1000 }
1001}