lance_encoding/
statistics.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::{
5    fmt::{self},
6    hash::{Hash, RandomState},
7    sync::Arc,
8};
9
10use arrow::{array::AsArray, datatypes::UInt64Type};
11use arrow_array::{Array, ArrowPrimitiveType, UInt64Array};
12use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
13use num_traits::PrimInt;
14
15use crate::data::{
16    AllNullDataBlock, DataBlock, DictionaryDataBlock, FixedSizeListBlock, FixedWidthDataBlock,
17    NullableDataBlock, OpaqueBlock, StructDataBlock, VariableWidthBlock,
18};
19
20#[derive(Clone, Copy, PartialEq, Eq, Hash)]
21pub enum Stat {
22    BitWidth,
23    DataSize,
24    Cardinality,
25    FixedSize,
26    NullCount,
27    MaxLength,
28}
29
30impl fmt::Debug for Stat {
31    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
32        match self {
33            Self::BitWidth => write!(f, "BitWidth"),
34            Self::DataSize => write!(f, "DataSize"),
35            Self::Cardinality => write!(f, "Cardinality"),
36            Self::FixedSize => write!(f, "FixedSize"),
37            Self::NullCount => write!(f, "NullCount"),
38            Self::MaxLength => write!(f, "MaxLength"),
39        }
40    }
41}
42
43impl fmt::Display for Stat {
44    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
45        write!(f, "{:?}", self)
46    }
47}
48
49pub trait ComputeStat {
50    fn compute_stat(&mut self);
51}
52
53impl ComputeStat for DataBlock {
54    fn compute_stat(&mut self) {
55        match self {
56            Self::Empty() => {}
57            Self::Constant(_) => {}
58            Self::AllNull(_) => {}
59            Self::Nullable(data_block) => data_block.data.compute_stat(),
60            Self::FixedWidth(data_block) => data_block.compute_stat(),
61            Self::FixedSizeList(data_block) => data_block.compute_stat(),
62            Self::VariableWidth(data_block) => data_block.compute_stat(),
63            Self::Opaque(data_block) => data_block.compute_stat(),
64            Self::Struct(data_block) => data_block.compute_stat(),
65            Self::Dictionary(_) => {}
66        }
67    }
68}
69
70impl ComputeStat for VariableWidthBlock {
71    fn compute_stat(&mut self) {
72        if !self.block_info.0.read().unwrap().is_empty() {
73            panic!("compute_stat should only be called once during DataBlock construction");
74        }
75        let data_size = self.data_size();
76        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
77
78        let cardinality_array = self.cardinality();
79
80        let max_length_array = self.max_length();
81
82        let mut info = self.block_info.0.write().unwrap();
83        info.insert(Stat::DataSize, data_size_array);
84        info.insert(Stat::Cardinality, cardinality_array);
85        info.insert(Stat::MaxLength, max_length_array);
86    }
87}
88
89impl ComputeStat for FixedWidthDataBlock {
90    fn compute_stat(&mut self) {
91        // compute this datablock's data_size
92        let data_size = self.data_size();
93        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
94
95        // compute this datablock's max_bit_width
96        let max_bit_widths = self.max_bit_widths();
97
98        // the MaxLength of FixedWidthDataBlock is it's self.bits_per_value / 8
99        let max_len = self.bits_per_value / 8;
100        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
101
102        let cardidinality_array = if self.bits_per_value == 128 {
103            Some(self.cardinality())
104        } else {
105            None
106        };
107
108        let mut info = self.block_info.0.write().unwrap();
109        info.insert(Stat::DataSize, data_size_array);
110        info.insert(Stat::BitWidth, max_bit_widths);
111        info.insert(Stat::MaxLength, max_len_array);
112        if let Some(cardinality_array) = cardidinality_array {
113            info.insert(Stat::Cardinality, cardinality_array);
114        }
115    }
116}
117
118impl ComputeStat for FixedSizeListBlock {
119    fn compute_stat(&mut self) {
120        // We leave the child stats unchanged.  This may seem odd (e.g. should bit width be the
121        // bit width of the child * dimension?) but it's because we use these stats to determine
122        // compression and we are currently just compressing the child data.
123        //
124        // There is a potential opportunity here to do better.  For example, if we have a FSL of
125        // 4 32-bit integers then we should probably treat them as a single 128-bit integer or maybe
126        // even 4 columns of 32-bit integers.  This might yield better compression.
127        self.child.compute_stat();
128    }
129}
130
131impl ComputeStat for OpaqueBlock {
132    fn compute_stat(&mut self) {
133        // compute this datablock's data_size
134        let data_size = self.data_size();
135        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
136        let mut info = self.block_info.0.write().unwrap();
137        info.insert(Stat::DataSize, data_size_array);
138    }
139}
140
141pub trait GetStat: fmt::Debug {
142    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>>;
143
144    fn expect_stat(&self, stat: Stat) -> Arc<dyn Array> {
145        self.get_stat(stat)
146            .unwrap_or_else(|| panic!("{:?} DataBlock does not have `{}` statistics.", self, stat))
147    }
148
149    fn expect_single_stat<T: ArrowPrimitiveType>(&self, stat: Stat) -> T::Native {
150        let stat_value = self.expect_stat(stat);
151        let stat_value = stat_value.as_primitive::<T>();
152        if stat_value.len() != 1 {
153            panic!(
154                "{:?} DataBlock does not have exactly one value for `{} statistics.",
155                self, stat
156            );
157        }
158        stat_value.value(0)
159    }
160}
161
162impl GetStat for DataBlock {
163    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
164        match self {
165            Self::Empty() => None,
166            Self::Constant(_) => None,
167            Self::AllNull(data_block) => data_block.get_stat(stat),
168            Self::Nullable(data_block) => data_block.get_stat(stat),
169            Self::FixedWidth(data_block) => data_block.get_stat(stat),
170            Self::FixedSizeList(data_block) => data_block.get_stat(stat),
171            Self::VariableWidth(data_block) => data_block.get_stat(stat),
172            Self::Opaque(data_block) => data_block.get_stat(stat),
173            Self::Struct(data_block) => data_block.get_stat(stat),
174            Self::Dictionary(data_block) => data_block.get_stat(stat),
175        }
176    }
177}
178
179// NullableDataBlock will be deprecated in Lance 2.1.
180impl GetStat for NullableDataBlock {
181    // This function simply returns the statistics of the inner `DataBlock` of `NullableDataBlock`,
182    // this is not accurate but `NullableDataBlock` is going to be deprecated in Lance 2.1 anyway.
183    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
184        self.data.get_stat(stat)
185    }
186}
187
188impl GetStat for VariableWidthBlock {
189    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
190        let block_info = self.block_info.0.read().unwrap();
191
192        if block_info.is_empty() {
193            panic!("get_stat should be called after statistics are computed.");
194        }
195        block_info.get(&stat).cloned()
196    }
197}
198
199impl GetStat for FixedSizeListBlock {
200    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
201        let child_stat = self.child.get_stat(stat);
202        match stat {
203            Stat::MaxLength => child_stat.map(|max_length| {
204                // this is conservative when working with variable length data as we shouldn't assume
205                // that we have a list of all max-length elements but it's cheap and easy to calculate
206                let max_length = max_length.as_primitive::<UInt64Type>().value(0);
207                Arc::new(UInt64Array::from(vec![max_length * self.dimension])) as Arc<dyn Array>
208            }),
209            _ => child_stat,
210        }
211    }
212}
213
214impl VariableWidthBlock {
215    // Caveat: the computation here assumes VariableWidthBlock.offsets maps directly to VariableWidthBlock.data
216    // without any adjustment(for example, no null_adjustment for offsets)
217    fn cardinality(&mut self) -> Arc<dyn Array> {
218        const PRECISION: u8 = 4;
219        let mut hll: HyperLogLogPlus<&[u8], RandomState> =
220            HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
221
222        match self.bits_per_offset {
223            32 => {
224                let offsets_ref = self.offsets.borrow_to_typed_slice::<u32>();
225                let offsets: &[u32] = offsets_ref.as_ref();
226
227                offsets
228                    .iter()
229                    .zip(offsets.iter().skip(1))
230                    .for_each(|(&start, &end)| {
231                        hll.insert(&self.data[start as usize..end as usize]);
232                    });
233                let cardinality = hll.count() as u64;
234                Arc::new(UInt64Array::from(vec![cardinality]))
235            }
236            64 => {
237                let offsets_ref = self.offsets.borrow_to_typed_slice::<u64>();
238                let offsets: &[u64] = offsets_ref.as_ref();
239
240                offsets
241                    .iter()
242                    .zip(offsets.iter().skip(1))
243                    .for_each(|(&start, &end)| {
244                        hll.insert(&self.data[start as usize..end as usize]);
245                    });
246
247                let cardinality = hll.count() as u64;
248                Arc::new(UInt64Array::from(vec![cardinality]))
249            }
250            _ => {
251                unreachable!("the bits_per_offset of VariableWidthBlock can only be 32 or 64")
252            }
253        }
254    }
255
256    fn max_length(&mut self) -> Arc<dyn Array> {
257        match self.bits_per_offset {
258            32 => {
259                let offsets = self.offsets.borrow_to_typed_slice::<u32>();
260                let offsets = offsets.as_ref();
261                let max_len = offsets
262                    .windows(2)
263                    .map(|pair| pair[1] - pair[0])
264                    .max()
265                    .unwrap_or(0);
266                Arc::new(UInt64Array::from(vec![max_len as u64]))
267            }
268            64 => {
269                let offsets = self.offsets.borrow_to_typed_slice::<u64>();
270                let offsets = offsets.as_ref();
271                let max_len = offsets
272                    .windows(2)
273                    .map(|pair| pair[1] - pair[0])
274                    .max()
275                    .unwrap_or(0);
276                Arc::new(UInt64Array::from(vec![max_len]))
277            }
278            _ => {
279                unreachable!("the type of offsets in VariableWidth can only be u32 or u64");
280            }
281        }
282    }
283}
284
285impl GetStat for AllNullDataBlock {
286    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
287        match stat {
288            Stat::NullCount => {
289                let null_count = self.num_values;
290                Some(Arc::new(UInt64Array::from(vec![null_count])))
291            }
292            Stat::DataSize => Some(Arc::new(UInt64Array::from(vec![0]))),
293            _ => None,
294        }
295    }
296}
297
298impl GetStat for FixedWidthDataBlock {
299    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
300        let block_info = self.block_info.0.read().unwrap();
301
302        if block_info.is_empty() {
303            panic!("get_stat should be called after statistics are computed.");
304        }
305        block_info.get(&stat).cloned()
306    }
307}
308
309impl FixedWidthDataBlock {
310    fn max_bit_widths(&mut self) -> Arc<dyn Array> {
311        assert!(self.num_values > 0);
312
313        const CHUNK_SIZE: usize = 1024;
314
315        fn calculate_max_bit_width<T: PrimInt>(slice: &[T], bits_per_value: u64) -> Vec<u64> {
316            slice
317                .chunks(CHUNK_SIZE)
318                .map(|chunk| {
319                    let max_value = chunk.iter().fold(T::zero(), |acc, &x| acc | x);
320                    bits_per_value - max_value.leading_zeros() as u64
321                })
322                .collect()
323        }
324
325        match self.bits_per_value {
326            8 => {
327                let u8_slice = self.data.borrow_to_typed_slice::<u8>();
328                let u8_slice = u8_slice.as_ref();
329                Arc::new(UInt64Array::from(calculate_max_bit_width(
330                    u8_slice,
331                    self.bits_per_value,
332                )))
333            }
334            16 => {
335                let u16_slice = self.data.borrow_to_typed_slice::<u16>();
336                let u16_slice = u16_slice.as_ref();
337                Arc::new(UInt64Array::from(calculate_max_bit_width(
338                    u16_slice,
339                    self.bits_per_value,
340                )))
341            }
342            32 => {
343                let u32_slice = self.data.borrow_to_typed_slice::<u32>();
344                let u32_slice = u32_slice.as_ref();
345                Arc::new(UInt64Array::from(calculate_max_bit_width(
346                    u32_slice,
347                    self.bits_per_value,
348                )))
349            }
350            64 => {
351                let u64_slice = self.data.borrow_to_typed_slice::<u64>();
352                let u64_slice = u64_slice.as_ref();
353                Arc::new(UInt64Array::from(calculate_max_bit_width(
354                    u64_slice,
355                    self.bits_per_value,
356                )))
357            }
358            _ => Arc::new(UInt64Array::from(vec![self.bits_per_value])),
359        }
360    }
361
362    fn cardinality(&mut self) -> Arc<dyn Array> {
363        match self.bits_per_value {
364            128 => {
365                let u128_slice_ref = self.data.borrow_to_typed_slice::<u128>();
366                let u128_slice = u128_slice_ref.as_ref();
367
368                const PRECISION: u8 = 4;
369                let mut hll: HyperLogLogPlus<u128, RandomState> =
370                    HyperLogLogPlus::new(PRECISION, RandomState::new()).unwrap();
371                for val in u128_slice {
372                    hll.insert(val);
373                }
374                let cardinality = hll.count() as u64;
375                Arc::new(UInt64Array::from(vec![cardinality]))
376            }
377            _ => unreachable!(),
378        }
379    }
380}
381
382impl GetStat for OpaqueBlock {
383    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
384        let block_info = self.block_info.0.read().unwrap();
385
386        if block_info.is_empty() {
387            panic!("get_stat should be called after statistics are computed.");
388        }
389        block_info.get(&stat).cloned()
390    }
391}
392
393impl GetStat for DictionaryDataBlock {
394    fn get_stat(&self, _stat: Stat) -> Option<Arc<dyn Array>> {
395        None
396    }
397}
398
399impl GetStat for StructDataBlock {
400    fn get_stat(&self, stat: Stat) -> Option<Arc<dyn Array>> {
401        let block_info = self.block_info.0.read().unwrap();
402        if block_info.is_empty() {
403            panic!("get_stat should be called after statistics are computed.")
404        }
405        block_info.get(&stat).cloned()
406    }
407}
408
409impl ComputeStat for StructDataBlock {
410    fn compute_stat(&mut self) {
411        let data_size = self.data_size();
412        let data_size_array = Arc::new(UInt64Array::from(vec![data_size]));
413
414        let max_len = self
415            .children
416            .iter()
417            .map(|child| child.expect_single_stat::<UInt64Type>(Stat::MaxLength))
418            .sum::<u64>();
419        let max_len_array = Arc::new(UInt64Array::from(vec![max_len]));
420
421        let mut info = self.block_info.0.write().unwrap();
422        info.insert(Stat::DataSize, data_size_array);
423        info.insert(Stat::MaxLength, max_len_array);
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use std::sync::Arc;
430
431    use arrow_array::{
432        ArrayRef, Int16Array, Int32Array, Int64Array, Int8Array, LargeStringArray, StringArray,
433        UInt16Array, UInt32Array, UInt64Array, UInt8Array,
434    };
435    use arrow_schema::{DataType, Field};
436    use lance_arrow::DataTypeExt;
437    use lance_datagen::{array, ArrayGeneratorExt, RowCount, DEFAULT_SEED};
438    use rand::SeedableRng;
439
440    use crate::statistics::{GetStat, Stat};
441
442    use super::DataBlock;
443
444    use arrow::{
445        array::AsArray,
446        compute::concat,
447        datatypes::{Int32Type, UInt64Type},
448    };
449    use arrow_array::Array;
450    #[test]
451    fn test_data_size_stat() {
452        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
453        let mut gen = array::rand::<Int32Type>().with_nulls(&[false, false, false]);
454        let arr1 = gen.generate(RowCount::from(3), &mut rng).unwrap();
455        let arr2 = gen.generate(RowCount::from(3), &mut rng).unwrap();
456        let arr3 = gen.generate(RowCount::from(3), &mut rng).unwrap();
457        let block = DataBlock::from_arrays(&[arr1.clone(), arr2.clone(), arr3.clone()], 9);
458
459        let concatenated_array = concat(&[
460            &*Arc::new(arr1.clone()) as &dyn Array,
461            &*Arc::new(arr2.clone()) as &dyn Array,
462            &*Arc::new(arr3.clone()) as &dyn Array,
463        ])
464        .unwrap();
465
466        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
467
468        let total_buffer_size: usize = concatenated_array
469            .to_data()
470            .buffers()
471            .iter()
472            .map(|buffer| buffer.len())
473            .sum();
474        assert!(data_size == total_buffer_size as u64);
475
476        // test DataType::Binary
477        let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
478        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
479        let block = DataBlock::from_array(arr.clone());
480        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
481
482        let total_buffer_size: usize = arr
483            .to_data()
484            .buffers()
485            .iter()
486            .map(|buffer| buffer.len())
487            .sum();
488        assert!(data_size == total_buffer_size as u64);
489
490        // test DataType::Struct
491        let fields = vec![
492            Arc::new(Field::new("int_field", DataType::Int32, false)),
493            Arc::new(Field::new("float_field", DataType::Float32, false)),
494        ]
495        .into();
496
497        let mut gen = lance_datagen::array::rand_type(&DataType::Struct(fields));
498        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
499        let block = DataBlock::from_array(arr.clone());
500        let (_, arr_parts, _) = arr.as_struct().clone().into_parts();
501        let total_buffer_size: usize = arr_parts
502            .iter()
503            .map(|arr| {
504                arr.to_data()
505                    .buffers()
506                    .iter()
507                    .map(|buffer| buffer.len())
508                    .sum::<usize>()
509            })
510            .sum();
511        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
512        assert!(data_size == total_buffer_size as u64);
513
514        // test DataType::Dictionary
515        let mut gen = array::rand_type(&DataType::Dictionary(
516            Box::new(DataType::Int32),
517            Box::new(DataType::Utf8),
518        ));
519        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
520        let block = DataBlock::from_array(arr.clone());
521        assert!(block.get_stat(Stat::DataSize).is_none());
522
523        let mut gen = array::rand::<Int32Type>().with_nulls(&[false, true, false]);
524        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
525        let block = DataBlock::from_array(arr.clone());
526        let data_size = block.expect_single_stat::<UInt64Type>(Stat::DataSize);
527        let total_buffer_size: usize = arr
528            .to_data()
529            .buffers()
530            .iter()
531            .map(|buffer| buffer.len())
532            .sum();
533
534        assert!(data_size == total_buffer_size as u64);
535    }
536
537    #[test]
538    fn test_bit_width_stat_for_integers() {
539        let int8_array = Int8Array::from(vec![1, 2, 3]);
540        let array_ref: ArrayRef = Arc::new(int8_array);
541        let block = DataBlock::from_array(array_ref);
542
543        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
544        let actual_bit_width = block.expect_stat(Stat::BitWidth);
545
546        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
547
548        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
549        let array_ref: ArrayRef = Arc::new(int8_array);
550        let block = DataBlock::from_array(array_ref);
551
552        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
553        let actual_bit_width = block.expect_stat(Stat::BitWidth);
554        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
555
556        let int8_array = Int8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
557        let array_ref: ArrayRef = Arc::new(int8_array);
558        let block = DataBlock::from_array(array_ref);
559
560        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
561        let actual_bit_width = block.expect_stat(Stat::BitWidth);
562        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
563
564        let int8_array = Int8Array::from(vec![-1, 2, 3]);
565        let array_ref: ArrayRef = Arc::new(int8_array);
566        let block = DataBlock::from_array(array_ref);
567
568        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
569        let actual_bit_width = block.expect_stat(Stat::BitWidth);
570        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
571
572        let int16_array = Int16Array::from(vec![1, 2, 3]);
573        let array_ref: ArrayRef = Arc::new(int16_array);
574        let block = DataBlock::from_array(array_ref);
575
576        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
577        let actual_bit_width = block.expect_stat(Stat::BitWidth);
578        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
579
580        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
581        let array_ref: ArrayRef = Arc::new(int16_array);
582        let block = DataBlock::from_array(array_ref);
583
584        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
585        let actual_bit_width = block.expect_stat(Stat::BitWidth);
586        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
587
588        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
589        let array_ref: ArrayRef = Arc::new(int16_array);
590        let block = DataBlock::from_array(array_ref);
591
592        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
593        let actual_bit_width = block.expect_stat(Stat::BitWidth);
594        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
595
596        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
597        let array_ref: ArrayRef = Arc::new(int16_array);
598        let block = DataBlock::from_array(array_ref);
599
600        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
601        let actual_bit_width = block.expect_stat(Stat::BitWidth);
602        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
603
604        let int16_array = Int16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
605        let array_ref: ArrayRef = Arc::new(int16_array);
606        let block = DataBlock::from_array(array_ref);
607
608        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
609        let actual_bit_width = block.expect_stat(Stat::BitWidth);
610        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
611
612        let int16_array = Int16Array::from(vec![-1, 2, 3]);
613        let array_ref: ArrayRef = Arc::new(int16_array);
614        let block = DataBlock::from_array(array_ref);
615
616        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
617        let actual_bit_width = block.expect_stat(Stat::BitWidth);
618        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
619
620        let int32_array = Int32Array::from(vec![1, 2, 3]);
621        let array_ref: ArrayRef = Arc::new(int32_array);
622        let block = DataBlock::from_array(array_ref);
623
624        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
625        let actual_bit_width = block.expect_stat(Stat::BitWidth);
626        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
627
628        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
629        let array_ref: ArrayRef = Arc::new(int32_array);
630        let block = DataBlock::from_array(array_ref);
631
632        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
633        let actual_bit_width = block.expect_stat(Stat::BitWidth);
634        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
635
636        let int32_array = Int32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
637        let array_ref: ArrayRef = Arc::new(int32_array);
638        let block = DataBlock::from_array(array_ref);
639
640        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
641        let actual_bit_width = block.expect_stat(Stat::BitWidth);
642        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
643
644        let int32_array = Int32Array::from(vec![-1, 2, 3]);
645        let array_ref: ArrayRef = Arc::new(int32_array);
646        let block = DataBlock::from_array(array_ref);
647
648        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
649        let actual_bit_width = block.expect_stat(Stat::BitWidth);
650        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
651
652        let int32_array = Int32Array::from(vec![-1, 2, 3, -88]);
653        let array_ref: ArrayRef = Arc::new(int32_array);
654        let block = DataBlock::from_array(array_ref);
655
656        let expected_bit_width = Arc::new(UInt64Array::from(vec![32])) as ArrayRef;
657        let actual_bit_width = block.expect_stat(Stat::BitWidth);
658        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
659
660        let int64_array = Int64Array::from(vec![1, 2, 3]);
661        let array_ref: ArrayRef = Arc::new(int64_array);
662        let block = DataBlock::from_array(array_ref);
663
664        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
665        let actual_bit_width = block.expect_stat(Stat::BitWidth);
666        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
667
668        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
669        let array_ref: ArrayRef = Arc::new(int64_array);
670        let block = DataBlock::from_array(array_ref);
671
672        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
673        let actual_bit_width = block.expect_stat(Stat::BitWidth);
674        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
675
676        let int64_array = Int64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
677        let array_ref: ArrayRef = Arc::new(int64_array);
678        let block = DataBlock::from_array(array_ref);
679
680        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
681        let actual_bit_width = block.expect_stat(Stat::BitWidth);
682        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
683
684        let int64_array = Int64Array::from(vec![-1, 2, 3]);
685        let array_ref: ArrayRef = Arc::new(int64_array);
686        let block = DataBlock::from_array(array_ref);
687
688        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
689        let actual_bit_width = block.expect_stat(Stat::BitWidth);
690        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
691
692        let int64_array = Int64Array::from(vec![-1, 2, 3, -88]);
693        let array_ref: ArrayRef = Arc::new(int64_array);
694        let block = DataBlock::from_array(array_ref);
695
696        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
697        let actual_bit_width = block.expect_stat(Stat::BitWidth);
698        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
699
700        let uint8_array = UInt8Array::from(vec![1, 2, 3]);
701        let array_ref: ArrayRef = Arc::new(uint8_array);
702        let block = DataBlock::from_array(array_ref);
703
704        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
705        let actual_bit_width = block.expect_stat(Stat::BitWidth);
706        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
707
708        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
709        let array_ref: ArrayRef = Arc::new(uint8_array);
710        let block = DataBlock::from_array(array_ref);
711
712        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
713        let actual_bit_width = block.expect_stat(Stat::BitWidth);
714        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
715
716        let uint8_array = UInt8Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
717        let array_ref: ArrayRef = Arc::new(uint8_array);
718        let block = DataBlock::from_array(array_ref);
719
720        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
721        let actual_bit_width = block.expect_stat(Stat::BitWidth);
722        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
723
724        let uint8_array = UInt8Array::from(vec![1, 2, 3, 0xF]);
725        let array_ref: ArrayRef = Arc::new(uint8_array);
726        let block = DataBlock::from_array(array_ref);
727
728        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
729        let actual_bit_width = block.expect_stat(Stat::BitWidth);
730        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
731
732        let uint16_array = UInt16Array::from(vec![1, 2, 3]);
733        let array_ref: ArrayRef = Arc::new(uint16_array);
734        let block = DataBlock::from_array(array_ref);
735
736        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
737        let actual_bit_width = block.expect_stat(Stat::BitWidth);
738        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
739
740        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x7F]);
741        let array_ref: ArrayRef = Arc::new(uint16_array);
742        let block = DataBlock::from_array(array_ref);
743
744        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
745        let actual_bit_width = block.expect_stat(Stat::BitWidth);
746        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
747
748        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
749        let array_ref: ArrayRef = Arc::new(uint16_array);
750        let block = DataBlock::from_array(array_ref);
751
752        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
753        let actual_bit_width = block.expect_stat(Stat::BitWidth);
754        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
755
756        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0x1FF]);
757        let array_ref: ArrayRef = Arc::new(uint16_array);
758        let block = DataBlock::from_array(array_ref);
759
760        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
761        let actual_bit_width = block.expect_stat(Stat::BitWidth);
762        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
763
764        let uint16_array = UInt16Array::from(vec![0x1, 0x2, 0x3, 0xF, 0x1F]);
765        let array_ref: ArrayRef = Arc::new(uint16_array);
766        let block = DataBlock::from_array(array_ref);
767
768        let expected_bit_width = Arc::new(UInt64Array::from(vec![5])) as ArrayRef;
769        let actual_bit_width = block.expect_stat(Stat::BitWidth);
770        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
771
772        let uint16_array = UInt16Array::from(vec![1, 2, 3, 0xFFFF]);
773        let array_ref: ArrayRef = Arc::new(uint16_array);
774        let block = DataBlock::from_array(array_ref);
775
776        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
777        let actual_bit_width = block.expect_stat(Stat::BitWidth);
778        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
779
780        let uint32_array = UInt32Array::from(vec![1, 2, 3]);
781        let array_ref: ArrayRef = Arc::new(uint32_array);
782        let block = DataBlock::from_array(array_ref);
783
784        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
785        let actual_bit_width = block.expect_stat(Stat::BitWidth);
786        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
787
788        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
789        let array_ref: ArrayRef = Arc::new(uint32_array);
790        let block = DataBlock::from_array(array_ref);
791
792        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
793        let actual_bit_width = block.expect_stat(Stat::BitWidth);
794        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref(),);
795
796        let uint32_array = UInt32Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
797        let array_ref: ArrayRef = Arc::new(uint32_array);
798        let block = DataBlock::from_array(array_ref);
799
800        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
801        let actual_bit_width = block.expect_stat(Stat::BitWidth);
802        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
803
804        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0xF]);
805        let array_ref: ArrayRef = Arc::new(uint32_array);
806        let block = DataBlock::from_array(array_ref);
807
808        let expected_bit_width = Arc::new(UInt64Array::from(vec![4])) as ArrayRef;
809        let actual_bit_width = block.expect_stat(Stat::BitWidth);
810        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
811
812        let uint32_array = UInt32Array::from(vec![1, 2, 3, 0x77]);
813        let array_ref: ArrayRef = Arc::new(uint32_array);
814        let block = DataBlock::from_array(array_ref);
815
816        let expected_bit_width = Arc::new(UInt64Array::from(vec![7])) as ArrayRef;
817        let actual_bit_width = block.expect_stat(Stat::BitWidth);
818        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
819
820        let uint64_array = UInt64Array::from(vec![1, 2, 3]);
821        let array_ref: ArrayRef = Arc::new(uint64_array);
822        let block = DataBlock::from_array(array_ref);
823
824        let expected_bit_width = Arc::new(UInt64Array::from(vec![2])) as ArrayRef;
825        let actual_bit_width = block.expect_stat(Stat::BitWidth);
826        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
827
828        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF]);
829        let array_ref: ArrayRef = Arc::new(uint64_array);
830        let block = DataBlock::from_array(array_ref);
831
832        let expected_bit_width = Arc::new(UInt64Array::from(vec![8])) as ArrayRef;
833        let actual_bit_width = block.expect_stat(Stat::BitWidth);
834        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
835
836        let uint64_array = UInt64Array::from(vec![0x1, 0x2, 0x3, 0xFF, 0x1FF]);
837        let array_ref: ArrayRef = Arc::new(uint64_array);
838        let block = DataBlock::from_array(array_ref);
839
840        let expected_bit_width = Arc::new(UInt64Array::from(vec![9])) as ArrayRef;
841        let actual_bit_width = block.expect_stat(Stat::BitWidth);
842        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
843
844        let uint64_array = UInt64Array::from(vec![0, 2, 3, 0xFFFF]);
845        let array_ref: ArrayRef = Arc::new(uint64_array);
846        let block = DataBlock::from_array(array_ref);
847
848        let expected_bit_width = Arc::new(UInt64Array::from(vec![16])) as ArrayRef;
849        let actual_bit_width = block.expect_stat(Stat::BitWidth);
850        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
851
852        let uint64_array = UInt64Array::from(vec![1, 2, 3, 0xFFFF_FFFF_FFFF_FFFF]);
853        let array_ref: ArrayRef = Arc::new(uint64_array);
854        let block = DataBlock::from_array(array_ref);
855
856        let expected_bit_width = Arc::new(UInt64Array::from(vec![64])) as ArrayRef;
857        let actual_bit_width = block.expect_stat(Stat::BitWidth);
858        assert_eq!(actual_bit_width.as_ref(), expected_bit_width.as_ref());
859    }
860
861    #[test]
862    fn test_bit_width_stat_more_than_1024() {
863        for data_type in [
864            DataType::Int8,
865            DataType::Int16,
866            DataType::Int32,
867            DataType::Int64,
868        ] {
869            let array1 = Int64Array::from(vec![3; 1024]);
870            let array2 = Int64Array::from(vec![8; 1024]);
871            let array3 = Int64Array::from(vec![-1; 10]);
872            let array1 = arrow_cast::cast(&array1, &data_type).unwrap();
873            let array2 = arrow_cast::cast(&array2, &data_type).unwrap();
874            let array3 = arrow_cast::cast(&array3, &data_type).unwrap();
875
876            let arrays: Vec<&dyn arrow::array::Array> =
877                vec![array1.as_ref(), array2.as_ref(), array3.as_ref()];
878            let concatenated = concat(&arrays).unwrap();
879            let block = DataBlock::from_array(concatenated.clone());
880
881            let expected_bit_width = Arc::new(UInt64Array::from(vec![
882                2,
883                4,
884                (data_type.byte_width() * 8) as u64,
885            ])) as ArrayRef;
886            let actual_bit_widths = block.expect_stat(Stat::BitWidth);
887            assert_eq!(actual_bit_widths.as_ref(), expected_bit_width.as_ref(),);
888        }
889    }
890
891    #[test]
892    fn test_bit_width_when_none() {
893        let mut rng = rand_xoshiro::Xoshiro256PlusPlus::seed_from_u64(DEFAULT_SEED.0);
894        let mut gen = lance_datagen::array::rand_type(&DataType::Binary);
895        let arr = gen.generate(RowCount::from(3), &mut rng).unwrap();
896        let block = DataBlock::from_array(arr.clone());
897        assert!(block.get_stat(Stat::BitWidth).is_none(),);
898    }
899
900    #[test]
901    fn test_cardinality_variable_width_datablock() {
902        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
903        let block = DataBlock::from_array(string_array);
904        let expected_cardinality = 2;
905        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
906        assert_eq!(actual_cardinality, expected_cardinality,);
907
908        let string_array = StringArray::from(vec![
909            Some("to be named by variables"),
910            Some("to be passed as arguments to procedures"),
911            Some("to be returned as values of procedures"),
912        ]);
913        let block = DataBlock::from_array(string_array);
914        let expected_cardinality = 3;
915        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
916
917        assert_eq!(actual_cardinality, expected_cardinality,);
918
919        let string_array = StringArray::from(vec![
920            Some("Samuel Eilenberg"),
921            Some("Saunders Mac Lane"),
922            Some("Samuel Eilenberg"),
923        ]);
924        let block = DataBlock::from_array(string_array);
925        let expected_cardinality = 2;
926        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
927        assert_eq!(actual_cardinality, expected_cardinality,);
928
929        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
930        let block = DataBlock::from_array(string_array);
931        let expected_cardinality = 2;
932        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
933        assert_eq!(actual_cardinality, expected_cardinality,);
934
935        let string_array = LargeStringArray::from(vec![
936            Some("to be named by variables"),
937            Some("to be passed as arguments to procedures"),
938            Some("to be returned as values of procedures"),
939        ]);
940        let block = DataBlock::from_array(string_array);
941        let expected_cardinality = 3;
942        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
943        assert_eq!(actual_cardinality, expected_cardinality,);
944
945        let string_array = LargeStringArray::from(vec![
946            Some("Samuel Eilenberg"),
947            Some("Saunders Mac Lane"),
948            Some("Samuel Eilenberg"),
949        ]);
950        let block = DataBlock::from_array(string_array);
951        let expected_cardinality = 2;
952        let actual_cardinality = block.expect_single_stat::<UInt64Type>(Stat::Cardinality);
953        assert_eq!(actual_cardinality, expected_cardinality,);
954    }
955
956    #[test]
957    fn test_max_length_variable_width_datablock() {
958        let string_array = StringArray::from(vec![Some("hello"), Some("world")]);
959        let block = DataBlock::from_array(string_array.clone());
960        let expected_max_length = string_array.value_length(0) as u64;
961        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
962        assert_eq!(actual_max_length, expected_max_length);
963
964        let string_array = StringArray::from(vec![
965            Some("to be named by variables"),
966            Some("to be passed as arguments to procedures"), // string that has max length
967            Some("to be returned as values of procedures"),
968        ]);
969        let block = DataBlock::from_array(string_array.clone());
970        let expected_max_length = string_array.value_length(1) as u64;
971        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
972        assert_eq!(actual_max_length, expected_max_length);
973
974        let string_array = StringArray::from(vec![
975            Some("Samuel Eilenberg"),
976            Some("Saunders Mac Lane"), // string that has max length
977            Some("Samuel Eilenberg"),
978        ]);
979        let block = DataBlock::from_array(string_array.clone());
980        let expected_max_length = string_array.value_length(1) as u64;
981        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
982        assert_eq!(actual_max_length, expected_max_length);
983
984        let string_array = LargeStringArray::from(vec![Some("hello"), Some("world")]);
985        let block = DataBlock::from_array(string_array.clone());
986        let expected_max_length = string_array.value_length(1) as u64;
987        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
988        assert_eq!(actual_max_length, expected_max_length);
989
990        let string_array = LargeStringArray::from(vec![
991            Some("to be named by variables"),
992            Some("to be passed as arguments to procedures"), // string that has max length
993            Some("to be returned as values of procedures"),
994        ]);
995        let block = DataBlock::from_array(string_array.clone());
996        let expected_max_length = string_array.value(1).len() as u64;
997        let actual_max_length = block.expect_single_stat::<UInt64Type>(Stat::MaxLength);
998
999        assert_eq!(actual_max_length, expected_max_length);
1000    }
1001}