lance_encoding/encodings/physical/bitpack_fastlanes.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4use std::sync::Arc;
5
6use arrow::datatypes::{
7 Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type,
8};
9use arrow_array::{Array, PrimitiveArray};
10use arrow_schema::DataType;
11use byteorder::{ByteOrder, LittleEndian};
12use bytes::Bytes;
13use futures::future::{BoxFuture, FutureExt};
14use log::trace;
15use snafu::location;
16
17use lance_arrow::DataTypeExt;
18use lance_core::{Error, Result};
19
20use crate::buffer::LanceBuffer;
21use crate::compression_algo::fastlanes::BitPacking;
22use crate::data::BlockInfo;
23use crate::data::{DataBlock, FixedWidthDataBlock, NullableDataBlock};
24use crate::decoder::{MiniBlockDecompressor, PageScheduler, PrimitivePageDecoder};
25use crate::encoder::{
26 ArrayEncoder, EncodedArray, MiniBlockChunk, MiniBlockCompressed, MiniBlockCompressor,
27};
28use crate::format::{pb, ProtobufUtils};
29use crate::statistics::{GetStat, Stat};
30use arrow::array::ArrayRef;
31use bytemuck::cast_slice;
32const LOG_ELEMS_PER_CHUNK: u8 = 10;
33const ELEMS_PER_CHUNK: u64 = 1 << LOG_ELEMS_PER_CHUNK;
34
35// Compute the compressed_bit_width for a given array of integers
36// todo: compute all statistics before encoding
37// todo: see how to use rust macro to rewrite this function
38pub fn compute_compressed_bit_width_for_non_neg(arrays: &[ArrayRef]) -> u64 {
39 debug_assert!(!arrays.is_empty());
40
41 let res;
42
43 match arrays[0].data_type() {
44 DataType::UInt8 => {
45 let mut global_max: u8 = 0;
46 for array in arrays {
47 let primitive_array = array
48 .as_any()
49 .downcast_ref::<PrimitiveArray<UInt8Type>>()
50 .unwrap();
51 let array_max = arrow::compute::bit_or(primitive_array);
52 global_max = global_max.max(array_max.unwrap_or(0));
53 }
54 let num_bits =
55 arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64;
56 // we will have constant encoding later
57 if num_bits == 0 {
58 res = 1;
59 } else {
60 res = num_bits;
61 }
62 }
63
64 DataType::Int8 => {
65 let mut global_max_width: u64 = 0;
66 for array in arrays {
67 let primitive_array = array
68 .as_any()
69 .downcast_ref::<PrimitiveArray<Int8Type>>()
70 .unwrap();
71 let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0);
72 global_max_width = global_max_width.max(8 - array_max_width.leading_zeros() as u64);
73 }
74 if global_max_width == 0 {
75 res = 1;
76 } else {
77 res = global_max_width;
78 }
79 }
80
81 DataType::UInt16 => {
82 let mut global_max: u16 = 0;
83 for array in arrays {
84 let primitive_array = array
85 .as_any()
86 .downcast_ref::<PrimitiveArray<UInt16Type>>()
87 .unwrap();
88 let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0);
89 global_max = global_max.max(array_max);
90 }
91 let num_bits =
92 arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64;
93 if num_bits == 0 {
94 res = 1;
95 } else {
96 res = num_bits;
97 }
98 }
99
100 DataType::Int16 => {
101 let mut global_max_width: u64 = 0;
102 for array in arrays {
103 let primitive_array = array
104 .as_any()
105 .downcast_ref::<PrimitiveArray<Int16Type>>()
106 .unwrap();
107 let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0);
108 global_max_width =
109 global_max_width.max(16 - array_max_width.leading_zeros() as u64);
110 }
111 if global_max_width == 0 {
112 res = 1;
113 } else {
114 res = global_max_width;
115 }
116 }
117
118 DataType::UInt32 => {
119 let mut global_max: u32 = 0;
120 for array in arrays {
121 let primitive_array = array
122 .as_any()
123 .downcast_ref::<PrimitiveArray<UInt32Type>>()
124 .unwrap();
125 let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0);
126 global_max = global_max.max(array_max);
127 }
128 let num_bits =
129 arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64;
130 if num_bits == 0 {
131 res = 1;
132 } else {
133 res = num_bits;
134 }
135 }
136
137 DataType::Int32 => {
138 let mut global_max_width: u64 = 0;
139 for array in arrays {
140 let primitive_array = array
141 .as_any()
142 .downcast_ref::<PrimitiveArray<Int32Type>>()
143 .unwrap();
144 let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0);
145 global_max_width =
146 global_max_width.max(32 - array_max_width.leading_zeros() as u64);
147 }
148 if global_max_width == 0 {
149 res = 1;
150 } else {
151 res = global_max_width;
152 }
153 }
154
155 DataType::UInt64 => {
156 let mut global_max: u64 = 0;
157 for array in arrays {
158 let primitive_array = array
159 .as_any()
160 .downcast_ref::<PrimitiveArray<UInt64Type>>()
161 .unwrap();
162 let array_max = arrow::compute::bit_or(primitive_array).unwrap_or(0);
163 global_max = global_max.max(array_max);
164 }
165 let num_bits =
166 arrays[0].data_type().byte_width() as u64 * 8 - global_max.leading_zeros() as u64;
167 if num_bits == 0 {
168 res = 1;
169 } else {
170 res = num_bits;
171 }
172 }
173
174 DataType::Int64 => {
175 let mut global_max_width: u64 = 0;
176 for array in arrays {
177 let primitive_array = array
178 .as_any()
179 .downcast_ref::<PrimitiveArray<Int64Type>>()
180 .unwrap();
181 let array_max_width = arrow::compute::bit_or(primitive_array).unwrap_or(0);
182 global_max_width =
183 global_max_width.max(64 - array_max_width.leading_zeros() as u64);
184 }
185 if global_max_width == 0 {
186 res = 1;
187 } else {
188 res = global_max_width;
189 }
190 }
191 _ => {
192 panic!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64");
193 }
194 };
195 res
196}
197
198// Bitpack integers using fastlanes algorithm, the input is sliced into chunks of 1024 integers, and bitpacked
199// chunk by chunk. when the input is not a multiple of 1024, the last chunk is padded with zeros, this is fine because
200// we also know the number of rows we have.
201// Here self is a borrow of BitpackedForNonNegArrayEncoder, unpacked is a mutable borrow of FixedWidthDataBlock,
202// data_type can be one of u8, u16, u32, or u64.
203// buffer_index is a mutable borrow of u32, indicating the buffer index of the output EncodedArray.
204// It outputs an fastlanes bitpacked EncodedArray
205macro_rules! encode_fixed_width {
206 ($self:expr, $unpacked:expr, $data_type:ty, $buffer_index:expr) => {{
207 let num_chunks = $unpacked.num_values.div_ceil(ELEMS_PER_CHUNK);
208 let num_full_chunks = $unpacked.num_values / ELEMS_PER_CHUNK;
209 let uncompressed_bit_width = std::mem::size_of::<$data_type>() as u64 * 8;
210
211 // the output vector type is the same as the input type, for example, when input is u16, output is Vec<u16>
212 let packed_chunk_size = 1024 * $self.compressed_bit_width as usize / uncompressed_bit_width as usize;
213
214 let input_slice = $unpacked.data.borrow_to_typed_slice::<$data_type>();
215 let input = input_slice.as_ref();
216
217 let mut output = Vec::with_capacity(num_chunks as usize * packed_chunk_size);
218
219 // Loop over all but the last chunk.
220 (0..num_full_chunks).for_each(|i| {
221 let start_elem = (i * ELEMS_PER_CHUNK) as usize;
222
223 let output_len = output.len();
224 unsafe {
225 output.set_len(output_len + packed_chunk_size);
226 BitPacking::unchecked_pack(
227 $self.compressed_bit_width,
228 &input[start_elem..][..ELEMS_PER_CHUNK as usize],
229 &mut output[output_len..][..packed_chunk_size],
230 );
231 }
232 });
233
234 if num_chunks != num_full_chunks {
235 let last_chunk_elem_num = $unpacked.num_values % ELEMS_PER_CHUNK;
236 let mut last_chunk = vec![0 as $data_type; ELEMS_PER_CHUNK as usize];
237 last_chunk[..last_chunk_elem_num as usize].clone_from_slice(
238 &input[$unpacked.num_values as usize - last_chunk_elem_num as usize..],
239 );
240
241 let output_len = output.len();
242 unsafe {
243 output.set_len(output_len + packed_chunk_size);
244 BitPacking::unchecked_pack(
245 $self.compressed_bit_width,
246 &last_chunk,
247 &mut output[output_len..][..packed_chunk_size],
248 );
249 }
250 }
251
252 let bitpacked_for_non_neg_buffer_index = *$buffer_index;
253 *$buffer_index += 1;
254
255 let encoding = ProtobufUtils::bitpacked_for_non_neg_encoding(
256 $self.compressed_bit_width as u64,
257 uncompressed_bit_width,
258 bitpacked_for_non_neg_buffer_index,
259 );
260 let packed = DataBlock::FixedWidth(FixedWidthDataBlock {
261 bits_per_value: $self.compressed_bit_width as u64,
262 data: LanceBuffer::reinterpret_vec(output),
263 num_values: $unpacked.num_values,
264 block_info: BlockInfo::new(),
265 });
266
267 Result::Ok(EncodedArray {
268 data: packed,
269 encoding,
270 })
271 }};
272}
273
274#[derive(Debug)]
275pub struct BitpackedForNonNegArrayEncoder {
276 pub compressed_bit_width: usize,
277 pub original_data_type: DataType,
278}
279
280impl BitpackedForNonNegArrayEncoder {
281 pub fn new(compressed_bit_width: usize, data_type: DataType) -> Self {
282 Self {
283 compressed_bit_width,
284 original_data_type: data_type,
285 }
286 }
287}
288
289impl ArrayEncoder for BitpackedForNonNegArrayEncoder {
290 fn encode(
291 &self,
292 data: DataBlock,
293 data_type: &DataType,
294 buffer_index: &mut u32,
295 ) -> Result<EncodedArray> {
296 match data {
297 DataBlock::AllNull(_) => {
298 let encoding = ProtobufUtils::basic_all_null_encoding();
299 Ok(EncodedArray { data, encoding })
300 }
301 DataBlock::FixedWidth(mut unpacked) => {
302 match data_type {
303 DataType::UInt8 | DataType::Int8 => encode_fixed_width!(self, unpacked, u8, buffer_index),
304 DataType::UInt16 | DataType::Int16 => encode_fixed_width!(self, unpacked, u16, buffer_index),
305 DataType::UInt32 | DataType::Int32 => encode_fixed_width!(self, unpacked, u32, buffer_index),
306 DataType::UInt64 | DataType::Int64 => encode_fixed_width!(self, unpacked, u64, buffer_index),
307 _ => unreachable!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64"),
308 }
309 }
310 DataBlock::Nullable(nullable) => {
311 let validity_buffer_index = *buffer_index;
312 *buffer_index += 1;
313
314 let validity_desc = ProtobufUtils::flat_encoding(
315 1,
316 validity_buffer_index,
317 /*compression=*/ None,
318 );
319 let encoded_values: EncodedArray;
320 match *nullable.data {
321 DataBlock::FixedWidth(mut unpacked) => {
322 match data_type {
323 DataType::UInt8 | DataType::Int8 => encoded_values = encode_fixed_width!(self, unpacked, u8, buffer_index)?,
324 DataType::UInt16 | DataType::Int16 => encoded_values = encode_fixed_width!(self, unpacked, u16, buffer_index)?,
325 DataType::UInt32 | DataType::Int32 => encoded_values = encode_fixed_width!(self, unpacked, u32, buffer_index)?,
326 DataType::UInt64 | DataType::Int64 => encoded_values = encode_fixed_width!(self, unpacked, u64, buffer_index)?,
327 _ => unreachable!("BitpackedForNonNegArrayEncoder only supports data types of UInt8, Int8, UInt16, Int16, UInt32, Int32, UInt64, Int64"),
328 }
329 }
330 _ => {
331 return Err(Error::InvalidInput {
332 source: "Bitpacking only supports fixed width data blocks or a nullable data block with fixed width data block inside or a all null data block".into(),
333 location: location!(),
334 });
335 }
336 }
337 let encoding =
338 ProtobufUtils::basic_some_null_encoding(validity_desc, encoded_values.encoding);
339 let encoded = DataBlock::Nullable(NullableDataBlock {
340 data: Box::new(encoded_values.data),
341 nulls: nullable.nulls,
342 block_info: BlockInfo::new(),
343 });
344 Ok(EncodedArray {
345 data: encoded,
346 encoding,
347 })
348 }
349 _ => {
350 Err(Error::InvalidInput {
351 source: "Bitpacking only supports fixed width data blocks or a nullable data block with fixed width data block inside or a all null data block".into(),
352 location: location!(),
353 })
354 }
355 }
356 }
357}
358
359#[derive(Debug)]
360pub struct BitpackedForNonNegScheduler {
361 compressed_bit_width: u64,
362 uncompressed_bits_per_value: u64,
363 buffer_offset: u64,
364}
365
366impl BitpackedForNonNegScheduler {
367 pub fn new(
368 compressed_bit_width: u64,
369 uncompressed_bits_per_value: u64,
370 buffer_offset: u64,
371 ) -> Self {
372 Self {
373 compressed_bit_width,
374 uncompressed_bits_per_value,
375 buffer_offset,
376 }
377 }
378
379 fn locate_chunk_start(&self, relative_row_num: u64) -> u64 {
380 let chunk_size = ELEMS_PER_CHUNK * self.compressed_bit_width / 8;
381 self.buffer_offset + (relative_row_num / ELEMS_PER_CHUNK * chunk_size)
382 }
383
384 fn locate_chunk_end(&self, relative_row_num: u64) -> u64 {
385 let chunk_size = ELEMS_PER_CHUNK * self.compressed_bit_width / 8;
386 self.buffer_offset + (relative_row_num / ELEMS_PER_CHUNK * chunk_size) + chunk_size
387 }
388}
389
390impl PageScheduler for BitpackedForNonNegScheduler {
391 fn schedule_ranges(
392 &self,
393 ranges: &[std::ops::Range<u64>],
394 scheduler: &Arc<dyn crate::EncodingsIo>,
395 top_level_row: u64,
396 ) -> BoxFuture<'static, Result<Box<dyn PrimitivePageDecoder>>> {
397 assert!(!ranges.is_empty());
398
399 let mut byte_ranges = vec![];
400
401 // map one bytes to multiple ranges, one bytes has at least one range corresponding to it
402 let mut bytes_idx_to_range_indices = vec![];
403 let first_byte_range = std::ops::Range {
404 start: self.locate_chunk_start(ranges[0].start),
405 end: self.locate_chunk_end(ranges[0].end - 1),
406 }; // the ranges are half-open
407 byte_ranges.push(first_byte_range);
408 bytes_idx_to_range_indices.push(vec![ranges[0].clone()]);
409
410 for (i, range) in ranges.iter().enumerate().skip(1) {
411 let this_start = self.locate_chunk_start(range.start);
412 let this_end = self.locate_chunk_end(range.end - 1);
413
414 // when the current range start is in the same chunk as the previous range's end, we colaesce this two bytes ranges
415 // when the current range start is not in the same chunk as the previous range's end, we create a new bytes range
416 if this_start == self.locate_chunk_start(ranges[i - 1].end - 1) {
417 byte_ranges.last_mut().unwrap().end = this_end;
418 bytes_idx_to_range_indices
419 .last_mut()
420 .unwrap()
421 .push(range.clone());
422 } else {
423 byte_ranges.push(this_start..this_end);
424 bytes_idx_to_range_indices.push(vec![range.clone()]);
425 }
426 }
427
428 trace!(
429 "Scheduling I/O for {} ranges spread across byte range {}..{}",
430 byte_ranges.len(),
431 byte_ranges[0].start,
432 byte_ranges.last().unwrap().end
433 );
434
435 let bytes = scheduler.submit_request(byte_ranges.clone(), top_level_row);
436
437 // copy the necessary data from `self` to move into the async block
438 let compressed_bit_width = self.compressed_bit_width;
439 let uncompressed_bits_per_value = self.uncompressed_bits_per_value;
440 let num_rows = ranges.iter().map(|range| range.end - range.start).sum();
441
442 async move {
443 let bytes = bytes.await?;
444 let decompressed_output = bitpacked_for_non_neg_decode(
445 compressed_bit_width,
446 uncompressed_bits_per_value,
447 &bytes,
448 &bytes_idx_to_range_indices,
449 num_rows,
450 );
451 Ok(Box::new(BitpackedForNonNegPageDecoder {
452 uncompressed_bits_per_value,
453 decompressed_buf: decompressed_output,
454 }) as Box<dyn PrimitivePageDecoder>)
455 }
456 .boxed()
457 }
458}
459
460#[derive(Debug)]
461struct BitpackedForNonNegPageDecoder {
462 // number of bits in the uncompressed value. E.g. this will be 32 for DataType::UInt32
463 uncompressed_bits_per_value: u64,
464
465 decompressed_buf: LanceBuffer,
466}
467
468impl PrimitivePageDecoder for BitpackedForNonNegPageDecoder {
469 fn decode(&self, rows_to_skip: u64, num_rows: u64) -> Result<DataBlock> {
470 if ![8, 16, 32, 64].contains(&self.uncompressed_bits_per_value) {
471 return Err(Error::InvalidInput {
472 source: "BitpackedForNonNegPageDecoder should only has uncompressed_bits_per_value of 8, 16, 32, or 64".into(),
473 location: location!(),
474 });
475 }
476
477 let elem_size_in_bytes = self.uncompressed_bits_per_value / 8;
478
479 Ok(DataBlock::FixedWidth(FixedWidthDataBlock {
480 data: self.decompressed_buf.slice_with_length(
481 (rows_to_skip * elem_size_in_bytes) as usize,
482 (num_rows * elem_size_in_bytes) as usize,
483 ),
484 bits_per_value: self.uncompressed_bits_per_value,
485 num_values: num_rows,
486 block_info: BlockInfo::new(),
487 }))
488 }
489}
490
491macro_rules! bitpacked_decode {
492 ($uncompressed_type:ty, $compressed_bit_width:expr, $data:expr, $bytes_idx_to_range_indices:expr, $num_rows:expr) => {{
493 let mut decompressed: Vec<$uncompressed_type> = Vec::with_capacity($num_rows as usize);
494 let packed_chunk_size_in_byte: usize = (ELEMS_PER_CHUNK * $compressed_bit_width) as usize / 8;
495 let mut decompress_chunk_buf = vec![0 as $uncompressed_type; ELEMS_PER_CHUNK as usize];
496
497 for (i, bytes) in $data.iter().enumerate() {
498 let mut ranges_idx = 0;
499 let mut curr_range_start = $bytes_idx_to_range_indices[i][0].start;
500 let mut chunk_num = 0;
501
502 while chunk_num * packed_chunk_size_in_byte < bytes.len() {
503 // Copy for memory alignment
504 let chunk_in_u8: Vec<u8> = bytes[chunk_num * packed_chunk_size_in_byte..]
505 [..packed_chunk_size_in_byte]
506 .to_vec();
507 chunk_num += 1;
508 let chunk = cast_slice(&chunk_in_u8);
509 unsafe {
510 BitPacking::unchecked_unpack(
511 $compressed_bit_width as usize,
512 chunk,
513 &mut decompress_chunk_buf,
514 );
515 }
516
517 loop {
518 // Case 1: All the elements after (curr_range_start % ELEMS_PER_CHUNK) inside this chunk are needed.
519 let elems_after_curr_range_start_in_this_chunk =
520 ELEMS_PER_CHUNK - curr_range_start % ELEMS_PER_CHUNK;
521 if curr_range_start + elems_after_curr_range_start_in_this_chunk
522 <= $bytes_idx_to_range_indices[i][ranges_idx].end
523 {
524 decompressed.extend_from_slice(
525 &decompress_chunk_buf[(curr_range_start % ELEMS_PER_CHUNK) as usize..],
526 );
527 curr_range_start += elems_after_curr_range_start_in_this_chunk;
528 break;
529 } else {
530 // Case 2: Only part of the elements after (curr_range_start % ELEMS_PER_CHUNK) inside this chunk are needed.
531 let elems_this_range_needed_in_this_chunk =
532 ($bytes_idx_to_range_indices[i][ranges_idx].end - curr_range_start)
533 .min(ELEMS_PER_CHUNK - curr_range_start % ELEMS_PER_CHUNK);
534 decompressed.extend_from_slice(
535 &decompress_chunk_buf[(curr_range_start % ELEMS_PER_CHUNK) as usize..]
536 [..elems_this_range_needed_in_this_chunk as usize],
537 );
538 if curr_range_start + elems_this_range_needed_in_this_chunk
539 == $bytes_idx_to_range_indices[i][ranges_idx].end
540 {
541 ranges_idx += 1;
542 if ranges_idx == $bytes_idx_to_range_indices[i].len() {
543 break;
544 }
545 curr_range_start = $bytes_idx_to_range_indices[i][ranges_idx].start;
546 } else {
547 curr_range_start += elems_this_range_needed_in_this_chunk;
548 }
549 }
550 }
551 }
552 }
553
554 LanceBuffer::reinterpret_vec(decompressed)
555 }};
556}
557
558fn bitpacked_for_non_neg_decode(
559 compressed_bit_width: u64,
560 uncompressed_bits_per_value: u64,
561 data: &[Bytes],
562 bytes_idx_to_range_indices: &[Vec<std::ops::Range<u64>>],
563 num_rows: u64,
564) -> LanceBuffer {
565 match uncompressed_bits_per_value {
566 8 => bitpacked_decode!(
567 u8,
568 compressed_bit_width,
569 data,
570 bytes_idx_to_range_indices,
571 num_rows
572 ),
573 16 => bitpacked_decode!(
574 u16,
575 compressed_bit_width,
576 data,
577 bytes_idx_to_range_indices,
578 num_rows
579 ),
580 32 => bitpacked_decode!(
581 u32,
582 compressed_bit_width,
583 data,
584 bytes_idx_to_range_indices,
585 num_rows
586 ),
587 64 => bitpacked_decode!(
588 u64,
589 compressed_bit_width,
590 data,
591 bytes_idx_to_range_indices,
592 num_rows
593 ),
594 _ => unreachable!(
595 "bitpacked_for_non_neg_decode only supports 8, 16, 32, 64 uncompressed_bits_per_value"
596 ),
597 }
598}
599
600#[cfg(test)]
601mod tests {
602 // use super::*;
603 // use arrow::array::{
604 // Int16Array, Int32Array, Int64Array, Int8Array, UInt16Array, UInt32Array, UInt64Array,
605 // UInt8Array,
606 // };
607 // use arrow::datatypes::DataType;
608
609 // #[test_log::test(tokio::test)]
610 // async fn test_compute_compressed_bit_width_for_non_neg() {}
611
612 // use std::collections::HashMap;
613
614 // use lance_datagen::RowCount;
615
616 // use crate::testing::{check_round_trip_encoding_of_data, TestCases};
617 // use crate::version::LanceFileVersion;
618
619 // async fn check_round_trip_bitpacked(array: Arc<dyn Array>) {
620 // let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1);
621 // check_round_trip_encoding_of_data(vec![array], &test_cases, HashMap::new()).await;
622 // }
623
624 // #[test_log::test(tokio::test)]
625 // async fn test_bitpack_fastlanes_u8() {
626 // let values: Vec<u8> = vec![5; 1024];
627 // let array = UInt8Array::from(values);
628 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
629 // check_round_trip_bitpacked(array).await;
630
631 // let values: Vec<u8> = vec![66; 1000];
632 // let array = UInt8Array::from(values);
633 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
634
635 // check_round_trip_bitpacked(array).await;
636
637 // let values: Vec<u8> = vec![77; 2000];
638 // let array = UInt8Array::from(values);
639 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
640
641 // check_round_trip_bitpacked(array).await;
642
643 // let values: Vec<u8> = vec![0; 10000];
644 // let array = UInt8Array::from(values);
645 // let arr = Arc::new(array) as ArrayRef;
646 // check_round_trip_bitpacked(arr).await;
647
648 // let values: Vec<u8> = vec![88; 10000];
649 // let array = UInt8Array::from(values);
650 // let arr = Arc::new(array) as ArrayRef;
651 // check_round_trip_bitpacked(arr).await;
652
653 // let arr = lance_datagen::gen()
654 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
655 // .into_batch_rows(RowCount::from(1))
656 // .unwrap()
657 // .column(0)
658 // .clone();
659 // check_round_trip_bitpacked(arr).await;
660
661 // let arr = lance_datagen::gen()
662 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
663 // .into_batch_rows(RowCount::from(20))
664 // .unwrap()
665 // .column(0)
666 // .clone();
667 // check_round_trip_bitpacked(arr).await;
668
669 // let arr = lance_datagen::gen()
670 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
671 // .into_batch_rows(RowCount::from(50))
672 // .unwrap()
673 // .column(0)
674 // .clone();
675 // check_round_trip_bitpacked(arr).await;
676
677 // let arr = lance_datagen::gen()
678 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
679 // .into_batch_rows(RowCount::from(100))
680 // .unwrap()
681 // .column(0)
682 // .clone();
683 // check_round_trip_bitpacked(arr).await;
684
685 // let arr = lance_datagen::gen()
686 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
687 // .into_batch_rows(RowCount::from(1000))
688 // .unwrap()
689 // .column(0)
690 // .clone();
691 // check_round_trip_bitpacked(arr).await;
692
693 // let arr = lance_datagen::gen()
694 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
695 // .into_batch_rows(RowCount::from(1024))
696 // .unwrap()
697 // .column(0)
698 // .clone();
699 // check_round_trip_bitpacked(arr).await;
700
701 // let arr = lance_datagen::gen()
702 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
703 // .into_batch_rows(RowCount::from(2000))
704 // .unwrap()
705 // .column(0)
706 // .clone();
707 // check_round_trip_bitpacked(arr).await;
708
709 // let arr = lance_datagen::gen()
710 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt8))
711 // .into_batch_rows(RowCount::from(3000))
712 // .unwrap()
713 // .column(0)
714 // .clone();
715 // check_round_trip_bitpacked(arr).await;
716 // }
717
718 // #[test_log::test(tokio::test)]
719 // async fn test_bitpack_fastlanes_u16() {
720 // let values: Vec<u16> = vec![5; 1024];
721 // let array = UInt16Array::from(values);
722 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
723 // check_round_trip_bitpacked(array).await;
724
725 // let values: Vec<u16> = vec![66; 1000];
726 // let array = UInt16Array::from(values);
727 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
728
729 // check_round_trip_bitpacked(array).await;
730
731 // let values: Vec<u16> = vec![77; 2000];
732 // let array = UInt16Array::from(values);
733 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
734
735 // check_round_trip_bitpacked(array).await;
736
737 // let values: Vec<u16> = vec![0; 10000];
738 // let array = UInt16Array::from(values);
739 // let arr = Arc::new(array) as ArrayRef;
740 // check_round_trip_bitpacked(arr).await;
741
742 // let values: Vec<u16> = vec![88; 10000];
743 // let array = UInt16Array::from(values);
744 // let arr = Arc::new(array) as ArrayRef;
745 // check_round_trip_bitpacked(arr).await;
746
747 // let values: Vec<u16> = vec![300; 100];
748 // let array = UInt16Array::from(values);
749 // let arr = Arc::new(array) as ArrayRef;
750 // check_round_trip_bitpacked(arr).await;
751
752 // let values: Vec<u16> = vec![800; 100];
753 // let array = UInt16Array::from(values);
754 // let arr = Arc::new(array) as ArrayRef;
755 // check_round_trip_bitpacked(arr).await;
756
757 // let arr = lance_datagen::gen()
758 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
759 // .into_batch_rows(RowCount::from(1))
760 // .unwrap()
761 // .column(0)
762 // .clone();
763 // check_round_trip_bitpacked(arr).await;
764
765 // let arr = lance_datagen::gen()
766 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
767 // .into_batch_rows(RowCount::from(20))
768 // .unwrap()
769 // .column(0)
770 // .clone();
771 // check_round_trip_bitpacked(arr).await;
772
773 // let arr = lance_datagen::gen()
774 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
775 // .into_batch_rows(RowCount::from(100))
776 // .unwrap()
777 // .column(0)
778 // .clone();
779 // check_round_trip_bitpacked(arr).await;
780
781 // let arr = lance_datagen::gen()
782 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
783 // .into_batch_rows(RowCount::from(1000))
784 // .unwrap()
785 // .column(0)
786 // .clone();
787 // check_round_trip_bitpacked(arr).await;
788
789 // let arr = lance_datagen::gen()
790 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
791 // .into_batch_rows(RowCount::from(1024))
792 // .unwrap()
793 // .column(0)
794 // .clone();
795 // check_round_trip_bitpacked(arr).await;
796
797 // let arr = lance_datagen::gen()
798 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
799 // .into_batch_rows(RowCount::from(2000))
800 // .unwrap()
801 // .column(0)
802 // .clone();
803 // check_round_trip_bitpacked(arr).await;
804
805 // let arr = lance_datagen::gen()
806 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt16))
807 // .into_batch_rows(RowCount::from(3000))
808 // .unwrap()
809 // .column(0)
810 // .clone();
811 // check_round_trip_bitpacked(arr).await;
812 // }
813
814 // #[test_log::test(tokio::test)]
815 // async fn test_bitpack_fastlanes_u32() {
816 // let values: Vec<u32> = vec![5; 1024];
817 // let array = UInt32Array::from(values);
818 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
819 // check_round_trip_bitpacked(array).await;
820
821 // let values: Vec<u32> = vec![7; 2000];
822 // let array = UInt32Array::from(values);
823 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
824 // check_round_trip_bitpacked(array).await;
825
826 // let values: Vec<u32> = vec![66; 1000];
827 // let array = UInt32Array::from(values);
828 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
829 // check_round_trip_bitpacked(array).await;
830
831 // let values: Vec<u32> = vec![666; 1000];
832 // let array = UInt32Array::from(values);
833 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
834 // check_round_trip_bitpacked(array).await;
835
836 // let values: Vec<u32> = vec![77; 2000];
837 // let array = UInt32Array::from(values);
838 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
839 // check_round_trip_bitpacked(array).await;
840
841 // let values: Vec<u32> = vec![0; 10000];
842 // let array = UInt32Array::from(values);
843 // let arr = Arc::new(array) as ArrayRef;
844 // check_round_trip_bitpacked(arr).await;
845
846 // let values: Vec<u32> = vec![1; 10000];
847 // let array = UInt32Array::from(values);
848 // let arr = Arc::new(array) as ArrayRef;
849 // check_round_trip_bitpacked(arr).await;
850
851 // let values: Vec<u32> = vec![88; 10000];
852 // let array = UInt32Array::from(values);
853 // let arr = Arc::new(array) as ArrayRef;
854 // check_round_trip_bitpacked(arr).await;
855
856 // let values: Vec<u32> = vec![300; 100];
857 // let array = UInt32Array::from(values);
858 // let arr = Arc::new(array) as ArrayRef;
859 // check_round_trip_bitpacked(arr).await;
860
861 // let values: Vec<u32> = vec![3000; 100];
862 // let array = UInt32Array::from(values);
863 // let arr = Arc::new(array) as ArrayRef;
864 // check_round_trip_bitpacked(arr).await;
865
866 // let values: Vec<u32> = vec![800; 100];
867 // let array = UInt32Array::from(values);
868 // let arr = Arc::new(array) as ArrayRef;
869 // check_round_trip_bitpacked(arr).await;
870
871 // let values: Vec<u32> = vec![8000; 100];
872 // let array = UInt32Array::from(values);
873 // let arr = Arc::new(array) as ArrayRef;
874 // check_round_trip_bitpacked(arr).await;
875
876 // let values: Vec<u32> = vec![65536; 100];
877 // let array = UInt32Array::from(values);
878 // let arr = Arc::new(array) as ArrayRef;
879 // check_round_trip_bitpacked(arr).await;
880
881 // let values: Vec<u32> = vec![655360; 100];
882 // let array = UInt32Array::from(values);
883 // let arr = Arc::new(array) as ArrayRef;
884 // check_round_trip_bitpacked(arr).await;
885
886 // let arr = lance_datagen::gen()
887 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
888 // .into_batch_rows(RowCount::from(1))
889 // .unwrap()
890 // .column(0)
891 // .clone();
892 // check_round_trip_bitpacked(arr).await;
893
894 // let arr = lance_datagen::gen()
895 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
896 // .into_batch_rows(RowCount::from(20))
897 // .unwrap()
898 // .column(0)
899 // .clone();
900 // check_round_trip_bitpacked(arr).await;
901
902 // let arr = lance_datagen::gen()
903 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
904 // .into_batch_rows(RowCount::from(50))
905 // .unwrap()
906 // .column(0)
907 // .clone();
908 // check_round_trip_bitpacked(arr).await;
909
910 // let arr = lance_datagen::gen()
911 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
912 // .into_batch_rows(RowCount::from(100))
913 // .unwrap()
914 // .column(0)
915 // .clone();
916 // check_round_trip_bitpacked(arr).await;
917
918 // let arr = lance_datagen::gen()
919 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
920 // .into_batch_rows(RowCount::from(1000))
921 // .unwrap()
922 // .column(0)
923 // .clone();
924 // check_round_trip_bitpacked(arr).await;
925
926 // let arr = lance_datagen::gen()
927 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
928 // .into_batch_rows(RowCount::from(1024))
929 // .unwrap()
930 // .column(0)
931 // .clone();
932 // check_round_trip_bitpacked(arr).await;
933
934 // let arr = lance_datagen::gen()
935 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
936 // .into_batch_rows(RowCount::from(2000))
937 // .unwrap()
938 // .column(0)
939 // .clone();
940 // check_round_trip_bitpacked(arr).await;
941
942 // let arr = lance_datagen::gen()
943 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt32))
944 // .into_batch_rows(RowCount::from(3000))
945 // .unwrap()
946 // .column(0)
947 // .clone();
948 // check_round_trip_bitpacked(arr).await;
949 // }
950
951 // #[test_log::test(tokio::test)]
952 // async fn test_bitpack_fastlanes_u64() {
953 // let values: Vec<u64> = vec![5; 1024];
954 // let array = UInt64Array::from(values);
955 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
956 // check_round_trip_bitpacked(array).await;
957
958 // let values: Vec<u64> = vec![7; 2000];
959 // let array = UInt64Array::from(values);
960 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
961 // check_round_trip_bitpacked(array).await;
962
963 // let values: Vec<u64> = vec![66; 1000];
964 // let array = UInt64Array::from(values);
965 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
966 // check_round_trip_bitpacked(array).await;
967
968 // let values: Vec<u64> = vec![666; 1000];
969 // let array = UInt64Array::from(values);
970 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
971 // check_round_trip_bitpacked(array).await;
972
973 // let values: Vec<u64> = vec![77; 2000];
974 // let array = UInt64Array::from(values);
975 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
976 // check_round_trip_bitpacked(array).await;
977
978 // let values: Vec<u64> = vec![0; 10000];
979 // let array = UInt64Array::from(values);
980 // let arr = Arc::new(array) as ArrayRef;
981 // check_round_trip_bitpacked(arr).await;
982
983 // let values: Vec<u64> = vec![1; 10000];
984 // let array = UInt64Array::from(values);
985 // let arr = Arc::new(array) as ArrayRef;
986 // check_round_trip_bitpacked(arr).await;
987
988 // let values: Vec<u64> = vec![88; 10000];
989 // let array = UInt64Array::from(values);
990 // let arr = Arc::new(array) as ArrayRef;
991 // check_round_trip_bitpacked(arr).await;
992
993 // let values: Vec<u64> = vec![300; 100];
994 // let array = UInt64Array::from(values);
995 // let arr = Arc::new(array) as ArrayRef;
996 // check_round_trip_bitpacked(arr).await;
997
998 // let values: Vec<u64> = vec![3000; 100];
999 // let array = UInt64Array::from(values);
1000 // let arr = Arc::new(array) as ArrayRef;
1001 // check_round_trip_bitpacked(arr).await;
1002
1003 // let values: Vec<u64> = vec![800; 100];
1004 // let array = UInt64Array::from(values);
1005 // let arr = Arc::new(array) as ArrayRef;
1006 // check_round_trip_bitpacked(arr).await;
1007
1008 // let values: Vec<u64> = vec![8000; 100];
1009 // let array = UInt64Array::from(values);
1010 // let arr = Arc::new(array) as ArrayRef;
1011 // check_round_trip_bitpacked(arr).await;
1012
1013 // let values: Vec<u64> = vec![65536; 100];
1014 // let array = UInt64Array::from(values);
1015 // let arr = Arc::new(array) as ArrayRef;
1016 // check_round_trip_bitpacked(arr).await;
1017
1018 // let values: Vec<u64> = vec![655360; 100];
1019 // let array = UInt64Array::from(values);
1020 // let arr = Arc::new(array) as ArrayRef;
1021 // check_round_trip_bitpacked(arr).await;
1022
1023 // let arr = lance_datagen::gen()
1024 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1025 // .into_batch_rows(RowCount::from(1))
1026 // .unwrap()
1027 // .column(0)
1028 // .clone();
1029 // check_round_trip_bitpacked(arr).await;
1030
1031 // let arr = lance_datagen::gen()
1032 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1033 // .into_batch_rows(RowCount::from(20))
1034 // .unwrap()
1035 // .column(0)
1036 // .clone();
1037 // check_round_trip_bitpacked(arr).await;
1038
1039 // let arr = lance_datagen::gen()
1040 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1041 // .into_batch_rows(RowCount::from(50))
1042 // .unwrap()
1043 // .column(0)
1044 // .clone();
1045 // check_round_trip_bitpacked(arr).await;
1046
1047 // let arr = lance_datagen::gen()
1048 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1049 // .into_batch_rows(RowCount::from(100))
1050 // .unwrap()
1051 // .column(0)
1052 // .clone();
1053 // check_round_trip_bitpacked(arr).await;
1054
1055 // let arr = lance_datagen::gen()
1056 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1057 // .into_batch_rows(RowCount::from(1000))
1058 // .unwrap()
1059 // .column(0)
1060 // .clone();
1061 // check_round_trip_bitpacked(arr).await;
1062
1063 // let arr = lance_datagen::gen()
1064 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1065 // .into_batch_rows(RowCount::from(1024))
1066 // .unwrap()
1067 // .column(0)
1068 // .clone();
1069 // check_round_trip_bitpacked(arr).await;
1070
1071 // let arr = lance_datagen::gen()
1072 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1073 // .into_batch_rows(RowCount::from(2000))
1074 // .unwrap()
1075 // .column(0)
1076 // .clone();
1077 // check_round_trip_bitpacked(arr).await;
1078
1079 // let arr = lance_datagen::gen()
1080 // .anon_col(lance_datagen::array::rand_type(&DataType::UInt64))
1081 // .into_batch_rows(RowCount::from(3000))
1082 // .unwrap()
1083 // .column(0)
1084 // .clone();
1085 // check_round_trip_bitpacked(arr).await;
1086 // }
1087
1088 // #[test_log::test(tokio::test)]
1089 // async fn test_bitpack_fastlanes_i8() {
1090 // let values: Vec<i8> = vec![-5; 1024];
1091 // let array = Int8Array::from(values);
1092 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1093 // check_round_trip_bitpacked(array).await;
1094
1095 // let values: Vec<i8> = vec![66; 1000];
1096 // let array = Int8Array::from(values);
1097 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1098
1099 // check_round_trip_bitpacked(array).await;
1100
1101 // let values: Vec<i8> = vec![77; 2000];
1102 // let array = Int8Array::from(values);
1103 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1104
1105 // check_round_trip_bitpacked(array).await;
1106
1107 // let values: Vec<i8> = vec![0; 10000];
1108 // let array = Int8Array::from(values);
1109 // let arr = Arc::new(array) as ArrayRef;
1110 // check_round_trip_bitpacked(arr).await;
1111
1112 // let values: Vec<i8> = vec![88; 10000];
1113 // let array = Int8Array::from(values);
1114 // let arr = Arc::new(array) as ArrayRef;
1115 // check_round_trip_bitpacked(arr).await;
1116
1117 // let values: Vec<i8> = vec![-88; 10000];
1118 // let array = Int8Array::from(values);
1119 // let arr = Arc::new(array) as ArrayRef;
1120 // check_round_trip_bitpacked(arr).await;
1121
1122 // let arr = lance_datagen::gen()
1123 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1124 // .into_batch_rows(RowCount::from(1))
1125 // .unwrap()
1126 // .column(0)
1127 // .clone();
1128 // check_round_trip_bitpacked(arr).await;
1129
1130 // let arr = lance_datagen::gen()
1131 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1132 // .into_batch_rows(RowCount::from(20))
1133 // .unwrap()
1134 // .column(0)
1135 // .clone();
1136 // check_round_trip_bitpacked(arr).await;
1137
1138 // let arr = lance_datagen::gen()
1139 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1140 // .into_batch_rows(RowCount::from(50))
1141 // .unwrap()
1142 // .column(0)
1143 // .clone();
1144 // check_round_trip_bitpacked(arr).await;
1145
1146 // let arr = lance_datagen::gen()
1147 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1148 // .into_batch_rows(RowCount::from(100))
1149 // .unwrap()
1150 // .column(0)
1151 // .clone();
1152 // check_round_trip_bitpacked(arr).await;
1153
1154 // let arr = lance_datagen::gen()
1155 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1156 // .into_batch_rows(RowCount::from(1000))
1157 // .unwrap()
1158 // .column(0)
1159 // .clone();
1160 // check_round_trip_bitpacked(arr).await;
1161
1162 // let arr = lance_datagen::gen()
1163 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1164 // .into_batch_rows(RowCount::from(1024))
1165 // .unwrap()
1166 // .column(0)
1167 // .clone();
1168 // check_round_trip_bitpacked(arr).await;
1169
1170 // let arr = lance_datagen::gen()
1171 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1172 // .into_batch_rows(RowCount::from(2000))
1173 // .unwrap()
1174 // .column(0)
1175 // .clone();
1176 // check_round_trip_bitpacked(arr).await;
1177
1178 // let arr = lance_datagen::gen()
1179 // .anon_col(lance_datagen::array::rand_type(&DataType::Int8))
1180 // .into_batch_rows(RowCount::from(3000))
1181 // .unwrap()
1182 // .column(0)
1183 // .clone();
1184 // check_round_trip_bitpacked(arr).await;
1185 // }
1186
1187 // #[test_log::test(tokio::test)]
1188 // async fn test_bitpack_fastlanes_i16() {
1189 // let values: Vec<i16> = vec![-5; 1024];
1190 // let array = Int16Array::from(values);
1191 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1192 // check_round_trip_bitpacked(array).await;
1193
1194 // let values: Vec<i16> = vec![66; 1000];
1195 // let array = Int16Array::from(values);
1196 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1197
1198 // check_round_trip_bitpacked(array).await;
1199
1200 // let values: Vec<i16> = vec![77; 2000];
1201 // let array = Int16Array::from(values);
1202 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1203
1204 // check_round_trip_bitpacked(array).await;
1205
1206 // let values: Vec<i16> = vec![0; 10000];
1207 // let array = Int16Array::from(values);
1208 // let arr = Arc::new(array) as ArrayRef;
1209 // check_round_trip_bitpacked(arr).await;
1210
1211 // let values: Vec<i16> = vec![88; 10000];
1212 // let array = Int16Array::from(values);
1213 // let arr = Arc::new(array) as ArrayRef;
1214 // check_round_trip_bitpacked(arr).await;
1215
1216 // let values: Vec<i16> = vec![300; 100];
1217 // let array = Int16Array::from(values);
1218 // let arr = Arc::new(array) as ArrayRef;
1219 // check_round_trip_bitpacked(arr).await;
1220
1221 // let values: Vec<i16> = vec![800; 100];
1222 // let array = Int16Array::from(values);
1223 // let arr = Arc::new(array) as ArrayRef;
1224 // check_round_trip_bitpacked(arr).await;
1225
1226 // let arr = lance_datagen::gen()
1227 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1228 // .into_batch_rows(RowCount::from(1))
1229 // .unwrap()
1230 // .column(0)
1231 // .clone();
1232 // check_round_trip_bitpacked(arr).await;
1233
1234 // let arr = lance_datagen::gen()
1235 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1236 // .into_batch_rows(RowCount::from(20))
1237 // .unwrap()
1238 // .column(0)
1239 // .clone();
1240 // check_round_trip_bitpacked(arr).await;
1241
1242 // let arr = lance_datagen::gen()
1243 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1244 // .into_batch_rows(RowCount::from(50))
1245 // .unwrap()
1246 // .column(0)
1247 // .clone();
1248 // check_round_trip_bitpacked(arr).await;
1249
1250 // let arr = lance_datagen::gen()
1251 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1252 // .into_batch_rows(RowCount::from(100))
1253 // .unwrap()
1254 // .column(0)
1255 // .clone();
1256 // check_round_trip_bitpacked(arr).await;
1257
1258 // let arr = lance_datagen::gen()
1259 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1260 // .into_batch_rows(RowCount::from(1000))
1261 // .unwrap()
1262 // .column(0)
1263 // .clone();
1264 // check_round_trip_bitpacked(arr).await;
1265
1266 // let arr = lance_datagen::gen()
1267 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1268 // .into_batch_rows(RowCount::from(1024))
1269 // .unwrap()
1270 // .column(0)
1271 // .clone();
1272 // check_round_trip_bitpacked(arr).await;
1273
1274 // let arr = lance_datagen::gen()
1275 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1276 // .into_batch_rows(RowCount::from(2000))
1277 // .unwrap()
1278 // .column(0)
1279 // .clone();
1280 // check_round_trip_bitpacked(arr).await;
1281
1282 // let arr = lance_datagen::gen()
1283 // .anon_col(lance_datagen::array::rand_type(&DataType::Int16))
1284 // .into_batch_rows(RowCount::from(3000))
1285 // .unwrap()
1286 // .column(0)
1287 // .clone();
1288 // check_round_trip_bitpacked(arr).await;
1289 // }
1290
1291 // #[test_log::test(tokio::test)]
1292 // async fn test_bitpack_fastlanes_i32() {
1293 // let values: Vec<i32> = vec![-5; 1024];
1294 // let array = Int32Array::from(values);
1295 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1296 // check_round_trip_bitpacked(array).await;
1297
1298 // let values: Vec<i32> = vec![66; 1000];
1299 // let array = Int32Array::from(values);
1300 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1301 // check_round_trip_bitpacked(array).await;
1302
1303 // let values: Vec<i32> = vec![-66; 1000];
1304 // let array = Int32Array::from(values);
1305 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1306 // check_round_trip_bitpacked(array).await;
1307
1308 // let values: Vec<i32> = vec![77; 2000];
1309 // let array = Int32Array::from(values);
1310 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1311 // check_round_trip_bitpacked(array).await;
1312
1313 // let values: Vec<i32> = vec![-77; 2000];
1314 // let array = Int32Array::from(values);
1315 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1316 // check_round_trip_bitpacked(array).await;
1317
1318 // let values: Vec<i32> = vec![0; 10000];
1319 // let array = Int32Array::from(values);
1320 // let arr = Arc::new(array) as ArrayRef;
1321 // check_round_trip_bitpacked(arr).await;
1322
1323 // let values: Vec<i32> = vec![88; 10000];
1324 // let array = Int32Array::from(values);
1325 // let arr = Arc::new(array) as ArrayRef;
1326 // check_round_trip_bitpacked(arr).await;
1327
1328 // let values: Vec<i32> = vec![-88; 10000];
1329 // let array = Int32Array::from(values);
1330 // let arr = Arc::new(array) as ArrayRef;
1331 // check_round_trip_bitpacked(arr).await;
1332
1333 // let values: Vec<i32> = vec![300; 100];
1334 // let array = Int32Array::from(values);
1335 // let arr = Arc::new(array) as ArrayRef;
1336 // check_round_trip_bitpacked(arr).await;
1337
1338 // let values: Vec<i32> = vec![-300; 100];
1339 // let array = Int32Array::from(values);
1340 // let arr = Arc::new(array) as ArrayRef;
1341 // check_round_trip_bitpacked(arr).await;
1342
1343 // let values: Vec<i32> = vec![800; 100];
1344 // let array = Int32Array::from(values);
1345 // let arr = Arc::new(array) as ArrayRef;
1346 // check_round_trip_bitpacked(arr).await;
1347
1348 // let values: Vec<i32> = vec![-800; 100];
1349 // let array = Int32Array::from(values);
1350 // let arr = Arc::new(array) as ArrayRef;
1351 // check_round_trip_bitpacked(arr).await;
1352
1353 // let values: Vec<i32> = vec![65536; 100];
1354 // let array = Int32Array::from(values);
1355 // let arr = Arc::new(array) as ArrayRef;
1356 // check_round_trip_bitpacked(arr).await;
1357
1358 // let values: Vec<i32> = vec![-65536; 100];
1359 // let array = Int32Array::from(values);
1360 // let arr = Arc::new(array) as ArrayRef;
1361 // check_round_trip_bitpacked(arr).await;
1362
1363 // let arr = lance_datagen::gen()
1364 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1365 // .into_batch_rows(RowCount::from(1))
1366 // .unwrap()
1367 // .column(0)
1368 // .clone();
1369 // check_round_trip_bitpacked(arr).await;
1370
1371 // let arr = lance_datagen::gen()
1372 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1373 // .into_batch_rows(RowCount::from(20))
1374 // .unwrap()
1375 // .column(0)
1376 // .clone();
1377 // check_round_trip_bitpacked(arr).await;
1378
1379 // let arr = lance_datagen::gen()
1380 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1381 // .into_batch_rows(RowCount::from(50))
1382 // .unwrap()
1383 // .column(0)
1384 // .clone();
1385 // check_round_trip_bitpacked(arr).await;
1386
1387 // let arr = lance_datagen::gen()
1388 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1389 // .into_batch_rows(RowCount::from(100))
1390 // .unwrap()
1391 // .column(0)
1392 // .clone();
1393 // check_round_trip_bitpacked(arr).await;
1394
1395 // let arr = lance_datagen::gen()
1396 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1397 // .into_batch_rows(RowCount::from(1000))
1398 // .unwrap()
1399 // .column(0)
1400 // .clone();
1401 // check_round_trip_bitpacked(arr).await;
1402
1403 // let arr = lance_datagen::gen()
1404 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1405 // .into_batch_rows(RowCount::from(1024))
1406 // .unwrap()
1407 // .column(0)
1408 // .clone();
1409 // check_round_trip_bitpacked(arr).await;
1410
1411 // let arr = lance_datagen::gen()
1412 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1413 // .into_batch_rows(RowCount::from(2000))
1414 // .unwrap()
1415 // .column(0)
1416 // .clone();
1417 // check_round_trip_bitpacked(arr).await;
1418
1419 // let arr = lance_datagen::gen()
1420 // .anon_col(lance_datagen::array::rand_type(&DataType::Int32))
1421 // .into_batch_rows(RowCount::from(3000))
1422 // .unwrap()
1423 // .column(0)
1424 // .clone();
1425 // check_round_trip_bitpacked(arr).await;
1426 // }
1427
1428 // #[test_log::test(tokio::test)]
1429 // async fn test_bitpack_fastlanes_i64() {
1430 // let values: Vec<i64> = vec![-5; 1024];
1431 // let array = Int64Array::from(values);
1432 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1433 // check_round_trip_bitpacked(array).await;
1434
1435 // let values: Vec<i64> = vec![66; 1000];
1436 // let array = Int64Array::from(values);
1437 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1438 // check_round_trip_bitpacked(array).await;
1439
1440 // let values: Vec<i64> = vec![-66; 1000];
1441 // let array = Int64Array::from(values);
1442 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1443 // check_round_trip_bitpacked(array).await;
1444
1445 // let values: Vec<i64> = vec![77; 2000];
1446 // let array = Int64Array::from(values);
1447 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1448 // check_round_trip_bitpacked(array).await;
1449
1450 // let values: Vec<i64> = vec![-77; 2000];
1451 // let array = Int64Array::from(values);
1452 // let array: Arc<dyn arrow_array::Array> = Arc::new(array);
1453 // check_round_trip_bitpacked(array).await;
1454
1455 // let values: Vec<i64> = vec![0; 10000];
1456 // let array = Int64Array::from(values);
1457 // let arr = Arc::new(array) as ArrayRef;
1458 // check_round_trip_bitpacked(arr).await;
1459
1460 // let values: Vec<i64> = vec![88; 10000];
1461 // let array = Int64Array::from(values);
1462 // let arr = Arc::new(array) as ArrayRef;
1463 // check_round_trip_bitpacked(arr).await;
1464
1465 // let values: Vec<i64> = vec![-88; 10000];
1466 // let array = Int64Array::from(values);
1467 // let arr = Arc::new(array) as ArrayRef;
1468 // check_round_trip_bitpacked(arr).await;
1469
1470 // let values: Vec<i64> = vec![300; 100];
1471 // let array = Int64Array::from(values);
1472 // let arr = Arc::new(array) as ArrayRef;
1473 // check_round_trip_bitpacked(arr).await;
1474
1475 // let values: Vec<i64> = vec![-300; 100];
1476 // let array = Int64Array::from(values);
1477 // let arr = Arc::new(array) as ArrayRef;
1478 // check_round_trip_bitpacked(arr).await;
1479
1480 // let values: Vec<i64> = vec![800; 100];
1481 // let array = Int64Array::from(values);
1482 // let arr = Arc::new(array) as ArrayRef;
1483 // check_round_trip_bitpacked(arr).await;
1484
1485 // let values: Vec<i64> = vec![-800; 100];
1486 // let array = Int64Array::from(values);
1487 // let arr = Arc::new(array) as ArrayRef;
1488 // check_round_trip_bitpacked(arr).await;
1489
1490 // let values: Vec<i64> = vec![65536; 100];
1491 // let array = Int64Array::from(values);
1492 // let arr = Arc::new(array) as ArrayRef;
1493 // check_round_trip_bitpacked(arr).await;
1494
1495 // let values: Vec<i64> = vec![-65536; 100];
1496 // let array = Int64Array::from(values);
1497 // let arr = Arc::new(array) as ArrayRef;
1498 // check_round_trip_bitpacked(arr).await;
1499
1500 // let arr = lance_datagen::gen()
1501 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1502 // .into_batch_rows(RowCount::from(1))
1503 // .unwrap()
1504 // .column(0)
1505 // .clone();
1506 // check_round_trip_bitpacked(arr).await;
1507
1508 // let arr = lance_datagen::gen()
1509 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1510 // .into_batch_rows(RowCount::from(20))
1511 // .unwrap()
1512 // .column(0)
1513 // .clone();
1514 // check_round_trip_bitpacked(arr).await;
1515
1516 // let arr = lance_datagen::gen()
1517 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1518 // .into_batch_rows(RowCount::from(50))
1519 // .unwrap()
1520 // .column(0)
1521 // .clone();
1522 // check_round_trip_bitpacked(arr).await;
1523
1524 // let arr = lance_datagen::gen()
1525 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1526 // .into_batch_rows(RowCount::from(100))
1527 // .unwrap()
1528 // .column(0)
1529 // .clone();
1530 // check_round_trip_bitpacked(arr).await;
1531
1532 // let arr = lance_datagen::gen()
1533 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1534 // .into_batch_rows(RowCount::from(1000))
1535 // .unwrap()
1536 // .column(0)
1537 // .clone();
1538 // check_round_trip_bitpacked(arr).await;
1539
1540 // let arr = lance_datagen::gen()
1541 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1542 // .into_batch_rows(RowCount::from(1024))
1543 // .unwrap()
1544 // .column(0)
1545 // .clone();
1546 // check_round_trip_bitpacked(arr).await;
1547
1548 // let arr = lance_datagen::gen()
1549 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1550 // .into_batch_rows(RowCount::from(2000))
1551 // .unwrap()
1552 // .column(0)
1553 // .clone();
1554 // check_round_trip_bitpacked(arr).await;
1555
1556 // let arr = lance_datagen::gen()
1557 // .anon_col(lance_datagen::array::rand_type(&DataType::Int64))
1558 // .into_batch_rows(RowCount::from(3000))
1559 // .unwrap()
1560 // .column(0)
1561 // .clone();
1562 // check_round_trip_bitpacked(arr).await;
1563 // }
1564}
1565
1566// This macro chunks the FixedWidth DataBlock, bitpacks them with 1024 values per chunk,
1567// it puts the bit-width parameter in front of each chunk,
1568// and the bit-width parameter has the same bit-width as the uncompressed DataBlock
1569// for example, if the input DataBlock has `bits_per_value` of `16`, there will be 2 bytes(16 bits)
1570// in front of each chunk storing the `bit-width` parameter.
1571macro_rules! chunk_data_impl {
1572 ($data:expr, $data_type:ty) => {{
1573 let data_buffer = $data.data.borrow_to_typed_slice::<$data_type>();
1574 let data_buffer = data_buffer.as_ref();
1575
1576 let bit_widths = $data.expect_stat(Stat::BitWidth);
1577 let bit_widths_array = bit_widths
1578 .as_any()
1579 .downcast_ref::<PrimitiveArray<UInt64Type>>()
1580 .unwrap();
1581
1582 let (packed_chunk_sizes, total_size) = bit_widths_array
1583 .values()
1584 .iter()
1585 .map(|&bit_width| {
1586 let chunk_size = ((1024 * bit_width) / $data.bits_per_value) as usize;
1587 (chunk_size, chunk_size + 1)
1588 })
1589 .fold(
1590 (Vec::with_capacity(bit_widths_array.len()), 0),
1591 |(mut sizes, total), (size, inc)| {
1592 sizes.push(size);
1593 (sizes, total + inc)
1594 },
1595 );
1596
1597 let mut output: Vec<$data_type> = Vec::with_capacity(total_size);
1598 let mut chunks = Vec::with_capacity(bit_widths_array.len());
1599
1600 for i in 0..bit_widths_array.len() - 1 {
1601 let start_elem = i * ELEMS_PER_CHUNK as usize;
1602 let bit_width = bit_widths_array.value(i) as $data_type;
1603 output.push(bit_width);
1604 let output_len = output.len();
1605 unsafe {
1606 output.set_len(output_len + packed_chunk_sizes[i]);
1607 BitPacking::unchecked_pack(
1608 bit_width as usize,
1609 &data_buffer[start_elem..][..ELEMS_PER_CHUNK as usize],
1610 &mut output[output_len..][..packed_chunk_sizes[i]],
1611 );
1612 }
1613 chunks.push(MiniBlockChunk {
1614 num_bytes: ((1 + packed_chunk_sizes[i]) * std::mem::size_of::<$data_type>()) as u16,
1615 log_num_values: LOG_ELEMS_PER_CHUNK,
1616 });
1617 }
1618
1619 // Handle the last chunk
1620 let last_chunk_elem_num = if $data.num_values % ELEMS_PER_CHUNK == 0 {
1621 1024
1622 } else {
1623 $data.num_values % ELEMS_PER_CHUNK
1624 };
1625 let mut last_chunk = vec![0; ELEMS_PER_CHUNK as usize];
1626 last_chunk[..last_chunk_elem_num as usize].clone_from_slice(
1627 &data_buffer[$data.num_values as usize - last_chunk_elem_num as usize..],
1628 );
1629 let bit_width = bit_widths_array.value(bit_widths_array.len() - 1) as $data_type;
1630 output.push(bit_width);
1631 let output_len = output.len();
1632 unsafe {
1633 output.set_len(output_len + packed_chunk_sizes[bit_widths_array.len() - 1]);
1634 BitPacking::unchecked_pack(
1635 bit_width as usize,
1636 &last_chunk,
1637 &mut output[output_len..][..packed_chunk_sizes[bit_widths_array.len() - 1]],
1638 );
1639 }
1640 chunks.push(MiniBlockChunk {
1641 num_bytes: ((1 + packed_chunk_sizes[bit_widths_array.len() - 1])
1642 * std::mem::size_of::<$data_type>()) as u16,
1643 log_num_values: 0,
1644 });
1645
1646 (
1647 MiniBlockCompressed {
1648 data: LanceBuffer::reinterpret_vec(output),
1649 chunks,
1650 num_values: $data.num_values,
1651 },
1652 ProtobufUtils::bitpack2($data.bits_per_value),
1653 )
1654 }};
1655}
1656
1657#[derive(Debug, Default)]
1658pub struct BitpackMiniBlockEncoder {}
1659
1660impl BitpackMiniBlockEncoder {
1661 fn chunk_data(
1662 &self,
1663 mut data: FixedWidthDataBlock,
1664 ) -> (MiniBlockCompressed, crate::format::pb::ArrayEncoding) {
1665 assert!(data.bits_per_value % 8 == 0);
1666 match data.bits_per_value {
1667 8 => chunk_data_impl!(data, u8),
1668 16 => chunk_data_impl!(data, u16),
1669 32 => chunk_data_impl!(data, u32),
1670 64 => chunk_data_impl!(data, u64),
1671 _ => unreachable!(),
1672 }
1673 }
1674}
1675
1676impl MiniBlockCompressor for BitpackMiniBlockEncoder {
1677 fn compress(
1678 &self,
1679 chunk: DataBlock,
1680 ) -> Result<(MiniBlockCompressed, crate::format::pb::ArrayEncoding)> {
1681 match chunk {
1682 DataBlock::FixedWidth(fixed_width) => Ok(self.chunk_data(fixed_width)),
1683 _ => Err(Error::InvalidInput {
1684 source: format!(
1685 "Cannot compress a data block of type {} with BitpackMiniBlockEncoder",
1686 chunk.name()
1687 )
1688 .into(),
1689 location: location!(),
1690 }),
1691 }
1692 }
1693}
1694
1695/// A decompressor for fixed-width data that has
1696/// been written, as-is, to disk in single contiguous array
1697#[derive(Debug)]
1698pub struct BitpackMiniBlockDecompressor {
1699 uncompressed_bit_width: u64,
1700}
1701
1702impl BitpackMiniBlockDecompressor {
1703 pub fn new(description: &pb::Bitpack2) -> Self {
1704 Self {
1705 uncompressed_bit_width: description.uncompressed_bits_per_value,
1706 }
1707 }
1708}
1709
1710impl MiniBlockDecompressor for BitpackMiniBlockDecompressor {
1711 fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result<DataBlock> {
1712 assert!(data.len() >= 8);
1713 assert!(num_values <= ELEMS_PER_CHUNK);
1714
1715 // This macro decompresses a chunk(1024 values) of bitpacked values.
1716 macro_rules! decompress_impl {
1717 ($type:ty) => {{
1718 let uncompressed_bit_width = std::mem::size_of::<$type>() * 8;
1719 let mut decompressed = vec![0 as $type; ELEMS_PER_CHUNK as usize];
1720
1721 // Copy for memory alignment
1722 let chunk_in_u8: Vec<u8> = data.to_vec();
1723 let bit_width_bytes = &chunk_in_u8[..std::mem::size_of::<$type>()];
1724 let bit_width_value = LittleEndian::read_uint(bit_width_bytes, std::mem::size_of::<$type>());
1725 let chunk = cast_slice(&chunk_in_u8[std::mem::size_of::<$type>()..]);
1726
1727 // The bit-packed chunk should have number of bytes (bit_width_value * ELEMS_PER_CHUNK / 8)
1728 assert!(chunk.len() * std::mem::size_of::<$type>() == (bit_width_value * ELEMS_PER_CHUNK as u64) as usize / 8);
1729
1730 unsafe {
1731 BitPacking::unchecked_unpack(
1732 bit_width_value as usize,
1733 chunk,
1734 &mut decompressed,
1735 );
1736 }
1737
1738 decompressed.shrink_to(num_values as usize);
1739 Ok(DataBlock::FixedWidth(FixedWidthDataBlock {
1740 data: LanceBuffer::reinterpret_vec(decompressed),
1741 bits_per_value: uncompressed_bit_width as u64,
1742 num_values,
1743 block_info: BlockInfo::new(),
1744 }))
1745 }};
1746 }
1747
1748 match self.uncompressed_bit_width {
1749 8 => decompress_impl!(u8),
1750 16 => decompress_impl!(u16),
1751 32 => decompress_impl!(u32),
1752 64 => decompress_impl!(u64),
1753 _ => todo!(),
1754 }
1755 }
1756}
1757
1758#[cfg(test)]
1759mod test {
1760 use std::{collections::HashMap, sync::Arc};
1761
1762 use arrow_array::{Int64Array, Int8Array};
1763
1764 use arrow_schema::DataType;
1765
1766 use arrow_array::Array;
1767
1768 use crate::{
1769 testing::{check_round_trip_encoding_of_data, TestCases},
1770 version::LanceFileVersion,
1771 };
1772
1773 #[test_log::test(tokio::test)]
1774 async fn test_miniblock_bitpack() {
1775 let test_cases = TestCases::default().with_file_version(LanceFileVersion::V2_1);
1776
1777 let arrays = vec![
1778 Arc::new(Int8Array::from(vec![100; 1024])) as Arc<dyn Array>,
1779 Arc::new(Int8Array::from(vec![1; 1024])) as Arc<dyn Array>,
1780 Arc::new(Int8Array::from(vec![16; 1024])) as Arc<dyn Array>,
1781 Arc::new(Int8Array::from(vec![-1; 1024])) as Arc<dyn Array>,
1782 Arc::new(Int8Array::from(vec![5; 1])) as Arc<dyn Array>,
1783 ];
1784 check_round_trip_encoding_of_data(arrays, &test_cases, HashMap::new()).await;
1785
1786 for data_type in [DataType::Int16, DataType::Int32, DataType::Int64] {
1787 let int64_arrays = vec![
1788 Int64Array::from(vec![3; 1024]),
1789 Int64Array::from(vec![8; 1024]),
1790 Int64Array::from(vec![16; 1024]),
1791 Int64Array::from(vec![100; 1024]),
1792 Int64Array::from(vec![512; 1024]),
1793 Int64Array::from(vec![1000; 1024]),
1794 Int64Array::from(vec![2000; 1024]),
1795 Int64Array::from(vec![-1; 10]),
1796 ];
1797
1798 let mut arrays = vec![];
1799 for int64_array in int64_arrays {
1800 arrays.push(arrow_cast::cast(&int64_array, &data_type).unwrap());
1801 }
1802 check_round_trip_encoding_of_data(arrays, &test_cases, HashMap::new()).await;
1803 }
1804 }
1805}