lance_encoding/
format.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4/// Protobuf definitions for encodings
5pub mod pb {
6    #![allow(clippy::all)]
7    #![allow(non_upper_case_globals)]
8    #![allow(non_camel_case_types)]
9    #![allow(non_snake_case)]
10    #![allow(unused)]
11    #![allow(improper_ctypes)]
12    #![allow(clippy::upper_case_acronyms)]
13    #![allow(clippy::use_self)]
14    include!(concat!(env!("OUT_DIR"), "/lance.encodings.rs"));
15}
16
17use pb::{
18    array_encoding::ArrayEncoding as ArrayEncodingEnum,
19    buffer::BufferType,
20    full_zip_layout,
21    nullable::{AllNull, NoNull, Nullability, SomeNull},
22    page_layout::Layout,
23    AllNullLayout, ArrayEncoding, Binary, Bitpack2, Bitpacked, BitpackedForNonNeg, Dictionary,
24    FixedSizeBinary, FixedSizeList, Flat, Fsst, MiniBlockLayout, Nullable, PackedStruct,
25    PackedStructFixedWidthMiniBlock, PageLayout, RepDefLayer, Variable,
26};
27
28use crate::{
29    encodings::physical::block_compress::CompressionConfig, repdef::DefinitionInterpretation,
30};
31
32use self::pb::Constant;
33
34// Utility functions for creating complex protobuf objects
35pub struct ProtobufUtils {}
36
37impl ProtobufUtils {
38    pub fn constant(value: Vec<u8>, num_values: u64) -> ArrayEncoding {
39        ArrayEncoding {
40            array_encoding: Some(ArrayEncodingEnum::Constant(Constant {
41                value: value.into(),
42                num_values,
43            })),
44        }
45    }
46
47    pub fn basic_all_null_encoding() -> ArrayEncoding {
48        ArrayEncoding {
49            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
50                nullability: Some(Nullability::AllNulls(AllNull {})),
51            }))),
52        }
53    }
54
55    pub fn basic_some_null_encoding(
56        validity: ArrayEncoding,
57        values: ArrayEncoding,
58    ) -> ArrayEncoding {
59        ArrayEncoding {
60            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
61                nullability: Some(Nullability::SomeNulls(Box::new(SomeNull {
62                    validity: Some(Box::new(validity)),
63                    values: Some(Box::new(values)),
64                }))),
65            }))),
66        }
67    }
68
69    pub fn basic_no_null_encoding(values: ArrayEncoding) -> ArrayEncoding {
70        ArrayEncoding {
71            array_encoding: Some(ArrayEncodingEnum::Nullable(Box::new(Nullable {
72                nullability: Some(Nullability::NoNulls(Box::new(NoNull {
73                    values: Some(Box::new(values)),
74                }))),
75            }))),
76        }
77    }
78
79    pub fn flat_encoding(
80        bits_per_value: u64,
81        buffer_index: u32,
82        compression: Option<CompressionConfig>,
83    ) -> ArrayEncoding {
84        ArrayEncoding {
85            array_encoding: Some(ArrayEncodingEnum::Flat(Flat {
86                bits_per_value,
87                buffer: Some(pb::Buffer {
88                    buffer_index,
89                    buffer_type: BufferType::Page as i32,
90                }),
91                compression: compression.map(|compression_config| pb::Compression {
92                    scheme: compression_config.scheme.to_string(),
93                    level: compression_config.level,
94                }),
95            })),
96        }
97    }
98
99    pub fn fsl_encoding(dimension: u64, items: ArrayEncoding) -> ArrayEncoding {
100        ArrayEncoding {
101            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
102                dimension: dimension.try_into().unwrap(),
103                items: Some(Box::new(items)),
104            }))),
105        }
106    }
107
108    pub fn bitpacked_encoding(
109        compressed_bits_per_value: u64,
110        uncompressed_bits_per_value: u64,
111        buffer_index: u32,
112        signed: bool,
113    ) -> ArrayEncoding {
114        ArrayEncoding {
115            array_encoding: Some(ArrayEncodingEnum::Bitpacked(Bitpacked {
116                compressed_bits_per_value,
117                buffer: Some(pb::Buffer {
118                    buffer_index,
119                    buffer_type: BufferType::Page as i32,
120                }),
121                uncompressed_bits_per_value,
122                signed,
123            })),
124        }
125    }
126
127    pub fn bitpacked_for_non_neg_encoding(
128        compressed_bits_per_value: u64,
129        uncompressed_bits_per_value: u64,
130        buffer_index: u32,
131    ) -> ArrayEncoding {
132        ArrayEncoding {
133            array_encoding: Some(ArrayEncodingEnum::BitpackedForNonNeg(BitpackedForNonNeg {
134                compressed_bits_per_value,
135                buffer: Some(pb::Buffer {
136                    buffer_index,
137                    buffer_type: BufferType::Page as i32,
138                }),
139                uncompressed_bits_per_value,
140            })),
141        }
142    }
143    pub fn bitpack2(uncompressed_bits_per_value: u64) -> ArrayEncoding {
144        ArrayEncoding {
145            array_encoding: Some(ArrayEncodingEnum::Bitpack2(Bitpack2 {
146                uncompressed_bits_per_value,
147            })),
148        }
149    }
150
151    pub fn variable(bits_per_offset: u8) -> ArrayEncoding {
152        ArrayEncoding {
153            array_encoding: Some(ArrayEncodingEnum::Variable(Variable {
154                bits_per_offset: bits_per_offset as u32,
155            })),
156        }
157    }
158
159    // Construct a `FsstMiniBlock` ArrayEncoding, the inner `binary_mini_block` encoding is actually
160    // not used and `FsstMiniBlockDecompressor` constructs a `binary_mini_block` in a `hard-coded` fashion.
161    // This can be an optimization later.
162    pub fn fsst(data: ArrayEncoding, symbol_table: Vec<u8>) -> ArrayEncoding {
163        ArrayEncoding {
164            array_encoding: Some(ArrayEncodingEnum::Fsst(Box::new(Fsst {
165                binary: Some(Box::new(data)),
166                symbol_table: symbol_table.into(),
167            }))),
168        }
169    }
170
171    pub fn packed_struct(
172        child_encodings: Vec<ArrayEncoding>,
173        packed_buffer_index: u32,
174    ) -> ArrayEncoding {
175        ArrayEncoding {
176            array_encoding: Some(ArrayEncodingEnum::PackedStruct(PackedStruct {
177                inner: child_encodings,
178                buffer: Some(pb::Buffer {
179                    buffer_index: packed_buffer_index,
180                    buffer_type: BufferType::Page as i32,
181                }),
182            })),
183        }
184    }
185
186    pub fn packed_struct_fixed_width_mini_block(
187        data: ArrayEncoding,
188        bits_per_values: Vec<u32>,
189    ) -> ArrayEncoding {
190        ArrayEncoding {
191            array_encoding: Some(ArrayEncodingEnum::PackedStructFixedWidthMiniBlock(
192                Box::new(PackedStructFixedWidthMiniBlock {
193                    flat: Some(Box::new(data)),
194                    bits_per_values,
195                }),
196            )),
197        }
198    }
199
200    pub fn binary(
201        indices_encoding: ArrayEncoding,
202        bytes_encoding: ArrayEncoding,
203        null_adjustment: u64,
204    ) -> ArrayEncoding {
205        ArrayEncoding {
206            array_encoding: Some(ArrayEncodingEnum::Binary(Box::new(Binary {
207                bytes: Some(Box::new(bytes_encoding)),
208                indices: Some(Box::new(indices_encoding)),
209                null_adjustment,
210            }))),
211        }
212    }
213
214    pub fn dict_encoding(
215        indices: ArrayEncoding,
216        items: ArrayEncoding,
217        num_items: u32,
218    ) -> ArrayEncoding {
219        ArrayEncoding {
220            array_encoding: Some(ArrayEncodingEnum::Dictionary(Box::new(Dictionary {
221                indices: Some(Box::new(indices)),
222                items: Some(Box::new(items)),
223                num_dictionary_items: num_items,
224            }))),
225        }
226    }
227
228    pub fn fixed_size_binary(data: ArrayEncoding, byte_width: u32) -> ArrayEncoding {
229        ArrayEncoding {
230            array_encoding: Some(ArrayEncodingEnum::FixedSizeBinary(Box::new(
231                FixedSizeBinary {
232                    bytes: Some(Box::new(data)),
233                    byte_width,
234                },
235            ))),
236        }
237    }
238
239    pub fn fixed_size_list(data: ArrayEncoding, dimension: u64) -> ArrayEncoding {
240        ArrayEncoding {
241            array_encoding: Some(ArrayEncodingEnum::FixedSizeList(Box::new(FixedSizeList {
242                dimension: dimension.try_into().unwrap(),
243                items: Some(Box::new(data)),
244            }))),
245        }
246    }
247
248    fn def_inter_to_repdef_layer(def: DefinitionInterpretation) -> i32 {
249        match def {
250            DefinitionInterpretation::AllValidItem => RepDefLayer::RepdefAllValidItem as i32,
251            DefinitionInterpretation::AllValidList => RepDefLayer::RepdefAllValidList as i32,
252            DefinitionInterpretation::NullableItem => RepDefLayer::RepdefNullableItem as i32,
253            DefinitionInterpretation::NullableList => RepDefLayer::RepdefNullableList as i32,
254            DefinitionInterpretation::EmptyableList => RepDefLayer::RepdefEmptyableList as i32,
255            DefinitionInterpretation::NullableAndEmptyableList => {
256                RepDefLayer::RepdefNullAndEmptyList as i32
257            }
258        }
259    }
260
261    pub fn repdef_layer_to_def_interp(layer: i32) -> DefinitionInterpretation {
262        let layer = RepDefLayer::try_from(layer).unwrap();
263        match layer {
264            RepDefLayer::RepdefAllValidItem => DefinitionInterpretation::AllValidItem,
265            RepDefLayer::RepdefAllValidList => DefinitionInterpretation::AllValidList,
266            RepDefLayer::RepdefNullableItem => DefinitionInterpretation::NullableItem,
267            RepDefLayer::RepdefNullableList => DefinitionInterpretation::NullableList,
268            RepDefLayer::RepdefEmptyableList => DefinitionInterpretation::EmptyableList,
269            RepDefLayer::RepdefNullAndEmptyList => {
270                DefinitionInterpretation::NullableAndEmptyableList
271            }
272            RepDefLayer::RepdefUnspecified => panic!("Unspecified repdef layer"),
273        }
274    }
275
276    pub fn miniblock_layout(
277        rep_encoding: ArrayEncoding,
278        def_encoding: ArrayEncoding,
279        value_encoding: ArrayEncoding,
280        repetition_index_depth: u32,
281        dictionary_encoding: Option<ArrayEncoding>,
282        def_meaning: &[DefinitionInterpretation],
283        num_items: u64,
284    ) -> PageLayout {
285        assert!(!def_meaning.is_empty());
286        PageLayout {
287            layout: Some(Layout::MiniBlockLayout(MiniBlockLayout {
288                def_compression: Some(def_encoding),
289                rep_compression: Some(rep_encoding),
290                value_compression: Some(value_encoding),
291                repetition_index_depth,
292                dictionary: dictionary_encoding,
293                layers: def_meaning
294                    .iter()
295                    .map(|&def| Self::def_inter_to_repdef_layer(def))
296                    .collect(),
297                num_items,
298            })),
299        }
300    }
301
302    fn full_zip_layout(
303        bits_rep: u8,
304        bits_def: u8,
305        details: full_zip_layout::Details,
306        value_encoding: ArrayEncoding,
307        def_meaning: &[DefinitionInterpretation],
308        num_items: u32,
309        num_visible_items: u32,
310    ) -> PageLayout {
311        PageLayout {
312            layout: Some(Layout::FullZipLayout(pb::FullZipLayout {
313                bits_rep: bits_rep as u32,
314                bits_def: bits_def as u32,
315                details: Some(details),
316                value_compression: Some(value_encoding),
317                num_items,
318                num_visible_items,
319                layers: def_meaning
320                    .iter()
321                    .map(|&def| Self::def_inter_to_repdef_layer(def))
322                    .collect(),
323            })),
324        }
325    }
326
327    pub fn fixed_full_zip_layout(
328        bits_rep: u8,
329        bits_def: u8,
330        bits_per_value: u32,
331        value_encoding: ArrayEncoding,
332        def_meaning: &[DefinitionInterpretation],
333        num_items: u32,
334        num_visible_items: u32,
335    ) -> PageLayout {
336        Self::full_zip_layout(
337            bits_rep,
338            bits_def,
339            full_zip_layout::Details::BitsPerValue(bits_per_value),
340            value_encoding,
341            def_meaning,
342            num_items,
343            num_visible_items,
344        )
345    }
346
347    pub fn variable_full_zip_layout(
348        bits_rep: u8,
349        bits_def: u8,
350        bits_per_offset: u32,
351        value_encoding: ArrayEncoding,
352        def_meaning: &[DefinitionInterpretation],
353        num_items: u32,
354        num_visible_items: u32,
355    ) -> PageLayout {
356        Self::full_zip_layout(
357            bits_rep,
358            bits_def,
359            full_zip_layout::Details::BitsPerOffset(bits_per_offset),
360            value_encoding,
361            def_meaning,
362            num_items,
363            num_visible_items,
364        )
365    }
366
367    pub fn all_null_layout(def_meaning: &[DefinitionInterpretation]) -> PageLayout {
368        PageLayout {
369            layout: Some(Layout::AllNullLayout(AllNullLayout {
370                layers: def_meaning
371                    .iter()
372                    .map(|&def| Self::def_inter_to_repdef_layer(def))
373                    .collect(),
374            })),
375        }
376    }
377
378    pub fn simple_all_null_layout() -> PageLayout {
379        Self::all_null_layout(&[DefinitionInterpretation::NullableItem])
380    }
381}