lance_encoding/utils/
bytepack.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4//! Utilities for byte (not bit) packing for situations where saving a few
5//! bits is less important than simplicity and speed.
6
7pub struct U8BytePacker {
8    data: Vec<u8>,
9}
10
11impl U8BytePacker {
12    fn with_capacity(capacity: usize) -> Self {
13        Self {
14            data: Vec::with_capacity(capacity),
15        }
16    }
17
18    fn append(&mut self, value: u64) {
19        self.data.push(value as u8);
20    }
21}
22
23pub struct U16BytePacker {
24    data: Vec<u8>,
25}
26
27impl U16BytePacker {
28    fn with_capacity(capacity: usize) -> Self {
29        Self {
30            data: Vec::with_capacity(capacity * 2),
31        }
32    }
33
34    fn append(&mut self, value: u64) {
35        self.data.extend_from_slice(&(value as u16).to_le_bytes());
36    }
37}
38
39pub struct U32BytePacker {
40    data: Vec<u8>,
41}
42
43impl U32BytePacker {
44    fn with_capacity(capacity: usize) -> Self {
45        Self {
46            data: Vec::with_capacity(capacity * 4),
47        }
48    }
49
50    fn append(&mut self, value: u64) {
51        self.data.extend_from_slice(&(value as u32).to_le_bytes());
52    }
53}
54
55pub struct U64BytePacker {
56    data: Vec<u8>,
57}
58
59impl U64BytePacker {
60    fn with_capacity(capacity: usize) -> Self {
61        Self {
62            data: Vec::with_capacity(capacity * 8),
63        }
64    }
65
66    fn append(&mut self, value: u64) {
67        self.data.extend_from_slice(&value.to_le_bytes());
68    }
69}
70
71/// A bytepacked integer encoder that automatically chooses the smallest
72/// possible integer type to store the given values.
73///
74/// This is byte packing (not bit packing).  Not even that, we only fit things into
75/// sizes of 1,2,4,8 bytes.  It's simple, fast, and easy but doesn't provide the
76/// maximum possible compression.
77///
78/// Still, it's useful for things like offsets which are often small and fit into a
79/// u16 or u32 but sometimes might need the full u64 range.
80///
81/// In the future we can investigate replacing this with something more sophisticated.
82pub enum BytepackedIntegerEncoder {
83    U8(U8BytePacker),
84    U16(U16BytePacker),
85    U32(U32BytePacker),
86    U64(U64BytePacker),
87    Zero,
88}
89
90impl BytepackedIntegerEncoder {
91    /// Create a new encoder with the given capacity and maximum value.
92    pub fn with_capacity(capacity: usize, max_value: u64) -> Self {
93        if max_value == 0 {
94            Self::Zero
95        } else if max_value <= u8::MAX as u64 {
96            Self::U8(U8BytePacker::with_capacity(capacity))
97        } else if max_value <= u16::MAX as u64 {
98            Self::U16(U16BytePacker::with_capacity(capacity))
99        } else if max_value <= u32::MAX as u64 {
100            Self::U32(U32BytePacker::with_capacity(capacity))
101        } else {
102            Self::U64(U64BytePacker::with_capacity(capacity))
103        }
104    }
105
106    /// Append a value to the encoder.
107    ///
108    /// # Safety
109    ///
110    /// This function is unsafe because it doesn't check for overflow.  If the
111    /// value is too large to fit in the chosen integer type, it will be silently
112    /// truncated.
113    pub unsafe fn append(&mut self, value: u64) {
114        match self {
115            Self::U8(packer) => packer.append(value),
116            Self::U16(packer) => packer.append(value),
117            Self::U32(packer) => packer.append(value),
118            Self::U64(packer) => packer.append(value),
119            Self::Zero => {}
120        }
121    }
122
123    /// Convert the encoder into a vector of bytes.
124    pub fn into_data(self) -> Vec<u8> {
125        match self {
126            Self::U8(packer) => packer.data,
127            Self::U16(packer) => packer.data,
128            Self::U32(packer) => packer.data,
129            Self::U64(packer) => packer.data,
130            Self::Zero => Vec::new(),
131        }
132    }
133}
134
135/// An iterator that unpacks bytes into integers (currently only u64)
136pub enum ByteUnpacker<I: Iterator<Item = u8>> {
137    U8(I),
138    U16(I),
139    U32(I),
140    U64(I),
141}
142
143impl<T: Iterator<Item = u8>> ByteUnpacker<T> {
144    #[allow(clippy::new_ret_no_self)]
145    pub fn new<I: IntoIterator<IntoIter = T>>(data: I, size: usize) -> impl Iterator<Item = u64> {
146        match size {
147            1 => Self::U8(data.into_iter()),
148            2 => Self::U16(data.into_iter()),
149            4 => Self::U32(data.into_iter()),
150            8 => Self::U64(data.into_iter()),
151            _ => panic!("Invalid size"),
152        }
153    }
154}
155
156impl<I: Iterator<Item = u8>> Iterator for ByteUnpacker<I> {
157    type Item = u64;
158
159    fn next(&mut self) -> Option<Self::Item> {
160        match self {
161            Self::U8(iter) => iter.next().map(|v| v as u64),
162            Self::U16(iter) => {
163                let first_byte = iter.next()?;
164                Some(u16::from_le_bytes([first_byte, iter.next().unwrap()]) as u64)
165            }
166            Self::U32(iter) => {
167                let first_byte = iter.next()?;
168                Some(u32::from_le_bytes([
169                    first_byte,
170                    iter.next().unwrap(),
171                    iter.next().unwrap(),
172                    iter.next().unwrap(),
173                ]) as u64)
174            }
175            Self::U64(iter) => {
176                let first_byte = iter.next()?;
177                Some(u64::from_le_bytes([
178                    first_byte,
179                    iter.next().unwrap(),
180                    iter.next().unwrap(),
181                    iter.next().unwrap(),
182                    iter.next().unwrap(),
183                    iter.next().unwrap(),
184                    iter.next().unwrap(),
185                    iter.next().unwrap(),
186                ]))
187            }
188        }
189    }
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195
196    #[test]
197    fn test_bytepacked_integer_encoder() {
198        // Fits in u8
199        let mut encoder = BytepackedIntegerEncoder::with_capacity(10, 100);
200        unsafe {
201            encoder.append(50);
202            encoder.append(20);
203            encoder.append(30);
204        }
205        let data = encoder.into_data();
206        assert_eq!(data, vec![50, 20, 30]);
207
208        assert_eq!(
209            ByteUnpacker::new(data, 1).collect::<Vec<_>>(),
210            vec![50, 20, 30]
211        );
212
213        // Requires u16
214        let mut encoder = BytepackedIntegerEncoder::with_capacity(10, 1000);
215        unsafe {
216            encoder.append(500);
217            encoder.append(200);
218            encoder.append(300);
219        }
220        let data = encoder.into_data();
221        assert_eq!(data, vec![244, 1, 200, 0, 44, 1]);
222
223        assert_eq!(
224            ByteUnpacker::new(data, 2).collect::<Vec<_>>(),
225            vec![500, 200, 300]
226        );
227
228        // Requires u32
229        let mut encoder = BytepackedIntegerEncoder::with_capacity(10, 1000000);
230        unsafe {
231            encoder.append(500000);
232            encoder.append(200000);
233            encoder.append(300000);
234        }
235        let data = encoder.into_data();
236        assert_eq!(data, vec![32, 161, 7, 0, 64, 13, 3, 0, 224, 147, 4, 0]);
237
238        assert_eq!(
239            ByteUnpacker::new(data, 4).collect::<Vec<_>>(),
240            vec![500000, 200000, 300000]
241        );
242
243        // Requires u64
244        let mut encoder = BytepackedIntegerEncoder::with_capacity(10, 0x10000000000);
245        unsafe {
246            encoder.append(0x5000000000);
247            encoder.append(0x2000000000);
248            encoder.append(0x3000000000);
249        }
250        let data = encoder.into_data();
251        assert_eq!(
252            data,
253            vec![0, 0, 0, 0, 80, 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 48, 0, 0, 0]
254        );
255
256        assert_eq!(
257            ByteUnpacker::new(data, 8).collect::<Vec<_>>(),
258            vec![0x5000000000, 0x2000000000, 0x3000000000]
259        );
260    }
261}