lance_encoding/compression_algo/
fastlanes.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright The Lance Authors
3
4// NOTICE:
5// This file is a modification of the `fastlanes` crate: https://github.com/spiraldb/fastlanes
6// It is modified to allow a rust stable build
7//
8// The original code can be accessed at
9//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/bitpacking.rs
10//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/lib.rs
11//      https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/src/macros.rs
12//
13// The original code is licensed under the Apache Software License:
14// https://github.com/spiraldb/fastlanes/blob/8e0ff374f815d919d0c0ebdccf5ffd9e6dc7d663/LICENSE
15
16use arrayref::{array_mut_ref, array_ref};
17use core::mem::size_of;
18use paste::paste;
19
20pub const FL_ORDER: [usize; 8] = [0, 4, 2, 6, 1, 5, 3, 7];
21
22pub trait FastLanes: Sized + Copy {
23    const T: usize = size_of::<Self>() * 8;
24    const LANES: usize = 1024 / Self::T;
25}
26
27// Implement the trait for basic unsigned integer types
28impl FastLanes for u8 {}
29impl FastLanes for u16 {}
30impl FastLanes for u32 {}
31impl FastLanes for u64 {}
32
33macro_rules! pack {
34    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident | $($body:tt)*) => {
35        macro_rules! __kernel__ {( $_1 $idx:ident ) => ( $($body)* )}
36        {
37            use paste::paste;
38
39            // The number of bits of T.
40            const T: usize = <$T>::T;
41
42            #[inline(always)]
43            fn index(row: usize, lane: usize) -> usize {
44                let o = row / 8;
45                let s = row % 8;
46                (FL_ORDER[o] * 16) + (s * 128) + lane
47            }
48
49            if $W == 0 {
50                // Nothing to do if W is 0, since the packed array is zero bytes.
51            } else if $W == T {
52                // Special case for W=T, we can just copy the input value directly to the packed value.
53                paste!(seq_t!(row in $T {
54                    let idx = index(row, $lane);
55                    $packed[<$T>::LANES * row + $lane] = __kernel__!(idx);
56                }));
57            } else {
58                // A mask of W bits.
59                let mask: $T = (1 << $W) - 1;
60
61                // First we loop over each lane in the virtual 1024 bit word.
62                let mut tmp: $T = 0;
63
64                // Loop over each of the rows of the lane.
65                // Inlining this loop means all branches are known at compile time and
66                // the code is auto-vectorized for SIMD execution.
67                paste!(seq_t!(row in $T {
68                    let idx = index(row, $lane);
69                    let src = __kernel__!(idx);
70                    let src = src & mask;
71
72                    // Shift the src bits into their position in the tmp output variable.
73                    if row == 0 {
74                        tmp = src;
75                    } else {
76                        tmp |= src << (row * $W) % T;
77                    }
78
79                    // If the next packed position is after our current one, then we have filled
80                    // the current output and we can write the packed value.
81                    let curr_word: usize = (row * $W) / T;
82                    let next_word: usize = ((row + 1) * $W) / T;
83
84                    #[allow(unused_assignments)]
85                    if next_word > curr_word {
86                        $packed[<$T>::LANES * curr_word + $lane] = tmp;
87                        let remaining_bits: usize = ((row + 1) * $W) % T;
88                        // Keep the remaining bits for the next packed value.
89                        tmp = src >> $W - remaining_bits;
90                    }
91                }));
92            }
93        }
94    };
95}
96
97macro_rules! unpack {
98    ($T:ty, $W:expr, $packed:expr, $lane:expr, | $_1:tt $idx:ident, $_2:tt $elem:ident | $($body:tt)*) => {
99        macro_rules! __kernel__ {( $_1 $idx:ident, $_2 $elem:ident ) => ( $($body)* )}
100        {
101            use paste::paste;
102
103            // The number of bits of T.
104            const T: usize = <$T>::T;
105
106            #[inline(always)]
107            fn index(row: usize, lane: usize) -> usize {
108                let o = row / 8;
109                let s = row % 8;
110                (FL_ORDER[o] * 16) + (s * 128) + lane
111            }
112
113            if $W == 0 {
114                // Special case for W=0, we just need to zero the output.
115                // We'll still respect the iteration order in case the kernel has side effects.
116                paste!(seq_t!(row in $T {
117                    let idx = index(row, $lane);
118                    let zero: $T = 0;
119                    __kernel__!(idx, zero);
120                }));
121            } else if $W == T {
122                // Special case for W=T, we can just copy the packed value directly to the output.
123                paste!(seq_t!(row in $T {
124                    let idx = index(row, $lane);
125                    let src = $packed[<$T>::LANES * row + $lane];
126                    __kernel__!(idx, src);
127                }));
128            } else {
129                #[inline]
130                fn mask(width: usize) -> $T {
131                    if width == T { <$T>::MAX } else { (1 << (width % T)) - 1 }
132                }
133
134                let mut src: $T = $packed[$lane];
135                let mut tmp: $T;
136
137                paste!(seq_t!(row in $T {
138                    // Figure out the packed positions
139                    let curr_word: usize = (row * $W) / T;
140                    let next_word = ((row + 1) * $W) / T;
141
142                    let shift = (row * $W) % T;
143
144                    if next_word > curr_word {
145                        // Consume some bits from the curr packed input, the remainder are in the next
146                        // packed input value
147                        let remaining_bits = ((row + 1) * $W) % T;
148                        let current_bits = $W - remaining_bits;
149                        tmp = (src >> shift) & mask(current_bits);
150
151                        if next_word < $W {
152                            // Load the next packed value
153                            src = $packed[<$T>::LANES * next_word + $lane];
154                            // Consume the remaining bits from the next input value.
155                            tmp |= (src & mask(remaining_bits)) << current_bits;
156                        }
157                    } else {
158                        // Otherwise, just grab W bits from the src value
159                        tmp = (src >> shift) & mask($W);
160                    }
161
162                    // Write out the unpacked value
163                    let idx = index(row, $lane);
164                    __kernel__!(idx, tmp);
165                }));
166            }
167        }
168    };
169}
170
171// Macro for repeating a code block bit_size_of::<T> times.
172macro_rules! seq_t {
173    ($ident:ident in u8 $body:tt) => {seq_macro::seq!($ident in 0..8 $body)};
174    ($ident:ident in u16 $body:tt) => {seq_macro::seq!($ident in 0..16 $body)};
175    ($ident:ident in u32 $body:tt) => {seq_macro::seq!($ident in 0..32 $body)};
176    ($ident:ident in u64 $body:tt) => {seq_macro::seq!($ident in 0..64 $body)};
177}
178
179/// `BitPack` into a compile-time known bit-width.
180pub trait BitPacking: FastLanes {
181    /// Packs 1024 elements into `W` bits each, where `W` is runtime-known instead of
182    /// compile-time known.
183    ///
184    /// # Safety
185    /// The input slice must be of exactly length 1024. The output slice must be of length
186    /// `1024 * W / T`, where `T` is the bit-width of Self and `W` is the packed width.
187    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
188    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]);
189
190    /// Unpacks 1024 elements from `W` bits each, where `W` is runtime-known instead of
191    /// compile-time known.
192    ///
193    /// # Safety
194    /// The input slice must be of length `1024 * W / T`, where `T` is the bit-width of Self and `W`
195    /// is the packed width. The output slice must be of exactly length 1024.
196    /// These lengths are checked only with `debug_assert` (i.e., not checked on release builds).
197    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]);
198}
199
200impl BitPacking for u8 {
201    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
202        let packed_len = 128 * width / size_of::<Self>();
203        debug_assert_eq!(
204            output.len(),
205            packed_len,
206            "Output buffer must be of size 1024 * W / T"
207        );
208        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
209        debug_assert!(
210            width <= Self::T,
211            "Width must be less than or equal to {}",
212            Self::T
213        );
214
215        match width {
216            1 => pack_8_1(
217                array_ref![input, 0, 1024],
218                array_mut_ref![output, 0, 1024 / 8],
219            ),
220            2 => pack_8_2(
221                array_ref![input, 0, 1024],
222                array_mut_ref![output, 0, 1024 * 2 / 8],
223            ),
224            3 => pack_8_3(
225                array_ref![input, 0, 1024],
226                array_mut_ref![output, 0, 1024 * 3 / 8],
227            ),
228            4 => pack_8_4(
229                array_ref![input, 0, 1024],
230                array_mut_ref![output, 0, 1024 * 4 / 8],
231            ),
232            5 => pack_8_5(
233                array_ref![input, 0, 1024],
234                array_mut_ref![output, 0, 1024 * 5 / 8],
235            ),
236            6 => pack_8_6(
237                array_ref![input, 0, 1024],
238                array_mut_ref![output, 0, 1024 * 6 / 8],
239            ),
240            7 => pack_8_7(
241                array_ref![input, 0, 1024],
242                array_mut_ref![output, 0, 1024 * 7 / 8],
243            ),
244            8 => pack_8_8(
245                array_ref![input, 0, 1024],
246                array_mut_ref![output, 0, 1024 * 8 / 8],
247            ),
248
249            _ => unreachable!("Unsupported width: {}", width),
250        }
251    }
252
253    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
254        let packed_len = 128 * width / size_of::<Self>();
255        debug_assert_eq!(
256            input.len(),
257            packed_len,
258            "Input buffer must be of size 1024 * W / T"
259        );
260        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
261        debug_assert!(
262            width <= Self::T,
263            "Width must be less than or equal to {}",
264            Self::T
265        );
266
267        match width {
268            1 => unpack_8_1(
269                array_ref![input, 0, 1024 / 8],
270                array_mut_ref![output, 0, 1024],
271            ),
272            2 => unpack_8_2(
273                array_ref![input, 0, 1024 * 2 / 8],
274                array_mut_ref![output, 0, 1024],
275            ),
276            3 => unpack_8_3(
277                array_ref![input, 0, 1024 * 3 / 8],
278                array_mut_ref![output, 0, 1024],
279            ),
280            4 => unpack_8_4(
281                array_ref![input, 0, 1024 * 4 / 8],
282                array_mut_ref![output, 0, 1024],
283            ),
284            5 => unpack_8_5(
285                array_ref![input, 0, 1024 * 5 / 8],
286                array_mut_ref![output, 0, 1024],
287            ),
288            6 => unpack_8_6(
289                array_ref![input, 0, 1024 * 6 / 8],
290                array_mut_ref![output, 0, 1024],
291            ),
292            7 => unpack_8_7(
293                array_ref![input, 0, 1024 * 7 / 8],
294                array_mut_ref![output, 0, 1024],
295            ),
296            8 => unpack_8_8(
297                array_ref![input, 0, 1024 * 8 / 8],
298                array_mut_ref![output, 0, 1024],
299            ),
300
301            _ => unreachable!("Unsupported width: {}", width),
302        }
303    }
304}
305
306impl BitPacking for u16 {
307    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
308        let packed_len = 128 * width / size_of::<Self>();
309        debug_assert_eq!(
310            output.len(),
311            packed_len,
312            "Output buffer must be of size 1024 * W / T"
313        );
314        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
315        debug_assert!(
316            width <= Self::T,
317            "Width must be less than or equal to {}",
318            Self::T
319        );
320
321        match width {
322            1 => pack_16_1(
323                array_ref![input, 0, 1024],
324                array_mut_ref![output, 0, 1024 / 16],
325            ),
326            2 => pack_16_2(
327                array_ref![input, 0, 1024],
328                array_mut_ref![output, 0, 1024 * 2 / 16],
329            ),
330            3 => pack_16_3(
331                array_ref![input, 0, 1024],
332                array_mut_ref![output, 0, 1024 * 3 / 16],
333            ),
334            4 => pack_16_4(
335                array_ref![input, 0, 1024],
336                array_mut_ref![output, 0, 1024 * 4 / 16],
337            ),
338            5 => pack_16_5(
339                array_ref![input, 0, 1024],
340                array_mut_ref![output, 0, 1024 * 5 / 16],
341            ),
342            6 => pack_16_6(
343                array_ref![input, 0, 1024],
344                array_mut_ref![output, 0, 1024 * 6 / 16],
345            ),
346            7 => pack_16_7(
347                array_ref![input, 0, 1024],
348                array_mut_ref![output, 0, 1024 * 7 / 16],
349            ),
350            8 => pack_16_8(
351                array_ref![input, 0, 1024],
352                array_mut_ref![output, 0, 1024 * 8 / 16],
353            ),
354            9 => pack_16_9(
355                array_ref![input, 0, 1024],
356                array_mut_ref![output, 0, 1024 * 9 / 16],
357            ),
358
359            10 => pack_16_10(
360                array_ref![input, 0, 1024],
361                array_mut_ref![output, 0, 1024 * 10 / 16],
362            ),
363            11 => pack_16_11(
364                array_ref![input, 0, 1024],
365                array_mut_ref![output, 0, 1024 * 11 / 16],
366            ),
367            12 => pack_16_12(
368                array_ref![input, 0, 1024],
369                array_mut_ref![output, 0, 1024 * 12 / 16],
370            ),
371            13 => pack_16_13(
372                array_ref![input, 0, 1024],
373                array_mut_ref![output, 0, 1024 * 13 / 16],
374            ),
375            14 => pack_16_14(
376                array_ref![input, 0, 1024],
377                array_mut_ref![output, 0, 1024 * 14 / 16],
378            ),
379            15 => pack_16_15(
380                array_ref![input, 0, 1024],
381                array_mut_ref![output, 0, 1024 * 15 / 16],
382            ),
383            16 => pack_16_16(
384                array_ref![input, 0, 1024],
385                array_mut_ref![output, 0, 1024 * 16 / 16],
386            ),
387
388            _ => unreachable!("Unsupported width: {}", width),
389        }
390    }
391
392    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
393        let packed_len = 128 * width / size_of::<Self>();
394        debug_assert_eq!(
395            input.len(),
396            packed_len,
397            "Input buffer must be of size 1024 * W / T"
398        );
399        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
400        debug_assert!(
401            width <= Self::T,
402            "Width must be less than or equal to {}",
403            Self::T
404        );
405
406        match width {
407            1 => unpack_16_1(
408                array_ref![input, 0, 1024 / 16],
409                array_mut_ref![output, 0, 1024],
410            ),
411            2 => unpack_16_2(
412                array_ref![input, 0, 1024 * 2 / 16],
413                array_mut_ref![output, 0, 1024],
414            ),
415            3 => unpack_16_3(
416                array_ref![input, 0, 1024 * 3 / 16],
417                array_mut_ref![output, 0, 1024],
418            ),
419            4 => unpack_16_4(
420                array_ref![input, 0, 1024 * 4 / 16],
421                array_mut_ref![output, 0, 1024],
422            ),
423            5 => unpack_16_5(
424                array_ref![input, 0, 1024 * 5 / 16],
425                array_mut_ref![output, 0, 1024],
426            ),
427            6 => unpack_16_6(
428                array_ref![input, 0, 1024 * 6 / 16],
429                array_mut_ref![output, 0, 1024],
430            ),
431            7 => unpack_16_7(
432                array_ref![input, 0, 1024 * 7 / 16],
433                array_mut_ref![output, 0, 1024],
434            ),
435            8 => unpack_16_8(
436                array_ref![input, 0, 1024 * 8 / 16],
437                array_mut_ref![output, 0, 1024],
438            ),
439            9 => unpack_16_9(
440                array_ref![input, 0, 1024 * 9 / 16],
441                array_mut_ref![output, 0, 1024],
442            ),
443
444            10 => unpack_16_10(
445                array_ref![input, 0, 1024 * 10 / 16],
446                array_mut_ref![output, 0, 1024],
447            ),
448            11 => unpack_16_11(
449                array_ref![input, 0, 1024 * 11 / 16],
450                array_mut_ref![output, 0, 1024],
451            ),
452            12 => unpack_16_12(
453                array_ref![input, 0, 1024 * 12 / 16],
454                array_mut_ref![output, 0, 1024],
455            ),
456            13 => unpack_16_13(
457                array_ref![input, 0, 1024 * 13 / 16],
458                array_mut_ref![output, 0, 1024],
459            ),
460            14 => unpack_16_14(
461                array_ref![input, 0, 1024 * 14 / 16],
462                array_mut_ref![output, 0, 1024],
463            ),
464            15 => unpack_16_15(
465                array_ref![input, 0, 1024 * 15 / 16],
466                array_mut_ref![output, 0, 1024],
467            ),
468            16 => unpack_16_16(
469                array_ref![input, 0, 1024 * 16 / 16],
470                array_mut_ref![output, 0, 1024],
471            ),
472
473            _ => unreachable!("Unsupported width: {}", width),
474        }
475    }
476}
477
478impl BitPacking for u32 {
479    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
480        let packed_len = 128 * width / size_of::<Self>();
481        debug_assert_eq!(
482            output.len(),
483            packed_len,
484            "Output buffer must be of size 1024 * W / T"
485        );
486        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
487        debug_assert!(
488            width <= Self::T,
489            "Width must be less than or equal to {}",
490            Self::T
491        );
492
493        match width {
494            1 => pack_32_1(
495                array_ref![input, 0, 1024],
496                array_mut_ref![output, 0, 1024 / 32],
497            ),
498            2 => pack_32_2(
499                array_ref![input, 0, 1024],
500                array_mut_ref![output, 0, 1024 * 2 / 32],
501            ),
502            3 => pack_32_3(
503                array_ref![input, 0, 1024],
504                array_mut_ref![output, 0, 1024 * 3 / 32],
505            ),
506            4 => pack_32_4(
507                array_ref![input, 0, 1024],
508                array_mut_ref![output, 0, 1024 * 4 / 32],
509            ),
510            5 => pack_32_5(
511                array_ref![input, 0, 1024],
512                array_mut_ref![output, 0, 1024 * 5 / 32],
513            ),
514            6 => pack_32_6(
515                array_ref![input, 0, 1024],
516                array_mut_ref![output, 0, 1024 * 6 / 32],
517            ),
518            7 => pack_32_7(
519                array_ref![input, 0, 1024],
520                array_mut_ref![output, 0, 1024 * 7 / 32],
521            ),
522            8 => pack_32_8(
523                array_ref![input, 0, 1024],
524                array_mut_ref![output, 0, 1024 * 8 / 32],
525            ),
526            9 => pack_32_9(
527                array_ref![input, 0, 1024],
528                array_mut_ref![output, 0, 1024 * 9 / 32],
529            ),
530
531            10 => pack_32_10(
532                array_ref![input, 0, 1024],
533                array_mut_ref![output, 0, 1024 * 10 / 32],
534            ),
535            11 => pack_32_11(
536                array_ref![input, 0, 1024],
537                array_mut_ref![output, 0, 1024 * 11 / 32],
538            ),
539            12 => pack_32_12(
540                array_ref![input, 0, 1024],
541                array_mut_ref![output, 0, 1024 * 12 / 32],
542            ),
543            13 => pack_32_13(
544                array_ref![input, 0, 1024],
545                array_mut_ref![output, 0, 1024 * 13 / 32],
546            ),
547            14 => pack_32_14(
548                array_ref![input, 0, 1024],
549                array_mut_ref![output, 0, 1024 * 14 / 32],
550            ),
551            15 => pack_32_15(
552                array_ref![input, 0, 1024],
553                array_mut_ref![output, 0, 1024 * 15 / 32],
554            ),
555            16 => pack_32_16(
556                array_ref![input, 0, 1024],
557                array_mut_ref![output, 0, 1024 * 16 / 32],
558            ),
559            17 => pack_32_17(
560                array_ref![input, 0, 1024],
561                array_mut_ref![output, 0, 1024 * 17 / 32],
562            ),
563            18 => pack_32_18(
564                array_ref![input, 0, 1024],
565                array_mut_ref![output, 0, 1024 * 18 / 32],
566            ),
567            19 => pack_32_19(
568                array_ref![input, 0, 1024],
569                array_mut_ref![output, 0, 1024 * 19 / 32],
570            ),
571
572            20 => pack_32_20(
573                array_ref![input, 0, 1024],
574                array_mut_ref![output, 0, 1024 * 20 / 32],
575            ),
576            21 => pack_32_21(
577                array_ref![input, 0, 1024],
578                array_mut_ref![output, 0, 1024 * 21 / 32],
579            ),
580            22 => pack_32_22(
581                array_ref![input, 0, 1024],
582                array_mut_ref![output, 0, 1024 * 22 / 32],
583            ),
584            23 => pack_32_23(
585                array_ref![input, 0, 1024],
586                array_mut_ref![output, 0, 1024 * 23 / 32],
587            ),
588            24 => pack_32_24(
589                array_ref![input, 0, 1024],
590                array_mut_ref![output, 0, 1024 * 24 / 32],
591            ),
592            25 => pack_32_25(
593                array_ref![input, 0, 1024],
594                array_mut_ref![output, 0, 1024 * 25 / 32],
595            ),
596            26 => pack_32_26(
597                array_ref![input, 0, 1024],
598                array_mut_ref![output, 0, 1024 * 26 / 32],
599            ),
600            27 => pack_32_27(
601                array_ref![input, 0, 1024],
602                array_mut_ref![output, 0, 1024 * 27 / 32],
603            ),
604            28 => pack_32_28(
605                array_ref![input, 0, 1024],
606                array_mut_ref![output, 0, 1024 * 28 / 32],
607            ),
608            29 => pack_32_29(
609                array_ref![input, 0, 1024],
610                array_mut_ref![output, 0, 1024 * 29 / 32],
611            ),
612
613            30 => pack_32_30(
614                array_ref![input, 0, 1024],
615                array_mut_ref![output, 0, 1024 * 30 / 32],
616            ),
617            31 => pack_32_31(
618                array_ref![input, 0, 1024],
619                array_mut_ref![output, 0, 1024 * 31 / 32],
620            ),
621            32 => pack_32_32(
622                array_ref![input, 0, 1024],
623                array_mut_ref![output, 0, 1024 * 32 / 32],
624            ),
625
626            _ => unreachable!("Unsupported width: {}", width),
627        }
628    }
629
630    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
631        let packed_len = 128 * width / size_of::<Self>();
632        debug_assert_eq!(
633            input.len(),
634            packed_len,
635            "Input buffer must be of size 1024 * W / T"
636        );
637        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
638        debug_assert!(
639            width <= Self::T,
640            "Width must be less than or equal to {}",
641            Self::T
642        );
643
644        match width {
645            1 => unpack_32_1(
646                array_ref![input, 0, 1024 / 32],
647                array_mut_ref![output, 0, 1024],
648            ),
649            2 => unpack_32_2(
650                array_ref![input, 0, 1024 * 2 / 32],
651                array_mut_ref![output, 0, 1024],
652            ),
653            3 => unpack_32_3(
654                array_ref![input, 0, 1024 * 3 / 32],
655                array_mut_ref![output, 0, 1024],
656            ),
657            4 => unpack_32_4(
658                array_ref![input, 0, 1024 * 4 / 32],
659                array_mut_ref![output, 0, 1024],
660            ),
661            5 => unpack_32_5(
662                array_ref![input, 0, 1024 * 5 / 32],
663                array_mut_ref![output, 0, 1024],
664            ),
665            6 => unpack_32_6(
666                array_ref![input, 0, 1024 * 6 / 32],
667                array_mut_ref![output, 0, 1024],
668            ),
669            7 => unpack_32_7(
670                array_ref![input, 0, 1024 * 7 / 32],
671                array_mut_ref![output, 0, 1024],
672            ),
673            8 => unpack_32_8(
674                array_ref![input, 0, 1024 * 8 / 32],
675                array_mut_ref![output, 0, 1024],
676            ),
677            9 => unpack_32_9(
678                array_ref![input, 0, 1024 * 9 / 32],
679                array_mut_ref![output, 0, 1024],
680            ),
681
682            10 => unpack_32_10(
683                array_ref![input, 0, 1024 * 10 / 32],
684                array_mut_ref![output, 0, 1024],
685            ),
686            11 => unpack_32_11(
687                array_ref![input, 0, 1024 * 11 / 32],
688                array_mut_ref![output, 0, 1024],
689            ),
690            12 => unpack_32_12(
691                array_ref![input, 0, 1024 * 12 / 32],
692                array_mut_ref![output, 0, 1024],
693            ),
694            13 => unpack_32_13(
695                array_ref![input, 0, 1024 * 13 / 32],
696                array_mut_ref![output, 0, 1024],
697            ),
698            14 => unpack_32_14(
699                array_ref![input, 0, 1024 * 14 / 32],
700                array_mut_ref![output, 0, 1024],
701            ),
702            15 => unpack_32_15(
703                array_ref![input, 0, 1024 * 15 / 32],
704                array_mut_ref![output, 0, 1024],
705            ),
706            16 => unpack_32_16(
707                array_ref![input, 0, 1024 * 16 / 32],
708                array_mut_ref![output, 0, 1024],
709            ),
710            17 => unpack_32_17(
711                array_ref![input, 0, 1024 * 17 / 32],
712                array_mut_ref![output, 0, 1024],
713            ),
714            18 => unpack_32_18(
715                array_ref![input, 0, 1024 * 18 / 32],
716                array_mut_ref![output, 0, 1024],
717            ),
718            19 => unpack_32_19(
719                array_ref![input, 0, 1024 * 19 / 32],
720                array_mut_ref![output, 0, 1024],
721            ),
722
723            20 => unpack_32_20(
724                array_ref![input, 0, 1024 * 20 / 32],
725                array_mut_ref![output, 0, 1024],
726            ),
727            21 => unpack_32_21(
728                array_ref![input, 0, 1024 * 21 / 32],
729                array_mut_ref![output, 0, 1024],
730            ),
731            22 => unpack_32_22(
732                array_ref![input, 0, 1024 * 22 / 32],
733                array_mut_ref![output, 0, 1024],
734            ),
735            23 => unpack_32_23(
736                array_ref![input, 0, 1024 * 23 / 32],
737                array_mut_ref![output, 0, 1024],
738            ),
739            24 => unpack_32_24(
740                array_ref![input, 0, 1024 * 24 / 32],
741                array_mut_ref![output, 0, 1024],
742            ),
743            25 => unpack_32_25(
744                array_ref![input, 0, 1024 * 25 / 32],
745                array_mut_ref![output, 0, 1024],
746            ),
747            26 => unpack_32_26(
748                array_ref![input, 0, 1024 * 26 / 32],
749                array_mut_ref![output, 0, 1024],
750            ),
751            27 => unpack_32_27(
752                array_ref![input, 0, 1024 * 27 / 32],
753                array_mut_ref![output, 0, 1024],
754            ),
755            28 => unpack_32_28(
756                array_ref![input, 0, 1024 * 28 / 32],
757                array_mut_ref![output, 0, 1024],
758            ),
759            29 => unpack_32_29(
760                array_ref![input, 0, 1024 * 29 / 32],
761                array_mut_ref![output, 0, 1024],
762            ),
763
764            30 => unpack_32_30(
765                array_ref![input, 0, 1024 * 30 / 32],
766                array_mut_ref![output, 0, 1024],
767            ),
768            31 => unpack_32_31(
769                array_ref![input, 0, 1024 * 31 / 32],
770                array_mut_ref![output, 0, 1024],
771            ),
772            32 => unpack_32_32(
773                array_ref![input, 0, 1024 * 32 / 32],
774                array_mut_ref![output, 0, 1024],
775            ),
776
777            _ => unreachable!("Unsupported width: {}", width),
778        }
779    }
780}
781
782impl BitPacking for u64 {
783    unsafe fn unchecked_pack(width: usize, input: &[Self], output: &mut [Self]) {
784        let packed_len = 128 * width / size_of::<Self>();
785        debug_assert_eq!(
786            output.len(),
787            packed_len,
788            "Output buffer must be of size 1024 * W / T"
789        );
790        debug_assert_eq!(input.len(), 1024, "Input buffer must be of size 1024");
791        debug_assert!(
792            width <= Self::T,
793            "Width must be less than or equal to {}",
794            Self::T
795        );
796
797        match width {
798            1 => pack_64_1(
799                array_ref![input, 0, 1024],
800                array_mut_ref![output, 0, 1024 / 64],
801            ),
802            2 => pack_64_2(
803                array_ref![input, 0, 1024],
804                array_mut_ref![output, 0, 1024 * 2 / 64],
805            ),
806            3 => pack_64_3(
807                array_ref![input, 0, 1024],
808                array_mut_ref![output, 0, 1024 * 3 / 64],
809            ),
810            4 => pack_64_4(
811                array_ref![input, 0, 1024],
812                array_mut_ref![output, 0, 1024 * 4 / 64],
813            ),
814            5 => pack_64_5(
815                array_ref![input, 0, 1024],
816                array_mut_ref![output, 0, 1024 * 5 / 64],
817            ),
818            6 => pack_64_6(
819                array_ref![input, 0, 1024],
820                array_mut_ref![output, 0, 1024 * 6 / 64],
821            ),
822            7 => pack_64_7(
823                array_ref![input, 0, 1024],
824                array_mut_ref![output, 0, 1024 * 7 / 64],
825            ),
826            8 => pack_64_8(
827                array_ref![input, 0, 1024],
828                array_mut_ref![output, 0, 1024 * 8 / 64],
829            ),
830            9 => pack_64_9(
831                array_ref![input, 0, 1024],
832                array_mut_ref![output, 0, 1024 * 9 / 64],
833            ),
834
835            10 => pack_64_10(
836                array_ref![input, 0, 1024],
837                array_mut_ref![output, 0, 1024 * 10 / 64],
838            ),
839            11 => pack_64_11(
840                array_ref![input, 0, 1024],
841                array_mut_ref![output, 0, 1024 * 11 / 64],
842            ),
843            12 => pack_64_12(
844                array_ref![input, 0, 1024],
845                array_mut_ref![output, 0, 1024 * 12 / 64],
846            ),
847            13 => pack_64_13(
848                array_ref![input, 0, 1024],
849                array_mut_ref![output, 0, 1024 * 13 / 64],
850            ),
851            14 => pack_64_14(
852                array_ref![input, 0, 1024],
853                array_mut_ref![output, 0, 1024 * 14 / 64],
854            ),
855            15 => pack_64_15(
856                array_ref![input, 0, 1024],
857                array_mut_ref![output, 0, 1024 * 15 / 64],
858            ),
859            16 => pack_64_16(
860                array_ref![input, 0, 1024],
861                array_mut_ref![output, 0, 1024 * 16 / 64],
862            ),
863            17 => pack_64_17(
864                array_ref![input, 0, 1024],
865                array_mut_ref![output, 0, 1024 * 17 / 64],
866            ),
867            18 => pack_64_18(
868                array_ref![input, 0, 1024],
869                array_mut_ref![output, 0, 1024 * 18 / 64],
870            ),
871            19 => pack_64_19(
872                array_ref![input, 0, 1024],
873                array_mut_ref![output, 0, 1024 * 19 / 64],
874            ),
875
876            20 => pack_64_20(
877                array_ref![input, 0, 1024],
878                array_mut_ref![output, 0, 1024 * 20 / 64],
879            ),
880            21 => pack_64_21(
881                array_ref![input, 0, 1024],
882                array_mut_ref![output, 0, 1024 * 21 / 64],
883            ),
884            22 => pack_64_22(
885                array_ref![input, 0, 1024],
886                array_mut_ref![output, 0, 1024 * 22 / 64],
887            ),
888            23 => pack_64_23(
889                array_ref![input, 0, 1024],
890                array_mut_ref![output, 0, 1024 * 23 / 64],
891            ),
892            24 => pack_64_24(
893                array_ref![input, 0, 1024],
894                array_mut_ref![output, 0, 1024 * 24 / 64],
895            ),
896            25 => pack_64_25(
897                array_ref![input, 0, 1024],
898                array_mut_ref![output, 0, 1024 * 25 / 64],
899            ),
900            26 => pack_64_26(
901                array_ref![input, 0, 1024],
902                array_mut_ref![output, 0, 1024 * 26 / 64],
903            ),
904            27 => pack_64_27(
905                array_ref![input, 0, 1024],
906                array_mut_ref![output, 0, 1024 * 27 / 64],
907            ),
908            28 => pack_64_28(
909                array_ref![input, 0, 1024],
910                array_mut_ref![output, 0, 1024 * 28 / 64],
911            ),
912            29 => pack_64_29(
913                array_ref![input, 0, 1024],
914                array_mut_ref![output, 0, 1024 * 29 / 64],
915            ),
916
917            30 => pack_64_30(
918                array_ref![input, 0, 1024],
919                array_mut_ref![output, 0, 1024 * 30 / 64],
920            ),
921            31 => pack_64_31(
922                array_ref![input, 0, 1024],
923                array_mut_ref![output, 0, 1024 * 31 / 64],
924            ),
925            32 => pack_64_32(
926                array_ref![input, 0, 1024],
927                array_mut_ref![output, 0, 1024 * 32 / 64],
928            ),
929            33 => pack_64_33(
930                array_ref![input, 0, 1024],
931                array_mut_ref![output, 0, 1024 * 33 / 64],
932            ),
933            34 => pack_64_34(
934                array_ref![input, 0, 1024],
935                array_mut_ref![output, 0, 1024 * 34 / 64],
936            ),
937            35 => pack_64_35(
938                array_ref![input, 0, 1024],
939                array_mut_ref![output, 0, 1024 * 35 / 64],
940            ),
941            36 => pack_64_36(
942                array_ref![input, 0, 1024],
943                array_mut_ref![output, 0, 1024 * 36 / 64],
944            ),
945            37 => pack_64_37(
946                array_ref![input, 0, 1024],
947                array_mut_ref![output, 0, 1024 * 37 / 64],
948            ),
949            38 => pack_64_38(
950                array_ref![input, 0, 1024],
951                array_mut_ref![output, 0, 1024 * 38 / 64],
952            ),
953            39 => pack_64_39(
954                array_ref![input, 0, 1024],
955                array_mut_ref![output, 0, 1024 * 39 / 64],
956            ),
957
958            40 => pack_64_40(
959                array_ref![input, 0, 1024],
960                array_mut_ref![output, 0, 1024 * 40 / 64],
961            ),
962            41 => pack_64_41(
963                array_ref![input, 0, 1024],
964                array_mut_ref![output, 0, 1024 * 41 / 64],
965            ),
966            42 => pack_64_42(
967                array_ref![input, 0, 1024],
968                array_mut_ref![output, 0, 1024 * 42 / 64],
969            ),
970            43 => pack_64_43(
971                array_ref![input, 0, 1024],
972                array_mut_ref![output, 0, 1024 * 43 / 64],
973            ),
974            44 => pack_64_44(
975                array_ref![input, 0, 1024],
976                array_mut_ref![output, 0, 1024 * 44 / 64],
977            ),
978            45 => pack_64_45(
979                array_ref![input, 0, 1024],
980                array_mut_ref![output, 0, 1024 * 45 / 64],
981            ),
982            46 => pack_64_46(
983                array_ref![input, 0, 1024],
984                array_mut_ref![output, 0, 1024 * 46 / 64],
985            ),
986            47 => pack_64_47(
987                array_ref![input, 0, 1024],
988                array_mut_ref![output, 0, 1024 * 47 / 64],
989            ),
990            48 => pack_64_48(
991                array_ref![input, 0, 1024],
992                array_mut_ref![output, 0, 1024 * 48 / 64],
993            ),
994            49 => pack_64_49(
995                array_ref![input, 0, 1024],
996                array_mut_ref![output, 0, 1024 * 49 / 64],
997            ),
998
999            50 => pack_64_50(
1000                array_ref![input, 0, 1024],
1001                array_mut_ref![output, 0, 1024 * 50 / 64],
1002            ),
1003            51 => pack_64_51(
1004                array_ref![input, 0, 1024],
1005                array_mut_ref![output, 0, 1024 * 51 / 64],
1006            ),
1007            52 => pack_64_52(
1008                array_ref![input, 0, 1024],
1009                array_mut_ref![output, 0, 1024 * 52 / 64],
1010            ),
1011            53 => pack_64_53(
1012                array_ref![input, 0, 1024],
1013                array_mut_ref![output, 0, 1024 * 53 / 64],
1014            ),
1015            54 => pack_64_54(
1016                array_ref![input, 0, 1024],
1017                array_mut_ref![output, 0, 1024 * 54 / 64],
1018            ),
1019            55 => pack_64_55(
1020                array_ref![input, 0, 1024],
1021                array_mut_ref![output, 0, 1024 * 55 / 64],
1022            ),
1023            56 => pack_64_56(
1024                array_ref![input, 0, 1024],
1025                array_mut_ref![output, 0, 1024 * 56 / 64],
1026            ),
1027            57 => pack_64_57(
1028                array_ref![input, 0, 1024],
1029                array_mut_ref![output, 0, 1024 * 57 / 64],
1030            ),
1031            58 => pack_64_58(
1032                array_ref![input, 0, 1024],
1033                array_mut_ref![output, 0, 1024 * 58 / 64],
1034            ),
1035            59 => pack_64_59(
1036                array_ref![input, 0, 1024],
1037                array_mut_ref![output, 0, 1024 * 59 / 64],
1038            ),
1039
1040            60 => pack_64_60(
1041                array_ref![input, 0, 1024],
1042                array_mut_ref![output, 0, 1024 * 60 / 64],
1043            ),
1044            61 => pack_64_61(
1045                array_ref![input, 0, 1024],
1046                array_mut_ref![output, 0, 1024 * 61 / 64],
1047            ),
1048            62 => pack_64_62(
1049                array_ref![input, 0, 1024],
1050                array_mut_ref![output, 0, 1024 * 62 / 64],
1051            ),
1052            63 => pack_64_63(
1053                array_ref![input, 0, 1024],
1054                array_mut_ref![output, 0, 1024 * 63 / 64],
1055            ),
1056            64 => pack_64_64(
1057                array_ref![input, 0, 1024],
1058                array_mut_ref![output, 0, 1024 * 64 / 64],
1059            ),
1060
1061            _ => unreachable!("Unsupported width: {}", width),
1062        }
1063    }
1064
1065    unsafe fn unchecked_unpack(width: usize, input: &[Self], output: &mut [Self]) {
1066        let packed_len = 128 * width / size_of::<Self>();
1067        debug_assert_eq!(
1068            input.len(),
1069            packed_len,
1070            "Input buffer must be of size 1024 * W / T"
1071        );
1072        debug_assert_eq!(output.len(), 1024, "Output buffer must be of size 1024");
1073        debug_assert!(
1074            width <= Self::T,
1075            "Width must be less than or equal to {}",
1076            Self::T
1077        );
1078
1079        match width {
1080            1 => unpack_64_1(
1081                array_ref![input, 0, 1024 / 64],
1082                array_mut_ref![output, 0, 1024],
1083            ),
1084            2 => unpack_64_2(
1085                array_ref![input, 0, 1024 * 2 / 64],
1086                array_mut_ref![output, 0, 1024],
1087            ),
1088            3 => unpack_64_3(
1089                array_ref![input, 0, 1024 * 3 / 64],
1090                array_mut_ref![output, 0, 1024],
1091            ),
1092            4 => unpack_64_4(
1093                array_ref![input, 0, 1024 * 4 / 64],
1094                array_mut_ref![output, 0, 1024],
1095            ),
1096            5 => unpack_64_5(
1097                array_ref![input, 0, 1024 * 5 / 64],
1098                array_mut_ref![output, 0, 1024],
1099            ),
1100            6 => unpack_64_6(
1101                array_ref![input, 0, 1024 * 6 / 64],
1102                array_mut_ref![output, 0, 1024],
1103            ),
1104            7 => unpack_64_7(
1105                array_ref![input, 0, 1024 * 7 / 64],
1106                array_mut_ref![output, 0, 1024],
1107            ),
1108            8 => unpack_64_8(
1109                array_ref![input, 0, 1024 * 8 / 64],
1110                array_mut_ref![output, 0, 1024],
1111            ),
1112            9 => unpack_64_9(
1113                array_ref![input, 0, 1024 * 9 / 64],
1114                array_mut_ref![output, 0, 1024],
1115            ),
1116
1117            10 => unpack_64_10(
1118                array_ref![input, 0, 1024 * 10 / 64],
1119                array_mut_ref![output, 0, 1024],
1120            ),
1121            11 => unpack_64_11(
1122                array_ref![input, 0, 1024 * 11 / 64],
1123                array_mut_ref![output, 0, 1024],
1124            ),
1125            12 => unpack_64_12(
1126                array_ref![input, 0, 1024 * 12 / 64],
1127                array_mut_ref![output, 0, 1024],
1128            ),
1129            13 => unpack_64_13(
1130                array_ref![input, 0, 1024 * 13 / 64],
1131                array_mut_ref![output, 0, 1024],
1132            ),
1133            14 => unpack_64_14(
1134                array_ref![input, 0, 1024 * 14 / 64],
1135                array_mut_ref![output, 0, 1024],
1136            ),
1137            15 => unpack_64_15(
1138                array_ref![input, 0, 1024 * 15 / 64],
1139                array_mut_ref![output, 0, 1024],
1140            ),
1141            16 => unpack_64_16(
1142                array_ref![input, 0, 1024 * 16 / 64],
1143                array_mut_ref![output, 0, 1024],
1144            ),
1145            17 => unpack_64_17(
1146                array_ref![input, 0, 1024 * 17 / 64],
1147                array_mut_ref![output, 0, 1024],
1148            ),
1149            18 => unpack_64_18(
1150                array_ref![input, 0, 1024 * 18 / 64],
1151                array_mut_ref![output, 0, 1024],
1152            ),
1153            19 => unpack_64_19(
1154                array_ref![input, 0, 1024 * 19 / 64],
1155                array_mut_ref![output, 0, 1024],
1156            ),
1157
1158            20 => unpack_64_20(
1159                array_ref![input, 0, 1024 * 20 / 64],
1160                array_mut_ref![output, 0, 1024],
1161            ),
1162            21 => unpack_64_21(
1163                array_ref![input, 0, 1024 * 21 / 64],
1164                array_mut_ref![output, 0, 1024],
1165            ),
1166            22 => unpack_64_22(
1167                array_ref![input, 0, 1024 * 22 / 64],
1168                array_mut_ref![output, 0, 1024],
1169            ),
1170            23 => unpack_64_23(
1171                array_ref![input, 0, 1024 * 23 / 64],
1172                array_mut_ref![output, 0, 1024],
1173            ),
1174            24 => unpack_64_24(
1175                array_ref![input, 0, 1024 * 24 / 64],
1176                array_mut_ref![output, 0, 1024],
1177            ),
1178            25 => unpack_64_25(
1179                array_ref![input, 0, 1024 * 25 / 64],
1180                array_mut_ref![output, 0, 1024],
1181            ),
1182            26 => unpack_64_26(
1183                array_ref![input, 0, 1024 * 26 / 64],
1184                array_mut_ref![output, 0, 1024],
1185            ),
1186            27 => unpack_64_27(
1187                array_ref![input, 0, 1024 * 27 / 64],
1188                array_mut_ref![output, 0, 1024],
1189            ),
1190            28 => unpack_64_28(
1191                array_ref![input, 0, 1024 * 28 / 64],
1192                array_mut_ref![output, 0, 1024],
1193            ),
1194            29 => unpack_64_29(
1195                array_ref![input, 0, 1024 * 29 / 64],
1196                array_mut_ref![output, 0, 1024],
1197            ),
1198
1199            30 => unpack_64_30(
1200                array_ref![input, 0, 1024 * 30 / 64],
1201                array_mut_ref![output, 0, 1024],
1202            ),
1203            31 => unpack_64_31(
1204                array_ref![input, 0, 1024 * 31 / 64],
1205                array_mut_ref![output, 0, 1024],
1206            ),
1207            32 => unpack_64_32(
1208                array_ref![input, 0, 1024 * 32 / 64],
1209                array_mut_ref![output, 0, 1024],
1210            ),
1211            33 => unpack_64_33(
1212                array_ref![input, 0, 1024 * 33 / 64],
1213                array_mut_ref![output, 0, 1024],
1214            ),
1215            34 => unpack_64_34(
1216                array_ref![input, 0, 1024 * 34 / 64],
1217                array_mut_ref![output, 0, 1024],
1218            ),
1219            35 => unpack_64_35(
1220                array_ref![input, 0, 1024 * 35 / 64],
1221                array_mut_ref![output, 0, 1024],
1222            ),
1223            36 => unpack_64_36(
1224                array_ref![input, 0, 1024 * 36 / 64],
1225                array_mut_ref![output, 0, 1024],
1226            ),
1227            37 => unpack_64_37(
1228                array_ref![input, 0, 1024 * 37 / 64],
1229                array_mut_ref![output, 0, 1024],
1230            ),
1231            38 => unpack_64_38(
1232                array_ref![input, 0, 1024 * 38 / 64],
1233                array_mut_ref![output, 0, 1024],
1234            ),
1235            39 => unpack_64_39(
1236                array_ref![input, 0, 1024 * 39 / 64],
1237                array_mut_ref![output, 0, 1024],
1238            ),
1239
1240            40 => unpack_64_40(
1241                array_ref![input, 0, 1024 * 40 / 64],
1242                array_mut_ref![output, 0, 1024],
1243            ),
1244            41 => unpack_64_41(
1245                array_ref![input, 0, 1024 * 41 / 64],
1246                array_mut_ref![output, 0, 1024],
1247            ),
1248            42 => unpack_64_42(
1249                array_ref![input, 0, 1024 * 42 / 64],
1250                array_mut_ref![output, 0, 1024],
1251            ),
1252            43 => unpack_64_43(
1253                array_ref![input, 0, 1024 * 43 / 64],
1254                array_mut_ref![output, 0, 1024],
1255            ),
1256            44 => unpack_64_44(
1257                array_ref![input, 0, 1024 * 44 / 64],
1258                array_mut_ref![output, 0, 1024],
1259            ),
1260            45 => unpack_64_45(
1261                array_ref![input, 0, 1024 * 45 / 64],
1262                array_mut_ref![output, 0, 1024],
1263            ),
1264            46 => unpack_64_46(
1265                array_ref![input, 0, 1024 * 46 / 64],
1266                array_mut_ref![output, 0, 1024],
1267            ),
1268            47 => unpack_64_47(
1269                array_ref![input, 0, 1024 * 47 / 64],
1270                array_mut_ref![output, 0, 1024],
1271            ),
1272            48 => unpack_64_48(
1273                array_ref![input, 0, 1024 * 48 / 64],
1274                array_mut_ref![output, 0, 1024],
1275            ),
1276            49 => unpack_64_49(
1277                array_ref![input, 0, 1024 * 49 / 64],
1278                array_mut_ref![output, 0, 1024],
1279            ),
1280
1281            50 => unpack_64_50(
1282                array_ref![input, 0, 1024 * 50 / 64],
1283                array_mut_ref![output, 0, 1024],
1284            ),
1285            51 => unpack_64_51(
1286                array_ref![input, 0, 1024 * 51 / 64],
1287                array_mut_ref![output, 0, 1024],
1288            ),
1289            52 => unpack_64_52(
1290                array_ref![input, 0, 1024 * 52 / 64],
1291                array_mut_ref![output, 0, 1024],
1292            ),
1293            53 => unpack_64_53(
1294                array_ref![input, 0, 1024 * 53 / 64],
1295                array_mut_ref![output, 0, 1024],
1296            ),
1297            54 => unpack_64_54(
1298                array_ref![input, 0, 1024 * 54 / 64],
1299                array_mut_ref![output, 0, 1024],
1300            ),
1301            55 => unpack_64_55(
1302                array_ref![input, 0, 1024 * 55 / 64],
1303                array_mut_ref![output, 0, 1024],
1304            ),
1305            56 => unpack_64_56(
1306                array_ref![input, 0, 1024 * 56 / 64],
1307                array_mut_ref![output, 0, 1024],
1308            ),
1309            57 => unpack_64_57(
1310                array_ref![input, 0, 1024 * 57 / 64],
1311                array_mut_ref![output, 0, 1024],
1312            ),
1313            58 => unpack_64_58(
1314                array_ref![input, 0, 1024 * 58 / 64],
1315                array_mut_ref![output, 0, 1024],
1316            ),
1317            59 => unpack_64_59(
1318                array_ref![input, 0, 1024 * 59 / 64],
1319                array_mut_ref![output, 0, 1024],
1320            ),
1321
1322            60 => unpack_64_60(
1323                array_ref![input, 0, 1024 * 60 / 64],
1324                array_mut_ref![output, 0, 1024],
1325            ),
1326            61 => unpack_64_61(
1327                array_ref![input, 0, 1024 * 61 / 64],
1328                array_mut_ref![output, 0, 1024],
1329            ),
1330            62 => unpack_64_62(
1331                array_ref![input, 0, 1024 * 62 / 64],
1332                array_mut_ref![output, 0, 1024],
1333            ),
1334            63 => unpack_64_63(
1335                array_ref![input, 0, 1024 * 63 / 64],
1336                array_mut_ref![output, 0, 1024],
1337            ),
1338            64 => unpack_64_64(
1339                array_ref![input, 0, 1024 * 64 / 64],
1340                array_mut_ref![output, 0, 1024],
1341            ),
1342
1343            _ => unreachable!("Unsupported width: {}", width),
1344        }
1345    }
1346}
1347
1348macro_rules! unpack_8 {
1349    ($name:ident, $bits:expr) => {
1350        fn $name(input: &[u8; 1024 * $bits / u8::T], output: &mut [u8; 1024]) {
1351            for lane in 0..u8::LANES {
1352                unpack!(u8, $bits, input, lane, |$idx, $elem| {
1353                    output[$idx] = $elem;
1354                });
1355            }
1356        }
1357    };
1358}
1359
1360unpack_8!(unpack_8_1, 1);
1361unpack_8!(unpack_8_2, 2);
1362unpack_8!(unpack_8_3, 3);
1363unpack_8!(unpack_8_4, 4);
1364unpack_8!(unpack_8_5, 5);
1365unpack_8!(unpack_8_6, 6);
1366unpack_8!(unpack_8_7, 7);
1367unpack_8!(unpack_8_8, 8);
1368
1369macro_rules! pack_8 {
1370    ($name:ident, $bits:expr) => {
1371        fn $name(input: &[u8; 1024], output: &mut [u8; 1024 * $bits / u8::T]) {
1372            for lane in 0..u8::LANES {
1373                pack!(u8, $bits, output, lane, |$idx| { input[$idx] });
1374            }
1375        }
1376    };
1377}
1378pack_8!(pack_8_1, 1);
1379pack_8!(pack_8_2, 2);
1380pack_8!(pack_8_3, 3);
1381pack_8!(pack_8_4, 4);
1382pack_8!(pack_8_5, 5);
1383pack_8!(pack_8_6, 6);
1384pack_8!(pack_8_7, 7);
1385pack_8!(pack_8_8, 8);
1386
1387macro_rules! unpack_16 {
1388    ($name:ident, $bits:expr) => {
1389        fn $name(input: &[u16; 1024 * $bits / u16::T], output: &mut [u16; 1024]) {
1390            for lane in 0..u16::LANES {
1391                unpack!(u16, $bits, input, lane, |$idx, $elem| {
1392                    output[$idx] = $elem;
1393                });
1394            }
1395        }
1396    };
1397}
1398
1399unpack_16!(unpack_16_1, 1);
1400unpack_16!(unpack_16_2, 2);
1401unpack_16!(unpack_16_3, 3);
1402unpack_16!(unpack_16_4, 4);
1403unpack_16!(unpack_16_5, 5);
1404unpack_16!(unpack_16_6, 6);
1405unpack_16!(unpack_16_7, 7);
1406unpack_16!(unpack_16_8, 8);
1407unpack_16!(unpack_16_9, 9);
1408unpack_16!(unpack_16_10, 10);
1409unpack_16!(unpack_16_11, 11);
1410unpack_16!(unpack_16_12, 12);
1411unpack_16!(unpack_16_13, 13);
1412unpack_16!(unpack_16_14, 14);
1413unpack_16!(unpack_16_15, 15);
1414unpack_16!(unpack_16_16, 16);
1415
1416macro_rules! pack_16 {
1417    ($name:ident, $bits:expr) => {
1418        fn $name(input: &[u16; 1024], output: &mut [u16; 1024 * $bits / u16::T]) {
1419            for lane in 0..u16::LANES {
1420                pack!(u16, $bits, output, lane, |$idx| { input[$idx] });
1421            }
1422        }
1423    };
1424}
1425
1426pack_16!(pack_16_1, 1);
1427pack_16!(pack_16_2, 2);
1428pack_16!(pack_16_3, 3);
1429pack_16!(pack_16_4, 4);
1430pack_16!(pack_16_5, 5);
1431pack_16!(pack_16_6, 6);
1432pack_16!(pack_16_7, 7);
1433pack_16!(pack_16_8, 8);
1434pack_16!(pack_16_9, 9);
1435pack_16!(pack_16_10, 10);
1436pack_16!(pack_16_11, 11);
1437pack_16!(pack_16_12, 12);
1438pack_16!(pack_16_13, 13);
1439pack_16!(pack_16_14, 14);
1440pack_16!(pack_16_15, 15);
1441pack_16!(pack_16_16, 16);
1442
1443macro_rules! unpack_32 {
1444    ($name:ident, $bit_width:expr) => {
1445        fn $name(input: &[u32; 1024 * $bit_width / u32::T], output: &mut [u32; 1024]) {
1446            for lane in 0..u32::LANES {
1447                unpack!(u32, $bit_width, input, lane, |$idx, $elem| {
1448                    output[$idx] = $elem
1449                });
1450            }
1451        }
1452    };
1453}
1454
1455unpack_32!(unpack_32_1, 1);
1456unpack_32!(unpack_32_2, 2);
1457unpack_32!(unpack_32_3, 3);
1458unpack_32!(unpack_32_4, 4);
1459unpack_32!(unpack_32_5, 5);
1460unpack_32!(unpack_32_6, 6);
1461unpack_32!(unpack_32_7, 7);
1462unpack_32!(unpack_32_8, 8);
1463unpack_32!(unpack_32_9, 9);
1464unpack_32!(unpack_32_10, 10);
1465unpack_32!(unpack_32_11, 11);
1466unpack_32!(unpack_32_12, 12);
1467unpack_32!(unpack_32_13, 13);
1468unpack_32!(unpack_32_14, 14);
1469unpack_32!(unpack_32_15, 15);
1470unpack_32!(unpack_32_16, 16);
1471unpack_32!(unpack_32_17, 17);
1472unpack_32!(unpack_32_18, 18);
1473unpack_32!(unpack_32_19, 19);
1474unpack_32!(unpack_32_20, 20);
1475unpack_32!(unpack_32_21, 21);
1476unpack_32!(unpack_32_22, 22);
1477unpack_32!(unpack_32_23, 23);
1478unpack_32!(unpack_32_24, 24);
1479unpack_32!(unpack_32_25, 25);
1480unpack_32!(unpack_32_26, 26);
1481unpack_32!(unpack_32_27, 27);
1482unpack_32!(unpack_32_28, 28);
1483unpack_32!(unpack_32_29, 29);
1484unpack_32!(unpack_32_30, 30);
1485unpack_32!(unpack_32_31, 31);
1486unpack_32!(unpack_32_32, 32);
1487
1488macro_rules! pack_32 {
1489    ($name:ident, $bits:expr) => {
1490        fn $name(input: &[u32; 1024], output: &mut [u32; 1024 * $bits / u32::BITS as usize]) {
1491            for lane in 0..u32::LANES {
1492                pack!(u32, $bits, output, lane, |$idx| { input[$idx] });
1493            }
1494        }
1495    };
1496}
1497
1498pack_32!(pack_32_1, 1);
1499pack_32!(pack_32_2, 2);
1500pack_32!(pack_32_3, 3);
1501pack_32!(pack_32_4, 4);
1502pack_32!(pack_32_5, 5);
1503pack_32!(pack_32_6, 6);
1504pack_32!(pack_32_7, 7);
1505pack_32!(pack_32_8, 8);
1506pack_32!(pack_32_9, 9);
1507pack_32!(pack_32_10, 10);
1508pack_32!(pack_32_11, 11);
1509pack_32!(pack_32_12, 12);
1510pack_32!(pack_32_13, 13);
1511pack_32!(pack_32_14, 14);
1512pack_32!(pack_32_15, 15);
1513pack_32!(pack_32_16, 16);
1514pack_32!(pack_32_17, 17);
1515pack_32!(pack_32_18, 18);
1516pack_32!(pack_32_19, 19);
1517pack_32!(pack_32_20, 20);
1518pack_32!(pack_32_21, 21);
1519pack_32!(pack_32_22, 22);
1520pack_32!(pack_32_23, 23);
1521pack_32!(pack_32_24, 24);
1522pack_32!(pack_32_25, 25);
1523pack_32!(pack_32_26, 26);
1524pack_32!(pack_32_27, 27);
1525pack_32!(pack_32_28, 28);
1526pack_32!(pack_32_29, 29);
1527pack_32!(pack_32_30, 30);
1528pack_32!(pack_32_31, 31);
1529pack_32!(pack_32_32, 32);
1530
1531macro_rules! unpack_64 {
1532    ($name:ident, $bit_width:expr) => {
1533        fn $name(input: &[u64; 1024 * $bit_width / u64::T], output: &mut [u64; 1024]) {
1534            for lane in 0..u64::LANES {
1535                unpack!(u64, $bit_width, input, lane, |$idx, $elem| {
1536                    output[$idx] = $elem
1537                });
1538            }
1539        }
1540    };
1541}
1542
1543unpack_64!(unpack_64_1, 1);
1544unpack_64!(unpack_64_2, 2);
1545unpack_64!(unpack_64_3, 3);
1546unpack_64!(unpack_64_4, 4);
1547unpack_64!(unpack_64_5, 5);
1548unpack_64!(unpack_64_6, 6);
1549unpack_64!(unpack_64_7, 7);
1550unpack_64!(unpack_64_8, 8);
1551unpack_64!(unpack_64_9, 9);
1552unpack_64!(unpack_64_10, 10);
1553unpack_64!(unpack_64_11, 11);
1554unpack_64!(unpack_64_12, 12);
1555unpack_64!(unpack_64_13, 13);
1556unpack_64!(unpack_64_14, 14);
1557unpack_64!(unpack_64_15, 15);
1558unpack_64!(unpack_64_16, 16);
1559unpack_64!(unpack_64_17, 17);
1560unpack_64!(unpack_64_18, 18);
1561unpack_64!(unpack_64_19, 19);
1562unpack_64!(unpack_64_20, 20);
1563unpack_64!(unpack_64_21, 21);
1564unpack_64!(unpack_64_22, 22);
1565unpack_64!(unpack_64_23, 23);
1566unpack_64!(unpack_64_24, 24);
1567unpack_64!(unpack_64_25, 25);
1568unpack_64!(unpack_64_26, 26);
1569unpack_64!(unpack_64_27, 27);
1570unpack_64!(unpack_64_28, 28);
1571unpack_64!(unpack_64_29, 29);
1572unpack_64!(unpack_64_30, 30);
1573unpack_64!(unpack_64_31, 31);
1574unpack_64!(unpack_64_32, 32);
1575
1576unpack_64!(unpack_64_33, 33);
1577unpack_64!(unpack_64_34, 34);
1578unpack_64!(unpack_64_35, 35);
1579unpack_64!(unpack_64_36, 36);
1580unpack_64!(unpack_64_37, 37);
1581unpack_64!(unpack_64_38, 38);
1582unpack_64!(unpack_64_39, 39);
1583unpack_64!(unpack_64_40, 40);
1584unpack_64!(unpack_64_41, 41);
1585unpack_64!(unpack_64_42, 42);
1586unpack_64!(unpack_64_43, 43);
1587unpack_64!(unpack_64_44, 44);
1588unpack_64!(unpack_64_45, 45);
1589unpack_64!(unpack_64_46, 46);
1590unpack_64!(unpack_64_47, 47);
1591unpack_64!(unpack_64_48, 48);
1592unpack_64!(unpack_64_49, 49);
1593unpack_64!(unpack_64_50, 50);
1594unpack_64!(unpack_64_51, 51);
1595unpack_64!(unpack_64_52, 52);
1596unpack_64!(unpack_64_53, 53);
1597unpack_64!(unpack_64_54, 54);
1598unpack_64!(unpack_64_55, 55);
1599unpack_64!(unpack_64_56, 56);
1600unpack_64!(unpack_64_57, 57);
1601unpack_64!(unpack_64_58, 58);
1602unpack_64!(unpack_64_59, 59);
1603unpack_64!(unpack_64_60, 60);
1604unpack_64!(unpack_64_61, 61);
1605unpack_64!(unpack_64_62, 62);
1606unpack_64!(unpack_64_63, 63);
1607unpack_64!(unpack_64_64, 64);
1608
1609macro_rules! pack_64 {
1610    ($name:ident, $bits:expr) => {
1611        fn $name(input: &[u64; 1024], output: &mut [u64; 1024 * $bits / u64::BITS as usize]) {
1612            for lane in 0..u64::LANES {
1613                pack!(u64, $bits, output, lane, |$idx| { input[$idx] });
1614            }
1615        }
1616    };
1617}
1618
1619pack_64!(pack_64_1, 1);
1620pack_64!(pack_64_2, 2);
1621pack_64!(pack_64_3, 3);
1622pack_64!(pack_64_4, 4);
1623pack_64!(pack_64_5, 5);
1624pack_64!(pack_64_6, 6);
1625pack_64!(pack_64_7, 7);
1626pack_64!(pack_64_8, 8);
1627pack_64!(pack_64_9, 9);
1628pack_64!(pack_64_10, 10);
1629pack_64!(pack_64_11, 11);
1630pack_64!(pack_64_12, 12);
1631pack_64!(pack_64_13, 13);
1632pack_64!(pack_64_14, 14);
1633pack_64!(pack_64_15, 15);
1634pack_64!(pack_64_16, 16);
1635pack_64!(pack_64_17, 17);
1636pack_64!(pack_64_18, 18);
1637pack_64!(pack_64_19, 19);
1638pack_64!(pack_64_20, 20);
1639pack_64!(pack_64_21, 21);
1640pack_64!(pack_64_22, 22);
1641pack_64!(pack_64_23, 23);
1642pack_64!(pack_64_24, 24);
1643pack_64!(pack_64_25, 25);
1644pack_64!(pack_64_26, 26);
1645pack_64!(pack_64_27, 27);
1646pack_64!(pack_64_28, 28);
1647pack_64!(pack_64_29, 29);
1648pack_64!(pack_64_30, 30);
1649pack_64!(pack_64_31, 31);
1650pack_64!(pack_64_32, 32);
1651
1652pack_64!(pack_64_33, 33);
1653pack_64!(pack_64_34, 34);
1654pack_64!(pack_64_35, 35);
1655pack_64!(pack_64_36, 36);
1656pack_64!(pack_64_37, 37);
1657pack_64!(pack_64_38, 38);
1658pack_64!(pack_64_39, 39);
1659pack_64!(pack_64_40, 40);
1660pack_64!(pack_64_41, 41);
1661pack_64!(pack_64_42, 42);
1662pack_64!(pack_64_43, 43);
1663pack_64!(pack_64_44, 44);
1664pack_64!(pack_64_45, 45);
1665pack_64!(pack_64_46, 46);
1666pack_64!(pack_64_47, 47);
1667pack_64!(pack_64_48, 48);
1668pack_64!(pack_64_49, 49);
1669pack_64!(pack_64_50, 50);
1670pack_64!(pack_64_51, 51);
1671pack_64!(pack_64_52, 52);
1672pack_64!(pack_64_53, 53);
1673pack_64!(pack_64_54, 54);
1674pack_64!(pack_64_55, 55);
1675pack_64!(pack_64_56, 56);
1676pack_64!(pack_64_57, 57);
1677pack_64!(pack_64_58, 58);
1678pack_64!(pack_64_59, 59);
1679pack_64!(pack_64_60, 60);
1680pack_64!(pack_64_61, 61);
1681pack_64!(pack_64_62, 62);
1682pack_64!(pack_64_63, 63);
1683pack_64!(pack_64_64, 64);
1684
1685#[cfg(test)]
1686mod test {
1687    use super::*;
1688    use core::array;
1689    // a fast random number generator
1690    pub struct XorShift {
1691        state: u64,
1692    }
1693
1694    impl XorShift {
1695        pub fn new(seed: u64) -> Self {
1696            Self { state: seed }
1697        }
1698
1699        pub fn next(&mut self) -> u64 {
1700            let mut x = self.state;
1701            x ^= x << 13;
1702            x ^= x >> 7;
1703            x ^= x << 17;
1704            self.state = x;
1705            x
1706        }
1707    }
1708
1709    // a macro version of this function generalize u8, u16, u32, u64 takes very long time for a test build, so I
1710    // write it for each type separately
1711    fn pack_unpack_u8(bit_width: usize) {
1712        let mut values: [u8; 1024] = [0; 1024];
1713        let mut rng = XorShift::new(123456789);
1714        for value in &mut values {
1715            *value = (rng.next() % (1 << bit_width)) as u8;
1716        }
1717
1718        let mut packed = vec![0; 1024 * bit_width / 8];
1719        for lane in 0..u8::LANES {
1720            // Always loop over lanes first. This is what the compiler vectorizes.
1721            pack!(u8, bit_width, packed, lane, |$pos| {
1722                values[$pos]
1723            });
1724        }
1725
1726        let mut unpacked: [u8; 1024] = [0; 1024];
1727        for lane in 0..u8::LANES {
1728            // Always loop over lanes first. This is what the compiler vectorizes.
1729            unpack!(u8, bit_width, packed, lane, |$idx, $elem| {
1730                unpacked[$idx] = $elem;
1731            });
1732        }
1733
1734        assert_eq!(values, unpacked);
1735    }
1736
1737    fn pack_unpack_u16(bit_width: usize) {
1738        let mut values: [u16; 1024] = [0; 1024];
1739        let mut rng = XorShift::new(123456789);
1740        for value in &mut values {
1741            *value = (rng.next() % (1 << bit_width)) as u16;
1742        }
1743
1744        let mut packed = vec![0; 1024 * bit_width / 16];
1745        for lane in 0..u16::LANES {
1746            // Always loop over lanes first. This is what the compiler vectorizes.
1747            pack!(u16, bit_width, packed, lane, |$pos| {
1748                values[$pos]
1749            });
1750        }
1751
1752        let mut unpacked: [u16; 1024] = [0; 1024];
1753        for lane in 0..u16::LANES {
1754            // Always loop over lanes first. This is what the compiler vectorizes.
1755            unpack!(u16, bit_width, packed, lane, |$idx, $elem| {
1756                unpacked[$idx] = $elem;
1757            });
1758        }
1759
1760        assert_eq!(values, unpacked);
1761    }
1762
1763    fn pack_unpack_u32(bit_width: usize) {
1764        let mut values: [u32; 1024] = [0; 1024];
1765        let mut rng = XorShift::new(123456789);
1766        for value in &mut values {
1767            *value = (rng.next() % (1 << bit_width)) as u32;
1768        }
1769
1770        let mut packed = vec![0; 1024 * bit_width / 32];
1771        for lane in 0..u32::LANES {
1772            // Always loop over lanes first. This is what the compiler vectorizes.
1773            pack!(u32, bit_width, packed, lane, |$pos| {
1774                values[$pos]
1775            });
1776        }
1777
1778        let mut unpacked: [u32; 1024] = [0; 1024];
1779        for lane in 0..u32::LANES {
1780            // Always loop over lanes first. This is what the compiler vectorizes.
1781            unpack!(u32, bit_width, packed, lane, |$idx, $elem| {
1782                unpacked[$idx] = $elem;
1783            });
1784        }
1785
1786        assert_eq!(values, unpacked);
1787    }
1788
1789    fn pack_unpack_u64(bit_width: usize) {
1790        let mut values: [u64; 1024] = [0; 1024];
1791        let mut rng = XorShift::new(123456789);
1792        if bit_width == 64 {
1793            for value in &mut values {
1794                *value = rng.next();
1795            }
1796        } else {
1797            for value in &mut values {
1798                *value = rng.next() % (1 << bit_width);
1799            }
1800        }
1801
1802        let mut packed = vec![0; 1024 * bit_width / 64];
1803        for lane in 0..u64::LANES {
1804            // Always loop over lanes first. This is what the compiler vectorizes.
1805            pack!(u64, bit_width, packed, lane, |$pos| {
1806                values[$pos]
1807            });
1808        }
1809
1810        let mut unpacked: [u64; 1024] = [0; 1024];
1811        for lane in 0..u64::LANES {
1812            // Always loop over lanes first. This is what the compiler vectorizes.
1813            unpack!(u64, bit_width, packed, lane, |$idx, $elem| {
1814                unpacked[$idx] = $elem;
1815            });
1816        }
1817
1818        assert_eq!(values, unpacked);
1819    }
1820
1821    #[test]
1822    fn test_pack() {
1823        pack_unpack_u8(0);
1824        pack_unpack_u8(1);
1825        pack_unpack_u8(2);
1826        pack_unpack_u8(3);
1827        pack_unpack_u8(4);
1828        pack_unpack_u8(5);
1829        pack_unpack_u8(6);
1830        pack_unpack_u8(7);
1831        pack_unpack_u8(8);
1832
1833        pack_unpack_u16(0);
1834        pack_unpack_u16(1);
1835        pack_unpack_u16(2);
1836        pack_unpack_u16(3);
1837        pack_unpack_u16(4);
1838        pack_unpack_u16(5);
1839        pack_unpack_u16(6);
1840        pack_unpack_u16(7);
1841        pack_unpack_u16(8);
1842        pack_unpack_u16(9);
1843        pack_unpack_u16(10);
1844        pack_unpack_u16(11);
1845        pack_unpack_u16(12);
1846        pack_unpack_u16(13);
1847        pack_unpack_u16(14);
1848        pack_unpack_u16(15);
1849        pack_unpack_u16(16);
1850
1851        pack_unpack_u32(0);
1852        pack_unpack_u32(1);
1853        pack_unpack_u32(2);
1854        pack_unpack_u32(3);
1855        pack_unpack_u32(4);
1856        pack_unpack_u32(5);
1857        pack_unpack_u32(6);
1858        pack_unpack_u32(7);
1859        pack_unpack_u32(8);
1860        pack_unpack_u32(9);
1861        pack_unpack_u32(10);
1862        pack_unpack_u32(11);
1863        pack_unpack_u32(12);
1864        pack_unpack_u32(13);
1865        pack_unpack_u32(14);
1866        pack_unpack_u32(15);
1867        pack_unpack_u32(16);
1868        pack_unpack_u32(17);
1869        pack_unpack_u32(18);
1870        pack_unpack_u32(19);
1871        pack_unpack_u32(20);
1872        pack_unpack_u32(21);
1873        pack_unpack_u32(22);
1874        pack_unpack_u32(23);
1875        pack_unpack_u32(24);
1876        pack_unpack_u32(25);
1877        pack_unpack_u32(26);
1878        pack_unpack_u32(27);
1879        pack_unpack_u32(28);
1880        pack_unpack_u32(29);
1881        pack_unpack_u32(30);
1882        pack_unpack_u32(31);
1883        pack_unpack_u32(32);
1884
1885        pack_unpack_u64(0);
1886        pack_unpack_u64(1);
1887        pack_unpack_u64(2);
1888        pack_unpack_u64(3);
1889        pack_unpack_u64(4);
1890        pack_unpack_u64(5);
1891        pack_unpack_u64(6);
1892        pack_unpack_u64(7);
1893        pack_unpack_u64(8);
1894        pack_unpack_u64(9);
1895        pack_unpack_u64(10);
1896        pack_unpack_u64(11);
1897        pack_unpack_u64(12);
1898        pack_unpack_u64(13);
1899        pack_unpack_u64(14);
1900        pack_unpack_u64(15);
1901        pack_unpack_u64(16);
1902        pack_unpack_u64(17);
1903        pack_unpack_u64(18);
1904        pack_unpack_u64(19);
1905        pack_unpack_u64(20);
1906        pack_unpack_u64(21);
1907        pack_unpack_u64(22);
1908        pack_unpack_u64(23);
1909        pack_unpack_u64(24);
1910        pack_unpack_u64(25);
1911        pack_unpack_u64(26);
1912        pack_unpack_u64(27);
1913        pack_unpack_u64(28);
1914        pack_unpack_u64(29);
1915        pack_unpack_u64(30);
1916        pack_unpack_u64(31);
1917        pack_unpack_u64(32);
1918        pack_unpack_u64(33);
1919        pack_unpack_u64(34);
1920        pack_unpack_u64(35);
1921        pack_unpack_u64(36);
1922        pack_unpack_u64(37);
1923        pack_unpack_u64(38);
1924        pack_unpack_u64(39);
1925        pack_unpack_u64(40);
1926        pack_unpack_u64(41);
1927        pack_unpack_u64(42);
1928        pack_unpack_u64(43);
1929        pack_unpack_u64(44);
1930        pack_unpack_u64(45);
1931        pack_unpack_u64(46);
1932        pack_unpack_u64(47);
1933        pack_unpack_u64(48);
1934        pack_unpack_u64(49);
1935        pack_unpack_u64(50);
1936        pack_unpack_u64(51);
1937        pack_unpack_u64(52);
1938        pack_unpack_u64(53);
1939        pack_unpack_u64(54);
1940        pack_unpack_u64(55);
1941        pack_unpack_u64(56);
1942        pack_unpack_u64(57);
1943        pack_unpack_u64(58);
1944        pack_unpack_u64(59);
1945        pack_unpack_u64(60);
1946        pack_unpack_u64(61);
1947        pack_unpack_u64(62);
1948        pack_unpack_u64(63);
1949        pack_unpack_u64(64);
1950    }
1951
1952    fn unchecked_pack_unpack_u8(bit_width: usize) {
1953        let mut values = [0u8; 1024];
1954        let mut rng = XorShift::new(123456789);
1955        for value in &mut values {
1956            *value = (rng.next() % (1 << bit_width)) as u8;
1957        }
1958        let mut packed = vec![0; 1024 * bit_width / 8];
1959        unsafe {
1960            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
1961        }
1962        let mut output = [0; 1024];
1963        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
1964        assert_eq!(values, output);
1965    }
1966
1967    fn unchecked_pack_unpack_u16(bit_width: usize) {
1968        let mut values = [0u16; 1024];
1969        let mut rng = XorShift::new(123456789);
1970        for value in &mut values {
1971            *value = (rng.next() % (1 << bit_width)) as u16;
1972        }
1973        let mut packed = vec![0; 1024 * bit_width / u16::T];
1974        unsafe {
1975            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
1976        }
1977        let mut output = [0; 1024];
1978        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
1979        assert_eq!(values, output);
1980    }
1981
1982    fn unchecked_pack_unpack_u32(bit_width: usize) {
1983        let mut values = [0u32; 1024];
1984        let mut rng = XorShift::new(123456789);
1985        for value in &mut values {
1986            *value = (rng.next() % (1 << bit_width)) as u32;
1987        }
1988        let mut packed = vec![0; 1024 * bit_width / u32::T];
1989        unsafe {
1990            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
1991        }
1992        let mut output = [0; 1024];
1993        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
1994        assert_eq!(values, output);
1995    }
1996
1997    fn unchecked_pack_unpack_u64(bit_width: usize) {
1998        let mut values = [0u64; 1024];
1999        let mut rng = XorShift::new(123456789);
2000        if bit_width == 64 {
2001            for value in &mut values {
2002                *value = rng.next();
2003            }
2004        }
2005        let mut packed = vec![0; 1024 * bit_width / u64::T];
2006        unsafe {
2007            BitPacking::unchecked_pack(bit_width, &values, &mut packed);
2008        }
2009        let mut output = [0; 1024];
2010        unsafe { BitPacking::unchecked_unpack(bit_width, &packed, &mut output) };
2011        assert_eq!(values, output);
2012    }
2013
2014    #[test]
2015    fn test_unchecked_pack() {
2016        let input = array::from_fn(|i| i as u32);
2017        let mut packed = [0; 320];
2018        unsafe { BitPacking::unchecked_pack(10, &input, &mut packed) };
2019        let mut output = [0; 1024];
2020        unsafe { BitPacking::unchecked_unpack(10, &packed, &mut output) };
2021        assert_eq!(input, output);
2022
2023        unchecked_pack_unpack_u8(1);
2024        unchecked_pack_unpack_u8(2);
2025        unchecked_pack_unpack_u8(3);
2026        unchecked_pack_unpack_u8(4);
2027        unchecked_pack_unpack_u8(5);
2028        unchecked_pack_unpack_u8(6);
2029        unchecked_pack_unpack_u8(7);
2030        unchecked_pack_unpack_u8(8);
2031
2032        unchecked_pack_unpack_u16(1);
2033        unchecked_pack_unpack_u16(2);
2034        unchecked_pack_unpack_u16(3);
2035        unchecked_pack_unpack_u16(4);
2036        unchecked_pack_unpack_u16(5);
2037        unchecked_pack_unpack_u16(6);
2038        unchecked_pack_unpack_u16(7);
2039        unchecked_pack_unpack_u16(8);
2040        unchecked_pack_unpack_u16(9);
2041        unchecked_pack_unpack_u16(10);
2042        unchecked_pack_unpack_u16(11);
2043        unchecked_pack_unpack_u16(12);
2044        unchecked_pack_unpack_u16(13);
2045        unchecked_pack_unpack_u16(14);
2046        unchecked_pack_unpack_u16(15);
2047        unchecked_pack_unpack_u16(16);
2048
2049        unchecked_pack_unpack_u32(1);
2050        unchecked_pack_unpack_u32(2);
2051        unchecked_pack_unpack_u32(3);
2052        unchecked_pack_unpack_u32(4);
2053        unchecked_pack_unpack_u32(5);
2054        unchecked_pack_unpack_u32(6);
2055        unchecked_pack_unpack_u32(7);
2056        unchecked_pack_unpack_u32(8);
2057        unchecked_pack_unpack_u32(9);
2058        unchecked_pack_unpack_u32(10);
2059        unchecked_pack_unpack_u32(11);
2060        unchecked_pack_unpack_u32(12);
2061        unchecked_pack_unpack_u32(13);
2062        unchecked_pack_unpack_u32(14);
2063        unchecked_pack_unpack_u32(15);
2064        unchecked_pack_unpack_u32(16);
2065        unchecked_pack_unpack_u32(17);
2066        unchecked_pack_unpack_u32(18);
2067        unchecked_pack_unpack_u32(19);
2068        unchecked_pack_unpack_u32(20);
2069        unchecked_pack_unpack_u32(21);
2070        unchecked_pack_unpack_u32(22);
2071        unchecked_pack_unpack_u32(23);
2072        unchecked_pack_unpack_u32(24);
2073        unchecked_pack_unpack_u32(25);
2074        unchecked_pack_unpack_u32(26);
2075        unchecked_pack_unpack_u32(27);
2076        unchecked_pack_unpack_u32(28);
2077        unchecked_pack_unpack_u32(29);
2078        unchecked_pack_unpack_u32(30);
2079        unchecked_pack_unpack_u32(31);
2080        unchecked_pack_unpack_u32(32);
2081
2082        unchecked_pack_unpack_u64(1);
2083        unchecked_pack_unpack_u64(2);
2084        unchecked_pack_unpack_u64(3);
2085        unchecked_pack_unpack_u64(4);
2086        unchecked_pack_unpack_u64(5);
2087        unchecked_pack_unpack_u64(6);
2088        unchecked_pack_unpack_u64(7);
2089        unchecked_pack_unpack_u64(8);
2090        unchecked_pack_unpack_u64(9);
2091        unchecked_pack_unpack_u64(10);
2092        unchecked_pack_unpack_u64(11);
2093        unchecked_pack_unpack_u64(12);
2094        unchecked_pack_unpack_u64(13);
2095        unchecked_pack_unpack_u64(14);
2096        unchecked_pack_unpack_u64(15);
2097        unchecked_pack_unpack_u64(16);
2098        unchecked_pack_unpack_u64(17);
2099        unchecked_pack_unpack_u64(18);
2100        unchecked_pack_unpack_u64(19);
2101        unchecked_pack_unpack_u64(20);
2102        unchecked_pack_unpack_u64(21);
2103        unchecked_pack_unpack_u64(22);
2104        unchecked_pack_unpack_u64(23);
2105        unchecked_pack_unpack_u64(24);
2106        unchecked_pack_unpack_u64(25);
2107        unchecked_pack_unpack_u64(26);
2108        unchecked_pack_unpack_u64(27);
2109        unchecked_pack_unpack_u64(28);
2110        unchecked_pack_unpack_u64(29);
2111        unchecked_pack_unpack_u64(30);
2112        unchecked_pack_unpack_u64(31);
2113        unchecked_pack_unpack_u64(32);
2114        unchecked_pack_unpack_u64(33);
2115        unchecked_pack_unpack_u64(34);
2116        unchecked_pack_unpack_u64(35);
2117        unchecked_pack_unpack_u64(36);
2118        unchecked_pack_unpack_u64(37);
2119        unchecked_pack_unpack_u64(38);
2120        unchecked_pack_unpack_u64(39);
2121        unchecked_pack_unpack_u64(40);
2122        unchecked_pack_unpack_u64(41);
2123        unchecked_pack_unpack_u64(42);
2124        unchecked_pack_unpack_u64(43);
2125        unchecked_pack_unpack_u64(44);
2126        unchecked_pack_unpack_u64(45);
2127        unchecked_pack_unpack_u64(46);
2128        unchecked_pack_unpack_u64(47);
2129        unchecked_pack_unpack_u64(48);
2130        unchecked_pack_unpack_u64(49);
2131        unchecked_pack_unpack_u64(50);
2132        unchecked_pack_unpack_u64(51);
2133        unchecked_pack_unpack_u64(52);
2134        unchecked_pack_unpack_u64(53);
2135        unchecked_pack_unpack_u64(54);
2136        unchecked_pack_unpack_u64(55);
2137        unchecked_pack_unpack_u64(56);
2138        unchecked_pack_unpack_u64(57);
2139        unchecked_pack_unpack_u64(58);
2140        unchecked_pack_unpack_u64(59);
2141        unchecked_pack_unpack_u64(60);
2142        unchecked_pack_unpack_u64(61);
2143        unchecked_pack_unpack_u64(62);
2144        unchecked_pack_unpack_u64(63);
2145        unchecked_pack_unpack_u64(64);
2146    }
2147}