arrow_buffer/util/
bit_iterator.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Types for iterating over packed bitmasks
19
20use crate::bit_chunk_iterator::{UnalignedBitChunk, UnalignedBitChunkIterator};
21use crate::bit_util::{ceil, get_bit_raw};
22
23/// Iterator over the bits within a packed bitmask
24///
25/// To efficiently iterate over just the set bits see [`BitIndexIterator`] and [`BitSliceIterator`]
26pub struct BitIterator<'a> {
27    buffer: &'a [u8],
28    current_offset: usize,
29    end_offset: usize,
30}
31
32impl<'a> BitIterator<'a> {
33    /// Create a new [`BitIterator`] from the provided `buffer`,
34    /// and `offset` and `len` in bits
35    ///
36    /// # Panic
37    ///
38    /// Panics if `buffer` is too short for the provided offset and length
39    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
40        let end_offset = offset.checked_add(len).unwrap();
41        let required_len = ceil(end_offset, 8);
42        assert!(
43            buffer.len() >= required_len,
44            "BitIterator buffer too small, expected {required_len} got {}",
45            buffer.len()
46        );
47
48        Self {
49            buffer,
50            current_offset: offset,
51            end_offset,
52        }
53    }
54}
55
56impl Iterator for BitIterator<'_> {
57    type Item = bool;
58
59    fn next(&mut self) -> Option<Self::Item> {
60        if self.current_offset == self.end_offset {
61            return None;
62        }
63        // Safety:
64        // offsets in bounds
65        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.current_offset) };
66        self.current_offset += 1;
67        Some(v)
68    }
69
70    fn size_hint(&self) -> (usize, Option<usize>) {
71        let remaining_bits = self.end_offset - self.current_offset;
72        (remaining_bits, Some(remaining_bits))
73    }
74}
75
76impl ExactSizeIterator for BitIterator<'_> {}
77
78impl DoubleEndedIterator for BitIterator<'_> {
79    fn next_back(&mut self) -> Option<Self::Item> {
80        if self.current_offset == self.end_offset {
81            return None;
82        }
83        self.end_offset -= 1;
84        // Safety:
85        // offsets in bounds
86        let v = unsafe { get_bit_raw(self.buffer.as_ptr(), self.end_offset) };
87        Some(v)
88    }
89}
90
91/// Iterator of contiguous ranges of set bits within a provided packed bitmask
92///
93/// Returns `(usize, usize)` each representing an interval where the corresponding
94/// bits in the provides mask are set
95///
96/// the first value is the start of the range (inclusive) and the second value is the end of the range (exclusive)
97///
98#[derive(Debug)]
99pub struct BitSliceIterator<'a> {
100    iter: UnalignedBitChunkIterator<'a>,
101    len: usize,
102    current_offset: i64,
103    current_chunk: u64,
104}
105
106impl<'a> BitSliceIterator<'a> {
107    /// Create a new [`BitSliceIterator`] from the provided `buffer`,
108    /// and `offset` and `len` in bits
109    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
110        let chunk = UnalignedBitChunk::new(buffer, offset, len);
111        let mut iter = chunk.iter();
112
113        let current_offset = -(chunk.lead_padding() as i64);
114        let current_chunk = iter.next().unwrap_or(0);
115
116        Self {
117            iter,
118            len,
119            current_offset,
120            current_chunk,
121        }
122    }
123
124    /// Returns `Some((chunk_offset, bit_offset))` for the next chunk that has at
125    /// least one bit set, or None if there is no such chunk.
126    ///
127    /// Where `chunk_offset` is the bit offset to the current `u64` chunk
128    /// and `bit_offset` is the offset of the first `1` bit in that chunk
129    fn advance_to_set_bit(&mut self) -> Option<(i64, u32)> {
130        loop {
131            if self.current_chunk != 0 {
132                // Find the index of the first 1
133                let bit_pos = self.current_chunk.trailing_zeros();
134                return Some((self.current_offset, bit_pos));
135            }
136
137            self.current_chunk = self.iter.next()?;
138            self.current_offset += 64;
139        }
140    }
141}
142
143impl Iterator for BitSliceIterator<'_> {
144    type Item = (usize, usize);
145
146    fn next(&mut self) -> Option<Self::Item> {
147        // Used as termination condition
148        if self.len == 0 {
149            return None;
150        }
151
152        let (start_chunk, start_bit) = self.advance_to_set_bit()?;
153
154        // Set bits up to start
155        self.current_chunk |= (1 << start_bit) - 1;
156
157        loop {
158            if self.current_chunk != u64::MAX {
159                // Find the index of the first 0
160                let end_bit = self.current_chunk.trailing_ones();
161
162                // Zero out up to end_bit
163                self.current_chunk &= !((1 << end_bit) - 1);
164
165                return Some((
166                    (start_chunk + start_bit as i64) as usize,
167                    (self.current_offset + end_bit as i64) as usize,
168                ));
169            }
170
171            match self.iter.next() {
172                Some(next) => {
173                    self.current_chunk = next;
174                    self.current_offset += 64;
175                }
176                None => {
177                    return Some((
178                        (start_chunk + start_bit as i64) as usize,
179                        std::mem::replace(&mut self.len, 0),
180                    ));
181                }
182            }
183        }
184    }
185}
186
187/// An iterator of `usize` whose index in a provided bitmask is true
188///
189/// This provides the best performance on most masks, apart from those which contain
190/// large runs and therefore favour [`BitSliceIterator`]
191#[derive(Debug)]
192pub struct BitIndexIterator<'a> {
193    current_chunk: u64,
194    chunk_offset: i64,
195    iter: UnalignedBitChunkIterator<'a>,
196}
197
198impl<'a> BitIndexIterator<'a> {
199    /// Create a new [`BitIndexIterator`] from the provide `buffer`,
200    /// and `offset` and `len` in bits
201    pub fn new(buffer: &'a [u8], offset: usize, len: usize) -> Self {
202        let chunks = UnalignedBitChunk::new(buffer, offset, len);
203        let mut iter = chunks.iter();
204
205        let current_chunk = iter.next().unwrap_or(0);
206        let chunk_offset = -(chunks.lead_padding() as i64);
207
208        Self {
209            current_chunk,
210            chunk_offset,
211            iter,
212        }
213    }
214}
215
216impl Iterator for BitIndexIterator<'_> {
217    type Item = usize;
218
219    fn next(&mut self) -> Option<Self::Item> {
220        loop {
221            if self.current_chunk != 0 {
222                let bit_pos = self.current_chunk.trailing_zeros();
223                self.current_chunk ^= 1 << bit_pos;
224                return Some((self.chunk_offset + bit_pos as i64) as usize);
225            }
226
227            self.current_chunk = self.iter.next()?;
228            self.chunk_offset += 64;
229        }
230    }
231}
232
233/// Calls the provided closure for each index in the provided null mask that is set,
234/// using an adaptive strategy based on the null count
235///
236/// Ideally this would be encapsulated in an [`Iterator`] that would determine the optimal
237/// strategy up front, and then yield indexes based on this.
238///
239/// Unfortunately, external iteration based on the resulting [`Iterator`] would match the strategy
240/// variant on each call to [`Iterator::next`], and LLVM generally cannot eliminate this.
241///
242/// One solution to this might be internal iteration, e.g. [`Iterator::try_fold`], however,
243/// it is currently [not possible] to override this for custom iterators in stable Rust.
244///
245/// As such this is the next best option
246///
247/// [not possible]: https://github.com/rust-lang/rust/issues/69595
248#[inline]
249pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
250    len: usize,
251    offset: usize,
252    null_count: usize,
253    nulls: Option<&[u8]>,
254    f: F,
255) -> Result<(), E> {
256    let valid_count = len - null_count;
257
258    if valid_count == len {
259        (0..len).try_for_each(f)
260    } else if null_count != len {
261        BitIndexIterator::new(nulls.unwrap(), offset, len).try_for_each(f)
262    } else {
263        Ok(())
264    }
265}
266
267// Note: further tests located in arrow_select::filter module
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    #[test]
274    fn test_bit_iterator_size_hint() {
275        let mut b = BitIterator::new(&[0b00000011], 0, 2);
276        assert_eq!(
277            b.size_hint(),
278            (2, Some(2)),
279            "Expected size_hint to be (2, Some(2))"
280        );
281
282        b.next();
283        assert_eq!(
284            b.size_hint(),
285            (1, Some(1)),
286            "Expected size_hint to be (1, Some(1)) after one bit consumed"
287        );
288
289        b.next();
290        assert_eq!(
291            b.size_hint(),
292            (0, Some(0)),
293            "Expected size_hint to be (0, Some(0)) after all bits consumed"
294        );
295    }
296
297    #[test]
298    fn test_bit_iterator() {
299        let mask = &[0b00010010, 0b00100011, 0b00000101, 0b00010001, 0b10010011];
300        let actual: Vec<_> = BitIterator::new(mask, 0, 5).collect();
301        assert_eq!(actual, &[false, true, false, false, true]);
302
303        let actual: Vec<_> = BitIterator::new(mask, 4, 5).collect();
304        assert_eq!(actual, &[true, false, false, false, true]);
305
306        let actual: Vec<_> = BitIterator::new(mask, 12, 14).collect();
307        assert_eq!(
308            actual,
309            &[
310                false, true, false, false, true, false, true, false, false, false, false, false,
311                true, false
312            ]
313        );
314
315        assert_eq!(BitIterator::new(mask, 0, 0).count(), 0);
316        assert_eq!(BitIterator::new(mask, 40, 0).count(), 0);
317    }
318
319    #[test]
320    #[should_panic(expected = "BitIterator buffer too small, expected 3 got 2")]
321    fn test_bit_iterator_bounds() {
322        let mask = &[223, 23];
323        BitIterator::new(mask, 17, 0);
324    }
325}