arrow_buffer/buffer/null.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::bit_iterator::{BitIndexIterator, BitIterator, BitSliceIterator};
19use crate::buffer::BooleanBuffer;
20use crate::{Buffer, MutableBuffer};
21
22/// A [`BooleanBuffer`] used to encode validity for Arrow arrays
23///
24/// In the [Arrow specification], array validity is encoded in a packed bitmask with a
25/// `true` value indicating the corresponding slot is not null, and `false` indicating
26/// that it is null.
27///
28/// `NullBuffer`s can be creating using [`NullBufferBuilder`]
29///
30/// [Arrow specification]: https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps
31/// [`NullBufferBuilder`]: crate::NullBufferBuilder
32#[derive(Debug, Clone, Eq, PartialEq)]
33pub struct NullBuffer {
34 buffer: BooleanBuffer,
35 null_count: usize,
36}
37
38impl NullBuffer {
39 /// Create a new [`NullBuffer`] computing the null count
40 pub fn new(buffer: BooleanBuffer) -> Self {
41 let null_count = buffer.len() - buffer.count_set_bits();
42 Self { buffer, null_count }
43 }
44
45 /// Create a new [`NullBuffer`] of length `len` where all values are null
46 pub fn new_null(len: usize) -> Self {
47 Self {
48 buffer: BooleanBuffer::new_unset(len),
49 null_count: len,
50 }
51 }
52
53 /// Create a new [`NullBuffer`] of length `len` where all values are valid
54 ///
55 /// Note: it is more efficient to not set the null buffer if it is known to
56 /// be all valid (aka all values are not null)
57 pub fn new_valid(len: usize) -> Self {
58 Self {
59 buffer: BooleanBuffer::new_set(len),
60 null_count: 0,
61 }
62 }
63
64 /// Create a new [`NullBuffer`] with the provided `buffer` and `null_count`
65 ///
66 /// # Safety
67 ///
68 /// `buffer` must contain `null_count` `0` bits
69 pub unsafe fn new_unchecked(buffer: BooleanBuffer, null_count: usize) -> Self {
70 Self { buffer, null_count }
71 }
72
73 /// Computes the union of the nulls in two optional [`NullBuffer`]
74 ///
75 /// This is commonly used by binary operations where the result is NULL if either
76 /// of the input values is NULL. Handling the null mask separately in this way
77 /// can yield significant performance improvements over an iterator approach
78 pub fn union(lhs: Option<&NullBuffer>, rhs: Option<&NullBuffer>) -> Option<NullBuffer> {
79 match (lhs, rhs) {
80 (Some(lhs), Some(rhs)) => Some(Self::new(lhs.inner() & rhs.inner())),
81 (Some(n), None) | (None, Some(n)) => Some(n.clone()),
82 (None, None) => None,
83 }
84 }
85
86 /// Returns true if all nulls in `other` also exist in self
87 pub fn contains(&self, other: &NullBuffer) -> bool {
88 if other.null_count == 0 {
89 return true;
90 }
91 let lhs = self.inner().bit_chunks().iter_padded();
92 let rhs = other.inner().bit_chunks().iter_padded();
93 lhs.zip(rhs).all(|(l, r)| (l & !r) == 0)
94 }
95
96 /// Returns a new [`NullBuffer`] where each bit in the current null buffer
97 /// is repeated `count` times. This is useful for masking the nulls of
98 /// the child of a FixedSizeListArray based on its parent
99 pub fn expand(&self, count: usize) -> Self {
100 let capacity = self.buffer.len().checked_mul(count).unwrap();
101 let mut buffer = MutableBuffer::new_null(capacity);
102
103 // Expand each bit within `null_mask` into `element_len`
104 // bits, constructing the implicit mask of the child elements
105 for i in 0..self.buffer.len() {
106 if self.is_null(i) {
107 continue;
108 }
109 for j in 0..count {
110 crate::bit_util::set_bit(buffer.as_mut(), i * count + j)
111 }
112 }
113 Self {
114 buffer: BooleanBuffer::new(buffer.into(), 0, capacity),
115 null_count: self.null_count * count,
116 }
117 }
118
119 /// Returns the length of this [`NullBuffer`]
120 #[inline]
121 pub fn len(&self) -> usize {
122 self.buffer.len()
123 }
124
125 /// Returns the offset of this [`NullBuffer`] in bits
126 #[inline]
127 pub fn offset(&self) -> usize {
128 self.buffer.offset()
129 }
130
131 /// Returns true if this [`NullBuffer`] is empty
132 #[inline]
133 pub fn is_empty(&self) -> bool {
134 self.buffer.is_empty()
135 }
136
137 /// Free up unused memory.
138 pub fn shrink_to_fit(&mut self) {
139 self.buffer.shrink_to_fit();
140 }
141
142 /// Returns the null count for this [`NullBuffer`]
143 #[inline]
144 pub fn null_count(&self) -> usize {
145 self.null_count
146 }
147
148 /// Returns `true` if the value at `idx` is not null
149 #[inline]
150 pub fn is_valid(&self, idx: usize) -> bool {
151 self.buffer.value(idx)
152 }
153
154 /// Returns `true` if the value at `idx` is null
155 #[inline]
156 pub fn is_null(&self, idx: usize) -> bool {
157 !self.is_valid(idx)
158 }
159
160 /// Returns the packed validity of this [`NullBuffer`] not including any offset
161 #[inline]
162 pub fn validity(&self) -> &[u8] {
163 self.buffer.values()
164 }
165
166 /// Slices this [`NullBuffer`] by the provided `offset` and `length`
167 pub fn slice(&self, offset: usize, len: usize) -> Self {
168 Self::new(self.buffer.slice(offset, len))
169 }
170
171 /// Returns an iterator over the bits in this [`NullBuffer`]
172 ///
173 /// * `true` indicates that the corresponding value is not NULL
174 /// * `false` indicates that the corresponding value is NULL
175 ///
176 /// Note: [`Self::valid_indices`] will be significantly faster for most use-cases
177 pub fn iter(&self) -> BitIterator<'_> {
178 self.buffer.iter()
179 }
180
181 /// Returns a [`BitIndexIterator`] over the valid indices in this [`NullBuffer`]
182 ///
183 /// Valid indices indicate the corresponding value is not NULL
184 pub fn valid_indices(&self) -> BitIndexIterator<'_> {
185 self.buffer.set_indices()
186 }
187
188 /// Returns a [`BitSliceIterator`] yielding contiguous ranges of valid indices
189 ///
190 /// Valid indices indicate the corresponding value is not NULL
191 pub fn valid_slices(&self) -> BitSliceIterator<'_> {
192 self.buffer.set_slices()
193 }
194
195 /// Calls the provided closure for each index in this null mask that is set
196 #[inline]
197 pub fn try_for_each_valid_idx<E, F: FnMut(usize) -> Result<(), E>>(
198 &self,
199 f: F,
200 ) -> Result<(), E> {
201 if self.null_count == self.len() {
202 return Ok(());
203 }
204 self.valid_indices().try_for_each(f)
205 }
206
207 /// Returns the inner [`BooleanBuffer`]
208 #[inline]
209 pub fn inner(&self) -> &BooleanBuffer {
210 &self.buffer
211 }
212
213 /// Returns the inner [`BooleanBuffer`]
214 #[inline]
215 pub fn into_inner(self) -> BooleanBuffer {
216 self.buffer
217 }
218
219 /// Returns the underlying [`Buffer`]
220 #[inline]
221 pub fn buffer(&self) -> &Buffer {
222 self.buffer.inner()
223 }
224}
225
226impl<'a> IntoIterator for &'a NullBuffer {
227 type Item = bool;
228 type IntoIter = BitIterator<'a>;
229
230 fn into_iter(self) -> Self::IntoIter {
231 self.buffer.iter()
232 }
233}
234
235impl From<BooleanBuffer> for NullBuffer {
236 fn from(value: BooleanBuffer) -> Self {
237 Self::new(value)
238 }
239}
240
241impl From<&[bool]> for NullBuffer {
242 fn from(value: &[bool]) -> Self {
243 BooleanBuffer::from(value).into()
244 }
245}
246
247impl<const N: usize> From<&[bool; N]> for NullBuffer {
248 fn from(value: &[bool; N]) -> Self {
249 value[..].into()
250 }
251}
252
253impl From<Vec<bool>> for NullBuffer {
254 fn from(value: Vec<bool>) -> Self {
255 BooleanBuffer::from(value).into()
256 }
257}
258
259impl FromIterator<bool> for NullBuffer {
260 fn from_iter<T: IntoIterator<Item = bool>>(iter: T) -> Self {
261 BooleanBuffer::from_iter(iter).into()
262 }
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268 #[test]
269 fn test_size() {
270 // This tests that the niche optimisation eliminates the overhead of an option
271 assert_eq!(
272 std::mem::size_of::<NullBuffer>(),
273 std::mem::size_of::<Option<NullBuffer>>()
274 );
275 }
276}