arrow_buffer/buffer/offset.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::buffer::ScalarBuffer;
19use crate::{ArrowNativeType, MutableBuffer, OffsetBufferBuilder};
20use std::ops::Deref;
21
22/// A non-empty buffer of monotonically increasing, positive integers.
23///
24/// [`OffsetBuffer`] are used to represent ranges of offsets. An
25/// `OffsetBuffer` of `N+1` items contains `N` such ranges. The start
26/// offset for element `i` is `offsets[i]` and the end offset is
27/// `offsets[i+1]`. Equal offsets represent an empty range.
28///
29/// # Example
30///
31/// This example shows how 5 distinct ranges, are represented using a
32/// 6 entry `OffsetBuffer`. The first entry `(0, 3)` represents the
33/// three offsets `0, 1, 2`. The entry `(3,3)` represent no offsets
34/// (e.g. an empty list).
35///
36/// ```text
37/// ┌───────┐ ┌───┐
38/// │ (0,3) │ │ 0 │
39/// ├───────┤ ├───┤
40/// │ (3,3) │ │ 3 │
41/// ├───────┤ ├───┤
42/// │ (3,4) │ │ 3 │
43/// ├───────┤ ├───┤
44/// │ (4,5) │ │ 4 │
45/// ├───────┤ ├───┤
46/// │ (5,7) │ │ 5 │
47/// └───────┘ ├───┤
48/// │ 7 │
49/// └───┘
50///
51/// Offsets Buffer
52/// Logical
53/// Offsets
54///
55/// (offsets[i],
56/// offsets[i+1])
57/// ```
58#[derive(Debug, Clone)]
59pub struct OffsetBuffer<O: ArrowNativeType>(ScalarBuffer<O>);
60
61impl<O: ArrowNativeType> OffsetBuffer<O> {
62 /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
63 ///
64 /// # Panics
65 ///
66 /// Panics if `buffer` is not a non-empty buffer containing
67 /// monotonically increasing values greater than or equal to zero
68 pub fn new(buffer: ScalarBuffer<O>) -> Self {
69 assert!(!buffer.is_empty(), "offsets cannot be empty");
70 assert!(
71 buffer[0] >= O::usize_as(0),
72 "offsets must be greater than 0"
73 );
74 assert!(
75 buffer.windows(2).all(|w| w[0] <= w[1]),
76 "offsets must be monotonically increasing"
77 );
78 Self(buffer)
79 }
80
81 /// Create a new [`OffsetBuffer`] from the provided [`ScalarBuffer`]
82 ///
83 /// # Safety
84 ///
85 /// `buffer` must be a non-empty buffer containing monotonically increasing
86 /// values greater than or equal to zero
87 pub unsafe fn new_unchecked(buffer: ScalarBuffer<O>) -> Self {
88 Self(buffer)
89 }
90
91 /// Create a new [`OffsetBuffer`] containing a single 0 value
92 pub fn new_empty() -> Self {
93 let buffer = MutableBuffer::from_len_zeroed(std::mem::size_of::<O>());
94 Self(buffer.into_buffer().into())
95 }
96
97 /// Create a new [`OffsetBuffer`] containing `len + 1` `0` values
98 pub fn new_zeroed(len: usize) -> Self {
99 let len_bytes = len
100 .checked_add(1)
101 .and_then(|o| o.checked_mul(std::mem::size_of::<O>()))
102 .expect("overflow");
103 let buffer = MutableBuffer::from_len_zeroed(len_bytes);
104 Self(buffer.into_buffer().into())
105 }
106
107 /// Create a new [`OffsetBuffer`] from the iterator of slice lengths
108 ///
109 /// ```
110 /// # use arrow_buffer::OffsetBuffer;
111 /// let offsets = OffsetBuffer::<i32>::from_lengths([1, 3, 5]);
112 /// assert_eq!(offsets.as_ref(), &[0, 1, 4, 9]);
113 /// ```
114 ///
115 /// # Panics
116 ///
117 /// Panics on overflow
118 pub fn from_lengths<I>(lengths: I) -> Self
119 where
120 I: IntoIterator<Item = usize>,
121 {
122 let iter = lengths.into_iter();
123 let mut out = Vec::with_capacity(iter.size_hint().0 + 1);
124 out.push(O::usize_as(0));
125
126 let mut acc = 0_usize;
127 for length in iter {
128 acc = acc.checked_add(length).expect("usize overflow");
129 out.push(O::usize_as(acc))
130 }
131 // Check for overflow
132 O::from_usize(acc).expect("offset overflow");
133 Self(out.into())
134 }
135
136 /// Get an Iterator over the lengths of this [`OffsetBuffer`]
137 ///
138 /// ```
139 /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
140 /// let offsets = OffsetBuffer::<_>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
141 /// assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
142 /// ```
143 ///
144 /// Empty [`OffsetBuffer`] will return an empty iterator
145 /// ```
146 /// # use arrow_buffer::OffsetBuffer;
147 /// let offsets = OffsetBuffer::<i32>::new_empty();
148 /// assert_eq!(offsets.lengths().count(), 0);
149 /// ```
150 ///
151 /// This can be used to merge multiple [`OffsetBuffer`]s to one
152 /// ```
153 /// # use arrow_buffer::{OffsetBuffer, ScalarBuffer};
154 ///
155 /// let buffer1 = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
156 /// let buffer2 = OffsetBuffer::<i32>::from_lengths([1, 3, 5, 7, 9]);
157 ///
158 /// let merged = OffsetBuffer::<i32>::from_lengths(
159 /// vec![buffer1, buffer2].iter().flat_map(|x| x.lengths())
160 /// );
161 ///
162 /// assert_eq!(merged.lengths().collect::<Vec<_>>(), &[2, 6, 3, 7, 2, 1, 3, 5, 7, 9]);
163 /// ```
164 pub fn lengths(&self) -> impl ExactSizeIterator<Item = usize> + '_ {
165 self.0.windows(2).map(|x| x[1].as_usize() - x[0].as_usize())
166 }
167
168 /// Free up unused memory.
169 pub fn shrink_to_fit(&mut self) {
170 self.0.shrink_to_fit();
171 }
172
173 /// Returns the inner [`ScalarBuffer`]
174 pub fn inner(&self) -> &ScalarBuffer<O> {
175 &self.0
176 }
177
178 /// Returns the inner [`ScalarBuffer`], consuming self
179 pub fn into_inner(self) -> ScalarBuffer<O> {
180 self.0
181 }
182
183 /// Returns a zero-copy slice of this buffer with length `len` and starting at `offset`
184 pub fn slice(&self, offset: usize, len: usize) -> Self {
185 Self(self.0.slice(offset, len.saturating_add(1)))
186 }
187
188 /// Returns true if this [`OffsetBuffer`] is equal to `other`, using pointer comparisons
189 /// to determine buffer equality. This is cheaper than `PartialEq::eq` but may
190 /// return false when the arrays are logically equal
191 #[inline]
192 pub fn ptr_eq(&self, other: &Self) -> bool {
193 self.0.ptr_eq(&other.0)
194 }
195}
196
197impl<T: ArrowNativeType> Deref for OffsetBuffer<T> {
198 type Target = [T];
199
200 #[inline]
201 fn deref(&self) -> &Self::Target {
202 &self.0
203 }
204}
205
206impl<T: ArrowNativeType> AsRef<[T]> for OffsetBuffer<T> {
207 #[inline]
208 fn as_ref(&self) -> &[T] {
209 self
210 }
211}
212
213impl<O: ArrowNativeType> From<OffsetBufferBuilder<O>> for OffsetBuffer<O> {
214 fn from(value: OffsetBufferBuilder<O>) -> Self {
215 value.finish()
216 }
217}
218
219#[cfg(test)]
220mod tests {
221 use super::*;
222
223 #[test]
224 #[should_panic(expected = "offsets cannot be empty")]
225 fn empty_offsets() {
226 OffsetBuffer::new(Vec::<i32>::new().into());
227 }
228
229 #[test]
230 #[should_panic(expected = "offsets must be greater than 0")]
231 fn negative_offsets() {
232 OffsetBuffer::new(vec![-1, 0, 1].into());
233 }
234
235 #[test]
236 fn offsets() {
237 OffsetBuffer::new(vec![0, 1, 2, 3].into());
238
239 let offsets = OffsetBuffer::<i32>::new_zeroed(3);
240 assert_eq!(offsets.as_ref(), &[0; 4]);
241
242 let offsets = OffsetBuffer::<i32>::new_zeroed(0);
243 assert_eq!(offsets.as_ref(), &[0; 1]);
244 }
245
246 #[test]
247 #[should_panic(expected = "overflow")]
248 fn offsets_new_zeroed_overflow() {
249 OffsetBuffer::<i32>::new_zeroed(usize::MAX);
250 }
251
252 #[test]
253 #[should_panic(expected = "offsets must be monotonically increasing")]
254 fn non_monotonic_offsets() {
255 OffsetBuffer::new(vec![1, 2, 0].into());
256 }
257
258 #[test]
259 fn from_lengths() {
260 let buffer = OffsetBuffer::<i32>::from_lengths([2, 6, 3, 7, 2]);
261 assert_eq!(buffer.as_ref(), &[0, 2, 8, 11, 18, 20]);
262
263 let half_max = i32::MAX / 2;
264 let buffer = OffsetBuffer::<i32>::from_lengths([half_max as usize, half_max as usize]);
265 assert_eq!(buffer.as_ref(), &[0, half_max, half_max * 2]);
266 }
267
268 #[test]
269 #[should_panic(expected = "offset overflow")]
270 fn from_lengths_offset_overflow() {
271 OffsetBuffer::<i32>::from_lengths([i32::MAX as usize, 1]);
272 }
273
274 #[test]
275 #[should_panic(expected = "usize overflow")]
276 fn from_lengths_usize_overflow() {
277 OffsetBuffer::<i32>::from_lengths([usize::MAX, 1]);
278 }
279
280 #[test]
281 fn get_lengths() {
282 let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
283 assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![1, 3, 5]);
284 }
285
286 #[test]
287 fn get_lengths_should_be_with_fixed_size() {
288 let offsets = OffsetBuffer::<i32>::new(ScalarBuffer::<i32>::from(vec![0, 1, 4, 9]));
289 let iter = offsets.lengths();
290 assert_eq!(iter.size_hint(), (3, Some(3)));
291 assert_eq!(iter.len(), 3);
292 }
293
294 #[test]
295 fn get_lengths_from_empty_offset_buffer_should_be_empty_iterator() {
296 let offsets = OffsetBuffer::<i32>::new_empty();
297 assert_eq!(offsets.lengths().collect::<Vec<usize>>(), vec![]);
298 }
299}