polars_arrow/array/binary/
mod.rs1use either::Either;
2
3use super::specification::try_check_offsets_bounds;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::bitmap::utils::{BitmapIter, ZipValidity};
7use crate::bitmap::Bitmap;
8use crate::buffer::Buffer;
9use crate::datatypes::ArrowDataType;
10use crate::offset::{Offset, Offsets, OffsetsBuffer};
11use crate::trusted_len::TrustedLen;
12
13mod ffi;
14pub(super) mod fmt;
15mod iterator;
16pub use iterator::*;
17mod from;
18mod mutable_values;
19pub use mutable_values::*;
20mod mutable;
21pub use mutable::*;
22use polars_error::{polars_bail, PolarsResult};
23
24#[derive(Clone)]
55pub struct BinaryArray<O: Offset> {
56 dtype: ArrowDataType,
57 offsets: OffsetsBuffer<O>,
58 values: Buffer<u8>,
59 validity: Option<Bitmap>,
60}
61
62impl<O: Offset> BinaryArray<O> {
63 pub fn try_new(
73 dtype: ArrowDataType,
74 offsets: OffsetsBuffer<O>,
75 values: Buffer<u8>,
76 validity: Option<Bitmap>,
77 ) -> PolarsResult<Self> {
78 try_check_offsets_bounds(&offsets, values.len())?;
79
80 if validity
81 .as_ref()
82 .is_some_and(|validity| validity.len() != offsets.len_proxy())
83 {
84 polars_bail!(ComputeError: "validity mask length must match the number of values")
85 }
86
87 if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
88 polars_bail!(ComputeError: "BinaryArray can only be initialized with DataType::Binary or DataType::LargeBinary")
89 }
90
91 Ok(Self {
92 dtype,
93 offsets,
94 values,
95 validity,
96 })
97 }
98
99 pub unsafe fn new_unchecked(
105 dtype: ArrowDataType,
106 offsets: OffsetsBuffer<O>,
107 values: Buffer<u8>,
108 validity: Option<Bitmap>,
109 ) -> Self {
110 Self {
111 dtype,
112 offsets,
113 values,
114 validity,
115 }
116 }
117
118 pub fn from_slice<T: AsRef<[u8]>, P: AsRef<[T]>>(slice: P) -> Self {
120 Self::from_trusted_len_values_iter(slice.as_ref().iter())
121 }
122
123 pub fn from<T: AsRef<[u8]>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
126 MutableBinaryArray::<O>::from(slice).into()
127 }
128
129 pub fn iter(&self) -> ZipValidity<&[u8], BinaryValueIter<O>, BitmapIter> {
131 ZipValidity::new_with_validity(self.values_iter(), self.validity.as_ref())
132 }
133
134 pub fn values_iter(&self) -> BinaryValueIter<O> {
136 BinaryValueIter::new(self)
137 }
138
139 #[inline]
141 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, BinaryArray<O>> {
142 NonNullValuesIter::new(self, self.validity())
143 }
144
145 #[inline]
147 pub fn len(&self) -> usize {
148 self.offsets.len_proxy()
149 }
150
151 #[inline]
155 pub fn value(&self, i: usize) -> &[u8] {
156 assert!(i < self.len());
157 unsafe { self.value_unchecked(i) }
158 }
159
160 #[inline]
165 pub unsafe fn value_unchecked(&self, i: usize) -> &[u8] {
166 let (start, end) = self.offsets.start_end_unchecked(i);
168
169 self.values.get_unchecked(start..end)
171 }
172
173 #[inline]
177 pub fn get(&self, i: usize) -> Option<&[u8]> {
178 if !self.is_null(i) {
179 unsafe { Some(self.value_unchecked(i)) }
181 } else {
182 None
183 }
184 }
185
186 #[inline]
188 pub fn dtype(&self) -> &ArrowDataType {
189 &self.dtype
190 }
191
192 #[inline]
194 pub fn values(&self) -> &Buffer<u8> {
195 &self.values
196 }
197
198 #[inline]
200 pub fn offsets(&self) -> &OffsetsBuffer<O> {
201 &self.offsets
202 }
203
204 #[inline]
206 pub fn validity(&self) -> Option<&Bitmap> {
207 self.validity.as_ref()
208 }
209
210 pub fn slice(&mut self, offset: usize, length: usize) {
216 assert!(
217 offset + length <= self.len(),
218 "the offset of the new Buffer cannot exceed the existing length"
219 );
220 unsafe { self.slice_unchecked(offset, length) }
221 }
222
223 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
230 self.validity = self
231 .validity
232 .take()
233 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
234 .filter(|bitmap| bitmap.unset_bits() > 0);
235 self.offsets.slice_unchecked(offset, length + 1);
236 }
237
238 impl_sliced!();
239 impl_mut_validity!();
240 impl_into_array!();
241
242 #[must_use]
244 pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
245 let Self {
246 dtype,
247 offsets,
248 values,
249 validity,
250 } = self;
251 (dtype, offsets, values, validity)
252 }
253
254 #[must_use]
256 pub fn into_mut(self) -> Either<Self, MutableBinaryArray<O>> {
257 use Either::*;
258 if let Some(bitmap) = self.validity {
259 match bitmap.into_mut() {
260 Left(bitmap) => Left(BinaryArray::new(
262 self.dtype,
263 self.offsets,
264 self.values,
265 Some(bitmap),
266 )),
267 Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
268 (Left(values), Left(offsets)) => Left(BinaryArray::new(
269 self.dtype,
270 offsets,
271 values,
272 Some(mutable_bitmap.into()),
273 )),
274 (Left(values), Right(offsets)) => Left(BinaryArray::new(
275 self.dtype,
276 offsets.into(),
277 values,
278 Some(mutable_bitmap.into()),
279 )),
280 (Right(values), Left(offsets)) => Left(BinaryArray::new(
281 self.dtype,
282 offsets,
283 values.into(),
284 Some(mutable_bitmap.into()),
285 )),
286 (Right(values), Right(offsets)) => Right(
287 MutableBinaryArray::try_new(
288 self.dtype,
289 offsets,
290 values,
291 Some(mutable_bitmap),
292 )
293 .unwrap(),
294 ),
295 },
296 }
297 } else {
298 match (self.values.into_mut(), self.offsets.into_mut()) {
299 (Left(values), Left(offsets)) => {
300 Left(BinaryArray::new(self.dtype, offsets, values, None))
301 },
302 (Left(values), Right(offsets)) => {
303 Left(BinaryArray::new(self.dtype, offsets.into(), values, None))
304 },
305 (Right(values), Left(offsets)) => {
306 Left(BinaryArray::new(self.dtype, offsets, values.into(), None))
307 },
308 (Right(values), Right(offsets)) => {
309 Right(MutableBinaryArray::try_new(self.dtype, offsets, values, None).unwrap())
310 },
311 }
312 }
313 }
314
315 pub fn new_empty(dtype: ArrowDataType) -> Self {
317 Self::new(dtype, OffsetsBuffer::new(), Buffer::new(), None)
318 }
319
320 #[inline]
322 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
323 unsafe {
324 Self::new_unchecked(
325 dtype,
326 Offsets::new_zeroed(length).into(),
327 Buffer::new(),
328 Some(Bitmap::new_zeroed(length)),
329 )
330 }
331 }
332
333 pub fn default_dtype() -> ArrowDataType {
335 if O::IS_LARGE {
336 ArrowDataType::LargeBinary
337 } else {
338 ArrowDataType::Binary
339 }
340 }
341
342 pub fn new(
344 dtype: ArrowDataType,
345 offsets: OffsetsBuffer<O>,
346 values: Buffer<u8>,
347 validity: Option<Bitmap>,
348 ) -> Self {
349 Self::try_new(dtype, offsets, values, validity).unwrap()
350 }
351
352 #[inline]
356 pub fn from_trusted_len_values_iter<T: AsRef<[u8]>, I: TrustedLen<Item = T>>(
357 iterator: I,
358 ) -> Self {
359 MutableBinaryArray::<O>::from_trusted_len_values_iter(iterator).into()
360 }
361
362 pub fn from_iter_values<T: AsRef<[u8]>, I: Iterator<Item = T>>(iterator: I) -> Self {
366 MutableBinaryArray::<O>::from_iter_values(iterator).into()
367 }
368
369 #[inline]
375 pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
376 where
377 P: AsRef<[u8]>,
378 I: Iterator<Item = Option<P>>,
379 {
380 MutableBinaryArray::<O>::from_trusted_len_iter_unchecked(iterator).into()
381 }
382
383 #[inline]
385 pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
386 where
387 P: AsRef<[u8]>,
388 I: TrustedLen<Item = Option<P>>,
389 {
390 unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
392 }
393
394 #[inline]
400 pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(iterator: I) -> Result<Self, E>
401 where
402 P: AsRef<[u8]>,
403 I: IntoIterator<Item = Result<Option<P>, E>>,
404 {
405 MutableBinaryArray::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
406 }
407
408 #[inline]
410 pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> Result<Self, E>
411 where
412 P: AsRef<[u8]>,
413 I: TrustedLen<Item = Result<Option<P>, E>>,
414 {
415 unsafe { Self::try_from_trusted_len_iter_unchecked(iter) }
417 }
418}
419
420impl<O: Offset> Array for BinaryArray<O> {
421 impl_common_array!();
422
423 fn validity(&self) -> Option<&Bitmap> {
424 self.validity.as_ref()
425 }
426
427 #[inline]
428 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
429 Box::new(self.clone().with_validity(validity))
430 }
431}
432
433unsafe impl<O: Offset> GenericBinaryArray<O> for BinaryArray<O> {
434 #[inline]
435 fn values(&self) -> &[u8] {
436 self.values()
437 }
438
439 #[inline]
440 fn offsets(&self) -> &[O] {
441 self.offsets().buffer()
442 }
443}
444
445impl<O: Offset> Splitable for BinaryArray<O> {
446 #[inline(always)]
447 fn check_bound(&self, offset: usize) -> bool {
448 offset <= self.len()
449 }
450
451 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
452 let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
453 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
454
455 (
456 Self {
457 dtype: self.dtype.clone(),
458 offsets: lhs_offsets,
459 values: self.values.clone(),
460 validity: lhs_validity,
461 },
462 Self {
463 dtype: self.dtype.clone(),
464 offsets: rhs_offsets,
465 values: self.values.clone(),
466 validity: rhs_validity,
467 },
468 )
469 }
470}