1use crate::array::{get_offsets, print_long_array};
19use crate::builder::GenericByteBuilder;
20use crate::iterator::ArrayIter;
21use crate::types::bytes::ByteArrayNativeType;
22use crate::types::ByteArrayType;
23use crate::{Array, ArrayAccessor, ArrayRef, OffsetSizeTrait, Scalar};
24use arrow_buffer::{ArrowNativeType, Buffer, MutableBuffer};
25use arrow_buffer::{NullBuffer, OffsetBuffer};
26use arrow_data::{ArrayData, ArrayDataBuilder};
27use arrow_schema::{ArrowError, DataType};
28use std::any::Any;
29use std::sync::Arc;
30
31pub struct GenericByteArray<T: ByteArrayType> {
88 data_type: DataType,
89 value_offsets: OffsetBuffer<T::Offset>,
90 value_data: Buffer,
91 nulls: Option<NullBuffer>,
92}
93
94impl<T: ByteArrayType> Clone for GenericByteArray<T> {
95 fn clone(&self) -> Self {
96 Self {
97 data_type: T::DATA_TYPE,
98 value_offsets: self.value_offsets.clone(),
99 value_data: self.value_data.clone(),
100 nulls: self.nulls.clone(),
101 }
102 }
103}
104
105impl<T: ByteArrayType> GenericByteArray<T> {
106 pub const DATA_TYPE: DataType = T::DATA_TYPE;
108
109 pub fn new(
115 offsets: OffsetBuffer<T::Offset>,
116 values: Buffer,
117 nulls: Option<NullBuffer>,
118 ) -> Self {
119 Self::try_new(offsets, values, nulls).unwrap()
120 }
121
122 pub fn try_new(
129 offsets: OffsetBuffer<T::Offset>,
130 values: Buffer,
131 nulls: Option<NullBuffer>,
132 ) -> Result<Self, ArrowError> {
133 let len = offsets.len() - 1;
134
135 T::validate(&offsets, &values)?;
137
138 if let Some(n) = nulls.as_ref() {
139 if n.len() != len {
140 return Err(ArrowError::InvalidArgumentError(format!(
141 "Incorrect length of null buffer for {}{}Array, expected {len} got {}",
142 T::Offset::PREFIX,
143 T::PREFIX,
144 n.len(),
145 )));
146 }
147 }
148
149 Ok(Self {
150 data_type: T::DATA_TYPE,
151 value_offsets: offsets,
152 value_data: values,
153 nulls,
154 })
155 }
156
157 pub unsafe fn new_unchecked(
163 offsets: OffsetBuffer<T::Offset>,
164 values: Buffer,
165 nulls: Option<NullBuffer>,
166 ) -> Self {
167 Self {
168 data_type: T::DATA_TYPE,
169 value_offsets: offsets,
170 value_data: values,
171 nulls,
172 }
173 }
174
175 pub fn new_null(len: usize) -> Self {
177 Self {
178 data_type: T::DATA_TYPE,
179 value_offsets: OffsetBuffer::new_zeroed(len),
180 value_data: MutableBuffer::new(0).into(),
181 nulls: Some(NullBuffer::new_null(len)),
182 }
183 }
184
185 pub fn new_scalar(value: impl AsRef<T::Native>) -> Scalar<Self> {
187 Scalar::new(Self::from_iter_values(std::iter::once(value)))
188 }
189
190 pub fn from_iter_values<Ptr, I>(iter: I) -> Self
192 where
193 Ptr: AsRef<T::Native>,
194 I: IntoIterator<Item = Ptr>,
195 {
196 let iter = iter.into_iter();
197 let (_, data_len) = iter.size_hint();
198 let data_len = data_len.expect("Iterator must be sized"); let mut offsets = MutableBuffer::new((data_len + 1) * std::mem::size_of::<T::Offset>());
201 offsets.push(T::Offset::usize_as(0));
202
203 let mut values = MutableBuffer::new(0);
204 for s in iter {
205 let s: &[u8] = s.as_ref().as_ref();
206 values.extend_from_slice(s);
207 offsets.push(T::Offset::usize_as(values.len()));
208 }
209
210 T::Offset::from_usize(values.len()).expect("offset overflow");
211 let offsets = Buffer::from(offsets);
212
213 let value_offsets = unsafe { OffsetBuffer::new_unchecked(offsets.into()) };
215
216 Self {
217 data_type: T::DATA_TYPE,
218 value_data: values.into(),
219 value_offsets,
220 nulls: None,
221 }
222 }
223
224 pub fn into_parts(self) -> (OffsetBuffer<T::Offset>, Buffer, Option<NullBuffer>) {
226 (self.value_offsets, self.value_data, self.nulls)
227 }
228
229 #[inline]
233 pub fn value_length(&self, i: usize) -> T::Offset {
234 let offsets = self.value_offsets();
235 offsets[i + 1] - offsets[i]
236 }
237
238 #[inline]
243 pub fn offsets(&self) -> &OffsetBuffer<T::Offset> {
244 &self.value_offsets
245 }
246
247 #[inline]
252 pub fn values(&self) -> &Buffer {
253 &self.value_data
254 }
255
256 pub fn value_data(&self) -> &[u8] {
258 self.value_data.as_slice()
259 }
260
261 pub fn is_ascii(&self) -> bool {
263 let offsets = self.value_offsets();
264 let start = offsets.first().unwrap();
265 let end = offsets.last().unwrap();
266 self.value_data()[start.as_usize()..end.as_usize()].is_ascii()
267 }
268
269 #[inline]
271 pub fn value_offsets(&self) -> &[T::Offset] {
272 &self.value_offsets
273 }
274
275 pub unsafe fn value_unchecked(&self, i: usize) -> &T::Native {
279 let end = *self.value_offsets().get_unchecked(i + 1);
280 let start = *self.value_offsets().get_unchecked(i);
281
282 let b = std::slice::from_raw_parts(
292 self.value_data.as_ptr().offset(start.to_isize().unwrap()),
293 (end - start).to_usize().unwrap(),
294 );
295
296 T::Native::from_bytes_unchecked(b)
299 }
300
301 pub fn value(&self, i: usize) -> &T::Native {
305 assert!(
306 i < self.len(),
307 "Trying to access an element at index {} from a {}{}Array of length {}",
308 i,
309 T::Offset::PREFIX,
310 T::PREFIX,
311 self.len()
312 );
313 unsafe { self.value_unchecked(i) }
316 }
317
318 pub fn iter(&self) -> ArrayIter<&Self> {
320 ArrayIter::new(self)
321 }
322
323 pub fn slice(&self, offset: usize, length: usize) -> Self {
325 Self {
326 data_type: T::DATA_TYPE,
327 value_offsets: self.value_offsets.slice(offset, length),
328 value_data: self.value_data.clone(),
329 nulls: self.nulls.as_ref().map(|n| n.slice(offset, length)),
330 }
331 }
332
333 pub fn into_builder(self) -> Result<GenericByteBuilder<T>, Self> {
336 let len = self.len();
337 let value_len = T::Offset::as_usize(self.value_offsets()[len] - self.value_offsets()[0]);
338
339 let data = self.into_data();
340 let null_bit_buffer = data.nulls().map(|b| b.inner().sliced());
341
342 let element_len = std::mem::size_of::<T::Offset>();
343 let offset_buffer = data.buffers()[0]
344 .slice_with_length(data.offset() * element_len, (len + 1) * element_len);
345
346 let element_len = std::mem::size_of::<u8>();
347 let value_buffer = data.buffers()[1]
348 .slice_with_length(data.offset() * element_len, value_len * element_len);
349
350 drop(data);
351
352 let try_mutable_null_buffer = match null_bit_buffer {
353 None => Ok(None),
354 Some(null_buffer) => {
355 null_buffer.into_mutable().map(Some)
357 }
358 };
359
360 let try_mutable_buffers = match try_mutable_null_buffer {
361 Ok(mutable_null_buffer) => {
362 let try_mutable_offset_buffer = offset_buffer.into_mutable();
364 let try_mutable_value_buffer = value_buffer.into_mutable();
365
366 match (try_mutable_offset_buffer, try_mutable_value_buffer) {
369 (Ok(mutable_offset_buffer), Ok(mutable_value_buffer)) => unsafe {
370 Ok(GenericByteBuilder::<T>::new_from_buffer(
371 mutable_offset_buffer,
372 mutable_value_buffer,
373 mutable_null_buffer,
374 ))
375 },
376 (Ok(mutable_offset_buffer), Err(value_buffer)) => Err((
377 mutable_offset_buffer.into(),
378 value_buffer,
379 mutable_null_buffer.map(|b| b.into()),
380 )),
381 (Err(offset_buffer), Ok(mutable_value_buffer)) => Err((
382 offset_buffer,
383 mutable_value_buffer.into(),
384 mutable_null_buffer.map(|b| b.into()),
385 )),
386 (Err(offset_buffer), Err(value_buffer)) => Err((
387 offset_buffer,
388 value_buffer,
389 mutable_null_buffer.map(|b| b.into()),
390 )),
391 }
392 }
393 Err(mutable_null_buffer) => {
394 Err((offset_buffer, value_buffer, Some(mutable_null_buffer)))
396 }
397 };
398
399 match try_mutable_buffers {
400 Ok(builder) => Ok(builder),
401 Err((offset_buffer, value_buffer, null_bit_buffer)) => {
402 let builder = ArrayData::builder(T::DATA_TYPE)
403 .len(len)
404 .add_buffer(offset_buffer)
405 .add_buffer(value_buffer)
406 .null_bit_buffer(null_bit_buffer);
407
408 let array_data = unsafe { builder.build_unchecked() };
409 let array = GenericByteArray::<T>::from(array_data);
410
411 Err(array)
412 }
413 }
414 }
415}
416
417impl<T: ByteArrayType> std::fmt::Debug for GenericByteArray<T> {
418 fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
419 write!(f, "{}{}Array\n[\n", T::Offset::PREFIX, T::PREFIX)?;
420 print_long_array(self, f, |array, index, f| {
421 std::fmt::Debug::fmt(&array.value(index), f)
422 })?;
423 write!(f, "]")
424 }
425}
426
427impl<T: ByteArrayType> Array for GenericByteArray<T> {
428 fn as_any(&self) -> &dyn Any {
429 self
430 }
431
432 fn to_data(&self) -> ArrayData {
433 self.clone().into()
434 }
435
436 fn into_data(self) -> ArrayData {
437 self.into()
438 }
439
440 fn data_type(&self) -> &DataType {
441 &self.data_type
442 }
443
444 fn slice(&self, offset: usize, length: usize) -> ArrayRef {
445 Arc::new(self.slice(offset, length))
446 }
447
448 fn len(&self) -> usize {
449 self.value_offsets.len() - 1
450 }
451
452 fn is_empty(&self) -> bool {
453 self.value_offsets.len() <= 1
454 }
455
456 fn shrink_to_fit(&mut self) {
457 self.value_offsets.shrink_to_fit();
458 self.value_data.shrink_to_fit();
459 if let Some(nulls) = &mut self.nulls {
460 nulls.shrink_to_fit();
461 }
462 }
463
464 fn offset(&self) -> usize {
465 0
466 }
467
468 fn nulls(&self) -> Option<&NullBuffer> {
469 self.nulls.as_ref()
470 }
471
472 fn logical_null_count(&self) -> usize {
473 self.null_count()
475 }
476
477 fn get_buffer_memory_size(&self) -> usize {
478 let mut sum = self.value_offsets.inner().inner().capacity();
479 sum += self.value_data.capacity();
480 if let Some(x) = &self.nulls {
481 sum += x.buffer().capacity()
482 }
483 sum
484 }
485
486 fn get_array_memory_size(&self) -> usize {
487 std::mem::size_of::<Self>() + self.get_buffer_memory_size()
488 }
489}
490
491impl<'a, T: ByteArrayType> ArrayAccessor for &'a GenericByteArray<T> {
492 type Item = &'a T::Native;
493
494 fn value(&self, index: usize) -> Self::Item {
495 GenericByteArray::value(self, index)
496 }
497
498 unsafe fn value_unchecked(&self, index: usize) -> Self::Item {
499 GenericByteArray::value_unchecked(self, index)
500 }
501}
502
503impl<T: ByteArrayType> From<ArrayData> for GenericByteArray<T> {
504 fn from(data: ArrayData) -> Self {
505 assert_eq!(
506 data.data_type(),
507 &Self::DATA_TYPE,
508 "{}{}Array expects DataType::{}",
509 T::Offset::PREFIX,
510 T::PREFIX,
511 Self::DATA_TYPE
512 );
513 assert_eq!(
514 data.buffers().len(),
515 2,
516 "{}{}Array data should contain 2 buffers only (offsets and values)",
517 T::Offset::PREFIX,
518 T::PREFIX,
519 );
520 let value_offsets = unsafe { get_offsets(&data) };
523 let value_data = data.buffers()[1].clone();
524 Self {
525 value_offsets,
526 value_data,
527 data_type: T::DATA_TYPE,
528 nulls: data.nulls().cloned(),
529 }
530 }
531}
532
533impl<T: ByteArrayType> From<GenericByteArray<T>> for ArrayData {
534 fn from(array: GenericByteArray<T>) -> Self {
535 let len = array.len();
536
537 let offsets = array.value_offsets.into_inner().into_inner();
538 let builder = ArrayDataBuilder::new(array.data_type)
539 .len(len)
540 .buffers(vec![offsets, array.value_data])
541 .nulls(array.nulls);
542
543 unsafe { builder.build_unchecked() }
544 }
545}
546
547impl<'a, T: ByteArrayType> IntoIterator for &'a GenericByteArray<T> {
548 type Item = Option<&'a T::Native>;
549 type IntoIter = ArrayIter<Self>;
550
551 fn into_iter(self) -> Self::IntoIter {
552 ArrayIter::new(self)
553 }
554}
555
556impl<'a, Ptr, T: ByteArrayType> FromIterator<&'a Option<Ptr>> for GenericByteArray<T>
557where
558 Ptr: AsRef<T::Native> + 'a,
559{
560 fn from_iter<I: IntoIterator<Item = &'a Option<Ptr>>>(iter: I) -> Self {
561 iter.into_iter()
562 .map(|o| o.as_ref().map(|p| p.as_ref()))
563 .collect()
564 }
565}
566
567impl<Ptr, T: ByteArrayType> FromIterator<Option<Ptr>> for GenericByteArray<T>
568where
569 Ptr: AsRef<T::Native>,
570{
571 fn from_iter<I: IntoIterator<Item = Option<Ptr>>>(iter: I) -> Self {
572 let iter = iter.into_iter();
573 let mut builder = GenericByteBuilder::with_capacity(iter.size_hint().0, 1024);
574 builder.extend(iter);
575 builder.finish()
576 }
577}
578
579#[cfg(test)]
580mod tests {
581 use crate::{BinaryArray, StringArray};
582 use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
583
584 #[test]
585 fn try_new() {
586 let data = Buffer::from_slice_ref("helloworld");
587 let offsets = OffsetBuffer::new(vec![0, 5, 10].into());
588 StringArray::new(offsets.clone(), data.clone(), None);
589
590 let nulls = NullBuffer::new_null(3);
591 let err =
592 StringArray::try_new(offsets.clone(), data.clone(), Some(nulls.clone())).unwrap_err();
593 assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for StringArray, expected 2 got 3");
594
595 let err = BinaryArray::try_new(offsets.clone(), data.clone(), Some(nulls)).unwrap_err();
596 assert_eq!(err.to_string(), "Invalid argument error: Incorrect length of null buffer for BinaryArray, expected 2 got 3");
597
598 let non_utf8_data = Buffer::from_slice_ref(b"he\xFFloworld");
599 let err = StringArray::try_new(offsets.clone(), non_utf8_data.clone(), None).unwrap_err();
600 assert_eq!(err.to_string(), "Invalid argument error: Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 2");
601
602 BinaryArray::new(offsets, non_utf8_data, None);
603
604 let offsets = OffsetBuffer::new(vec![0, 5, 11].into());
605 let err = StringArray::try_new(offsets.clone(), data.clone(), None).unwrap_err();
606 assert_eq!(
607 err.to_string(),
608 "Invalid argument error: Offset of 11 exceeds length of values 10"
609 );
610
611 let err = BinaryArray::try_new(offsets.clone(), data, None).unwrap_err();
612 assert_eq!(
613 err.to_string(),
614 "Invalid argument error: Maximum offset of 11 is larger than values of length 10"
615 );
616
617 let non_ascii_data = Buffer::from_slice_ref("heìloworld");
618 StringArray::new(offsets.clone(), non_ascii_data.clone(), None);
619 BinaryArray::new(offsets, non_ascii_data.clone(), None);
620
621 let offsets = OffsetBuffer::new(vec![0, 3, 10].into());
622 let err = StringArray::try_new(offsets.clone(), non_ascii_data.clone(), None).unwrap_err();
623 assert_eq!(
624 err.to_string(),
625 "Invalid argument error: Split UTF-8 codepoint at offset 3"
626 );
627
628 BinaryArray::new(offsets, non_ascii_data, None);
629 }
630}