1use either::Either;
2
3use super::specification::try_check_utf8;
4use super::{Array, GenericBinaryArray, Splitable};
5use crate::array::iterator::NonNullValuesIter;
6use crate::array::BinaryArray;
7use crate::bitmap::utils::{BitmapIter, ZipValidity};
8use crate::bitmap::Bitmap;
9use crate::buffer::Buffer;
10use crate::datatypes::ArrowDataType;
11use crate::offset::{Offset, Offsets, OffsetsBuffer};
12use crate::trusted_len::TrustedLen;
13
14mod ffi;
15pub(super) mod fmt;
16mod from;
17mod iterator;
18mod mutable;
19mod mutable_values;
20pub use iterator::*;
21pub use mutable::*;
22pub use mutable_values::MutableUtf8ValuesArray;
23use polars_error::*;
24
25pub(super) struct StrAsBytes<P>(P);
27impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
28 #[inline(always)]
29 fn as_ref(&self) -> &[u8] {
30 self.0.as_ref().as_bytes()
31 }
32}
33
34#[derive(Clone)]
65pub struct Utf8Array<O: Offset> {
66 dtype: ArrowDataType,
67 offsets: OffsetsBuffer<O>,
68 values: Buffer<u8>,
69 validity: Option<Bitmap>,
70}
71
72impl<O: Offset> Utf8Array<O> {
74 pub fn try_new(
85 dtype: ArrowDataType,
86 offsets: OffsetsBuffer<O>,
87 values: Buffer<u8>,
88 validity: Option<Bitmap>,
89 ) -> PolarsResult<Self> {
90 try_check_utf8(&offsets, &values)?;
91 if validity
92 .as_ref()
93 .is_some_and(|validity| validity.len() != offsets.len_proxy())
94 {
95 polars_bail!(ComputeError: "validity mask length must match the number of values");
96 }
97
98 if dtype.to_physical_type() != Self::default_dtype().to_physical_type() {
99 polars_bail!(ComputeError: "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
100 }
101
102 Ok(Self {
103 dtype,
104 offsets,
105 values,
106 validity,
107 })
108 }
109
110 pub fn from_slice<T: AsRef<str>, P: AsRef<[T]>>(slice: P) -> Self {
114 Self::from_trusted_len_values_iter(slice.as_ref().iter())
115 }
116
117 pub fn from<T: AsRef<str>, P: AsRef<[Option<T>]>>(slice: P) -> Self {
122 MutableUtf8Array::<O>::from(slice).into()
123 }
124
125 pub fn iter(&self) -> ZipValidity<&str, Utf8ValuesIter<O>, BitmapIter> {
127 ZipValidity::new_with_validity(self.values_iter(), self.validity())
128 }
129
130 pub fn values_iter(&self) -> Utf8ValuesIter<O> {
132 Utf8ValuesIter::new(self)
133 }
134
135 #[inline]
137 pub fn non_null_values_iter(&self) -> NonNullValuesIter<'_, Utf8Array<O>> {
138 NonNullValuesIter::new(self, self.validity())
139 }
140
141 #[inline]
143 pub fn len(&self) -> usize {
144 self.offsets.len_proxy()
145 }
146
147 #[inline]
151 pub fn value(&self, i: usize) -> &str {
152 assert!(i < self.len());
153 unsafe { self.value_unchecked(i) }
154 }
155
156 #[inline]
161 pub unsafe fn value_unchecked(&self, i: usize) -> &str {
162 let (start, end) = self.offsets.start_end_unchecked(i);
164
165 let slice = self.values.get_unchecked(start..end);
167
168 std::str::from_utf8_unchecked(slice)
170 }
171
172 #[inline]
176 pub fn get(&self, i: usize) -> Option<&str> {
177 if !self.is_null(i) {
178 unsafe { Some(self.value_unchecked(i)) }
180 } else {
181 None
182 }
183 }
184
185 #[inline]
187 pub fn dtype(&self) -> &ArrowDataType {
188 &self.dtype
189 }
190
191 #[inline]
193 pub fn values(&self) -> &Buffer<u8> {
194 &self.values
195 }
196
197 #[inline]
199 pub fn offsets(&self) -> &OffsetsBuffer<O> {
200 &self.offsets
201 }
202
203 #[inline]
205 pub fn validity(&self) -> Option<&Bitmap> {
206 self.validity.as_ref()
207 }
208
209 pub fn slice(&mut self, offset: usize, length: usize) {
215 assert!(
216 offset + length <= self.len(),
217 "the offset of the new array cannot exceed the arrays' length"
218 );
219 unsafe { self.slice_unchecked(offset, length) }
220 }
221
222 pub unsafe fn slice_unchecked(&mut self, offset: usize, length: usize) {
229 self.validity = self
230 .validity
231 .take()
232 .map(|bitmap| bitmap.sliced_unchecked(offset, length))
233 .filter(|bitmap| bitmap.unset_bits() > 0);
234 self.offsets.slice_unchecked(offset, length + 1);
235 }
236
237 impl_sliced!();
238 impl_mut_validity!();
239 impl_into_array!();
240
241 #[must_use]
243 pub fn into_inner(self) -> (ArrowDataType, OffsetsBuffer<O>, Buffer<u8>, Option<Bitmap>) {
244 let Self {
245 dtype,
246 offsets,
247 values,
248 validity,
249 } = self;
250 (dtype, offsets, values, validity)
251 }
252
253 #[must_use]
255 pub fn into_mut(self) -> Either<Self, MutableUtf8Array<O>> {
256 use Either::*;
257 if let Some(bitmap) = self.validity {
258 match bitmap.into_mut() {
259 Left(bitmap) => Left(unsafe {
261 Utf8Array::new_unchecked(self.dtype, self.offsets, self.values, Some(bitmap))
262 }),
263 Right(mutable_bitmap) => match (self.values.into_mut(), self.offsets.into_mut()) {
264 (Left(values), Left(offsets)) => {
265 Left(unsafe {
267 Utf8Array::new_unchecked(
268 self.dtype,
269 offsets,
270 values,
271 Some(mutable_bitmap.into()),
272 )
273 })
274 },
275 (Left(values), Right(offsets)) => {
276 Left(unsafe {
278 Utf8Array::new_unchecked(
279 self.dtype,
280 offsets.into(),
281 values,
282 Some(mutable_bitmap.into()),
283 )
284 })
285 },
286 (Right(values), Left(offsets)) => {
287 Left(unsafe {
289 Utf8Array::new_unchecked(
290 self.dtype,
291 offsets,
292 values.into(),
293 Some(mutable_bitmap.into()),
294 )
295 })
296 },
297 (Right(values), Right(offsets)) => Right(unsafe {
298 MutableUtf8Array::new_unchecked(
299 self.dtype,
300 offsets,
301 values,
302 Some(mutable_bitmap),
303 )
304 }),
305 },
306 }
307 } else {
308 match (self.values.into_mut(), self.offsets.into_mut()) {
309 (Left(values), Left(offsets)) => {
310 Left(unsafe { Utf8Array::new_unchecked(self.dtype, offsets, values, None) })
311 },
312 (Left(values), Right(offsets)) => Left(unsafe {
313 Utf8Array::new_unchecked(self.dtype, offsets.into(), values, None)
314 }),
315 (Right(values), Left(offsets)) => Left(unsafe {
316 Utf8Array::new_unchecked(self.dtype, offsets, values.into(), None)
317 }),
318 (Right(values), Right(offsets)) => Right(unsafe {
319 MutableUtf8Array::new_unchecked(self.dtype, offsets, values, None)
320 }),
321 }
322 }
323 }
324
325 #[inline]
329 pub fn new_empty(dtype: ArrowDataType) -> Self {
330 unsafe { Self::new_unchecked(dtype, OffsetsBuffer::new(), Buffer::new(), None) }
331 }
332
333 #[inline]
335 pub fn new_null(dtype: ArrowDataType, length: usize) -> Self {
336 Self::new(
337 dtype,
338 Offsets::new_zeroed(length).into(),
339 Buffer::new(),
340 Some(Bitmap::new_zeroed(length)),
341 )
342 }
343
344 pub fn default_dtype() -> ArrowDataType {
346 if O::IS_LARGE {
347 ArrowDataType::LargeUtf8
348 } else {
349 ArrowDataType::Utf8
350 }
351 }
352
353 pub unsafe fn new_unchecked(
367 dtype: ArrowDataType,
368 offsets: OffsetsBuffer<O>,
369 values: Buffer<u8>,
370 validity: Option<Bitmap>,
371 ) -> Self {
372 debug_assert!(
373 offsets.last().to_usize() <= values.len(),
374 "offsets must not exceed the values length"
375 );
376 debug_assert!(
377 validity
378 .as_ref()
379 .is_none_or(|validity| validity.len() == offsets.len_proxy()),
380 "validity mask length must match the number of values"
381 );
382 debug_assert!(
383 dtype.to_physical_type() == Self::default_dtype().to_physical_type(),
384 "Utf8Array can only be initialized with DataType::Utf8 or DataType::LargeUtf8"
385 );
386
387 Self {
388 dtype,
389 offsets,
390 values,
391 validity,
392 }
393 }
394
395 pub fn new(
405 dtype: ArrowDataType,
406 offsets: OffsetsBuffer<O>,
407 values: Buffer<u8>,
408 validity: Option<Bitmap>,
409 ) -> Self {
410 Self::try_new(dtype, offsets, values, validity).unwrap()
411 }
412
413 #[inline]
417 pub fn from_trusted_len_values_iter<T: AsRef<str>, I: TrustedLen<Item = T>>(
418 iterator: I,
419 ) -> Self {
420 MutableUtf8Array::<O>::from_trusted_len_values_iter(iterator).into()
421 }
422
423 pub fn from_iter_values<T: AsRef<str>, I: Iterator<Item = T>>(iterator: I) -> Self {
425 MutableUtf8Array::<O>::from_iter_values(iterator).into()
426 }
427
428 #[inline]
434 pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
435 where
436 P: AsRef<str>,
437 I: Iterator<Item = Option<P>>,
438 {
439 MutableUtf8Array::<O>::from_trusted_len_iter_unchecked(iterator).into()
440 }
441
442 #[inline]
444 pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
445 where
446 P: AsRef<str>,
447 I: TrustedLen<Item = Option<P>>,
448 {
449 MutableUtf8Array::<O>::from_trusted_len_iter(iterator).into()
450 }
451
452 #[inline]
458 pub unsafe fn try_from_trusted_len_iter_unchecked<E, I, P>(
459 iterator: I,
460 ) -> std::result::Result<Self, E>
461 where
462 P: AsRef<str>,
463 I: IntoIterator<Item = std::result::Result<Option<P>, E>>,
464 {
465 MutableUtf8Array::<O>::try_from_trusted_len_iter_unchecked(iterator).map(|x| x.into())
466 }
467
468 #[inline]
470 pub fn try_from_trusted_len_iter<E, I, P>(iter: I) -> std::result::Result<Self, E>
471 where
472 P: AsRef<str>,
473 I: TrustedLen<Item = std::result::Result<Option<P>, E>>,
474 {
475 MutableUtf8Array::<O>::try_from_trusted_len_iter(iter).map(|x| x.into())
476 }
477
478 pub fn apply_validity<F: FnOnce(Bitmap) -> Bitmap>(&mut self, f: F) {
484 if let Some(validity) = std::mem::take(&mut self.validity) {
485 self.set_validity(Some(f(validity)))
486 }
487 }
488
489 pub fn to_binary(&self) -> BinaryArray<O> {
491 unsafe {
492 BinaryArray::new_unchecked(
493 BinaryArray::<O>::default_dtype(),
494 self.offsets.clone(),
495 self.values.clone(),
496 self.validity.clone(),
497 )
498 }
499 }
500}
501
502impl<O: Offset> Splitable for Utf8Array<O> {
503 #[inline(always)]
504 fn check_bound(&self, offset: usize) -> bool {
505 offset <= self.len()
506 }
507
508 unsafe fn _split_at_unchecked(&self, offset: usize) -> (Self, Self) {
509 let (lhs_validity, rhs_validity) = unsafe { self.validity.split_at_unchecked(offset) };
510 let (lhs_offsets, rhs_offsets) = unsafe { self.offsets.split_at_unchecked(offset) };
511
512 (
513 Self {
514 dtype: self.dtype.clone(),
515 offsets: lhs_offsets,
516 values: self.values.clone(),
517 validity: lhs_validity,
518 },
519 Self {
520 dtype: self.dtype.clone(),
521 offsets: rhs_offsets,
522 values: self.values.clone(),
523 validity: rhs_validity,
524 },
525 )
526 }
527}
528
529impl<O: Offset> Array for Utf8Array<O> {
530 impl_common_array!();
531
532 fn validity(&self) -> Option<&Bitmap> {
533 self.validity.as_ref()
534 }
535
536 #[inline]
537 fn with_validity(&self, validity: Option<Bitmap>) -> Box<dyn Array> {
538 Box::new(self.clone().with_validity(validity))
539 }
540}
541
542unsafe impl<O: Offset> GenericBinaryArray<O> for Utf8Array<O> {
543 #[inline]
544 fn values(&self) -> &[u8] {
545 self.values()
546 }
547
548 #[inline]
549 fn offsets(&self) -> &[O] {
550 self.offsets().buffer()
551 }
552}
553
554impl<O: Offset> Default for Utf8Array<O> {
555 fn default() -> Self {
556 let dtype = if O::IS_LARGE {
557 ArrowDataType::LargeUtf8
558 } else {
559 ArrowDataType::Utf8
560 };
561 Utf8Array::new(dtype, Default::default(), Default::default(), None)
562 }
563}