use std::sync::Arc;
use polars_error::{polars_bail, PolarsResult};
use super::{MutableUtf8Array, StrAsBytes, Utf8Array};
use crate::array::physical_binary::*;
use crate::array::specification::{try_check_offsets_bounds, try_check_utf8};
use crate::array::{Array, ArrayValuesIter, MutableArray, TryExtend, TryExtendFromSelf, TryPush};
use crate::bitmap::MutableBitmap;
use crate::datatypes::ArrowDataType;
use crate::offset::{Offset, Offsets};
use crate::trusted_len::TrustedLen;
#[derive(Debug, Clone)]
pub struct MutableUtf8ValuesArray<O: Offset> {
data_type: ArrowDataType,
offsets: Offsets<O>,
values: Vec<u8>,
}
impl<O: Offset> From<MutableUtf8ValuesArray<O>> for Utf8Array<O> {
fn from(other: MutableUtf8ValuesArray<O>) -> Self {
unsafe {
Utf8Array::<O>::new_unchecked(
other.data_type,
other.offsets.into(),
other.values.into(),
None,
)
}
}
}
impl<O: Offset> From<MutableUtf8ValuesArray<O>> for MutableUtf8Array<O> {
fn from(other: MutableUtf8ValuesArray<O>) -> Self {
unsafe {
MutableUtf8Array::<O>::new_unchecked(other.data_type, other.offsets, other.values, None)
}
}
}
impl<O: Offset> Default for MutableUtf8ValuesArray<O> {
fn default() -> Self {
Self::new()
}
}
impl<O: Offset> MutableUtf8ValuesArray<O> {
pub fn new() -> Self {
Self {
data_type: Self::default_data_type(),
offsets: Offsets::new(),
values: Vec::<u8>::new(),
}
}
pub fn try_new(
data_type: ArrowDataType,
offsets: Offsets<O>,
values: Vec<u8>,
) -> PolarsResult<Self> {
try_check_utf8(&offsets, &values)?;
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
polars_bail!(ComputeError: "MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
}
Ok(Self {
data_type,
offsets,
values,
})
}
pub unsafe fn new_unchecked(
data_type: ArrowDataType,
offsets: Offsets<O>,
values: Vec<u8>,
) -> Self {
try_check_offsets_bounds(&offsets, values.len())
.expect("The length of the values must be equal to the last offset value");
if data_type.to_physical_type() != Self::default_data_type().to_physical_type() {
panic!("MutableUtf8ValuesArray can only be initialized with DataType::Utf8 or DataType::LargeUtf8")
}
Self {
data_type,
offsets,
values,
}
}
pub fn default_data_type() -> ArrowDataType {
Utf8Array::<O>::default_data_type()
}
pub fn with_capacity(capacity: usize) -> Self {
Self::with_capacities(capacity, 0)
}
pub fn with_capacities(capacity: usize, values: usize) -> Self {
Self {
data_type: Self::default_data_type(),
offsets: Offsets::<O>::with_capacity(capacity),
values: Vec::<u8>::with_capacity(values),
}
}
#[inline]
pub fn values(&self) -> &Vec<u8> {
&self.values
}
#[inline]
pub fn offsets(&self) -> &Offsets<O> {
&self.offsets
}
#[inline]
pub fn reserve(&mut self, additional: usize, additional_values: usize) {
self.offsets.reserve(additional + 1);
self.values.reserve(additional_values);
}
pub fn capacity(&self) -> usize {
self.offsets.capacity()
}
#[inline]
pub fn len(&self) -> usize {
self.offsets.len_proxy()
}
#[inline]
pub fn push<T: AsRef<str>>(&mut self, value: T) {
self.try_push(value).unwrap()
}
pub fn pop(&mut self) -> Option<String> {
if self.len() == 0 {
return None;
}
self.offsets.pop()?;
let start = self.offsets.last().to_usize();
let value = self.values.split_off(start);
Some(unsafe { String::from_utf8_unchecked(value) })
}
#[inline]
pub fn value(&self, i: usize) -> &str {
assert!(i < self.len());
unsafe { self.value_unchecked(i) }
}
#[inline]
pub unsafe fn value_unchecked(&self, i: usize) -> &str {
let (start, end) = self.offsets.start_end(i);
let slice = self.values.get_unchecked(start..end);
std::str::from_utf8_unchecked(slice)
}
pub fn iter(&self) -> ArrayValuesIter<Self> {
ArrayValuesIter::new(self)
}
pub fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
self.offsets.shrink_to_fit();
}
pub fn into_inner(self) -> (ArrowDataType, Offsets<O>, Vec<u8>) {
(self.data_type, self.offsets, self.values)
}
}
impl<O: Offset> MutableArray for MutableUtf8ValuesArray<O> {
fn len(&self) -> usize {
self.len()
}
fn validity(&self) -> Option<&MutableBitmap> {
None
}
fn as_box(&mut self) -> Box<dyn Array> {
let array: Utf8Array<O> = std::mem::take(self).into();
array.boxed()
}
fn as_arc(&mut self) -> Arc<dyn Array> {
let array: Utf8Array<O> = std::mem::take(self).into();
array.arced()
}
fn data_type(&self) -> &ArrowDataType {
&self.data_type
}
fn as_any(&self) -> &dyn std::any::Any {
self
}
fn as_mut_any(&mut self) -> &mut dyn std::any::Any {
self
}
#[inline]
fn push_null(&mut self) {
self.push::<&str>("")
}
fn reserve(&mut self, additional: usize) {
self.reserve(additional, 0)
}
fn shrink_to_fit(&mut self) {
self.shrink_to_fit()
}
}
impl<O: Offset, P: AsRef<str>> FromIterator<P> for MutableUtf8ValuesArray<O> {
fn from_iter<I: IntoIterator<Item = P>>(iter: I) -> Self {
let (offsets, values) = values_iter(iter.into_iter().map(StrAsBytes));
unsafe { Self::new_unchecked(Self::default_data_type(), offsets, values) }
}
}
impl<O: Offset> MutableUtf8ValuesArray<O> {
pub(crate) unsafe fn extend_from_trusted_len_iter<I, P>(
&mut self,
validity: &mut MutableBitmap,
iterator: I,
) where
P: AsRef<str>,
I: Iterator<Item = Option<P>>,
{
let iterator = iterator.map(|x| x.map(StrAsBytes));
extend_from_trusted_len_iter(&mut self.offsets, &mut self.values, validity, iterator);
}
#[inline]
pub fn extend_trusted_len<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: TrustedLen<Item = P>,
{
unsafe { self.extend_trusted_len_unchecked(iterator) }
}
#[inline]
pub unsafe fn extend_trusted_len_unchecked<I, P>(&mut self, iterator: I)
where
P: AsRef<str>,
I: Iterator<Item = P>,
{
let iterator = iterator.map(StrAsBytes);
extend_from_trusted_len_values_iter(&mut self.offsets, &mut self.values, iterator);
}
#[inline]
pub fn from_trusted_len_iter<I, P>(iterator: I) -> Self
where
P: AsRef<str>,
I: TrustedLen<Item = P>,
{
unsafe { Self::from_trusted_len_iter_unchecked(iterator) }
}
#[inline]
pub unsafe fn from_trusted_len_iter_unchecked<I, P>(iterator: I) -> Self
where
P: AsRef<str>,
I: Iterator<Item = P>,
{
let iterator = iterator.map(StrAsBytes);
let (offsets, values) = trusted_len_values_iter(iterator);
Self::new_unchecked(Self::default_data_type(), offsets, values)
}
pub fn try_from_iter<P: AsRef<str>, I: IntoIterator<Item = P>>(iter: I) -> PolarsResult<Self> {
let iterator = iter.into_iter();
let (lower, _) = iterator.size_hint();
let mut array = Self::with_capacity(lower);
for item in iterator {
array.try_push(item)?;
}
Ok(array)
}
pub fn extend_fallible<T, I, E>(&mut self, iter: I) -> std::result::Result<(), E>
where
E: std::error::Error,
I: IntoIterator<Item = std::result::Result<T, E>>,
T: AsRef<str>,
{
let mut iter = iter.into_iter();
self.reserve(iter.size_hint().0, 0);
iter.try_for_each(|x| {
self.push(x?);
Ok(())
})
}
}
impl<O: Offset, T: AsRef<str>> Extend<T> for MutableUtf8ValuesArray<O> {
fn extend<I: IntoIterator<Item = T>>(&mut self, iter: I) {
extend_from_values_iter(
&mut self.offsets,
&mut self.values,
iter.into_iter().map(StrAsBytes),
);
}
}
impl<O: Offset, T: AsRef<str>> TryExtend<T> for MutableUtf8ValuesArray<O> {
fn try_extend<I: IntoIterator<Item = T>>(&mut self, iter: I) -> PolarsResult<()> {
let mut iter = iter.into_iter();
self.reserve(iter.size_hint().0, 0);
iter.try_for_each(|x| self.try_push(x))
}
}
impl<O: Offset, T: AsRef<str>> TryPush<T> for MutableUtf8ValuesArray<O> {
#[inline]
fn try_push(&mut self, value: T) -> PolarsResult<()> {
let bytes = value.as_ref().as_bytes();
self.values.extend_from_slice(bytes);
self.offsets.try_push(bytes.len())
}
}
impl<O: Offset> TryExtendFromSelf for MutableUtf8ValuesArray<O> {
fn try_extend_from_self(&mut self, other: &Self) -> PolarsResult<()> {
self.values.extend_from_slice(&other.values);
self.offsets.try_extend_from_self(&other.offsets)
}
}