use arrow_array::*;
use arrow_array::{cast::AsArray, types::*};
use arrow_buffer::Buffer;
use arrow_data::ArrayData;
use arrow_schema::{ArrowError, DataType};
use std::sync::Arc;
macro_rules! unary_offsets {
($array: expr, $data_type: expr, $op: expr) => {{
let slice = $array.value_offsets();
let lengths = slice.windows(2).map(|offset| $op(offset[1] - offset[0]));
let buffer = unsafe { Buffer::from_trusted_len_iter(lengths) };
let null_bit_buffer = $array.nulls().map(|b| b.inner().sliced());
let data = unsafe {
ArrayData::new_unchecked(
$data_type,
$array.len(),
None,
null_bit_buffer,
0,
vec![buffer],
vec![],
)
};
make_array(data)
}};
}
macro_rules! kernel_dict {
($array: ident, $kernel: expr, $kt: ident, $($t: ident: $gt: ident), *) => {
match $kt.as_ref() {
$(&DataType::$t => {
let dict = $array
.as_any()
.downcast_ref::<DictionaryArray<$gt>>()
.unwrap_or_else(|| {
panic!("Expect 'DictionaryArray<{}>' but got array of data type {:?}",
stringify!($gt), $array.data_type())
});
let values = $kernel(dict.values())?;
let result = DictionaryArray::try_new(dict.keys().clone(), values)?;
Ok(Arc::new(result))
},
)*
t => panic!("Unsupported dictionary key type: {}", t)
}
}
}
fn length_list<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericListArray<O>>()
.unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x)
}
fn length_list_fixed_size(array: &dyn Array, length: i32) -> ArrayRef {
let array = array.as_fixed_size_list();
let length_list = array.len();
let buffer = Buffer::from_vec(vec![length; length_list]);
let data = Int32Array::new(buffer.into(), array.nulls().cloned());
Arc::new(data)
}
fn length_binary<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x)
}
fn length_string<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x)
}
fn bit_length_binary<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericBinaryArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}
fn bit_length_string<O, T>(array: &dyn Array) -> ArrayRef
where
O: OffsetSizeTrait,
T: ArrowPrimitiveType,
T::Native: OffsetSizeTrait,
{
let array = array
.as_any()
.downcast_ref::<GenericStringArray<O>>()
.unwrap();
let bits_in_bytes = O::from_usize(8).unwrap();
unary_offsets!(array, T::DATA_TYPE, |x| x * bits_in_bytes)
}
pub fn length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
match array.data_type() {
DataType::Dictionary(kt, _) => {
kernel_dict!(
array,
|a| { length(a) },
kt,
Int8: Int8Type,
Int16: Int16Type,
Int32: Int32Type,
Int64: Int64Type,
UInt8: UInt8Type,
UInt16: UInt16Type,
UInt32: UInt32Type,
UInt64: UInt64Type
)
}
DataType::List(_) => Ok(length_list::<i32, Int32Type>(array)),
DataType::LargeList(_) => Ok(length_list::<i64, Int64Type>(array)),
DataType::Utf8 => Ok(length_string::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(length_string::<i64, Int64Type>(array)),
DataType::Binary => Ok(length_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(length_binary::<i64, Int64Type>(array)),
DataType::FixedSizeList(_, len) => Ok(length_list_fixed_size(array, *len)),
other => Err(ArrowError::ComputeError(format!(
"length not supported for {other:?}"
))),
}
}
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
match array.data_type() {
DataType::Dictionary(kt, _) => {
kernel_dict!(
array,
|a| { bit_length(a) },
kt,
Int8: Int8Type,
Int16: Int16Type,
Int32: Int32Type,
Int64: Int64Type,
UInt8: UInt8Type,
UInt16: UInt16Type,
UInt32: UInt32Type,
UInt64: UInt64Type
)
}
DataType::Utf8 => Ok(bit_length_string::<i32, Int32Type>(array)),
DataType::LargeUtf8 => Ok(bit_length_string::<i64, Int64Type>(array)),
DataType::Binary => Ok(bit_length_binary::<i32, Int32Type>(array)),
DataType::LargeBinary => Ok(bit_length_binary::<i64, Int64Type>(array)),
other => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {other:?}"
))),
}
}
#[cfg(test)]
mod tests {
use super::*;
use arrow_array::cast::AsArray;
use arrow_buffer::NullBuffer;
use arrow_schema::Field;
fn double_vec<T: Clone>(v: Vec<T>) -> Vec<T> {
[&v[..], &v[..]].concat()
}
fn length_cases_string() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![3, 2, 1, 0];
for _ in 0..10 {
values = double_vec(values);
expected = double_vec(expected);
}
vec![
(vec!["hello", " ", "world"], 3, vec![5, 1, 5]),
(vec!["hello", " ", "world", "!"], 4, vec![5, 1, 5, 1]),
(vec!["💖"], 1, vec![4]),
(values, 4096, expected),
]
}
macro_rules! length_binary_helper {
($offset_ty: ty, $result_ty: ty, $kernel: ident, $value: expr, $expected: expr) => {{
let array = GenericBinaryArray::<$offset_ty>::from($value);
let result = $kernel(&array).unwrap();
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(&expected, result);
}};
}
macro_rules! length_list_helper {
($offset_ty: ty, $result_ty: ty, $element_ty: ty, $value: expr, $expected: expr) => {{
let array =
GenericListArray::<$offset_ty>::from_iter_primitive::<$element_ty, _, _>(
$value,
);
let result = length(&array).unwrap();
let result = result.as_any().downcast_ref::<$result_ty>().unwrap();
let expected: $result_ty = $expected.into();
assert_eq!(&expected, result);
}};
}
#[test]
#[cfg_attr(miri, ignore)] fn length_test_string() {
length_cases_string()
.into_iter()
.for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
})
}
#[test]
#[cfg_attr(miri, ignore)] fn length_test_large_string() {
length_cases_string()
.into_iter()
.for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value as i64, result.value(i));
});
})
}
#[test]
fn length_test_binary() {
let value: Vec<&[u8]> = vec![b"zero", b"one", &[0xff, 0xf8]];
let result: Vec<i32> = vec![4, 3, 2];
length_binary_helper!(i32, Int32Array, length, value, result)
}
#[test]
fn length_test_large_binary() {
let value: Vec<&[u8]> = vec![b"zero", &[0xff, 0xf8], b"two"];
let result: Vec<i64> = vec![4, 2, 3];
length_binary_helper!(i64, Int64Array, length, value, result)
}
#[test]
fn length_test_list() {
let value = vec![
Some(vec![]),
Some(vec![Some(1), Some(2), Some(4)]),
Some(vec![Some(0)]),
];
let result: Vec<i32> = vec![0, 3, 1];
length_list_helper!(i32, Int32Array, Int32Type, value, result)
}
#[test]
fn length_test_large_list() {
let value = vec![
Some(vec![]),
Some(vec![Some(1.1), Some(2.2), Some(3.3)]),
Some(vec![None]),
];
let result: Vec<i64> = vec![0, 3, 1];
length_list_helper!(i64, Int64Array, Float32Type, value, result)
}
type OptionStr = Option<&'static str>;
fn length_null_cases_string() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
4,
vec![Some(3), None, Some(5), Some(4)],
)]
}
#[test]
fn length_null_string() {
length_null_cases_string()
.into_iter()
.for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
let expected: Int32Array = expected.into();
assert_eq!(&expected, result);
})
}
#[test]
fn length_null_large_string() {
length_null_cases_string()
.into_iter()
.for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
let expected: Int64Array = expected
.iter()
.map(|e| e.map(|e| e as i64))
.collect::<Vec<_>>()
.into();
assert_eq!(&expected, result);
})
}
#[test]
fn length_null_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"zero"), None, Some(&[0xff, 0xf8]), Some(b"three")];
let result: Vec<Option<i32>> = vec![Some(4), None, Some(2), Some(5)];
length_binary_helper!(i32, Int32Array, length, value, result)
}
#[test]
fn length_null_large_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(&[0xff, 0xf8]), None, Some(b"two"), Some(b"three")];
let result: Vec<Option<i64>> = vec![Some(2), None, Some(3), Some(5)];
length_binary_helper!(i64, Int64Array, length, value, result)
}
#[test]
fn length_null_list() {
let value = vec![
Some(vec![]),
None,
Some(vec![Some(1), None, Some(2), Some(4)]),
Some(vec![Some(0)]),
];
let result: Vec<Option<i32>> = vec![Some(0), None, Some(4), Some(1)];
length_list_helper!(i32, Int32Array, Int8Type, value, result)
}
#[test]
fn length_null_large_list() {
let value = vec![
Some(vec![]),
None,
Some(vec![Some(1.1), None, Some(4.0)]),
Some(vec![Some(0.1)]),
];
let result: Vec<Option<i64>> = vec![Some(0), None, Some(3), Some(1)];
length_list_helper!(i64, Int64Array, Float32Type, value, result)
}
#[test]
fn length_wrong_type() {
let array: UInt64Array = vec![1u64].into();
assert!(length(&array).is_err());
}
#[test]
fn length_offsets_string() {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
let result = length(&b).unwrap();
let result: &Int32Array = result.as_primitive();
let expected = Int32Array::from(vec![Some(1), Some(5), None]);
assert_eq!(&expected, result);
}
#[test]
fn length_offsets_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(b" "), Some(&[0xff, 0xf8]), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = length(&b).unwrap();
let result: &Int32Array = result.as_primitive();
let expected = Int32Array::from(vec![Some(1), Some(2), None]);
assert_eq!(&expected, result);
}
fn bit_length_cases() -> Vec<(Vec<&'static str>, usize, Vec<i32>)> {
let mut values = vec!["one", "on", "o", ""];
let mut expected = vec![24, 16, 8, 0];
for _ in 0..10 {
values = double_vec(values);
expected = double_vec(expected);
}
vec![
(vec!["hello", " ", "world", "!"], 4, vec![40, 8, 40, 8]),
(vec!["💖"], 1, vec![32]),
(vec!["josé"], 1, vec![40]),
(values, 4096, expected),
]
}
#[test]
#[cfg_attr(miri, ignore)] fn bit_length_test_string() {
bit_length_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value, result.value(i));
});
})
}
#[test]
#[cfg_attr(miri, ignore)] fn bit_length_test_large_string() {
bit_length_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
expected.iter().enumerate().for_each(|(i, value)| {
assert_eq!(*value as i64, result.value(i));
});
})
}
#[test]
fn bit_length_binary() {
let value: Vec<&[u8]> = vec![b"one", &[0xff, 0xf8], b"three"];
let expected: Vec<i32> = vec![24, 16, 40];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}
#[test]
fn bit_length_large_binary() {
let value: Vec<&[u8]> = vec![b"zero", b" ", &[0xff, 0xf8]];
let expected: Vec<i64> = vec![32, 8, 16];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}
fn bit_length_null_cases() -> Vec<(Vec<OptionStr>, usize, Vec<Option<i32>>)> {
vec![(
vec![Some("one"), None, Some("three"), Some("four")],
4,
vec![Some(24), None, Some(40), Some(32)],
)]
}
#[test]
fn bit_length_null_string() {
bit_length_null_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let array = StringArray::from(input);
let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int32Array>().unwrap();
let expected: Int32Array = expected.into();
assert_eq!(&expected, result);
})
}
#[test]
fn bit_length_null_large_string() {
bit_length_null_cases()
.into_iter()
.for_each(|(input, len, expected)| {
let array = LargeStringArray::from(input);
let result = bit_length(&array).unwrap();
assert_eq!(len, result.len());
let result = result.as_any().downcast_ref::<Int64Array>().unwrap();
let expected: Int64Array = expected
.iter()
.map(|e| e.map(|e| e as i64))
.collect::<Vec<_>>()
.into();
assert_eq!(&expected, result);
})
}
#[test]
fn bit_length_null_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(b"three"), Some(&[0xff, 0xf8])];
let expected: Vec<Option<i32>> = vec![Some(24), None, Some(40), Some(16)];
length_binary_helper!(i32, Int32Array, bit_length, value, expected)
}
#[test]
fn bit_length_null_large_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"one"), None, Some(&[0xff, 0xf8]), Some(b"four")];
let expected: Vec<Option<i64>> = vec![Some(24), None, Some(16), Some(32)];
length_binary_helper!(i64, Int64Array, bit_length, value, expected)
}
#[test]
fn bit_length_wrong_type() {
let array: UInt64Array = vec![1u64].into();
assert!(bit_length(&array).is_err());
}
#[test]
fn bit_length_offsets_string() {
let a = StringArray::from(vec![Some("hello"), Some(" "), Some("world"), None]);
let b = a.slice(1, 3);
let result = bit_length(&b).unwrap();
let result: &Int32Array = result.as_primitive();
let expected = Int32Array::from(vec![Some(8), Some(40), None]);
assert_eq!(&expected, result);
}
#[test]
fn bit_length_offsets_binary() {
let value: Vec<Option<&[u8]>> =
vec![Some(b"hello"), Some(&[]), Some(b"world"), None];
let a = BinaryArray::from(value);
let b = a.slice(1, 3);
let result = bit_length(&b).unwrap();
let result: &Int32Array = result.as_primitive();
let expected = Int32Array::from(vec![Some(0), Some(40), None]);
assert_eq!(&expected, result);
}
#[test]
fn length_dictionary() {
_length_dictionary::<Int8Type>();
_length_dictionary::<Int16Type>();
_length_dictionary::<Int32Type>();
_length_dictionary::<Int64Type>();
_length_dictionary::<UInt8Type>();
_length_dictionary::<UInt16Type>();
_length_dictionary::<UInt32Type>();
_length_dictionary::<UInt64Type>();
}
fn _length_dictionary<K: ArrowDictionaryKeyType>() {
const TOTAL: i32 = 100;
let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
let data: Vec<Option<&str>> = (0..TOTAL)
.map(|n| {
let i = n % 5;
if i == 3 {
None
} else {
Some(v[i as usize])
}
})
.collect();
let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
let expected: Vec<Option<i32>> =
data.iter().map(|opt| opt.map(|s| s.len() as i32)).collect();
let res = length(&dict_array).unwrap();
let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
let actual: Vec<Option<i32>> = actual
.values()
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.take_iter(dict_array.keys_iter())
.collect();
for i in 0..TOTAL as usize {
assert_eq!(expected[i], actual[i],);
}
}
#[test]
fn bit_length_dictionary() {
_bit_length_dictionary::<Int8Type>();
_bit_length_dictionary::<Int16Type>();
_bit_length_dictionary::<Int32Type>();
_bit_length_dictionary::<Int64Type>();
_bit_length_dictionary::<UInt8Type>();
_bit_length_dictionary::<UInt16Type>();
_bit_length_dictionary::<UInt32Type>();
_bit_length_dictionary::<UInt64Type>();
}
fn _bit_length_dictionary<K: ArrowDictionaryKeyType>() {
const TOTAL: i32 = 100;
let v = ["aaaa", "bb", "ccccc", "ddd", "eeeeee"];
let data: Vec<Option<&str>> = (0..TOTAL)
.map(|n| {
let i = n % 5;
if i == 3 {
None
} else {
Some(v[i as usize])
}
})
.collect();
let dict_array: DictionaryArray<K> = data.clone().into_iter().collect();
let expected: Vec<Option<i32>> = data
.iter()
.map(|opt| opt.map(|s| (s.chars().count() * 8) as i32))
.collect();
let res = bit_length(&dict_array).unwrap();
let actual = res.as_any().downcast_ref::<DictionaryArray<K>>().unwrap();
let actual: Vec<Option<i32>> = actual
.values()
.as_any()
.downcast_ref::<Int32Array>()
.unwrap()
.take_iter(dict_array.keys_iter())
.collect();
for i in 0..TOTAL as usize {
assert_eq!(expected[i], actual[i],);
}
}
#[test]
fn test_fixed_size_list_length() {
let value_data = ArrayData::builder(DataType::Int32)
.len(9)
.add_buffer(Buffer::from_slice_ref([0, 1, 2, 3, 4, 5, 6, 7, 8]))
.build()
.unwrap();
let list_data_type = DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::Int32, false)),
3,
);
let nulls = NullBuffer::from(vec![true, false, true]);
let list_data = ArrayData::builder(list_data_type)
.len(3)
.add_child_data(value_data)
.nulls(Some(nulls))
.build()
.unwrap();
let list_array = FixedSizeListArray::from(list_data);
let lengths = length(&list_array).unwrap();
let lengths = lengths.as_any().downcast_ref::<Int32Array>().unwrap();
assert_eq!(lengths.len(), 3);
assert_eq!(lengths.value(0), 3);
assert!(lengths.is_null(1));
assert_eq!(lengths.value(2), 3);
}
}