#[cfg(not(feature = "force_hash_collisions"))]
use std::sync::Arc;
use ahash::RandomState;
use arrow::array::*;
use arrow::datatypes::*;
use arrow::row::Rows;
#[cfg(not(feature = "force_hash_collisions"))]
use arrow::{downcast_dictionary_array, downcast_primitive_array};
use arrow_buffer::IntervalDayTime;
use arrow_buffer::IntervalMonthDayNano;
#[cfg(not(feature = "force_hash_collisions"))]
use crate::cast::{
as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
as_primitive_array, as_string_array, as_string_view_array, as_struct_array,
};
use crate::error::Result;
#[cfg(not(feature = "force_hash_collisions"))]
use crate::error::_internal_err;
#[inline]
pub fn combine_hashes(l: u64, r: u64) -> u64 {
let hash = (17 * 37u64).wrapping_add(l);
hash.wrapping_mul(37).wrapping_add(r)
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
if mul_col {
hashes_buffer.iter_mut().for_each(|hash| {
*hash = combine_hashes(random_state.hash_one(1), *hash);
})
} else {
hashes_buffer.iter_mut().for_each(|hash| {
*hash = random_state.hash_one(1);
})
}
}
pub trait HashValue {
fn hash_one(&self, state: &RandomState) -> u64;
}
impl<'a, T: HashValue + ?Sized> HashValue for &'a T {
fn hash_one(&self, state: &RandomState) -> u64 {
T::hash_one(self, state)
}
}
macro_rules! hash_value {
($($t:ty),+) => {
$(impl HashValue for $t {
fn hash_one(&self, state: &RandomState) -> u64 {
state.hash_one(self)
}
})+
};
}
hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
macro_rules! hash_float_value {
($(($t:ty, $i:ty)),+) => {
$(impl HashValue for $t {
fn hash_one(&self, state: &RandomState) -> u64 {
state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
}
})+
};
}
hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_array_primitive<T>(
array: &PrimitiveArray<T>,
random_state: &RandomState,
hashes_buffer: &mut [u64],
rehash: bool,
) where
T: ArrowPrimitiveType,
<T as arrow_array::ArrowPrimitiveType>::Native: HashValue,
{
assert_eq!(
hashes_buffer.len(),
array.len(),
"hashes_buffer and array should be of equal length"
);
if array.null_count() == 0 {
if rehash {
for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
*hash = combine_hashes(value.hash_one(random_state), *hash);
}
} else {
for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
*hash = value.hash_one(random_state);
}
}
} else if rehash {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
if !array.is_null(i) {
let value = unsafe { array.value_unchecked(i) };
*hash = combine_hashes(value.hash_one(random_state), *hash);
}
}
} else {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
if !array.is_null(i) {
let value = unsafe { array.value_unchecked(i) };
*hash = value.hash_one(random_state);
}
}
}
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_array<T>(
array: T,
random_state: &RandomState,
hashes_buffer: &mut [u64],
rehash: bool,
) where
T: ArrayAccessor,
T::Item: HashValue,
{
assert_eq!(
hashes_buffer.len(),
array.len(),
"hashes_buffer and array should be of equal length"
);
if array.null_count() == 0 {
if rehash {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
let value = unsafe { array.value_unchecked(i) };
*hash = combine_hashes(value.hash_one(random_state), *hash);
}
} else {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
let value = unsafe { array.value_unchecked(i) };
*hash = value.hash_one(random_state);
}
}
} else if rehash {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
if !array.is_null(i) {
let value = unsafe { array.value_unchecked(i) };
*hash = combine_hashes(value.hash_one(random_state), *hash);
}
}
} else {
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
if !array.is_null(i) {
let value = unsafe { array.value_unchecked(i) };
*hash = value.hash_one(random_state);
}
}
}
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_dictionary<K: ArrowDictionaryKeyType>(
array: &DictionaryArray<K>,
random_state: &RandomState,
hashes_buffer: &mut [u64],
multi_col: bool,
) -> Result<()> {
let values = Arc::clone(array.values());
let mut dict_hashes = vec![0; values.len()];
create_hashes(&[values], random_state, &mut dict_hashes)?;
if multi_col {
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
if let Some(key) = key {
*hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
} }
} else {
for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
if let Some(key) = key {
*hash = dict_hashes[key.as_usize()]
} }
}
Ok(())
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_struct_array(
array: &StructArray,
random_state: &RandomState,
hashes_buffer: &mut [u64],
) -> Result<()> {
let nulls = array.nulls();
let row_len = array.len();
let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
nulls.valid_indices().collect()
} else {
(0..row_len).collect()
};
let mut values_hashes = vec![0u64; row_len];
create_hashes(array.columns(), random_state, &mut values_hashes)?;
for i in valid_row_indices {
let hash = &mut hashes_buffer[i];
*hash = combine_hashes(*hash, values_hashes[i]);
}
Ok(())
}
fn hash_map_array(
array: &MapArray,
random_state: &RandomState,
hashes_buffer: &mut [u64],
) -> Result<()> {
let nulls = array.nulls();
let offsets = array.offsets();
let mut values_hashes = vec![0u64; array.entries().len()];
create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
if let Some(nulls) = nulls {
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
if nulls.is_valid(i) {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
*hash = combine_hashes(*hash, *values_hash);
}
}
}
} else {
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
*hash = combine_hashes(*hash, *values_hash);
}
}
}
Ok(())
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_list_array<OffsetSize>(
array: &GenericListArray<OffsetSize>,
random_state: &RandomState,
hashes_buffer: &mut [u64],
) -> Result<()>
where
OffsetSize: OffsetSizeTrait,
{
let values = Arc::clone(array.values());
let offsets = array.value_offsets();
let nulls = array.nulls();
let mut values_hashes = vec![0u64; values.len()];
create_hashes(&[values], random_state, &mut values_hashes)?;
if let Some(nulls) = nulls {
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
if nulls.is_valid(i) {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
*hash = combine_hashes(*hash, *values_hash);
}
}
}
} else {
for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
*hash = combine_hashes(*hash, *values_hash);
}
}
}
Ok(())
}
#[cfg(not(feature = "force_hash_collisions"))]
fn hash_fixed_list_array(
array: &FixedSizeListArray,
random_state: &RandomState,
hashes_buffer: &mut [u64],
) -> Result<()> {
let values = Arc::clone(array.values());
let value_len = array.value_length();
let offset_size = value_len as usize / array.len();
let nulls = array.nulls();
let mut values_hashes = vec![0u64; values.len()];
create_hashes(&[values], random_state, &mut values_hashes)?;
if let Some(nulls) = nulls {
for i in 0..array.len() {
if nulls.is_valid(i) {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size]
{
*hash = combine_hashes(*hash, *values_hash);
}
}
}
} else {
for i in 0..array.len() {
let hash = &mut hashes_buffer[i];
for values_hash in &values_hashes[i * offset_size..(i + 1) * offset_size] {
*hash = combine_hashes(*hash, *values_hash);
}
}
}
Ok(())
}
#[cfg(feature = "force_hash_collisions")]
pub fn create_hashes<'a>(
_arrays: &[ArrayRef],
_random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for hash in hashes_buffer.iter_mut() {
*hash = 0
}
Ok(hashes_buffer)
}
#[cfg(feature = "force_hash_collisions")]
pub fn create_row_hashes<'a>(
_rows: &[Vec<u8>],
_random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for hash in hashes_buffer.iter_mut() {
*hash = 0
}
Ok(hashes_buffer)
}
#[cfg(not(feature = "force_hash_collisions"))]
pub fn create_row_hashes<'a>(
rows: &[Vec<u8>],
random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for hash in hashes_buffer.iter_mut() {
*hash = 0
}
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
*hash = random_state.hash_one(&rows[i]);
}
Ok(hashes_buffer)
}
#[cfg(not(feature = "force_hash_collisions"))]
pub fn create_hashes<'a>(
arrays: &[ArrayRef],
random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for (i, col) in arrays.iter().enumerate() {
let array = col.as_ref();
let rehash = i >= 1;
downcast_primitive_array! {
array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
DataType::Null => hash_null(random_state, hashes_buffer, rehash),
DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash),
DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
DataType::FixedSizeBinary(_) => {
let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
hash_array(array, random_state, hashes_buffer, rehash)
}
DataType::Decimal128(_, _) => {
let array = as_primitive_array::<Decimal128Type>(array)?;
hash_array_primitive(array, random_state, hashes_buffer, rehash)
}
DataType::Decimal256(_, _) => {
let array = as_primitive_array::<Decimal256Type>(array)?;
hash_array_primitive(array, random_state, hashes_buffer, rehash)
}
DataType::Dictionary(_, _) => downcast_dictionary_array! {
array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
_ => unreachable!()
}
DataType::Struct(_) => {
let array = as_struct_array(array)?;
hash_struct_array(array, random_state, hashes_buffer)?;
}
DataType::List(_) => {
let array = as_list_array(array)?;
hash_list_array(array, random_state, hashes_buffer)?;
}
DataType::LargeList(_) => {
let array = as_large_list_array(array)?;
hash_list_array(array, random_state, hashes_buffer)?;
}
DataType::Map(_, _) => {
let array = as_map_array(array)?;
hash_map_array(array, random_state, hashes_buffer)?;
}
DataType::FixedSizeList(_,_) => {
let array = as_fixed_size_list_array(array)?;
hash_fixed_list_array(array, random_state, hashes_buffer)?;
}
_ => {
return _internal_err!(
"Unsupported data type in hasher: {}",
col.data_type()
);
}
}
}
Ok(hashes_buffer)
}
#[cfg(feature = "force_hash_collisions")]
pub fn create_row_hashes_v2<'a>(
_rows: &Rows,
_random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for hash in hashes_buffer.iter_mut() {
*hash = 0
}
Ok(hashes_buffer)
}
#[cfg(not(feature = "force_hash_collisions"))]
pub fn create_row_hashes_v2<'a>(
rows: &Rows,
random_state: &RandomState,
hashes_buffer: &'a mut Vec<u64>,
) -> Result<&'a mut Vec<u64>> {
for hash in hashes_buffer.iter_mut() {
*hash = 0
}
for (i, hash) in hashes_buffer.iter_mut().enumerate() {
*hash = random_state.hash_one(rows.row(i));
}
Ok(hashes_buffer)
}
#[cfg(test)]
mod tests {
use std::sync::Arc;
use arrow::array::*;
#[cfg(not(feature = "force_hash_collisions"))]
use arrow::datatypes::*;
use super::*;
#[test]
fn create_hashes_for_decimal_array() -> Result<()> {
let array = vec![1, 2, 3, 4]
.into_iter()
.map(Some)
.collect::<Decimal128Array>()
.with_precision_and_scale(20, 3)
.unwrap();
let array_ref = Arc::new(array);
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let hashes_buff = &mut vec![0; array_ref.len()];
let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
assert_eq!(hashes.len(), 4);
Ok(())
}
#[test]
fn create_hashes_for_float_arrays() -> Result<()> {
let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let hashes_buff = &mut vec![0; f32_arr.len()];
let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
assert_eq!(hashes.len(), 4,);
let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
assert_eq!(hashes.len(), 4,);
Ok(())
}
macro_rules! create_hash_binary {
($NAME:ident, $ARRAY:ty) => {
#[cfg(not(feature = "force_hash_collisions"))]
#[test]
fn $NAME() {
let binary = [
Some(b"short".to_byte_slice()),
None,
Some(b"long but different 12 bytes string"),
Some(b"short2"),
Some(b"Longer than 12 bytes string"),
Some(b"short"),
Some(b"Longer than 12 bytes string"),
];
let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut binary_hashes = vec![0; binary.len()];
create_hashes(&[binary_array], &random_state, &mut binary_hashes)
.unwrap();
let mut ref_hashes = vec![0; binary.len()];
create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
match val {
Some(_) => assert_ne!(*hash, 0),
None => assert_eq!(*hash, 0),
}
}
assert_eq!(binary_hashes, ref_hashes);
assert_eq!(binary[0], binary[5]);
assert_eq!(binary[4], binary[6]);
assert_ne!(binary[0], binary[2]);
}
};
}
create_hash_binary!(binary_array, BinaryArray);
create_hash_binary!(binary_view_array, BinaryViewArray);
#[test]
fn create_hashes_fixed_size_binary() -> Result<()> {
let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
let fixed_size_binary_array =
Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
let hashes =
create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
assert_eq!(hashes.len(), 3,);
Ok(())
}
macro_rules! create_hash_string {
($NAME:ident, $ARRAY:ty) => {
#[cfg(not(feature = "force_hash_collisions"))]
#[test]
fn $NAME() {
let strings = [
Some("short"),
None,
Some("long but different 12 bytes string"),
Some("short2"),
Some("Longer than 12 bytes string"),
Some("short"),
Some("Longer than 12 bytes string"),
];
let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
let dict_array = Arc::new(
strings
.iter()
.cloned()
.collect::<DictionaryArray<Int8Type>>(),
);
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut string_hashes = vec![0; strings.len()];
create_hashes(&[string_array], &random_state, &mut string_hashes)
.unwrap();
let mut dict_hashes = vec![0; strings.len()];
create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
for (val, hash) in strings.iter().zip(string_hashes.iter()) {
match val {
Some(_) => assert_ne!(*hash, 0),
None => assert_eq!(*hash, 0),
}
}
assert_eq!(string_hashes, dict_hashes);
assert_eq!(strings[0], strings[5]);
assert_eq!(strings[4], strings[6]);
assert_ne!(strings[0], strings[2]);
}
};
}
create_hash_string!(string_array, StringArray);
create_hash_string!(large_string_array, LargeStringArray);
create_hash_string!(string_view_array, StringArray);
create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_dict_arrays() {
let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
let dict_array = Arc::new(
strings
.iter()
.cloned()
.collect::<DictionaryArray<Int8Type>>(),
);
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut string_hashes = vec![0; strings.len()];
create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
let mut dict_hashes = vec![0; strings.len()];
create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
for (val, hash) in strings.iter().zip(string_hashes.iter()) {
match val {
Some(_) => assert_ne!(*hash, 0),
None => assert_eq!(*hash, 0),
}
}
assert_eq!(string_hashes, dict_hashes);
assert_eq!(strings[1], strings[4]);
assert_eq!(dict_hashes[1], dict_hashes[4]);
assert_eq!(strings[0], strings[3]);
assert_eq!(dict_hashes[0], dict_hashes[3]);
assert_ne!(strings[0], strings[2]);
assert_ne!(dict_hashes[0], dict_hashes[2]);
}
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_list_arrays() {
let data = vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), None, Some(5)]),
Some(vec![Some(3), None, Some(5)]),
None,
Some(vec![Some(0), Some(1), Some(2)]),
Some(vec![]),
];
let list_array =
Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; list_array.len()];
create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[5]);
assert_eq!(hashes[1], hashes[4]);
assert_eq!(hashes[2], hashes[3]);
assert_eq!(hashes[1], hashes[6]); }
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_fixed_size_list_arrays() {
let data = vec![
Some(vec![Some(0), Some(1), Some(2)]),
None,
Some(vec![Some(3), None, Some(5)]),
Some(vec![Some(3), None, Some(5)]),
None,
Some(vec![Some(0), Some(1), Some(2)]),
];
let list_array =
Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
data, 3,
)) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; list_array.len()];
create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[5]);
assert_eq!(hashes[1], hashes[4]);
assert_eq!(hashes[2], hashes[3]);
}
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_struct_arrays() {
use arrow_buffer::Buffer;
let boolarr = Arc::new(BooleanArray::from(vec![
false, false, true, true, true, true,
]));
let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
let struct_array = StructArray::from((
vec![
(
Arc::new(Field::new("bool", DataType::Boolean, false)),
Arc::clone(&boolarr) as ArrayRef,
),
(
Arc::new(Field::new("i32", DataType::Int32, false)),
Arc::clone(&i32arr) as ArrayRef,
),
(
Arc::new(Field::new("i32", DataType::Int32, false)),
Arc::clone(&i32arr) as ArrayRef,
),
(
Arc::new(Field::new("bool", DataType::Boolean, false)),
Arc::clone(&boolarr) as ArrayRef,
),
],
Buffer::from(&[0b001011]),
));
assert!(struct_array.is_valid(0));
assert!(struct_array.is_valid(1));
assert!(struct_array.is_null(2));
assert!(struct_array.is_valid(3));
assert!(struct_array.is_null(4));
assert!(struct_array.is_null(5));
let array = Arc::new(struct_array) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; array.len()];
create_hashes(&[array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[1]);
assert_ne!(hashes[2], hashes[3]);
assert_eq!(hashes[4], hashes[5]);
}
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_struct_arrays_more_column_than_row() {
let struct_array = StructArray::from(vec![
(
Arc::new(Field::new("bool", DataType::Boolean, false)),
Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
),
(
Arc::new(Field::new("i32-1", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
(
Arc::new(Field::new("i32-2", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
(
Arc::new(Field::new("i32-3", DataType::Int32, false)),
Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
),
]);
assert!(struct_array.is_valid(0));
assert!(struct_array.is_valid(1));
let array = Arc::new(struct_array) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; array.len()];
create_hashes(&[array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[1]);
}
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_hashes_for_map_arrays() {
let mut builder =
MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
builder.keys().append_value("key1");
builder.keys().append_value("key2");
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.keys().append_value("key2");
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.keys().append_value("key2");
builder.values().append_value(1);
builder.values().append_value(3);
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.keys().append_value("key3");
builder.values().append_value(1);
builder.values().append_value(2);
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.values().append_value(1);
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.values().append_null();
builder.append(true).unwrap();
builder.append(true).unwrap();
builder.keys().append_value("key1");
builder.values().append_value(1);
builder.append(false).unwrap();
let array = Arc::new(builder.finish()) as ArrayRef;
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut hashes = vec![0; array.len()];
create_hashes(&[array], &random_state, &mut hashes).unwrap();
assert_eq!(hashes[0], hashes[1]); assert_ne!(hashes[0], hashes[2]); assert_ne!(hashes[0], hashes[3]); assert_ne!(hashes[0], hashes[4]); assert_ne!(hashes[4], hashes[5]); assert_eq!(hashes[6], hashes[7]); }
#[test]
#[cfg(not(feature = "force_hash_collisions"))]
fn create_multi_column_hash_for_dict_arrays() {
let strings1 = [Some("foo"), None, Some("bar")];
let strings2 = [Some("blarg"), Some("blah"), None];
let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
let dict_array = Arc::new(
strings2
.iter()
.cloned()
.collect::<DictionaryArray<Int32Type>>(),
);
let random_state = RandomState::with_seeds(0, 0, 0, 0);
let mut one_col_hashes = vec![0; strings1.len()];
create_hashes(
&[Arc::clone(&dict_array) as ArrayRef],
&random_state,
&mut one_col_hashes,
)
.unwrap();
let mut two_col_hashes = vec![0; strings1.len()];
create_hashes(
&[dict_array, string_array],
&random_state,
&mut two_col_hashes,
)
.unwrap();
assert_eq!(one_col_hashes.len(), 3);
assert_eq!(two_col_hashes.len(), 3);
assert_ne!(one_col_hashes, two_col_hashes);
}
}