datafusion_common/
hash_utils.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Functionality used both on logical and physical plans
19
20#[cfg(not(feature = "force_hash_collisions"))]
21use std::sync::Arc;
22
23use ahash::RandomState;
24use arrow::array::types::{IntervalDayTime, IntervalMonthDayNano};
25use arrow::array::*;
26use arrow::datatypes::*;
27#[cfg(not(feature = "force_hash_collisions"))]
28use arrow::{downcast_dictionary_array, downcast_primitive_array};
29
30#[cfg(not(feature = "force_hash_collisions"))]
31use crate::cast::{
32    as_binary_view_array, as_boolean_array, as_fixed_size_list_array,
33    as_generic_binary_array, as_large_list_array, as_list_array, as_map_array,
34    as_string_array, as_string_view_array, as_struct_array,
35};
36use crate::error::Result;
37#[cfg(not(feature = "force_hash_collisions"))]
38use crate::error::_internal_err;
39
40// Combines two hashes into one hash
41#[inline]
42pub fn combine_hashes(l: u64, r: u64) -> u64 {
43    let hash = (17 * 37u64).wrapping_add(l);
44    hash.wrapping_mul(37).wrapping_add(r)
45}
46
47#[cfg(not(feature = "force_hash_collisions"))]
48fn hash_null(random_state: &RandomState, hashes_buffer: &'_ mut [u64], mul_col: bool) {
49    if mul_col {
50        hashes_buffer.iter_mut().for_each(|hash| {
51            // stable hash for null value
52            *hash = combine_hashes(random_state.hash_one(1), *hash);
53        })
54    } else {
55        hashes_buffer.iter_mut().for_each(|hash| {
56            *hash = random_state.hash_one(1);
57        })
58    }
59}
60
61pub trait HashValue {
62    fn hash_one(&self, state: &RandomState) -> u64;
63}
64
65impl<T: HashValue + ?Sized> HashValue for &T {
66    fn hash_one(&self, state: &RandomState) -> u64 {
67        T::hash_one(self, state)
68    }
69}
70
71macro_rules! hash_value {
72    ($($t:ty),+) => {
73        $(impl HashValue for $t {
74            fn hash_one(&self, state: &RandomState) -> u64 {
75                state.hash_one(self)
76            }
77        })+
78    };
79}
80hash_value!(i8, i16, i32, i64, i128, i256, u8, u16, u32, u64);
81hash_value!(bool, str, [u8], IntervalDayTime, IntervalMonthDayNano);
82
83macro_rules! hash_float_value {
84    ($(($t:ty, $i:ty)),+) => {
85        $(impl HashValue for $t {
86            fn hash_one(&self, state: &RandomState) -> u64 {
87                state.hash_one(<$i>::from_ne_bytes(self.to_ne_bytes()))
88            }
89        })+
90    };
91}
92hash_float_value!((half::f16, u16), (f32, u32), (f64, u64));
93
94/// Builds hash values of PrimitiveArray and writes them into `hashes_buffer`
95/// If `rehash==true` this combines the previous hash value in the buffer
96/// with the new hash using `combine_hashes`
97#[cfg(not(feature = "force_hash_collisions"))]
98fn hash_array_primitive<T>(
99    array: &PrimitiveArray<T>,
100    random_state: &RandomState,
101    hashes_buffer: &mut [u64],
102    rehash: bool,
103) where
104    T: ArrowPrimitiveType<Native: HashValue>,
105{
106    assert_eq!(
107        hashes_buffer.len(),
108        array.len(),
109        "hashes_buffer and array should be of equal length"
110    );
111
112    if array.null_count() == 0 {
113        if rehash {
114            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
115                *hash = combine_hashes(value.hash_one(random_state), *hash);
116            }
117        } else {
118            for (hash, &value) in hashes_buffer.iter_mut().zip(array.values().iter()) {
119                *hash = value.hash_one(random_state);
120            }
121        }
122    } else if rehash {
123        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
124            if !array.is_null(i) {
125                let value = unsafe { array.value_unchecked(i) };
126                *hash = combine_hashes(value.hash_one(random_state), *hash);
127            }
128        }
129    } else {
130        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
131            if !array.is_null(i) {
132                let value = unsafe { array.value_unchecked(i) };
133                *hash = value.hash_one(random_state);
134            }
135        }
136    }
137}
138
139/// Hashes one array into the `hashes_buffer`
140/// If `rehash==true` this combines the previous hash value in the buffer
141/// with the new hash using `combine_hashes`
142#[cfg(not(feature = "force_hash_collisions"))]
143fn hash_array<T>(
144    array: T,
145    random_state: &RandomState,
146    hashes_buffer: &mut [u64],
147    rehash: bool,
148) where
149    T: ArrayAccessor,
150    T::Item: HashValue,
151{
152    assert_eq!(
153        hashes_buffer.len(),
154        array.len(),
155        "hashes_buffer and array should be of equal length"
156    );
157
158    if array.null_count() == 0 {
159        if rehash {
160            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
161                let value = unsafe { array.value_unchecked(i) };
162                *hash = combine_hashes(value.hash_one(random_state), *hash);
163            }
164        } else {
165            for (i, hash) in hashes_buffer.iter_mut().enumerate() {
166                let value = unsafe { array.value_unchecked(i) };
167                *hash = value.hash_one(random_state);
168            }
169        }
170    } else if rehash {
171        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
172            if !array.is_null(i) {
173                let value = unsafe { array.value_unchecked(i) };
174                *hash = combine_hashes(value.hash_one(random_state), *hash);
175            }
176        }
177    } else {
178        for (i, hash) in hashes_buffer.iter_mut().enumerate() {
179            if !array.is_null(i) {
180                let value = unsafe { array.value_unchecked(i) };
181                *hash = value.hash_one(random_state);
182            }
183        }
184    }
185}
186
187/// Hash the values in a dictionary array
188#[cfg(not(feature = "force_hash_collisions"))]
189fn hash_dictionary<K: ArrowDictionaryKeyType>(
190    array: &DictionaryArray<K>,
191    random_state: &RandomState,
192    hashes_buffer: &mut [u64],
193    multi_col: bool,
194) -> Result<()> {
195    // Hash each dictionary value once, and then use that computed
196    // hash for each key value to avoid a potentially expensive
197    // redundant hashing for large dictionary elements (e.g. strings)
198    let values = Arc::clone(array.values());
199    let mut dict_hashes = vec![0; values.len()];
200    create_hashes(&[values], random_state, &mut dict_hashes)?;
201
202    // combine hash for each index in values
203    if multi_col {
204        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
205            if let Some(key) = key {
206                *hash = combine_hashes(dict_hashes[key.as_usize()], *hash)
207            } // no update for Null, consistent with other hashes
208        }
209    } else {
210        for (hash, key) in hashes_buffer.iter_mut().zip(array.keys().iter()) {
211            if let Some(key) = key {
212                *hash = dict_hashes[key.as_usize()]
213            } // no update for Null, consistent with other hashes
214        }
215    }
216    Ok(())
217}
218
219#[cfg(not(feature = "force_hash_collisions"))]
220fn hash_struct_array(
221    array: &StructArray,
222    random_state: &RandomState,
223    hashes_buffer: &mut [u64],
224) -> Result<()> {
225    let nulls = array.nulls();
226    let row_len = array.len();
227
228    let valid_row_indices: Vec<usize> = if let Some(nulls) = nulls {
229        nulls.valid_indices().collect()
230    } else {
231        (0..row_len).collect()
232    };
233
234    // Create hashes for each row that combines the hashes over all the column at that row.
235    let mut values_hashes = vec![0u64; row_len];
236    create_hashes(array.columns(), random_state, &mut values_hashes)?;
237
238    for i in valid_row_indices {
239        let hash = &mut hashes_buffer[i];
240        *hash = combine_hashes(*hash, values_hashes[i]);
241    }
242
243    Ok(())
244}
245
246// only adding this `cfg` b/c this function is only used with this `cfg`
247#[cfg(not(feature = "force_hash_collisions"))]
248fn hash_map_array(
249    array: &MapArray,
250    random_state: &RandomState,
251    hashes_buffer: &mut [u64],
252) -> Result<()> {
253    let nulls = array.nulls();
254    let offsets = array.offsets();
255
256    // Create hashes for each entry in each row
257    let mut values_hashes = vec![0u64; array.entries().len()];
258    create_hashes(array.entries().columns(), random_state, &mut values_hashes)?;
259
260    // Combine the hashes for entries on each row with each other and previous hash for that row
261    if let Some(nulls) = nulls {
262        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
263            if nulls.is_valid(i) {
264                let hash = &mut hashes_buffer[i];
265                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
266                    *hash = combine_hashes(*hash, *values_hash);
267                }
268            }
269        }
270    } else {
271        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
272            let hash = &mut hashes_buffer[i];
273            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
274                *hash = combine_hashes(*hash, *values_hash);
275            }
276        }
277    }
278
279    Ok(())
280}
281
282#[cfg(not(feature = "force_hash_collisions"))]
283fn hash_list_array<OffsetSize>(
284    array: &GenericListArray<OffsetSize>,
285    random_state: &RandomState,
286    hashes_buffer: &mut [u64],
287) -> Result<()>
288where
289    OffsetSize: OffsetSizeTrait,
290{
291    let values = Arc::clone(array.values());
292    let offsets = array.value_offsets();
293    let nulls = array.nulls();
294    let mut values_hashes = vec![0u64; values.len()];
295    create_hashes(&[values], random_state, &mut values_hashes)?;
296    if let Some(nulls) = nulls {
297        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
298            if nulls.is_valid(i) {
299                let hash = &mut hashes_buffer[i];
300                for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
301                    *hash = combine_hashes(*hash, *values_hash);
302                }
303            }
304        }
305    } else {
306        for (i, (start, stop)) in offsets.iter().zip(offsets.iter().skip(1)).enumerate() {
307            let hash = &mut hashes_buffer[i];
308            for values_hash in &values_hashes[start.as_usize()..stop.as_usize()] {
309                *hash = combine_hashes(*hash, *values_hash);
310            }
311        }
312    }
313    Ok(())
314}
315
316#[cfg(not(feature = "force_hash_collisions"))]
317fn hash_fixed_list_array(
318    array: &FixedSizeListArray,
319    random_state: &RandomState,
320    hashes_buffer: &mut [u64],
321) -> Result<()> {
322    let values = Arc::clone(array.values());
323    let value_length = array.value_length() as usize;
324    let nulls = array.nulls();
325    let mut values_hashes = vec![0u64; values.len()];
326    create_hashes(&[values], random_state, &mut values_hashes)?;
327    if let Some(nulls) = nulls {
328        for i in 0..array.len() {
329            if nulls.is_valid(i) {
330                let hash = &mut hashes_buffer[i];
331                for values_hash in
332                    &values_hashes[i * value_length..(i + 1) * value_length]
333                {
334                    *hash = combine_hashes(*hash, *values_hash);
335                }
336            }
337        }
338    } else {
339        for i in 0..array.len() {
340            let hash = &mut hashes_buffer[i];
341            for values_hash in &values_hashes[i * value_length..(i + 1) * value_length] {
342                *hash = combine_hashes(*hash, *values_hash);
343            }
344        }
345    }
346    Ok(())
347}
348
349/// Test version of `create_hashes` that produces the same value for
350/// all hashes (to test collisions)
351///
352/// See comments on `hashes_buffer` for more details
353#[cfg(feature = "force_hash_collisions")]
354pub fn create_hashes<'a>(
355    _arrays: &[ArrayRef],
356    _random_state: &RandomState,
357    hashes_buffer: &'a mut Vec<u64>,
358) -> Result<&'a mut Vec<u64>> {
359    for hash in hashes_buffer.iter_mut() {
360        *hash = 0
361    }
362    Ok(hashes_buffer)
363}
364
365/// Creates hash values for every row, based on the values in the
366/// columns.
367///
368/// The number of rows to hash is determined by `hashes_buffer.len()`.
369/// `hashes_buffer` should be pre-sized appropriately
370#[cfg(not(feature = "force_hash_collisions"))]
371pub fn create_hashes<'a>(
372    arrays: &[ArrayRef],
373    random_state: &RandomState,
374    hashes_buffer: &'a mut Vec<u64>,
375) -> Result<&'a mut Vec<u64>> {
376    for (i, col) in arrays.iter().enumerate() {
377        let array = col.as_ref();
378        // combine hashes with `combine_hashes` for all columns besides the first
379        let rehash = i >= 1;
380        downcast_primitive_array! {
381            array => hash_array_primitive(array, random_state, hashes_buffer, rehash),
382            DataType::Null => hash_null(random_state, hashes_buffer, rehash),
383            DataType::Boolean => hash_array(as_boolean_array(array)?, random_state, hashes_buffer, rehash),
384            DataType::Utf8 => hash_array(as_string_array(array)?, random_state, hashes_buffer, rehash),
385            DataType::Utf8View => hash_array(as_string_view_array(array)?, random_state, hashes_buffer, rehash),
386            DataType::LargeUtf8 => hash_array(as_largestring_array(array), random_state, hashes_buffer, rehash),
387            DataType::Binary => hash_array(as_generic_binary_array::<i32>(array)?, random_state, hashes_buffer, rehash),
388            DataType::BinaryView => hash_array(as_binary_view_array(array)?, random_state, hashes_buffer, rehash),
389            DataType::LargeBinary => hash_array(as_generic_binary_array::<i64>(array)?, random_state, hashes_buffer, rehash),
390            DataType::FixedSizeBinary(_) => {
391                let array: &FixedSizeBinaryArray = array.as_any().downcast_ref().unwrap();
392                hash_array(array, random_state, hashes_buffer, rehash)
393            }
394            DataType::Dictionary(_, _) => downcast_dictionary_array! {
395                array => hash_dictionary(array, random_state, hashes_buffer, rehash)?,
396                _ => unreachable!()
397            }
398            DataType::Struct(_) => {
399                let array = as_struct_array(array)?;
400                hash_struct_array(array, random_state, hashes_buffer)?;
401            }
402            DataType::List(_) => {
403                let array = as_list_array(array)?;
404                hash_list_array(array, random_state, hashes_buffer)?;
405            }
406            DataType::LargeList(_) => {
407                let array = as_large_list_array(array)?;
408                hash_list_array(array, random_state, hashes_buffer)?;
409            }
410            DataType::Map(_, _) => {
411                let array = as_map_array(array)?;
412                hash_map_array(array, random_state, hashes_buffer)?;
413            }
414            DataType::FixedSizeList(_,_) => {
415                let array = as_fixed_size_list_array(array)?;
416                hash_fixed_list_array(array, random_state, hashes_buffer)?;
417            }
418            _ => {
419                // This is internal because we should have caught this before.
420                return _internal_err!(
421                    "Unsupported data type in hasher: {}",
422                    col.data_type()
423                );
424            }
425        }
426    }
427    Ok(hashes_buffer)
428}
429
430#[cfg(test)]
431mod tests {
432    use std::sync::Arc;
433
434    use arrow::array::*;
435    #[cfg(not(feature = "force_hash_collisions"))]
436    use arrow::datatypes::*;
437
438    use super::*;
439
440    #[test]
441    fn create_hashes_for_decimal_array() -> Result<()> {
442        let array = vec![1, 2, 3, 4]
443            .into_iter()
444            .map(Some)
445            .collect::<Decimal128Array>()
446            .with_precision_and_scale(20, 3)
447            .unwrap();
448        let array_ref = Arc::new(array);
449        let random_state = RandomState::with_seeds(0, 0, 0, 0);
450        let hashes_buff = &mut vec![0; array_ref.len()];
451        let hashes = create_hashes(&[array_ref], &random_state, hashes_buff)?;
452        assert_eq!(hashes.len(), 4);
453        Ok(())
454    }
455
456    #[test]
457    fn create_hashes_for_empty_fixed_size_lit() -> Result<()> {
458        let empty_array = FixedSizeListBuilder::new(StringBuilder::new(), 1).finish();
459        let random_state = RandomState::with_seeds(0, 0, 0, 0);
460        let hashes_buff = &mut vec![0; 0];
461        let hashes = create_hashes(&[Arc::new(empty_array)], &random_state, hashes_buff)?;
462        assert_eq!(hashes, &Vec::<u64>::new());
463        Ok(())
464    }
465
466    #[test]
467    fn create_hashes_for_float_arrays() -> Result<()> {
468        let f32_arr = Arc::new(Float32Array::from(vec![0.12, 0.5, 1f32, 444.7]));
469        let f64_arr = Arc::new(Float64Array::from(vec![0.12, 0.5, 1f64, 444.7]));
470
471        let random_state = RandomState::with_seeds(0, 0, 0, 0);
472        let hashes_buff = &mut vec![0; f32_arr.len()];
473        let hashes = create_hashes(&[f32_arr], &random_state, hashes_buff)?;
474        assert_eq!(hashes.len(), 4,);
475
476        let hashes = create_hashes(&[f64_arr], &random_state, hashes_buff)?;
477        assert_eq!(hashes.len(), 4,);
478
479        Ok(())
480    }
481
482    macro_rules! create_hash_binary {
483        ($NAME:ident, $ARRAY:ty) => {
484            #[cfg(not(feature = "force_hash_collisions"))]
485            #[test]
486            fn $NAME() {
487                let binary = [
488                    Some(b"short".to_byte_slice()),
489                    None,
490                    Some(b"long but different 12 bytes string"),
491                    Some(b"short2"),
492                    Some(b"Longer than 12 bytes string"),
493                    Some(b"short"),
494                    Some(b"Longer than 12 bytes string"),
495                ];
496
497                let binary_array = Arc::new(binary.iter().cloned().collect::<$ARRAY>());
498                let ref_array = Arc::new(binary.iter().cloned().collect::<BinaryArray>());
499
500                let random_state = RandomState::with_seeds(0, 0, 0, 0);
501
502                let mut binary_hashes = vec![0; binary.len()];
503                create_hashes(&[binary_array], &random_state, &mut binary_hashes)
504                    .unwrap();
505
506                let mut ref_hashes = vec![0; binary.len()];
507                create_hashes(&[ref_array], &random_state, &mut ref_hashes).unwrap();
508
509                // Null values result in a zero hash,
510                for (val, hash) in binary.iter().zip(binary_hashes.iter()) {
511                    match val {
512                        Some(_) => assert_ne!(*hash, 0),
513                        None => assert_eq!(*hash, 0),
514                    }
515                }
516
517                // same logical values should hash to the same hash value
518                assert_eq!(binary_hashes, ref_hashes);
519
520                // Same values should map to same hash values
521                assert_eq!(binary[0], binary[5]);
522                assert_eq!(binary[4], binary[6]);
523
524                // different binary should map to different hash values
525                assert_ne!(binary[0], binary[2]);
526            }
527        };
528    }
529
530    create_hash_binary!(binary_array, BinaryArray);
531    create_hash_binary!(binary_view_array, BinaryViewArray);
532
533    #[test]
534    fn create_hashes_fixed_size_binary() -> Result<()> {
535        let input_arg = vec![vec![1, 2], vec![5, 6], vec![5, 6]];
536        let fixed_size_binary_array =
537            Arc::new(FixedSizeBinaryArray::try_from_iter(input_arg.into_iter()).unwrap());
538
539        let random_state = RandomState::with_seeds(0, 0, 0, 0);
540        let hashes_buff = &mut vec![0; fixed_size_binary_array.len()];
541        let hashes =
542            create_hashes(&[fixed_size_binary_array], &random_state, hashes_buff)?;
543        assert_eq!(hashes.len(), 3,);
544
545        Ok(())
546    }
547
548    macro_rules! create_hash_string {
549        ($NAME:ident, $ARRAY:ty) => {
550            #[cfg(not(feature = "force_hash_collisions"))]
551            #[test]
552            fn $NAME() {
553                let strings = [
554                    Some("short"),
555                    None,
556                    Some("long but different 12 bytes string"),
557                    Some("short2"),
558                    Some("Longer than 12 bytes string"),
559                    Some("short"),
560                    Some("Longer than 12 bytes string"),
561                ];
562
563                let string_array = Arc::new(strings.iter().cloned().collect::<$ARRAY>());
564                let dict_array = Arc::new(
565                    strings
566                        .iter()
567                        .cloned()
568                        .collect::<DictionaryArray<Int8Type>>(),
569                );
570
571                let random_state = RandomState::with_seeds(0, 0, 0, 0);
572
573                let mut string_hashes = vec![0; strings.len()];
574                create_hashes(&[string_array], &random_state, &mut string_hashes)
575                    .unwrap();
576
577                let mut dict_hashes = vec![0; strings.len()];
578                create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
579
580                // Null values result in a zero hash,
581                for (val, hash) in strings.iter().zip(string_hashes.iter()) {
582                    match val {
583                        Some(_) => assert_ne!(*hash, 0),
584                        None => assert_eq!(*hash, 0),
585                    }
586                }
587
588                // same logical values should hash to the same hash value
589                assert_eq!(string_hashes, dict_hashes);
590
591                // Same values should map to same hash values
592                assert_eq!(strings[0], strings[5]);
593                assert_eq!(strings[4], strings[6]);
594
595                // different strings should map to different hash values
596                assert_ne!(strings[0], strings[2]);
597            }
598        };
599    }
600
601    create_hash_string!(string_array, StringArray);
602    create_hash_string!(large_string_array, LargeStringArray);
603    create_hash_string!(string_view_array, StringArray);
604    create_hash_string!(dict_string_array, DictionaryArray<Int8Type>);
605
606    #[test]
607    // Tests actual values of hashes, which are different if forcing collisions
608    #[cfg(not(feature = "force_hash_collisions"))]
609    fn create_hashes_for_dict_arrays() {
610        let strings = [Some("foo"), None, Some("bar"), Some("foo"), None];
611
612        let string_array = Arc::new(strings.iter().cloned().collect::<StringArray>());
613        let dict_array = Arc::new(
614            strings
615                .iter()
616                .cloned()
617                .collect::<DictionaryArray<Int8Type>>(),
618        );
619
620        let random_state = RandomState::with_seeds(0, 0, 0, 0);
621
622        let mut string_hashes = vec![0; strings.len()];
623        create_hashes(&[string_array], &random_state, &mut string_hashes).unwrap();
624
625        let mut dict_hashes = vec![0; strings.len()];
626        create_hashes(&[dict_array], &random_state, &mut dict_hashes).unwrap();
627
628        // Null values result in a zero hash,
629        for (val, hash) in strings.iter().zip(string_hashes.iter()) {
630            match val {
631                Some(_) => assert_ne!(*hash, 0),
632                None => assert_eq!(*hash, 0),
633            }
634        }
635
636        // same logical values should hash to the same hash value
637        assert_eq!(string_hashes, dict_hashes);
638
639        // Same values should map to same hash values
640        assert_eq!(strings[1], strings[4]);
641        assert_eq!(dict_hashes[1], dict_hashes[4]);
642        assert_eq!(strings[0], strings[3]);
643        assert_eq!(dict_hashes[0], dict_hashes[3]);
644
645        // different strings should map to different hash values
646        assert_ne!(strings[0], strings[2]);
647        assert_ne!(dict_hashes[0], dict_hashes[2]);
648    }
649
650    #[test]
651    // Tests actual values of hashes, which are different if forcing collisions
652    #[cfg(not(feature = "force_hash_collisions"))]
653    fn create_hashes_for_list_arrays() {
654        let data = vec![
655            Some(vec![Some(0), Some(1), Some(2)]),
656            None,
657            Some(vec![Some(3), None, Some(5)]),
658            Some(vec![Some(3), None, Some(5)]),
659            None,
660            Some(vec![Some(0), Some(1), Some(2)]),
661            Some(vec![]),
662        ];
663        let list_array =
664            Arc::new(ListArray::from_iter_primitive::<Int32Type, _, _>(data)) as ArrayRef;
665        let random_state = RandomState::with_seeds(0, 0, 0, 0);
666        let mut hashes = vec![0; list_array.len()];
667        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
668        assert_eq!(hashes[0], hashes[5]);
669        assert_eq!(hashes[1], hashes[4]);
670        assert_eq!(hashes[2], hashes[3]);
671        assert_eq!(hashes[1], hashes[6]); // null vs empty list
672    }
673
674    #[test]
675    // Tests actual values of hashes, which are different if forcing collisions
676    #[cfg(not(feature = "force_hash_collisions"))]
677    fn create_hashes_for_fixed_size_list_arrays() {
678        let data = vec![
679            Some(vec![Some(0), Some(1), Some(2)]),
680            None,
681            Some(vec![Some(3), None, Some(5)]),
682            Some(vec![Some(3), None, Some(5)]),
683            None,
684            Some(vec![Some(0), Some(1), Some(2)]),
685        ];
686        let list_array =
687            Arc::new(FixedSizeListArray::from_iter_primitive::<Int32Type, _, _>(
688                data, 3,
689            )) as ArrayRef;
690        let random_state = RandomState::with_seeds(0, 0, 0, 0);
691        let mut hashes = vec![0; list_array.len()];
692        create_hashes(&[list_array], &random_state, &mut hashes).unwrap();
693        assert_eq!(hashes[0], hashes[5]);
694        assert_eq!(hashes[1], hashes[4]);
695        assert_eq!(hashes[2], hashes[3]);
696    }
697
698    #[test]
699    // Tests actual values of hashes, which are different if forcing collisions
700    #[cfg(not(feature = "force_hash_collisions"))]
701    fn create_hashes_for_struct_arrays() {
702        use arrow::buffer::Buffer;
703
704        let boolarr = Arc::new(BooleanArray::from(vec![
705            false, false, true, true, true, true,
706        ]));
707        let i32arr = Arc::new(Int32Array::from(vec![10, 10, 20, 20, 30, 31]));
708
709        let struct_array = StructArray::from((
710            vec![
711                (
712                    Arc::new(Field::new("bool", DataType::Boolean, false)),
713                    Arc::clone(&boolarr) as ArrayRef,
714                ),
715                (
716                    Arc::new(Field::new("i32", DataType::Int32, false)),
717                    Arc::clone(&i32arr) as ArrayRef,
718                ),
719                (
720                    Arc::new(Field::new("i32", DataType::Int32, false)),
721                    Arc::clone(&i32arr) as ArrayRef,
722                ),
723                (
724                    Arc::new(Field::new("bool", DataType::Boolean, false)),
725                    Arc::clone(&boolarr) as ArrayRef,
726                ),
727            ],
728            Buffer::from(&[0b001011]),
729        ));
730
731        assert!(struct_array.is_valid(0));
732        assert!(struct_array.is_valid(1));
733        assert!(struct_array.is_null(2));
734        assert!(struct_array.is_valid(3));
735        assert!(struct_array.is_null(4));
736        assert!(struct_array.is_null(5));
737
738        let array = Arc::new(struct_array) as ArrayRef;
739
740        let random_state = RandomState::with_seeds(0, 0, 0, 0);
741        let mut hashes = vec![0; array.len()];
742        create_hashes(&[array], &random_state, &mut hashes).unwrap();
743        assert_eq!(hashes[0], hashes[1]);
744        // same value but the third row ( hashes[2] ) is null
745        assert_ne!(hashes[2], hashes[3]);
746        // different values but both are null
747        assert_eq!(hashes[4], hashes[5]);
748    }
749
750    #[test]
751    // Tests actual values of hashes, which are different if forcing collisions
752    #[cfg(not(feature = "force_hash_collisions"))]
753    fn create_hashes_for_struct_arrays_more_column_than_row() {
754        let struct_array = StructArray::from(vec![
755            (
756                Arc::new(Field::new("bool", DataType::Boolean, false)),
757                Arc::new(BooleanArray::from(vec![false, false])) as ArrayRef,
758            ),
759            (
760                Arc::new(Field::new("i32-1", DataType::Int32, false)),
761                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
762            ),
763            (
764                Arc::new(Field::new("i32-2", DataType::Int32, false)),
765                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
766            ),
767            (
768                Arc::new(Field::new("i32-3", DataType::Int32, false)),
769                Arc::new(Int32Array::from(vec![10, 10])) as ArrayRef,
770            ),
771        ]);
772
773        assert!(struct_array.is_valid(0));
774        assert!(struct_array.is_valid(1));
775
776        let array = Arc::new(struct_array) as ArrayRef;
777        let random_state = RandomState::with_seeds(0, 0, 0, 0);
778        let mut hashes = vec![0; array.len()];
779        create_hashes(&[array], &random_state, &mut hashes).unwrap();
780        assert_eq!(hashes[0], hashes[1]);
781    }
782
783    #[test]
784    // Tests actual values of hashes, which are different if forcing collisions
785    #[cfg(not(feature = "force_hash_collisions"))]
786    fn create_hashes_for_map_arrays() {
787        let mut builder =
788            MapBuilder::new(None, StringBuilder::new(), Int32Builder::new());
789        // Row 0
790        builder.keys().append_value("key1");
791        builder.keys().append_value("key2");
792        builder.values().append_value(1);
793        builder.values().append_value(2);
794        builder.append(true).unwrap();
795        // Row 1
796        builder.keys().append_value("key1");
797        builder.keys().append_value("key2");
798        builder.values().append_value(1);
799        builder.values().append_value(2);
800        builder.append(true).unwrap();
801        // Row 2
802        builder.keys().append_value("key1");
803        builder.keys().append_value("key2");
804        builder.values().append_value(1);
805        builder.values().append_value(3);
806        builder.append(true).unwrap();
807        // Row 3
808        builder.keys().append_value("key1");
809        builder.keys().append_value("key3");
810        builder.values().append_value(1);
811        builder.values().append_value(2);
812        builder.append(true).unwrap();
813        // Row 4
814        builder.keys().append_value("key1");
815        builder.values().append_value(1);
816        builder.append(true).unwrap();
817        // Row 5
818        builder.keys().append_value("key1");
819        builder.values().append_null();
820        builder.append(true).unwrap();
821        // Row 6
822        builder.append(true).unwrap();
823        // Row 7
824        builder.keys().append_value("key1");
825        builder.values().append_value(1);
826        builder.append(false).unwrap();
827
828        let array = Arc::new(builder.finish()) as ArrayRef;
829
830        let random_state = RandomState::with_seeds(0, 0, 0, 0);
831        let mut hashes = vec![0; array.len()];
832        create_hashes(&[array], &random_state, &mut hashes).unwrap();
833        assert_eq!(hashes[0], hashes[1]); // same value
834        assert_ne!(hashes[0], hashes[2]); // different value
835        assert_ne!(hashes[0], hashes[3]); // different key
836        assert_ne!(hashes[0], hashes[4]); // missing an entry
837        assert_ne!(hashes[4], hashes[5]); // filled vs null value
838        assert_eq!(hashes[6], hashes[7]); // empty vs null map
839    }
840
841    #[test]
842    // Tests actual values of hashes, which are different if forcing collisions
843    #[cfg(not(feature = "force_hash_collisions"))]
844    fn create_multi_column_hash_for_dict_arrays() {
845        let strings1 = [Some("foo"), None, Some("bar")];
846        let strings2 = [Some("blarg"), Some("blah"), None];
847
848        let string_array = Arc::new(strings1.iter().cloned().collect::<StringArray>());
849        let dict_array = Arc::new(
850            strings2
851                .iter()
852                .cloned()
853                .collect::<DictionaryArray<Int32Type>>(),
854        );
855
856        let random_state = RandomState::with_seeds(0, 0, 0, 0);
857
858        let mut one_col_hashes = vec![0; strings1.len()];
859        create_hashes(
860            &[Arc::clone(&dict_array) as ArrayRef],
861            &random_state,
862            &mut one_col_hashes,
863        )
864        .unwrap();
865
866        let mut two_col_hashes = vec![0; strings1.len()];
867        create_hashes(
868            &[dict_array, string_array],
869            &random_state,
870            &mut two_col_hashes,
871        )
872        .unwrap();
873
874        assert_eq!(one_col_hashes.len(), 3);
875        assert_eq!(two_col_hashes.len(), 3);
876
877        assert_ne!(one_col_hashes, two_col_hashes);
878    }
879}