polars_compute/filter/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
//! Contains operators to filter arrays such as [`filter`].
mod boolean;
mod primitive;
mod scalar;

#[cfg(all(target_arch = "x86_64", feature = "simd"))]
mod avx512;

use arrow::array::growable::make_growable;
use arrow::array::{new_empty_array, Array, BinaryViewArray, BooleanArray, PrimitiveArray};
use arrow::bitmap::utils::SlicesIterator;
use arrow::bitmap::Bitmap;
use arrow::with_match_primitive_type_full;
pub use boolean::filter_boolean_kernel;

pub fn filter(array: &dyn Array, mask: &BooleanArray) -> Box<dyn Array> {
    assert_eq!(array.len(), mask.len());

    // Treat null mask values as false.
    if let Some(validities) = mask.validity() {
        let combined_mask = mask.values() & validities;
        filter_with_bitmap(array, &combined_mask)
    } else {
        filter_with_bitmap(array, mask.values())
    }
}

pub fn filter_with_bitmap(array: &dyn Array, mask: &Bitmap) -> Box<dyn Array> {
    // Many filters involve filtering values in a subsection of the array. When we trim the leading
    // and trailing filtered items, we can close in on those items and not have to perform and
    // thinking about those. The overhead for when there are no leading or trailing filtered values
    // is very minimal: only a clone of the mask and the array.
    //
    // This also allows dispatching to the fast paths way, way, way more often.
    let mut mask = mask.clone();
    let leading_zeros = mask.take_leading_zeros();
    mask.take_trailing_zeros();
    let array = array.sliced(leading_zeros, mask.len());

    let mask = &mask;
    let array = array.as_ref();

    // Fast-path: completely empty or completely full mask.
    let false_count = mask.unset_bits();
    if false_count == mask.len() {
        return new_empty_array(array.dtype().clone());
    }
    if false_count == 0 {
        return array.to_boxed();
    }

    use arrow::datatypes::PhysicalType::*;
    match array.dtype().to_physical_type() {
        Primitive(primitive) => with_match_primitive_type_full!(primitive, |$T| {
            let array: &PrimitiveArray<$T> = array.as_any().downcast_ref().unwrap();
            let (values, validity) = primitive::filter_values_and_validity::<$T>(array.values(), array.validity(), mask);
            Box::new(PrimitiveArray::from_vec(values).with_validity(validity))
        }),
        Boolean => {
            let array = array.as_any().downcast_ref::<BooleanArray>().unwrap();
            let (values, validity) =
                boolean::filter_bitmap_and_validity(array.values(), array.validity(), mask);
            BooleanArray::new(array.dtype().clone(), values, validity).boxed()
        },
        BinaryView => {
            let array = array.as_any().downcast_ref::<BinaryViewArray>().unwrap();
            let views = array.views();
            let validity = array.validity();
            let (views, validity) = primitive::filter_values_and_validity(views, validity, mask);
            unsafe {
                BinaryViewArray::new_unchecked_unknown_md(
                    array.dtype().clone(),
                    views.into(),
                    array.data_buffers().clone(),
                    validity,
                    Some(array.total_buffer_len()),
                )
            }
            .boxed()
        },
        // Should go via BinaryView
        Utf8View => {
            unreachable!()
        },
        _ => {
            let iter = SlicesIterator::new(mask);
            let mut mutable = make_growable(&[array], false, iter.slots());
            // SAFETY:
            // we are in bounds
            iter.for_each(|(start, len)| unsafe { mutable.extend(0, start, len) });
            mutable.as_box()
        },
    }
}