polars_expr/groups/
mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
use std::any::Any;
use std::path::Path;

use polars_core::prelude::*;
use polars_utils::aliases::PlRandomState;
use polars_utils::cardinality_sketch::CardinalitySketch;
use polars_utils::hashing::HashPartitioner;
use polars_utils::IdxSize;

mod row_encoded;

/// A Grouper maps keys to groups, such that duplicate keys map to the same group.
pub trait Grouper: Any + Send + Sync {
    /// Creates a new empty Grouper similar to this one.
    fn new_empty(&self) -> Box<dyn Grouper>;

    /// Reserves space for the given number additional of groups.
    fn reserve(&mut self, additional: usize);

    /// Returns the number of groups in this Grouper.
    fn num_groups(&self) -> IdxSize;

    /// Inserts the given keys into this Grouper, mutating groups_idxs such
    /// that group_idxs[i] is the group index of keys[..][i].
    fn insert_keys(&mut self, keys: &DataFrame, group_idxs: &mut Vec<IdxSize>);

    /// Adds the given Grouper into this one, mutating groups_idxs such that
    /// the ith group of other now has group index group_idxs[i] in self.
    fn combine(&mut self, other: &dyn Grouper, group_idxs: &mut Vec<IdxSize>);

    /// Adds the given Grouper into this one, mutating groups_idxs such that
    /// the group subset[i] of other now has group index group_idxs[i] in self.
    ///
    /// # Safety
    /// For all i, subset[i] < other.len().
    unsafe fn gather_combine(
        &mut self,
        other: &dyn Grouper,
        subset: &[IdxSize],
        group_idxs: &mut Vec<IdxSize>,
    );

    /// Generate partition indices.
    ///
    /// After this function partitions_idxs[i] will contain the indices for
    /// partition i, and sketches[i] will contain a cardinality sketch for
    /// partition i.
    fn gen_partition_idxs(
        &self,
        partitioner: &HashPartitioner,
        partition_idxs: &mut [Vec<IdxSize>],
        sketches: &mut [CardinalitySketch],
    );

    /// Returns the keys in this Grouper in group order, that is the key for
    /// group i is returned in row i.
    fn get_keys_in_group_order(&self) -> DataFrame;

    /// Stores this Grouper at the given path.
    fn store_ooc(&self, _path: &Path) {
        unimplemented!();
    }

    /// Loads this Grouper from the given path.
    fn load_ooc(&mut self, _path: &Path) {
        unimplemented!();
    }

    fn as_any(&self) -> &dyn Any;
}

pub fn new_hash_grouper(key_schema: Arc<Schema>, random_state: PlRandomState) -> Box<dyn Grouper> {
    Box::new(row_encoded::RowEncodedHashGrouper::new(
        key_schema,
        random_state,
    ))
}