polars_core/chunked_array/ops/
chunkops.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
use std::cell::Cell;

use arrow::bitmap::{Bitmap, MutableBitmap};
use arrow::legacy::kernels::concatenate::concatenate_owned_unchecked;
use polars_error::constants::LENGTH_LIMIT_MSG;

use super::*;
use crate::chunked_array::metadata::MetadataProperties;
#[cfg(feature = "object")]
use crate::chunked_array::object::builder::ObjectChunkedBuilder;
use crate::utils::slice_offsets;

pub(crate) fn split_at(
    chunks: &[ArrayRef],
    offset: i64,
    own_length: usize,
) -> (Vec<ArrayRef>, Vec<ArrayRef>) {
    let mut new_chunks_left = Vec::with_capacity(1);
    let mut new_chunks_right = Vec::with_capacity(1);
    let (raw_offset, _) = slice_offsets(offset, 0, own_length);

    let mut remaining_offset = raw_offset;
    let mut iter = chunks.iter();

    for chunk in &mut iter {
        let chunk_len = chunk.len();
        if remaining_offset > 0 && remaining_offset >= chunk_len {
            remaining_offset -= chunk_len;
            new_chunks_left.push(chunk.clone());
            continue;
        }

        let (l, r) = chunk.split_at_boxed(remaining_offset);
        new_chunks_left.push(l);
        new_chunks_right.push(r);
        break;
    }

    for chunk in iter {
        new_chunks_right.push(chunk.clone())
    }
    if new_chunks_left.is_empty() {
        new_chunks_left.push(chunks[0].sliced(0, 0));
    }
    if new_chunks_right.is_empty() {
        new_chunks_right.push(chunks[0].sliced(0, 0));
    }
    (new_chunks_left, new_chunks_right)
}

pub(crate) fn slice(
    chunks: &[ArrayRef],
    offset: i64,
    slice_length: usize,
    own_length: usize,
) -> (Vec<ArrayRef>, usize) {
    let mut new_chunks = Vec::with_capacity(1);
    let (raw_offset, slice_len) = slice_offsets(offset, slice_length, own_length);

    let mut remaining_length = slice_len;
    let mut remaining_offset = raw_offset;
    let mut new_len = 0;

    for chunk in chunks {
        let chunk_len = chunk.len();
        if remaining_offset > 0 && remaining_offset >= chunk_len {
            remaining_offset -= chunk_len;
            continue;
        }
        let take_len = if remaining_length + remaining_offset > chunk_len {
            chunk_len - remaining_offset
        } else {
            remaining_length
        };
        new_len += take_len;

        debug_assert!(remaining_offset + take_len <= chunk.len());
        unsafe {
            // SAFETY:
            // this function ensures the slices are in bounds
            new_chunks.push(chunk.sliced_unchecked(remaining_offset, take_len));
        }
        remaining_length -= take_len;
        remaining_offset = 0;
        if remaining_length == 0 {
            break;
        }
    }
    if new_chunks.is_empty() {
        new_chunks.push(chunks[0].sliced(0, 0));
    }
    (new_chunks, new_len)
}

// When we deal with arrays and lists we can easily exceed the limit if
// we take the underlying values array as a Series. This call stack
// is hard to follow, so for this one case we make an exception
// and use a thread local.
thread_local!(static CHECK_LENGTH: Cell<bool> = const { Cell::new(true) });

/// Meant for internal use. In very rare conditions this can be turned off.
/// # Safety
/// The caller must ensure the Series that exceeds the length get's deconstructed
/// into array values or list values before and never is used.
pub unsafe fn _set_check_length(check: bool) {
    CHECK_LENGTH.set(check)
}

impl<T: PolarsDataType> ChunkedArray<T> {
    /// Get the length of the ChunkedArray
    #[inline]
    pub fn len(&self) -> usize {
        self.length
    }

    /// Return the number of null values in the ChunkedArray.
    #[inline]
    pub fn null_count(&self) -> usize {
        self.null_count
    }

    /// Set the null count directly.
    ///
    /// This can be useful after mutably adjusting the validity of the
    /// underlying arrays.
    ///
    /// # Safety
    /// The new null count must match the total null count of the underlying
    /// arrays.
    pub unsafe fn set_null_count(&mut self, null_count: usize) {
        self.null_count = null_count;
    }

    /// Check if ChunkedArray is empty.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Compute the length
    pub(crate) fn compute_len(&mut self) {
        fn inner(chunks: &[ArrayRef]) -> usize {
            match chunks.len() {
                // fast path
                1 => chunks[0].len(),
                _ => chunks.iter().fold(0, |acc, arr| acc + arr.len()),
            }
        }
        let len = inner(&self.chunks);
        // Length limit is `IdxSize::MAX - 1`. We use `IdxSize::MAX` to indicate `NULL` in indexing.
        if len >= (IdxSize::MAX as usize) && CHECK_LENGTH.get() {
            panic!("{}", LENGTH_LIMIT_MSG);
        }
        self.length = len;
        self.null_count = self
            .chunks
            .iter()
            .map(|arr| arr.null_count())
            .sum::<usize>();
    }

    pub fn rechunk(&self) -> Self {
        match self.dtype() {
            #[cfg(feature = "object")]
            DataType::Object(_, _) => {
                panic!("implementation error")
            },
            _ => {
                fn inner_rechunk(chunks: &[ArrayRef]) -> Vec<ArrayRef> {
                    vec![concatenate_owned_unchecked(chunks).unwrap()]
                }

                if self.chunks.len() == 1 {
                    self.clone()
                } else {
                    let chunks = inner_rechunk(&self.chunks);

                    let mut ca = unsafe { self.copy_with_chunks(chunks) };

                    use MetadataProperties as P;
                    ca.copy_metadata(
                        self,
                        P::SORTED
                            | P::FAST_EXPLODE_LIST
                            | P::MIN_VALUE
                            | P::MAX_VALUE
                            | P::DISTINCT_COUNT,
                    );

                    ca
                }
            },
        }
    }

    pub fn rechunk_validity(&self) -> Option<Bitmap> {
        if self.chunks.len() == 1 {
            return self.chunks[0].validity().cloned();
        }

        if !self.has_nulls() || self.is_empty() {
            return None;
        }

        let mut bm = MutableBitmap::with_capacity(self.len());
        for arr in self.downcast_iter() {
            if let Some(v) = arr.validity() {
                bm.extend_from_bitmap(v);
            } else {
                bm.extend_constant(arr.len(), true);
            }
        }
        Some(bm.into())
    }

    /// Split the array. The chunks are reallocated the underlying data slices are zero copy.
    ///
    /// When offset is negative it will be counted from the end of the array.
    /// This method will never error,
    /// and will slice the best match when offset, or length is out of bounds
    pub fn split_at(&self, offset: i64) -> (Self, Self) {
        // A normal slice, slice the buffers and thus keep the whole memory allocated.
        let (l, r) = split_at(&self.chunks, offset, self.len());
        let mut out_l = unsafe { self.copy_with_chunks(l) };
        let mut out_r = unsafe { self.copy_with_chunks(r) };

        use MetadataProperties as P;
        let mut properties_l = P::SORTED | P::FAST_EXPLODE_LIST;
        let mut properties_r = P::SORTED | P::FAST_EXPLODE_LIST;

        let is_ascending = self.is_sorted_ascending_flag();
        let is_descending = self.is_sorted_descending_flag();

        if is_ascending || is_descending {
            let has_nulls_at_start = self.null_count() != 0
                && self
                    .chunks()
                    .first()
                    .unwrap()
                    .as_ref()
                    .validity()
                    .is_some_and(|bm| bm.get(0).unwrap());

            if !has_nulls_at_start {
                let can_copy_min_value = !has_nulls_at_start && is_ascending;
                let can_copy_max_value = !has_nulls_at_start && is_descending;

                properties_l.set(P::MIN_VALUE, can_copy_min_value);
                properties_l.set(P::MAX_VALUE, can_copy_max_value);
            }

            let has_nulls_at_end = self.null_count() != 0
                && self
                    .chunks()
                    .last()
                    .unwrap()
                    .as_ref()
                    .validity()
                    .is_some_and(|bm| bm.get(bm.len() - 1).unwrap());

            if !has_nulls_at_end {
                let can_copy_min_value = !has_nulls_at_end && is_descending;
                let can_copy_max_value = !has_nulls_at_end && is_ascending;
                properties_r.set(P::MIN_VALUE, can_copy_min_value);
                properties_r.set(P::MAX_VALUE, can_copy_max_value);
            }
        }
        out_l.copy_metadata(self, properties_l);
        out_r.copy_metadata(self, properties_r);

        (out_l, out_r)
    }

    /// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
    ///
    /// When offset is negative it will be counted from the end of the array.
    /// This method will never error,
    /// and will slice the best match when offset, or length is out of bounds
    pub fn slice(&self, offset: i64, length: usize) -> Self {
        // The len: 0 special cases ensure we release memory.
        // A normal slice, slice the buffers and thus keep the whole memory allocated.
        let exec = || {
            let (chunks, len) = slice(&self.chunks, offset, length, self.len());
            let mut out = unsafe { self.copy_with_chunks(chunks) };

            use MetadataProperties as P;
            let mut properties = P::SORTED | P::FAST_EXPLODE_LIST;

            let is_ascending = self.is_sorted_ascending_flag();
            let is_descending = self.is_sorted_descending_flag();

            if length != 0 && (is_ascending || is_descending) {
                let (raw_offset, slice_len) = slice_offsets(offset, length, self.len());

                let mut can_copy_min_value = false;
                let mut can_copy_max_value = false;

                let is_at_start = raw_offset == 0;
                if is_at_start {
                    let has_nulls_at_start = self.null_count() != 0
                        && self
                            .chunks()
                            .first()
                            .unwrap()
                            .as_ref()
                            .validity()
                            .is_some_and(|bm| bm.get(0).unwrap());

                    can_copy_min_value |= !has_nulls_at_start && is_ascending;
                    can_copy_max_value |= !has_nulls_at_start && is_descending;
                }

                let is_until_end = raw_offset + slice_len == self.len();
                if is_until_end {
                    let has_nulls_at_end = self.null_count() != 0
                        && self
                            .chunks()
                            .last()
                            .unwrap()
                            .as_ref()
                            .validity()
                            .is_some_and(|bm| bm.get(bm.len() - 1).unwrap());

                    can_copy_min_value |= !has_nulls_at_end && is_descending;
                    can_copy_max_value |= !has_nulls_at_end && is_ascending;
                }

                properties.set(P::MIN_VALUE, can_copy_min_value);
                properties.set(P::MAX_VALUE, can_copy_max_value);
            }

            out.copy_metadata(self, properties);
            out.length = len;

            out
        };

        match length {
            0 => match self.dtype() {
                #[cfg(feature = "object")]
                DataType::Object(_, _) => exec(),
                _ => self.clear(),
            },
            _ => exec(),
        }
    }

    /// Take a view of top n elements
    #[must_use]
    pub fn limit(&self, num_elements: usize) -> Self
    where
        Self: Sized,
    {
        self.slice(0, num_elements)
    }

    /// Get the head of the [`ChunkedArray`]
    #[must_use]
    pub fn head(&self, length: Option<usize>) -> Self
    where
        Self: Sized,
    {
        match length {
            Some(len) => self.slice(0, std::cmp::min(len, self.len())),
            None => self.slice(0, std::cmp::min(10, self.len())),
        }
    }

    /// Get the tail of the [`ChunkedArray`]
    #[must_use]
    pub fn tail(&self, length: Option<usize>) -> Self
    where
        Self: Sized,
    {
        let len = match length {
            Some(len) => std::cmp::min(len, self.len()),
            None => std::cmp::min(10, self.len()),
        };
        self.slice(-(len as i64), len)
    }

    /// Remove empty chunks.
    pub fn prune_empty_chunks(&mut self) {
        let mut count = 0u32;
        unsafe {
            self.chunks_mut().retain(|arr| {
                count += 1;
                // Always keep at least one chunk
                if count == 1 {
                    true
                } else {
                    // Remove the empty chunks
                    arr.len() > 0
                }
            })
        }
    }
}

#[cfg(feature = "object")]
impl<T: PolarsObject> ObjectChunked<T> {
    pub(crate) fn rechunk_object(&self) -> Self {
        if self.chunks.len() == 1 {
            self.clone()
        } else {
            let mut builder = ObjectChunkedBuilder::new(self.name().clone(), self.len());
            let chunks = self.downcast_iter();

            // todo! use iterators once implemented
            // no_null path
            if !self.has_nulls() {
                for arr in chunks {
                    for idx in 0..arr.len() {
                        builder.append_value(arr.value(idx).clone())
                    }
                }
            } else {
                for arr in chunks {
                    for idx in 0..arr.len() {
                        if arr.is_valid(idx) {
                            builder.append_value(arr.value(idx).clone())
                        } else {
                            builder.append_null()
                        }
                    }
                }
            }
            builder.finish()
        }
    }
}

#[cfg(test)]
mod test {
    #[cfg(feature = "dtype-categorical")]
    use crate::prelude::*;

    #[test]
    #[cfg(feature = "dtype-categorical")]
    fn test_categorical_map_after_rechunk() {
        let s = Series::new(PlSmallStr::EMPTY, &["foo", "bar", "spam"]);
        let mut a = s
            .cast(&DataType::Categorical(None, Default::default()))
            .unwrap();

        a.append(&a.slice(0, 2)).unwrap();
        let a = a.rechunk();
        assert!(a.categorical().unwrap().get_rev_map().len() > 0);
    }
}