polars_arrow/array/
specification.rs

1use polars_error::{polars_bail, polars_err, to_compute_err, PolarsResult};
2
3use crate::array::DictionaryKey;
4use crate::offset::{Offset, Offsets, OffsetsBuffer};
5
6/// Helper trait to support `Offset` and `OffsetBuffer`
7pub trait OffsetsContainer<O> {
8    fn last(&self) -> usize;
9    fn as_slice(&self) -> &[O];
10}
11
12impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
13    #[inline]
14    fn last(&self) -> usize {
15        self.last().to_usize()
16    }
17
18    #[inline]
19    fn as_slice(&self) -> &[O] {
20        self.buffer()
21    }
22}
23
24impl<O: Offset> OffsetsContainer<O> for Offsets<O> {
25    #[inline]
26    fn last(&self) -> usize {
27        self.last().to_usize()
28    }
29
30    #[inline]
31    fn as_slice(&self) -> &[O] {
32        self.as_slice()
33    }
34}
35
36pub(crate) fn try_check_offsets_bounds<O: Offset>(
37    offsets: &[O],
38    values_len: usize,
39) -> PolarsResult<()> {
40    if offsets.last().unwrap().to_usize() > values_len {
41        polars_bail!(ComputeError: "offsets must not exceed the values length")
42    } else {
43        Ok(())
44    }
45}
46
47/// # Error
48/// * any offset is larger or equal to `values_len`.
49/// * any slice of `values` between two consecutive pairs from `offsets` is invalid `utf8`, or
50pub fn try_check_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> PolarsResult<()> {
51    if offsets.len() == 1 {
52        return Ok(());
53    }
54    assert!(offsets.len() > 1);
55    let end = offsets.last().unwrap().to_usize();
56    let start = offsets.first().unwrap().to_usize();
57
58    try_check_offsets_bounds(offsets, values.len())?;
59    let values_range = &values[start..end];
60
61    if values_range.is_ascii() {
62        Ok(())
63    } else {
64        simdutf8::basic::from_utf8(values_range).map_err(to_compute_err)?;
65
66        // offsets can be == values.len()
67        // find first offset from the end that is smaller
68        // Example:
69        // values.len() = 10
70        // offsets = [0, 5, 10, 10]
71        let last = offsets
72            .iter()
73            .enumerate()
74            .skip(1)
75            .rev()
76            .find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));
77
78        let last = if let Some(last) = last {
79            // following the example: last = 1 (offset = 5)
80            last
81        } else {
82            // given `l = values.len()`, this branch is hit iff either:
83            // * `offsets = [0, l, l, ...]`, which was covered by `from_utf8(values)` above
84            // * `offsets = [0]`, which never happens because offsets.as_slice().len() == 1 is short-circuited above
85            return Ok(());
86        };
87
88        // truncate to relevant offsets. Note: `=last` because last was computed skipping the first item
89        // following the example: starts = [0, 5]
90        let starts = unsafe { offsets.get_unchecked(..=last) };
91
92        let mut any_invalid = false;
93        for start in starts {
94            let start = start.to_usize();
95
96            // SAFETY: `try_check_offsets_bounds` just checked for bounds
97            let b = *unsafe { values.get_unchecked(start) };
98
99            // A valid code-point iff it does not start with 0b10xxxxxx
100            // Bit-magic taken from `std::str::is_char_boundary`
101            any_invalid |= (b as i8) < -0x40;
102        }
103        if any_invalid {
104            polars_bail!(ComputeError: "non-valid char boundary detected")
105        }
106        Ok(())
107    }
108}
109
110/// Check dictionary indexes without checking usize conversion.
111/// # Safety
112/// The caller must ensure that `K::as_usize` always succeeds.
113pub(crate) unsafe fn check_indexes_unchecked<K: DictionaryKey>(
114    keys: &[K],
115    len: usize,
116) -> PolarsResult<()> {
117    let mut invalid = false;
118
119    // this loop is auto-vectorized
120    keys.iter().for_each(|k| invalid |= k.as_usize() > len);
121
122    if invalid {
123        let key = keys.iter().map(|k| k.as_usize()).max().unwrap();
124        polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
125    } else {
126        Ok(())
127    }
128}
129
130pub fn check_indexes<K>(keys: &[K], len: usize) -> PolarsResult<()>
131where
132    K: std::fmt::Debug + Copy + TryInto<usize>,
133{
134    keys.iter().try_for_each(|key| {
135        let key: usize = (*key)
136            .try_into()
137            .map_err(|_| polars_err!(ComputeError: "The dictionary key must fit in a `usize`, but {key:?} does not")
138            )?;
139        if key >= len {
140            polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
141        } else {
142            Ok(())
143        }
144    })
145}
146
147#[cfg(test)]
148mod tests {
149    use proptest::prelude::*;
150
151    use super::*;
152
153    pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
154        prop::collection::vec(any::<u8>(), 1..100)
155    }
156
157    proptest! {
158        // a bit expensive, feel free to run it when changing the code above
159        // #![proptest_config(ProptestConfig::with_cases(100000))]
160        #[test]
161        #[cfg_attr(miri, ignore)] // miri and proptest do not work well
162        fn check_utf8_validation(values in binary_strategy()) {
163
164            for offset in 0..values.len() - 1 {
165                let offsets: OffsetsBuffer<i32> = vec![0, offset as i32, values.len() as i32].try_into().unwrap();
166
167                let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
168                is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
169
170                assert_eq!(try_check_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
171            }
172        }
173    }
174}