polars_arrow/array/
specification.rs1use polars_error::{polars_bail, polars_err, to_compute_err, PolarsResult};
2
3use crate::array::DictionaryKey;
4use crate::offset::{Offset, Offsets, OffsetsBuffer};
5
6pub trait OffsetsContainer<O> {
8 fn last(&self) -> usize;
9 fn as_slice(&self) -> &[O];
10}
11
12impl<O: Offset> OffsetsContainer<O> for OffsetsBuffer<O> {
13 #[inline]
14 fn last(&self) -> usize {
15 self.last().to_usize()
16 }
17
18 #[inline]
19 fn as_slice(&self) -> &[O] {
20 self.buffer()
21 }
22}
23
24impl<O: Offset> OffsetsContainer<O> for Offsets<O> {
25 #[inline]
26 fn last(&self) -> usize {
27 self.last().to_usize()
28 }
29
30 #[inline]
31 fn as_slice(&self) -> &[O] {
32 self.as_slice()
33 }
34}
35
36pub(crate) fn try_check_offsets_bounds<O: Offset>(
37 offsets: &[O],
38 values_len: usize,
39) -> PolarsResult<()> {
40 if offsets.last().unwrap().to_usize() > values_len {
41 polars_bail!(ComputeError: "offsets must not exceed the values length")
42 } else {
43 Ok(())
44 }
45}
46
47pub fn try_check_utf8<O: Offset>(offsets: &[O], values: &[u8]) -> PolarsResult<()> {
51 if offsets.len() == 1 {
52 return Ok(());
53 }
54 assert!(offsets.len() > 1);
55 let end = offsets.last().unwrap().to_usize();
56 let start = offsets.first().unwrap().to_usize();
57
58 try_check_offsets_bounds(offsets, values.len())?;
59 let values_range = &values[start..end];
60
61 if values_range.is_ascii() {
62 Ok(())
63 } else {
64 simdutf8::basic::from_utf8(values_range).map_err(to_compute_err)?;
65
66 let last = offsets
72 .iter()
73 .enumerate()
74 .skip(1)
75 .rev()
76 .find_map(|(i, offset)| (offset.to_usize() < values.len()).then(|| i));
77
78 let last = if let Some(last) = last {
79 last
81 } else {
82 return Ok(());
86 };
87
88 let starts = unsafe { offsets.get_unchecked(..=last) };
91
92 let mut any_invalid = false;
93 for start in starts {
94 let start = start.to_usize();
95
96 let b = *unsafe { values.get_unchecked(start) };
98
99 any_invalid |= (b as i8) < -0x40;
102 }
103 if any_invalid {
104 polars_bail!(ComputeError: "non-valid char boundary detected")
105 }
106 Ok(())
107 }
108}
109
110pub(crate) unsafe fn check_indexes_unchecked<K: DictionaryKey>(
114 keys: &[K],
115 len: usize,
116) -> PolarsResult<()> {
117 let mut invalid = false;
118
119 keys.iter().for_each(|k| invalid |= k.as_usize() > len);
121
122 if invalid {
123 let key = keys.iter().map(|k| k.as_usize()).max().unwrap();
124 polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
125 } else {
126 Ok(())
127 }
128}
129
130pub fn check_indexes<K>(keys: &[K], len: usize) -> PolarsResult<()>
131where
132 K: std::fmt::Debug + Copy + TryInto<usize>,
133{
134 keys.iter().try_for_each(|key| {
135 let key: usize = (*key)
136 .try_into()
137 .map_err(|_| polars_err!(ComputeError: "The dictionary key must fit in a `usize`, but {key:?} does not")
138 )?;
139 if key >= len {
140 polars_bail!(ComputeError: "one of the dictionary keys is {key} but it must be < than the length of the dictionary values, which is {len}")
141 } else {
142 Ok(())
143 }
144 })
145}
146
147#[cfg(test)]
148mod tests {
149 use proptest::prelude::*;
150
151 use super::*;
152
153 pub(crate) fn binary_strategy() -> impl Strategy<Value = Vec<u8>> {
154 prop::collection::vec(any::<u8>(), 1..100)
155 }
156
157 proptest! {
158 #[test]
161 #[cfg_attr(miri, ignore)] fn check_utf8_validation(values in binary_strategy()) {
163
164 for offset in 0..values.len() - 1 {
165 let offsets: OffsetsBuffer<i32> = vec![0, offset as i32, values.len() as i32].try_into().unwrap();
166
167 let mut is_valid = std::str::from_utf8(&values[..offset]).is_ok();
168 is_valid &= std::str::from_utf8(&values[offset..]).is_ok();
169
170 assert_eq!(try_check_utf8::<i32>(&offsets, &values).is_ok(), is_valid)
171 }
172 }
173 }
174}