datafusion_functions/
strings.rs1use std::mem::size_of;
19
20use arrow::array::{
21 make_view, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ByteView,
22 GenericStringArray, LargeStringArray, NullBufferBuilder, OffsetSizeTrait,
23 StringArray, StringViewArray, StringViewBuilder,
24};
25use arrow::buffer::{MutableBuffer, NullBuffer};
26use arrow::datatypes::DataType;
27
28#[deprecated(since = "45.0.0", note = "Use arrow::array::StringArrayType instead")]
30pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
31 fn iter(&self) -> ArrayIter<Self>;
35
36 fn is_ascii(&self) -> bool;
38}
39
40#[allow(deprecated)]
41impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> {
42 fn iter(&self) -> ArrayIter<Self> {
43 GenericStringArray::<T>::iter(self)
44 }
45
46 fn is_ascii(&self) -> bool {
47 GenericStringArray::<T>::is_ascii(self)
48 }
49}
50
51#[allow(deprecated)]
52impl<'a> StringArrayType<'a> for &'a StringViewArray {
53 fn iter(&self) -> ArrayIter<Self> {
54 StringViewArray::iter(self)
55 }
56
57 fn is_ascii(&self) -> bool {
58 StringViewArray::is_ascii(self)
59 }
60}
61
62pub struct StringArrayBuilder {
66 offsets_buffer: MutableBuffer,
67 value_buffer: MutableBuffer,
68}
69
70impl StringArrayBuilder {
71 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
72 let capacity = item_capacity
73 .checked_add(1)
74 .map(|i| i.saturating_mul(size_of::<i32>()))
75 .expect("capacity integer overflow");
76
77 let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
78 unsafe { offsets_buffer.push_unchecked(0_i32) };
80 Self {
81 offsets_buffer,
82 value_buffer: MutableBuffer::with_capacity(data_capacity),
83 }
84 }
85
86 pub fn write<const CHECK_VALID: bool>(
87 &mut self,
88 column: &ColumnarValueRef,
89 i: usize,
90 ) {
91 match column {
92 ColumnarValueRef::Scalar(s) => {
93 self.value_buffer.extend_from_slice(s);
94 }
95 ColumnarValueRef::NullableArray(array) => {
96 if !CHECK_VALID || array.is_valid(i) {
97 self.value_buffer
98 .extend_from_slice(array.value(i).as_bytes());
99 }
100 }
101 ColumnarValueRef::NullableLargeStringArray(array) => {
102 if !CHECK_VALID || array.is_valid(i) {
103 self.value_buffer
104 .extend_from_slice(array.value(i).as_bytes());
105 }
106 }
107 ColumnarValueRef::NullableStringViewArray(array) => {
108 if !CHECK_VALID || array.is_valid(i) {
109 self.value_buffer
110 .extend_from_slice(array.value(i).as_bytes());
111 }
112 }
113 ColumnarValueRef::NonNullableArray(array) => {
114 self.value_buffer
115 .extend_from_slice(array.value(i).as_bytes());
116 }
117 ColumnarValueRef::NonNullableLargeStringArray(array) => {
118 self.value_buffer
119 .extend_from_slice(array.value(i).as_bytes());
120 }
121 ColumnarValueRef::NonNullableStringViewArray(array) => {
122 self.value_buffer
123 .extend_from_slice(array.value(i).as_bytes());
124 }
125 }
126 }
127
128 pub fn append_offset(&mut self) {
129 let next_offset: i32 = self
130 .value_buffer
131 .len()
132 .try_into()
133 .expect("byte array offset overflow");
134 self.offsets_buffer.push(next_offset);
135 }
136
137 pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
145 let row_count = self.offsets_buffer.len() / size_of::<i32>() - 1;
146 if let Some(ref null_buffer) = null_buffer {
147 assert_eq!(
148 null_buffer.len(),
149 row_count,
150 "Null buffer and offsets buffer must be the same length"
151 );
152 }
153 let array_builder = ArrayDataBuilder::new(DataType::Utf8)
154 .len(row_count)
155 .add_buffer(self.offsets_buffer.into())
156 .add_buffer(self.value_buffer.into())
157 .nulls(null_buffer);
158 let array_data = unsafe { array_builder.build_unchecked() };
161 StringArray::from(array_data)
162 }
163}
164
165pub struct StringViewArrayBuilder {
166 builder: StringViewBuilder,
167 block: String,
168}
169
170impl StringViewArrayBuilder {
171 pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
172 let builder = StringViewBuilder::with_capacity(data_capacity);
173 Self {
174 builder,
175 block: String::new(),
176 }
177 }
178
179 pub fn write<const CHECK_VALID: bool>(
180 &mut self,
181 column: &ColumnarValueRef,
182 i: usize,
183 ) {
184 match column {
185 ColumnarValueRef::Scalar(s) => {
186 self.block.push_str(std::str::from_utf8(s).unwrap());
187 }
188 ColumnarValueRef::NullableArray(array) => {
189 if !CHECK_VALID || array.is_valid(i) {
190 self.block.push_str(
191 std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
192 );
193 }
194 }
195 ColumnarValueRef::NullableLargeStringArray(array) => {
196 if !CHECK_VALID || array.is_valid(i) {
197 self.block.push_str(
198 std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
199 );
200 }
201 }
202 ColumnarValueRef::NullableStringViewArray(array) => {
203 if !CHECK_VALID || array.is_valid(i) {
204 self.block.push_str(
205 std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
206 );
207 }
208 }
209 ColumnarValueRef::NonNullableArray(array) => {
210 self.block
211 .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
212 }
213 ColumnarValueRef::NonNullableLargeStringArray(array) => {
214 self.block
215 .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
216 }
217 ColumnarValueRef::NonNullableStringViewArray(array) => {
218 self.block
219 .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
220 }
221 }
222 }
223
224 pub fn append_offset(&mut self) {
225 self.builder.append_value(&self.block);
226 self.block = String::new();
227 }
228
229 pub fn finish(mut self) -> StringViewArray {
230 self.builder.finish()
231 }
232}
233
234pub struct LargeStringArrayBuilder {
235 offsets_buffer: MutableBuffer,
236 value_buffer: MutableBuffer,
237}
238
239impl LargeStringArrayBuilder {
240 pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
241 let capacity = item_capacity
242 .checked_add(1)
243 .map(|i| i.saturating_mul(size_of::<i64>()))
244 .expect("capacity integer overflow");
245
246 let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
247 unsafe { offsets_buffer.push_unchecked(0_i64) };
249 Self {
250 offsets_buffer,
251 value_buffer: MutableBuffer::with_capacity(data_capacity),
252 }
253 }
254
255 pub fn write<const CHECK_VALID: bool>(
256 &mut self,
257 column: &ColumnarValueRef,
258 i: usize,
259 ) {
260 match column {
261 ColumnarValueRef::Scalar(s) => {
262 self.value_buffer.extend_from_slice(s);
263 }
264 ColumnarValueRef::NullableArray(array) => {
265 if !CHECK_VALID || array.is_valid(i) {
266 self.value_buffer
267 .extend_from_slice(array.value(i).as_bytes());
268 }
269 }
270 ColumnarValueRef::NullableLargeStringArray(array) => {
271 if !CHECK_VALID || array.is_valid(i) {
272 self.value_buffer
273 .extend_from_slice(array.value(i).as_bytes());
274 }
275 }
276 ColumnarValueRef::NullableStringViewArray(array) => {
277 if !CHECK_VALID || array.is_valid(i) {
278 self.value_buffer
279 .extend_from_slice(array.value(i).as_bytes());
280 }
281 }
282 ColumnarValueRef::NonNullableArray(array) => {
283 self.value_buffer
284 .extend_from_slice(array.value(i).as_bytes());
285 }
286 ColumnarValueRef::NonNullableLargeStringArray(array) => {
287 self.value_buffer
288 .extend_from_slice(array.value(i).as_bytes());
289 }
290 ColumnarValueRef::NonNullableStringViewArray(array) => {
291 self.value_buffer
292 .extend_from_slice(array.value(i).as_bytes());
293 }
294 }
295 }
296
297 pub fn append_offset(&mut self) {
298 let next_offset: i64 = self
299 .value_buffer
300 .len()
301 .try_into()
302 .expect("byte array offset overflow");
303 self.offsets_buffer.push(next_offset);
304 }
305
306 pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
314 let row_count = self.offsets_buffer.len() / size_of::<i64>() - 1;
315 if let Some(ref null_buffer) = null_buffer {
316 assert_eq!(
317 null_buffer.len(),
318 row_count,
319 "Null buffer and offsets buffer must be the same length"
320 );
321 }
322 let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
323 .len(row_count)
324 .add_buffer(self.offsets_buffer.into())
325 .add_buffer(self.value_buffer.into())
326 .nulls(null_buffer);
327 let array_data = unsafe { array_builder.build_unchecked() };
330 LargeStringArray::from(array_data)
331 }
332}
333
334pub fn make_and_append_view(
348 views_buffer: &mut Vec<u128>,
349 null_builder: &mut NullBufferBuilder,
350 original_view: &u128,
351 substr: &str,
352 start_offset: u32,
353) {
354 let substr_len = substr.len();
355 let sub_view = if substr_len > 12 {
356 let view = ByteView::from(*original_view);
357 make_view(
358 substr.as_bytes(),
359 view.buffer_index,
360 view.offset + start_offset,
361 )
362 } else {
363 make_view(substr.as_bytes(), 0, 0)
365 };
366 views_buffer.push(sub_view);
367 null_builder.append_non_null();
368}
369
370#[derive(Debug)]
371pub enum ColumnarValueRef<'a> {
372 Scalar(&'a [u8]),
373 NullableArray(&'a StringArray),
374 NonNullableArray(&'a StringArray),
375 NullableLargeStringArray(&'a LargeStringArray),
376 NonNullableLargeStringArray(&'a LargeStringArray),
377 NullableStringViewArray(&'a StringViewArray),
378 NonNullableStringViewArray(&'a StringViewArray),
379}
380
381impl ColumnarValueRef<'_> {
382 #[inline]
383 pub fn is_valid(&self, i: usize) -> bool {
384 match &self {
385 Self::Scalar(_)
386 | Self::NonNullableArray(_)
387 | Self::NonNullableLargeStringArray(_)
388 | Self::NonNullableStringViewArray(_) => true,
389 Self::NullableArray(array) => array.is_valid(i),
390 Self::NullableStringViewArray(array) => array.is_valid(i),
391 Self::NullableLargeStringArray(array) => array.is_valid(i),
392 }
393 }
394
395 #[inline]
396 pub fn nulls(&self) -> Option<NullBuffer> {
397 match &self {
398 Self::Scalar(_)
399 | Self::NonNullableArray(_)
400 | Self::NonNullableStringViewArray(_)
401 | Self::NonNullableLargeStringArray(_) => None,
402 Self::NullableArray(array) => array.nulls().cloned(),
403 Self::NullableStringViewArray(array) => array.nulls().cloned(),
404 Self::NullableLargeStringArray(array) => array.nulls().cloned(),
405 }
406 }
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 #[should_panic(expected = "capacity integer overflow")]
415 fn test_overflow_string_array_builder() {
416 let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
417 }
418
419 #[test]
420 #[should_panic(expected = "capacity integer overflow")]
421 fn test_overflow_large_string_array_builder() {
422 let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
423 }
424}