datafusion_functions/
strings.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use std::mem::size_of;
19
20use arrow::array::{
21    make_view, Array, ArrayAccessor, ArrayDataBuilder, ArrayIter, ByteView,
22    GenericStringArray, LargeStringArray, NullBufferBuilder, OffsetSizeTrait,
23    StringArray, StringViewArray, StringViewBuilder,
24};
25use arrow::buffer::{MutableBuffer, NullBuffer};
26use arrow::datatypes::DataType;
27
28/// Abstracts iteration over different types of string arrays.
29#[deprecated(since = "45.0.0", note = "Use arrow::array::StringArrayType instead")]
30pub trait StringArrayType<'a>: ArrayAccessor<Item = &'a str> + Sized {
31    /// Return an [`ArrayIter`]  over the values of the array.
32    ///
33    /// This iterator iterates returns `Option<&str>` for each item in the array.
34    fn iter(&self) -> ArrayIter<Self>;
35
36    /// Check if the array is ASCII only.
37    fn is_ascii(&self) -> bool;
38}
39
40#[allow(deprecated)]
41impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray<T> {
42    fn iter(&self) -> ArrayIter<Self> {
43        GenericStringArray::<T>::iter(self)
44    }
45
46    fn is_ascii(&self) -> bool {
47        GenericStringArray::<T>::is_ascii(self)
48    }
49}
50
51#[allow(deprecated)]
52impl<'a> StringArrayType<'a> for &'a StringViewArray {
53    fn iter(&self) -> ArrayIter<Self> {
54        StringViewArray::iter(self)
55    }
56
57    fn is_ascii(&self) -> bool {
58        StringViewArray::is_ascii(self)
59    }
60}
61
62/// Optimized version of the StringBuilder in Arrow that:
63/// 1. Precalculating the expected length of the result, avoiding reallocations.
64/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
65pub struct StringArrayBuilder {
66    offsets_buffer: MutableBuffer,
67    value_buffer: MutableBuffer,
68}
69
70impl StringArrayBuilder {
71    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
72        let capacity = item_capacity
73            .checked_add(1)
74            .map(|i| i.saturating_mul(size_of::<i32>()))
75            .expect("capacity integer overflow");
76
77        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
78        // SAFETY: the first offset value is definitely not going to exceed the bounds.
79        unsafe { offsets_buffer.push_unchecked(0_i32) };
80        Self {
81            offsets_buffer,
82            value_buffer: MutableBuffer::with_capacity(data_capacity),
83        }
84    }
85
86    pub fn write<const CHECK_VALID: bool>(
87        &mut self,
88        column: &ColumnarValueRef,
89        i: usize,
90    ) {
91        match column {
92            ColumnarValueRef::Scalar(s) => {
93                self.value_buffer.extend_from_slice(s);
94            }
95            ColumnarValueRef::NullableArray(array) => {
96                if !CHECK_VALID || array.is_valid(i) {
97                    self.value_buffer
98                        .extend_from_slice(array.value(i).as_bytes());
99                }
100            }
101            ColumnarValueRef::NullableLargeStringArray(array) => {
102                if !CHECK_VALID || array.is_valid(i) {
103                    self.value_buffer
104                        .extend_from_slice(array.value(i).as_bytes());
105                }
106            }
107            ColumnarValueRef::NullableStringViewArray(array) => {
108                if !CHECK_VALID || array.is_valid(i) {
109                    self.value_buffer
110                        .extend_from_slice(array.value(i).as_bytes());
111                }
112            }
113            ColumnarValueRef::NonNullableArray(array) => {
114                self.value_buffer
115                    .extend_from_slice(array.value(i).as_bytes());
116            }
117            ColumnarValueRef::NonNullableLargeStringArray(array) => {
118                self.value_buffer
119                    .extend_from_slice(array.value(i).as_bytes());
120            }
121            ColumnarValueRef::NonNullableStringViewArray(array) => {
122                self.value_buffer
123                    .extend_from_slice(array.value(i).as_bytes());
124            }
125        }
126    }
127
128    pub fn append_offset(&mut self) {
129        let next_offset: i32 = self
130            .value_buffer
131            .len()
132            .try_into()
133            .expect("byte array offset overflow");
134        self.offsets_buffer.push(next_offset);
135    }
136
137    /// Finalize the builder into a concrete [`StringArray`].
138    ///
139    /// # Panics
140    ///
141    /// This method can panic when:
142    ///
143    /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
144    pub fn finish(self, null_buffer: Option<NullBuffer>) -> StringArray {
145        let row_count = self.offsets_buffer.len() / size_of::<i32>() - 1;
146        if let Some(ref null_buffer) = null_buffer {
147            assert_eq!(
148                null_buffer.len(),
149                row_count,
150                "Null buffer and offsets buffer must be the same length"
151            );
152        }
153        let array_builder = ArrayDataBuilder::new(DataType::Utf8)
154            .len(row_count)
155            .add_buffer(self.offsets_buffer.into())
156            .add_buffer(self.value_buffer.into())
157            .nulls(null_buffer);
158        // SAFETY: all data that was appended was valid UTF8 and the values
159        // and offsets were created correctly
160        let array_data = unsafe { array_builder.build_unchecked() };
161        StringArray::from(array_data)
162    }
163}
164
165pub struct StringViewArrayBuilder {
166    builder: StringViewBuilder,
167    block: String,
168}
169
170impl StringViewArrayBuilder {
171    pub fn with_capacity(_item_capacity: usize, data_capacity: usize) -> Self {
172        let builder = StringViewBuilder::with_capacity(data_capacity);
173        Self {
174            builder,
175            block: String::new(),
176        }
177    }
178
179    pub fn write<const CHECK_VALID: bool>(
180        &mut self,
181        column: &ColumnarValueRef,
182        i: usize,
183    ) {
184        match column {
185            ColumnarValueRef::Scalar(s) => {
186                self.block.push_str(std::str::from_utf8(s).unwrap());
187            }
188            ColumnarValueRef::NullableArray(array) => {
189                if !CHECK_VALID || array.is_valid(i) {
190                    self.block.push_str(
191                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
192                    );
193                }
194            }
195            ColumnarValueRef::NullableLargeStringArray(array) => {
196                if !CHECK_VALID || array.is_valid(i) {
197                    self.block.push_str(
198                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
199                    );
200                }
201            }
202            ColumnarValueRef::NullableStringViewArray(array) => {
203                if !CHECK_VALID || array.is_valid(i) {
204                    self.block.push_str(
205                        std::str::from_utf8(array.value(i).as_bytes()).unwrap(),
206                    );
207                }
208            }
209            ColumnarValueRef::NonNullableArray(array) => {
210                self.block
211                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
212            }
213            ColumnarValueRef::NonNullableLargeStringArray(array) => {
214                self.block
215                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
216            }
217            ColumnarValueRef::NonNullableStringViewArray(array) => {
218                self.block
219                    .push_str(std::str::from_utf8(array.value(i).as_bytes()).unwrap());
220            }
221        }
222    }
223
224    pub fn append_offset(&mut self) {
225        self.builder.append_value(&self.block);
226        self.block = String::new();
227    }
228
229    pub fn finish(mut self) -> StringViewArray {
230        self.builder.finish()
231    }
232}
233
234pub struct LargeStringArrayBuilder {
235    offsets_buffer: MutableBuffer,
236    value_buffer: MutableBuffer,
237}
238
239impl LargeStringArrayBuilder {
240    pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
241        let capacity = item_capacity
242            .checked_add(1)
243            .map(|i| i.saturating_mul(size_of::<i64>()))
244            .expect("capacity integer overflow");
245
246        let mut offsets_buffer = MutableBuffer::with_capacity(capacity);
247        // SAFETY: the first offset value is definitely not going to exceed the bounds.
248        unsafe { offsets_buffer.push_unchecked(0_i64) };
249        Self {
250            offsets_buffer,
251            value_buffer: MutableBuffer::with_capacity(data_capacity),
252        }
253    }
254
255    pub fn write<const CHECK_VALID: bool>(
256        &mut self,
257        column: &ColumnarValueRef,
258        i: usize,
259    ) {
260        match column {
261            ColumnarValueRef::Scalar(s) => {
262                self.value_buffer.extend_from_slice(s);
263            }
264            ColumnarValueRef::NullableArray(array) => {
265                if !CHECK_VALID || array.is_valid(i) {
266                    self.value_buffer
267                        .extend_from_slice(array.value(i).as_bytes());
268                }
269            }
270            ColumnarValueRef::NullableLargeStringArray(array) => {
271                if !CHECK_VALID || array.is_valid(i) {
272                    self.value_buffer
273                        .extend_from_slice(array.value(i).as_bytes());
274                }
275            }
276            ColumnarValueRef::NullableStringViewArray(array) => {
277                if !CHECK_VALID || array.is_valid(i) {
278                    self.value_buffer
279                        .extend_from_slice(array.value(i).as_bytes());
280                }
281            }
282            ColumnarValueRef::NonNullableArray(array) => {
283                self.value_buffer
284                    .extend_from_slice(array.value(i).as_bytes());
285            }
286            ColumnarValueRef::NonNullableLargeStringArray(array) => {
287                self.value_buffer
288                    .extend_from_slice(array.value(i).as_bytes());
289            }
290            ColumnarValueRef::NonNullableStringViewArray(array) => {
291                self.value_buffer
292                    .extend_from_slice(array.value(i).as_bytes());
293            }
294        }
295    }
296
297    pub fn append_offset(&mut self) {
298        let next_offset: i64 = self
299            .value_buffer
300            .len()
301            .try_into()
302            .expect("byte array offset overflow");
303        self.offsets_buffer.push(next_offset);
304    }
305
306    /// Finalize the builder into a concrete [`LargeStringArray`].
307    ///
308    /// # Panics
309    ///
310    /// This method can panic when:
311    ///
312    /// - the provided `null_buffer` is not the same length as the `offsets_buffer`.
313    pub fn finish(self, null_buffer: Option<NullBuffer>) -> LargeStringArray {
314        let row_count = self.offsets_buffer.len() / size_of::<i64>() - 1;
315        if let Some(ref null_buffer) = null_buffer {
316            assert_eq!(
317                null_buffer.len(),
318                row_count,
319                "Null buffer and offsets buffer must be the same length"
320            );
321        }
322        let array_builder = ArrayDataBuilder::new(DataType::LargeUtf8)
323            .len(row_count)
324            .add_buffer(self.offsets_buffer.into())
325            .add_buffer(self.value_buffer.into())
326            .nulls(null_buffer);
327        // SAFETY: all data that was appended was valid Large UTF8 and the values
328        // and offsets were created correctly
329        let array_data = unsafe { array_builder.build_unchecked() };
330        LargeStringArray::from(array_data)
331    }
332}
333
334/// Append a new view to the views buffer with the given substr
335///
336/// # Safety
337///
338/// original_view must be a valid view (the format described on
339/// [`GenericByteViewArray`](arrow::array::GenericByteViewArray).
340///
341/// # Arguments
342/// - views_buffer: The buffer to append the new view to
343/// - null_builder: The buffer to append the null value to
344/// - original_view: The original view value
345/// - substr: The substring to append. Must be a valid substring of the original view
346/// - start_offset: The start offset of the substring in the view
347pub fn make_and_append_view(
348    views_buffer: &mut Vec<u128>,
349    null_builder: &mut NullBufferBuilder,
350    original_view: &u128,
351    substr: &str,
352    start_offset: u32,
353) {
354    let substr_len = substr.len();
355    let sub_view = if substr_len > 12 {
356        let view = ByteView::from(*original_view);
357        make_view(
358            substr.as_bytes(),
359            view.buffer_index,
360            view.offset + start_offset,
361        )
362    } else {
363        // inline value does not need block id or offset
364        make_view(substr.as_bytes(), 0, 0)
365    };
366    views_buffer.push(sub_view);
367    null_builder.append_non_null();
368}
369
370#[derive(Debug)]
371pub enum ColumnarValueRef<'a> {
372    Scalar(&'a [u8]),
373    NullableArray(&'a StringArray),
374    NonNullableArray(&'a StringArray),
375    NullableLargeStringArray(&'a LargeStringArray),
376    NonNullableLargeStringArray(&'a LargeStringArray),
377    NullableStringViewArray(&'a StringViewArray),
378    NonNullableStringViewArray(&'a StringViewArray),
379}
380
381impl ColumnarValueRef<'_> {
382    #[inline]
383    pub fn is_valid(&self, i: usize) -> bool {
384        match &self {
385            Self::Scalar(_)
386            | Self::NonNullableArray(_)
387            | Self::NonNullableLargeStringArray(_)
388            | Self::NonNullableStringViewArray(_) => true,
389            Self::NullableArray(array) => array.is_valid(i),
390            Self::NullableStringViewArray(array) => array.is_valid(i),
391            Self::NullableLargeStringArray(array) => array.is_valid(i),
392        }
393    }
394
395    #[inline]
396    pub fn nulls(&self) -> Option<NullBuffer> {
397        match &self {
398            Self::Scalar(_)
399            | Self::NonNullableArray(_)
400            | Self::NonNullableStringViewArray(_)
401            | Self::NonNullableLargeStringArray(_) => None,
402            Self::NullableArray(array) => array.nulls().cloned(),
403            Self::NullableStringViewArray(array) => array.nulls().cloned(),
404            Self::NullableLargeStringArray(array) => array.nulls().cloned(),
405        }
406    }
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    #[should_panic(expected = "capacity integer overflow")]
415    fn test_overflow_string_array_builder() {
416        let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
417    }
418
419    #[test]
420    #[should_panic(expected = "capacity integer overflow")]
421    fn test_overflow_large_string_array_builder() {
422        let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
423    }
424}