arrow_array/array/
string_array.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::ArrowError;
21
22/// A [`GenericByteArray`] for storing `str`
23pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26    /// Returns the number of `Unicode Scalar Value` in the string at index `i`.
27    /// # Performance
28    /// This function has `O(n)` time complexity where `n` is the string length.
29    /// If you can make sure that all chars in the string are in the range `U+0x0000` ~ `U+0x007F`,
30    /// please use the function [`value_length`](#method.value_length) which has O(1) time complexity.
31    pub fn num_chars(&self, i: usize) -> usize {
32        self.value(i).chars().count()
33    }
34
35    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
36    pub fn take_iter<'a>(
37        &'a self,
38        indexes: impl Iterator<Item = Option<usize>> + 'a,
39    ) -> impl Iterator<Item = Option<&'a str>> {
40        indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
41    }
42
43    /// Returns an iterator that returns the values of `array.value(i)` for an iterator with each element `i`
44    /// # Safety
45    ///
46    /// caller must ensure that the indexes in the iterator are less than the `array.len()`
47    pub unsafe fn take_iter_unchecked<'a>(
48        &'a self,
49        indexes: impl Iterator<Item = Option<usize>> + 'a,
50    ) -> impl Iterator<Item = Option<&'a str>> {
51        indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
52    }
53
54    /// Fallibly creates a [`GenericStringArray`] from a [`GenericBinaryArray`] returning
55    /// an error if [`GenericBinaryArray`] contains invalid UTF-8 data
56    pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
57        let (offsets, values, nulls) = v.into_parts();
58        Self::try_new(offsets, values, nulls)
59    }
60}
61
62impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
63    for GenericStringArray<OffsetSize>
64{
65    fn from(v: GenericListArray<OffsetSize>) -> Self {
66        GenericBinaryArray::<OffsetSize>::from(v).into()
67    }
68}
69
70impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
71    for GenericStringArray<OffsetSize>
72{
73    fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
74        Self::try_from_binary(v).unwrap()
75    }
76}
77
78impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
79    fn from(v: Vec<Option<&str>>) -> Self {
80        v.into_iter().collect()
81    }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
85    fn from(v: Vec<&str>) -> Self {
86        Self::from_iter_values(v)
87    }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
91    fn from(v: Vec<Option<String>>) -> Self {
92        v.into_iter().collect()
93    }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
97    fn from(v: Vec<String>) -> Self {
98        Self::from_iter_values(v)
99    }
100}
101
102/// A [`GenericStringArray`] of `str` using `i32` offsets
103///
104/// # Examples
105///
106/// Construction
107///
108/// ```
109/// # use arrow_array::StringArray;
110/// // Create from Vec<Option<&str>>
111/// let arr = StringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
112/// // Create from Vec<&str>
113/// let arr = StringArray::from(vec!["foo", "bar", "baz"]);
114/// // Create from iter/collect (requires Option<&str>)
115/// let arr: StringArray = std::iter::repeat(Some("foo")).take(10).collect();
116/// ```
117///
118/// Construction and Access
119///
120/// ```
121/// # use arrow_array::StringArray;
122/// let array = StringArray::from(vec![Some("foo"), None, Some("bar")]);
123/// assert_eq!(array.value(0), "foo");
124/// ```
125///
126/// See [`GenericByteArray`] for more information and examples
127pub type StringArray = GenericStringArray<i32>;
128
129/// A [`GenericStringArray`] of `str` using `i64` offsets
130///
131/// # Examples
132///
133/// Construction
134///
135/// ```
136/// # use arrow_array::LargeStringArray;
137/// // Create from Vec<Option<&str>>
138/// let arr = LargeStringArray::from(vec![Some("foo"), Some("bar"), None, Some("baz")]);
139/// // Create from Vec<&str>
140/// let arr = LargeStringArray::from(vec!["foo", "bar", "baz"]);
141/// // Create from iter/collect (requires Option<&str>)
142/// let arr: LargeStringArray = std::iter::repeat(Some("foo")).take(10).collect();
143/// ```
144///
145/// Construction and Access
146///
147/// ```
148/// use arrow_array::LargeStringArray;
149/// let array = LargeStringArray::from(vec![Some("foo"), None, Some("bar")]);
150/// assert_eq!(array.value(2), "bar");
151/// ```
152///
153/// See [`GenericByteArray`] for more information and examples
154pub type LargeStringArray = GenericStringArray<i64>;
155
156#[cfg(test)]
157mod tests {
158    use super::*;
159    use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
160    use crate::types::UInt8Type;
161    use crate::Array;
162    use arrow_buffer::Buffer;
163    use arrow_data::ArrayData;
164    use arrow_schema::{DataType, Field};
165    use std::sync::Arc;
166
167    #[test]
168    fn test_string_array_from_u8_slice() {
169        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
170
171        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
172        let string_array = StringArray::from(values);
173
174        assert_eq!(3, string_array.len());
175        assert_eq!(0, string_array.null_count());
176        assert_eq!("hello", string_array.value(0));
177        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
178        assert_eq!("", string_array.value(1));
179        assert_eq!("", unsafe { string_array.value_unchecked(1) });
180        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
181        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
182            string_array.value_unchecked(2)
183        });
184        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
185        assert_eq!(8, string_array.num_chars(2));
186        for i in 0..3 {
187            assert!(string_array.is_valid(i));
188            assert!(!string_array.is_null(i));
189        }
190    }
191
192    #[test]
193    #[should_panic(expected = "StringArray expects DataType::Utf8")]
194    fn test_string_array_from_int() {
195        let array = LargeStringArray::from(vec!["a", "b"]);
196        drop(StringArray::from(array.into_data()));
197    }
198
199    #[test]
200    fn test_large_string_array_from_u8_slice() {
201        let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
202
203        // Array data: ["hello", "", "A£ऀ𖼚𝌆৩ƐZ"]
204        let string_array = LargeStringArray::from(values);
205
206        assert_eq!(3, string_array.len());
207        assert_eq!(0, string_array.null_count());
208        assert_eq!("hello", string_array.value(0));
209        assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
210        assert_eq!("", string_array.value(1));
211        assert_eq!("", unsafe { string_array.value_unchecked(1) });
212        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
213        assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
214            string_array.value_unchecked(2)
215        });
216        assert_eq!(5, string_array.value_offsets()[2]);
217        assert_eq!(20, string_array.value_length(2)); // 1 + 2 + 3 + 4 + 4 + 3 + 2 + 1
218        assert_eq!(8, string_array.num_chars(2));
219        for i in 0..3 {
220            assert!(string_array.is_valid(i));
221            assert!(!string_array.is_null(i));
222        }
223    }
224
225    #[test]
226    fn test_nested_string_array() {
227        let string_builder = StringBuilder::with_capacity(3, 10);
228        let mut list_of_string_builder = ListBuilder::new(string_builder);
229
230        list_of_string_builder.values().append_value("foo");
231        list_of_string_builder.values().append_value("bar");
232        list_of_string_builder.append(true);
233
234        list_of_string_builder.values().append_value("foobar");
235        list_of_string_builder.append(true);
236        let list_of_strings = list_of_string_builder.finish();
237
238        assert_eq!(list_of_strings.len(), 2);
239
240        let first_slot = list_of_strings.value(0);
241        let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
242        assert_eq!(first_list.len(), 2);
243        assert_eq!(first_list.value(0), "foo");
244        assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
245        assert_eq!(first_list.value(1), "bar");
246        assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
247
248        let second_slot = list_of_strings.value(1);
249        let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
250        assert_eq!(second_list.len(), 1);
251        assert_eq!(second_list.value(0), "foobar");
252        assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
253    }
254
255    #[test]
256    #[should_panic(
257        expected = "Trying to access an element at index 4 from a StringArray of length 3"
258    )]
259    fn test_string_array_get_value_index_out_of_bound() {
260        let values: [u8; 12] = [
261            b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
262        ];
263        let offsets: [i32; 4] = [0, 5, 5, 12];
264        let array_data = ArrayData::builder(DataType::Utf8)
265            .len(3)
266            .add_buffer(Buffer::from_slice_ref(offsets))
267            .add_buffer(Buffer::from_slice_ref(values))
268            .build()
269            .unwrap();
270        let string_array = StringArray::from(array_data);
271        string_array.value(4);
272    }
273
274    #[test]
275    fn test_string_array_fmt_debug() {
276        let arr: StringArray = vec!["hello", "arrow"].into();
277        assert_eq!(
278            "StringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
279            format!("{arr:?}")
280        );
281    }
282
283    #[test]
284    fn test_large_string_array_fmt_debug() {
285        let arr: LargeStringArray = vec!["hello", "arrow"].into();
286        assert_eq!(
287            "LargeStringArray\n[\n  \"hello\",\n  \"arrow\",\n]",
288            format!("{arr:?}")
289        );
290    }
291
292    #[test]
293    fn test_string_array_from_iter() {
294        let data = [Some("hello"), None, Some("arrow")];
295        let data_vec = data.to_vec();
296        // from Vec<Option<&str>>
297        let array1 = StringArray::from(data_vec.clone());
298        // from Iterator<Option<&str>>
299        let array2: StringArray = data_vec.clone().into_iter().collect();
300        // from Iterator<Option<String>>
301        let array3: StringArray = data_vec
302            .into_iter()
303            .map(|x| x.map(|s| s.to_string()))
304            .collect();
305        // from Iterator<&Option<&str>>
306        let array4: StringArray = data.iter().collect::<StringArray>();
307
308        assert_eq!(array1, array2);
309        assert_eq!(array2, array3);
310        assert_eq!(array3, array4);
311    }
312
313    #[test]
314    fn test_string_array_from_iter_values() {
315        let data = ["hello", "hello2"];
316        let array1 = StringArray::from_iter_values(data.iter());
317
318        assert_eq!(array1.value(0), "hello");
319        assert_eq!(array1.value(1), "hello2");
320
321        // Also works with String types.
322        let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
323        let array2 = StringArray::from_iter_values(data2.iter());
324
325        assert_eq!(array2.value(0), "goodbye");
326        assert_eq!(array2.value(1), "goodbye2");
327    }
328
329    #[test]
330    fn test_string_array_from_unbound_iter() {
331        // iterator that doesn't declare (upper) size bound
332        let string_iter = (0..)
333            .scan(0usize, |pos, i| {
334                if *pos < 10 {
335                    *pos += 1;
336                    Some(Some(format!("value {i}")))
337                } else {
338                    // actually returns up to 10 values
339                    None
340                }
341            })
342            // limited using take()
343            .take(100);
344
345        let (_, upper_size_bound) = string_iter.size_hint();
346        // the upper bound, defined by take above, is 100
347        assert_eq!(upper_size_bound, Some(100));
348        let string_array: StringArray = string_iter.collect();
349        // but the actual number of items in the array should be 10
350        assert_eq!(string_array.len(), 10);
351    }
352
353    #[test]
354    fn test_string_array_all_null() {
355        let data: Vec<Option<&str>> = vec![None];
356        let array = StringArray::from(data);
357        array
358            .into_data()
359            .validate_full()
360            .expect("All null array has valid array data");
361    }
362
363    #[test]
364    fn test_large_string_array_all_null() {
365        let data: Vec<Option<&str>> = vec![None];
366        let array = LargeStringArray::from(data);
367        array
368            .into_data()
369            .validate_full()
370            .expect("All null array has valid array data");
371    }
372
373    fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
374        let values = b"HelloArrowAndParquet";
375        // "ArrowAndParquet"
376        let child_data = ArrayData::builder(DataType::UInt8)
377            .len(15)
378            .offset(5)
379            .add_buffer(Buffer::from(values))
380            .build()
381            .unwrap();
382
383        let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
384        let null_buffer = Buffer::from_slice_ref([0b101]);
385        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
386            Field::new_list_field(DataType::UInt8, false),
387        ));
388
389        // [None, Some("Parquet")]
390        let array_data = ArrayData::builder(data_type)
391            .len(2)
392            .offset(1)
393            .add_buffer(Buffer::from_slice_ref(offsets))
394            .null_bit_buffer(Some(null_buffer))
395            .add_child_data(child_data)
396            .build()
397            .unwrap();
398        let list_array = GenericListArray::<O>::from(array_data);
399        let string_array = GenericStringArray::<O>::from(list_array);
400
401        assert_eq!(2, string_array.len());
402        assert_eq!(1, string_array.null_count());
403        assert!(string_array.is_null(0));
404        assert!(string_array.is_valid(1));
405        assert_eq!("Parquet", string_array.value(1));
406    }
407
408    #[test]
409    fn test_string_array_from_list_array() {
410        _test_generic_string_array_from_list_array::<i32>();
411    }
412
413    #[test]
414    fn test_large_string_array_from_list_array() {
415        _test_generic_string_array_from_list_array::<i64>();
416    }
417
418    fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
419        let values = b"HelloArrow";
420        let child_data = ArrayData::builder(DataType::UInt8)
421            .len(10)
422            .add_buffer(Buffer::from(values))
423            .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
424            .build()
425            .unwrap();
426
427        let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
428
429        // It is possible to create a null struct containing a non-nullable child
430        // see https://github.com/apache/arrow-rs/pull/3244 for details
431        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
432            Field::new_list_field(DataType::UInt8, true),
433        ));
434
435        // [None, Some(b"Parquet")]
436        let array_data = ArrayData::builder(data_type)
437            .len(2)
438            .add_buffer(Buffer::from_slice_ref(offsets))
439            .add_child_data(child_data)
440            .build()
441            .unwrap();
442        let list_array = GenericListArray::<O>::from(array_data);
443        drop(GenericStringArray::<O>::from(list_array));
444    }
445
446    #[test]
447    #[should_panic(expected = "The child array cannot contain null values.")]
448    fn test_string_array_from_list_array_with_child_nulls_failed() {
449        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
450    }
451
452    #[test]
453    #[should_panic(expected = "The child array cannot contain null values.")]
454    fn test_large_string_array_from_list_array_with_child_nulls_failed() {
455        _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
456    }
457
458    fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
459        let values = b"HelloArrow";
460        let child_data = ArrayData::builder(DataType::UInt16)
461            .len(5)
462            .add_buffer(Buffer::from(values))
463            .build()
464            .unwrap();
465
466        let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
467        let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
468            Field::new_list_field(DataType::UInt16, false),
469        ));
470
471        let array_data = ArrayData::builder(data_type)
472            .len(2)
473            .add_buffer(Buffer::from_slice_ref(offsets))
474            .add_child_data(child_data)
475            .build()
476            .unwrap();
477        let list_array = GenericListArray::<O>::from(array_data);
478        drop(GenericStringArray::<O>::from(list_array));
479    }
480
481    #[test]
482    #[should_panic(
483        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
484    )]
485    fn test_string_array_from_list_array_wrong_type() {
486        _test_generic_string_array_from_list_array_wrong_type::<i32>();
487    }
488
489    #[test]
490    #[should_panic(
491        expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
492    )]
493    fn test_large_string_array_from_list_array_wrong_type() {
494        _test_generic_string_array_from_list_array_wrong_type::<i64>();
495    }
496
497    #[test]
498    #[should_panic(
499        expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
500    )]
501    fn test_list_array_utf8_validation() {
502        let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
503        builder.values().append_value(0xFF);
504        builder.append(true);
505        let list = builder.finish();
506        let _ = StringArray::from(list);
507    }
508
509    #[test]
510    fn test_empty_offsets() {
511        let string = StringArray::from(
512            ArrayData::builder(DataType::Utf8)
513                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
514                .build()
515                .unwrap(),
516        );
517        assert_eq!(string.len(), 0);
518        assert_eq!(string.value_offsets(), &[0]);
519
520        let string = LargeStringArray::from(
521            ArrayData::builder(DataType::LargeUtf8)
522                .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
523                .build()
524                .unwrap(),
525        );
526        assert_eq!(string.len(), 0);
527        assert_eq!(string.value_offsets(), &[0]);
528    }
529
530    #[test]
531    fn test_into_builder() {
532        let array: StringArray = vec!["hello", "arrow"].into();
533
534        // Append values
535        let mut builder = array.into_builder().unwrap();
536
537        builder.append_value("rust");
538
539        let expected: StringArray = vec!["hello", "arrow", "rust"].into();
540        let array = builder.finish();
541        assert_eq!(expected, array);
542    }
543
544    #[test]
545    fn test_into_builder_err() {
546        let array: StringArray = vec!["hello", "arrow"].into();
547
548        // Clone it, so we cannot get a mutable builder back
549        let shared_array = array.clone();
550
551        let err_return = array.into_builder().unwrap_err();
552        assert_eq!(&err_return, &shared_array);
553    }
554}