1use crate::types::GenericStringType;
19use crate::{GenericBinaryArray, GenericByteArray, GenericListArray, OffsetSizeTrait};
20use arrow_schema::ArrowError;
21
22pub type GenericStringArray<OffsetSize> = GenericByteArray<GenericStringType<OffsetSize>>;
24
25impl<OffsetSize: OffsetSizeTrait> GenericStringArray<OffsetSize> {
26 pub fn num_chars(&self, i: usize) -> usize {
32 self.value(i).chars().count()
33 }
34
35 pub fn take_iter<'a>(
37 &'a self,
38 indexes: impl Iterator<Item = Option<usize>> + 'a,
39 ) -> impl Iterator<Item = Option<&'a str>> {
40 indexes.map(|opt_index| opt_index.map(|index| self.value(index)))
41 }
42
43 pub unsafe fn take_iter_unchecked<'a>(
48 &'a self,
49 indexes: impl Iterator<Item = Option<usize>> + 'a,
50 ) -> impl Iterator<Item = Option<&'a str>> {
51 indexes.map(|opt_index| opt_index.map(|index| self.value_unchecked(index)))
52 }
53
54 pub fn try_from_binary(v: GenericBinaryArray<OffsetSize>) -> Result<Self, ArrowError> {
57 let (offsets, values, nulls) = v.into_parts();
58 Self::try_new(offsets, values, nulls)
59 }
60}
61
62impl<OffsetSize: OffsetSizeTrait> From<GenericListArray<OffsetSize>>
63 for GenericStringArray<OffsetSize>
64{
65 fn from(v: GenericListArray<OffsetSize>) -> Self {
66 GenericBinaryArray::<OffsetSize>::from(v).into()
67 }
68}
69
70impl<OffsetSize: OffsetSizeTrait> From<GenericBinaryArray<OffsetSize>>
71 for GenericStringArray<OffsetSize>
72{
73 fn from(v: GenericBinaryArray<OffsetSize>) -> Self {
74 Self::try_from_binary(v).unwrap()
75 }
76}
77
78impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<&str>>> for GenericStringArray<OffsetSize> {
79 fn from(v: Vec<Option<&str>>) -> Self {
80 v.into_iter().collect()
81 }
82}
83
84impl<OffsetSize: OffsetSizeTrait> From<Vec<&str>> for GenericStringArray<OffsetSize> {
85 fn from(v: Vec<&str>) -> Self {
86 Self::from_iter_values(v)
87 }
88}
89
90impl<OffsetSize: OffsetSizeTrait> From<Vec<Option<String>>> for GenericStringArray<OffsetSize> {
91 fn from(v: Vec<Option<String>>) -> Self {
92 v.into_iter().collect()
93 }
94}
95
96impl<OffsetSize: OffsetSizeTrait> From<Vec<String>> for GenericStringArray<OffsetSize> {
97 fn from(v: Vec<String>) -> Self {
98 Self::from_iter_values(v)
99 }
100}
101
102pub type StringArray = GenericStringArray<i32>;
128
129pub type LargeStringArray = GenericStringArray<i64>;
155
156#[cfg(test)]
157mod tests {
158 use super::*;
159 use crate::builder::{ListBuilder, PrimitiveBuilder, StringBuilder};
160 use crate::types::UInt8Type;
161 use crate::Array;
162 use arrow_buffer::Buffer;
163 use arrow_data::ArrayData;
164 use arrow_schema::{DataType, Field};
165 use std::sync::Arc;
166
167 #[test]
168 fn test_string_array_from_u8_slice() {
169 let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
170
171 let string_array = StringArray::from(values);
173
174 assert_eq!(3, string_array.len());
175 assert_eq!(0, string_array.null_count());
176 assert_eq!("hello", string_array.value(0));
177 assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
178 assert_eq!("", string_array.value(1));
179 assert_eq!("", unsafe { string_array.value_unchecked(1) });
180 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
181 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
182 string_array.value_unchecked(2)
183 });
184 assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
186 for i in 0..3 {
187 assert!(string_array.is_valid(i));
188 assert!(!string_array.is_null(i));
189 }
190 }
191
192 #[test]
193 #[should_panic(expected = "StringArray expects DataType::Utf8")]
194 fn test_string_array_from_int() {
195 let array = LargeStringArray::from(vec!["a", "b"]);
196 drop(StringArray::from(array.into_data()));
197 }
198
199 #[test]
200 fn test_large_string_array_from_u8_slice() {
201 let values: Vec<&str> = vec!["hello", "", "A£ऀ𖼚𝌆৩ƐZ"];
202
203 let string_array = LargeStringArray::from(values);
205
206 assert_eq!(3, string_array.len());
207 assert_eq!(0, string_array.null_count());
208 assert_eq!("hello", string_array.value(0));
209 assert_eq!("hello", unsafe { string_array.value_unchecked(0) });
210 assert_eq!("", string_array.value(1));
211 assert_eq!("", unsafe { string_array.value_unchecked(1) });
212 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", string_array.value(2));
213 assert_eq!("A£ऀ𖼚𝌆৩ƐZ", unsafe {
214 string_array.value_unchecked(2)
215 });
216 assert_eq!(5, string_array.value_offsets()[2]);
217 assert_eq!(20, string_array.value_length(2)); assert_eq!(8, string_array.num_chars(2));
219 for i in 0..3 {
220 assert!(string_array.is_valid(i));
221 assert!(!string_array.is_null(i));
222 }
223 }
224
225 #[test]
226 fn test_nested_string_array() {
227 let string_builder = StringBuilder::with_capacity(3, 10);
228 let mut list_of_string_builder = ListBuilder::new(string_builder);
229
230 list_of_string_builder.values().append_value("foo");
231 list_of_string_builder.values().append_value("bar");
232 list_of_string_builder.append(true);
233
234 list_of_string_builder.values().append_value("foobar");
235 list_of_string_builder.append(true);
236 let list_of_strings = list_of_string_builder.finish();
237
238 assert_eq!(list_of_strings.len(), 2);
239
240 let first_slot = list_of_strings.value(0);
241 let first_list = first_slot.as_any().downcast_ref::<StringArray>().unwrap();
242 assert_eq!(first_list.len(), 2);
243 assert_eq!(first_list.value(0), "foo");
244 assert_eq!(unsafe { first_list.value_unchecked(0) }, "foo");
245 assert_eq!(first_list.value(1), "bar");
246 assert_eq!(unsafe { first_list.value_unchecked(1) }, "bar");
247
248 let second_slot = list_of_strings.value(1);
249 let second_list = second_slot.as_any().downcast_ref::<StringArray>().unwrap();
250 assert_eq!(second_list.len(), 1);
251 assert_eq!(second_list.value(0), "foobar");
252 assert_eq!(unsafe { second_list.value_unchecked(0) }, "foobar");
253 }
254
255 #[test]
256 #[should_panic(
257 expected = "Trying to access an element at index 4 from a StringArray of length 3"
258 )]
259 fn test_string_array_get_value_index_out_of_bound() {
260 let values: [u8; 12] = [
261 b'h', b'e', b'l', b'l', b'o', b'p', b'a', b'r', b'q', b'u', b'e', b't',
262 ];
263 let offsets: [i32; 4] = [0, 5, 5, 12];
264 let array_data = ArrayData::builder(DataType::Utf8)
265 .len(3)
266 .add_buffer(Buffer::from_slice_ref(offsets))
267 .add_buffer(Buffer::from_slice_ref(values))
268 .build()
269 .unwrap();
270 let string_array = StringArray::from(array_data);
271 string_array.value(4);
272 }
273
274 #[test]
275 fn test_string_array_fmt_debug() {
276 let arr: StringArray = vec!["hello", "arrow"].into();
277 assert_eq!(
278 "StringArray\n[\n \"hello\",\n \"arrow\",\n]",
279 format!("{arr:?}")
280 );
281 }
282
283 #[test]
284 fn test_large_string_array_fmt_debug() {
285 let arr: LargeStringArray = vec!["hello", "arrow"].into();
286 assert_eq!(
287 "LargeStringArray\n[\n \"hello\",\n \"arrow\",\n]",
288 format!("{arr:?}")
289 );
290 }
291
292 #[test]
293 fn test_string_array_from_iter() {
294 let data = [Some("hello"), None, Some("arrow")];
295 let data_vec = data.to_vec();
296 let array1 = StringArray::from(data_vec.clone());
298 let array2: StringArray = data_vec.clone().into_iter().collect();
300 let array3: StringArray = data_vec
302 .into_iter()
303 .map(|x| x.map(|s| s.to_string()))
304 .collect();
305 let array4: StringArray = data.iter().collect::<StringArray>();
307
308 assert_eq!(array1, array2);
309 assert_eq!(array2, array3);
310 assert_eq!(array3, array4);
311 }
312
313 #[test]
314 fn test_string_array_from_iter_values() {
315 let data = ["hello", "hello2"];
316 let array1 = StringArray::from_iter_values(data.iter());
317
318 assert_eq!(array1.value(0), "hello");
319 assert_eq!(array1.value(1), "hello2");
320
321 let data2 = ["goodbye".to_string(), "goodbye2".to_string()];
323 let array2 = StringArray::from_iter_values(data2.iter());
324
325 assert_eq!(array2.value(0), "goodbye");
326 assert_eq!(array2.value(1), "goodbye2");
327 }
328
329 #[test]
330 fn test_string_array_from_unbound_iter() {
331 let string_iter = (0..)
333 .scan(0usize, |pos, i| {
334 if *pos < 10 {
335 *pos += 1;
336 Some(Some(format!("value {i}")))
337 } else {
338 None
340 }
341 })
342 .take(100);
344
345 let (_, upper_size_bound) = string_iter.size_hint();
346 assert_eq!(upper_size_bound, Some(100));
348 let string_array: StringArray = string_iter.collect();
349 assert_eq!(string_array.len(), 10);
351 }
352
353 #[test]
354 fn test_string_array_all_null() {
355 let data: Vec<Option<&str>> = vec![None];
356 let array = StringArray::from(data);
357 array
358 .into_data()
359 .validate_full()
360 .expect("All null array has valid array data");
361 }
362
363 #[test]
364 fn test_large_string_array_all_null() {
365 let data: Vec<Option<&str>> = vec![None];
366 let array = LargeStringArray::from(data);
367 array
368 .into_data()
369 .validate_full()
370 .expect("All null array has valid array data");
371 }
372
373 fn _test_generic_string_array_from_list_array<O: OffsetSizeTrait>() {
374 let values = b"HelloArrowAndParquet";
375 let child_data = ArrayData::builder(DataType::UInt8)
377 .len(15)
378 .offset(5)
379 .add_buffer(Buffer::from(values))
380 .build()
381 .unwrap();
382
383 let offsets = [0, 5, 8, 15].map(|n| O::from_usize(n).unwrap());
384 let null_buffer = Buffer::from_slice_ref([0b101]);
385 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
386 Field::new_list_field(DataType::UInt8, false),
387 ));
388
389 let array_data = ArrayData::builder(data_type)
391 .len(2)
392 .offset(1)
393 .add_buffer(Buffer::from_slice_ref(offsets))
394 .null_bit_buffer(Some(null_buffer))
395 .add_child_data(child_data)
396 .build()
397 .unwrap();
398 let list_array = GenericListArray::<O>::from(array_data);
399 let string_array = GenericStringArray::<O>::from(list_array);
400
401 assert_eq!(2, string_array.len());
402 assert_eq!(1, string_array.null_count());
403 assert!(string_array.is_null(0));
404 assert!(string_array.is_valid(1));
405 assert_eq!("Parquet", string_array.value(1));
406 }
407
408 #[test]
409 fn test_string_array_from_list_array() {
410 _test_generic_string_array_from_list_array::<i32>();
411 }
412
413 #[test]
414 fn test_large_string_array_from_list_array() {
415 _test_generic_string_array_from_list_array::<i64>();
416 }
417
418 fn _test_generic_string_array_from_list_array_with_child_nulls_failed<O: OffsetSizeTrait>() {
419 let values = b"HelloArrow";
420 let child_data = ArrayData::builder(DataType::UInt8)
421 .len(10)
422 .add_buffer(Buffer::from(values))
423 .null_bit_buffer(Some(Buffer::from_slice_ref([0b1010101010])))
424 .build()
425 .unwrap();
426
427 let offsets = [0, 5, 10].map(|n| O::from_usize(n).unwrap());
428
429 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
432 Field::new_list_field(DataType::UInt8, true),
433 ));
434
435 let array_data = ArrayData::builder(data_type)
437 .len(2)
438 .add_buffer(Buffer::from_slice_ref(offsets))
439 .add_child_data(child_data)
440 .build()
441 .unwrap();
442 let list_array = GenericListArray::<O>::from(array_data);
443 drop(GenericStringArray::<O>::from(list_array));
444 }
445
446 #[test]
447 #[should_panic(expected = "The child array cannot contain null values.")]
448 fn test_string_array_from_list_array_with_child_nulls_failed() {
449 _test_generic_string_array_from_list_array_with_child_nulls_failed::<i32>();
450 }
451
452 #[test]
453 #[should_panic(expected = "The child array cannot contain null values.")]
454 fn test_large_string_array_from_list_array_with_child_nulls_failed() {
455 _test_generic_string_array_from_list_array_with_child_nulls_failed::<i64>();
456 }
457
458 fn _test_generic_string_array_from_list_array_wrong_type<O: OffsetSizeTrait>() {
459 let values = b"HelloArrow";
460 let child_data = ArrayData::builder(DataType::UInt16)
461 .len(5)
462 .add_buffer(Buffer::from(values))
463 .build()
464 .unwrap();
465
466 let offsets = [0, 2, 3].map(|n| O::from_usize(n).unwrap());
467 let data_type = GenericListArray::<O>::DATA_TYPE_CONSTRUCTOR(Arc::new(
468 Field::new_list_field(DataType::UInt16, false),
469 ));
470
471 let array_data = ArrayData::builder(data_type)
472 .len(2)
473 .add_buffer(Buffer::from_slice_ref(offsets))
474 .add_child_data(child_data)
475 .build()
476 .unwrap();
477 let list_array = GenericListArray::<O>::from(array_data);
478 drop(GenericStringArray::<O>::from(list_array));
479 }
480
481 #[test]
482 #[should_panic(
483 expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
484 )]
485 fn test_string_array_from_list_array_wrong_type() {
486 _test_generic_string_array_from_list_array_wrong_type::<i32>();
487 }
488
489 #[test]
490 #[should_panic(
491 expected = "BinaryArray can only be created from List<u8> arrays, mismatched data types."
492 )]
493 fn test_large_string_array_from_list_array_wrong_type() {
494 _test_generic_string_array_from_list_array_wrong_type::<i64>();
495 }
496
497 #[test]
498 #[should_panic(
499 expected = "Encountered non UTF-8 data: invalid utf-8 sequence of 1 bytes from index 0"
500 )]
501 fn test_list_array_utf8_validation() {
502 let mut builder = ListBuilder::new(PrimitiveBuilder::<UInt8Type>::new());
503 builder.values().append_value(0xFF);
504 builder.append(true);
505 let list = builder.finish();
506 let _ = StringArray::from(list);
507 }
508
509 #[test]
510 fn test_empty_offsets() {
511 let string = StringArray::from(
512 ArrayData::builder(DataType::Utf8)
513 .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
514 .build()
515 .unwrap(),
516 );
517 assert_eq!(string.len(), 0);
518 assert_eq!(string.value_offsets(), &[0]);
519
520 let string = LargeStringArray::from(
521 ArrayData::builder(DataType::LargeUtf8)
522 .buffers(vec![Buffer::from(&[]), Buffer::from(&[])])
523 .build()
524 .unwrap(),
525 );
526 assert_eq!(string.len(), 0);
527 assert_eq!(string.value_offsets(), &[0]);
528 }
529
530 #[test]
531 fn test_into_builder() {
532 let array: StringArray = vec!["hello", "arrow"].into();
533
534 let mut builder = array.into_builder().unwrap();
536
537 builder.append_value("rust");
538
539 let expected: StringArray = vec!["hello", "arrow", "rust"].into();
540 let array = builder.finish();
541 assert_eq!(expected, array);
542 }
543
544 #[test]
545 fn test_into_builder_err() {
546 let array: StringArray = vec!["hello", "arrow"].into();
547
548 let shared_array = array.clone();
550
551 let err_return = array.into_builder().unwrap_err();
552 assert_eq!(&err_return, &shared_array);
553 }
554}