polars_arrow/legacy/array/
utf8.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
use crate::array::{BinaryArray, Utf8Array};
use crate::datatypes::ArrowDataType;
use crate::legacy::trusted_len::TrustedLenPush;
use crate::offset::Offsets;

#[inline]
unsafe fn extend_from_trusted_len_values_iter<I, P>(
    offsets: &mut Vec<i64>,
    values: &mut Vec<u8>,
    iterator: I,
) where
    P: AsRef<[u8]>,
    I: Iterator<Item = P>,
{
    let mut total_length = 0;
    offsets.push(total_length);
    iterator.for_each(|item| {
        let s = item.as_ref();
        // Push new entries for both `values` and `offsets` buffer
        values.extend_from_slice(s);

        total_length += s.len() as i64;
        offsets.push_unchecked(total_length);
    });
}

/// # Safety
/// reported `len` must be correct.
#[inline]
unsafe fn fill_offsets_and_values<I, P>(
    iterator: I,
    value_capacity: usize,
    len: usize,
) -> (Offsets<i64>, Vec<u8>)
where
    P: AsRef<[u8]>,
    I: Iterator<Item = P>,
{
    let mut offsets = Vec::with_capacity(len + 1);
    let mut values = Vec::<u8>::with_capacity(value_capacity);

    extend_from_trusted_len_values_iter(&mut offsets, &mut values, iterator);

    (Offsets::new_unchecked(offsets), values)
}

struct StrAsBytes<P>(P);
impl<T: AsRef<str>> AsRef<[u8]> for StrAsBytes<T> {
    #[inline(always)]
    fn as_ref(&self) -> &[u8] {
        self.0.as_ref().as_bytes()
    }
}

pub trait Utf8FromIter {
    #[inline]
    fn from_values_iter<I, S>(iter: I, len: usize, size_hint: usize) -> Utf8Array<i64>
    where
        S: AsRef<str>,
        I: Iterator<Item = S>,
    {
        let iter = iter.map(StrAsBytes);
        let (offsets, values) = unsafe { fill_offsets_and_values(iter, size_hint, len) };
        unsafe {
            Utf8Array::new_unchecked(
                ArrowDataType::LargeUtf8,
                offsets.into(),
                values.into(),
                None,
            )
        }
    }
}

impl Utf8FromIter for Utf8Array<i64> {}

pub trait BinaryFromIter {
    #[inline]
    fn from_values_iter<I, S>(iter: I, len: usize, value_cap: usize) -> BinaryArray<i64>
    where
        S: AsRef<[u8]>,
        I: Iterator<Item = S>,
    {
        let (offsets, values) = unsafe { fill_offsets_and_values(iter, value_cap, len) };
        BinaryArray::new(
            ArrowDataType::LargeBinary,
            offsets.into(),
            values.into(),
            None,
        )
    }
}

impl BinaryFromIter for BinaryArray<i64> {}