polars_json/ndjson/
write.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
//! APIs to serialize and write to [NDJSON](http://ndjson.org/).
use std::io::Write;

use arrow::array::Array;
pub use fallible_streaming_iterator::FallibleStreamingIterator;
use polars_error::{PolarsError, PolarsResult};

use super::super::json::write::new_serializer;

fn serialize(array: &dyn Array, buffer: &mut Vec<u8>) {
    let mut serializer = new_serializer(array, 0, usize::MAX);
    (0..array.len()).for_each(|_| {
        buffer.extend_from_slice(serializer.next().unwrap());
        buffer.push(b'\n');
    });
}

/// [`FallibleStreamingIterator`] that serializes an [`Array`] to bytes of valid NDJSON
/// where every line is an element of the array.
/// # Implementation
/// Advancing this iterator CPU-bounded
#[derive(Debug, Clone)]
pub struct Serializer<A, I>
where
    A: AsRef<dyn Array>,
    I: Iterator<Item = PolarsResult<A>>,
{
    arrays: I,
    buffer: Vec<u8>,
}

impl<A, I> Serializer<A, I>
where
    A: AsRef<dyn Array>,
    I: Iterator<Item = PolarsResult<A>>,
{
    /// Creates a new [`Serializer`].
    pub fn new(arrays: I, buffer: Vec<u8>) -> Self {
        Self { arrays, buffer }
    }
}

impl<A, I> FallibleStreamingIterator for Serializer<A, I>
where
    A: AsRef<dyn Array>,
    I: Iterator<Item = PolarsResult<A>>,
{
    type Item = [u8];

    type Error = PolarsError;

    fn advance(&mut self) -> PolarsResult<()> {
        self.buffer.clear();
        self.arrays
            .next()
            .map(|maybe_array| maybe_array.map(|array| serialize(array.as_ref(), &mut self.buffer)))
            .transpose()?;
        Ok(())
    }

    fn get(&self) -> Option<&Self::Item> {
        if !self.buffer.is_empty() {
            Some(&self.buffer)
        } else {
            None
        }
    }
}

/// An iterator adapter that receives an implementer of [`Write`] and
/// an implementer of [`FallibleStreamingIterator`] (such as [`Serializer`])
/// and writes a valid NDJSON
/// # Implementation
/// Advancing this iterator mixes CPU-bounded (serializing arrays) tasks and IO-bounded (write to the writer).
pub struct FileWriter<W, I>
where
    W: Write,
    I: FallibleStreamingIterator<Item = [u8], Error = PolarsError>,
{
    writer: W,
    iterator: I,
}

impl<W, I> FileWriter<W, I>
where
    W: Write,
    I: FallibleStreamingIterator<Item = [u8], Error = PolarsError>,
{
    /// Creates a new [`FileWriter`].
    pub fn new(writer: W, iterator: I) -> Self {
        Self { writer, iterator }
    }

    /// Returns the inner content of this iterator
    ///
    /// There are two use-cases for this function:
    /// * to continue writing to its writer
    /// * to reuse an internal buffer of its iterator
    pub fn into_inner(self) -> (W, I) {
        (self.writer, self.iterator)
    }
}

impl<W, I> Iterator for FileWriter<W, I>
where
    W: Write,
    I: FallibleStreamingIterator<Item = [u8], Error = PolarsError>,
{
    type Item = PolarsResult<()>;

    fn next(&mut self) -> Option<Self::Item> {
        let item = self.iterator.next().transpose()?;
        Some(item.and_then(|x| {
            self.writer.write_all(x)?;
            Ok(())
        }))
    }
}