Struct arrow_json::reader::Decoder

source ·
pub struct Decoder { /* private fields */ }
Expand description

A low-level interface for reading JSON data from a byte stream

See Reader for a higher-level interface for interface with BufRead

The push-based interface facilitates integration with sources that yield arbitrarily delimited bytes ranges, such as BufRead, or a chunked byte stream received from object storage

fn read_from_json<R: BufRead>(
    mut reader: R,
    schema: SchemaRef,
) -> Result<impl Iterator<Item = Result<RecordBatch, ArrowError>>, ArrowError> {
    let mut decoder = ReaderBuilder::new(schema).build_decoder()?;
    let mut next = move || {
        loop {
            // Decoder is agnostic that buf doesn't contain whole records
            let buf = reader.fill_buf()?;
            if buf.is_empty() {
                break; // Input exhausted
            }
            let read = buf.len();
            let decoded = decoder.decode(buf)?;

            // Consume the number of bytes read
            reader.consume(decoded);
            if decoded != read {
                break; // Read batch size
            }
        }
        decoder.flush()
    };
    Ok(std::iter::from_fn(move || next().transpose()))
}

Implementations§

source§

impl Decoder

source

pub fn decode(&mut self, buf: &[u8]) -> Result<usize, ArrowError>

Read JSON objects from buf, returning the number of bytes read

This method returns once batch_size objects have been parsed since the last call to Self::flush, or buf is exhausted. Any remaining bytes should be included in the next call to Self::decode

There is no requirement that buf contains a whole number of records, facilitating integration with arbitrary byte streams, such as that yielded by BufRead

source

pub fn serialize<S: Serialize>(&mut self, rows: &[S]) -> Result<(), ArrowError>

Serialize rows to this Decoder

This provides a simple way to convert serde-compatible datastructures into arrow RecordBatch.

Custom conversion logic as described in arrow_array::builder will likely outperform this, especially where the schema is known at compile-time, however, this provides a mechanism to get something up and running quickly

It can be used with serde_json::Value

let json = vec![json!({"float": 2.3}), json!({"float": 5.7})];

let schema = Schema::new(vec![Field::new("float", DataType::Float32, true)]);
let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();

decoder.serialize(&json).unwrap();
let batch = decoder.flush().unwrap().unwrap();
assert_eq!(batch.num_rows(), 2);
assert_eq!(batch.num_columns(), 1);
let values = batch.column(0).as_primitive::<Float32Type>().values();
assert_eq!(values, &[2.3, 5.7])

Or with arbitrary Serialize types

#[derive(Serialize)]
struct MyStruct {
    int32: i32,
    float: f32,
}

let schema = Schema::new(vec![
    Field::new("int32", DataType::Int32, false),
    Field::new("float", DataType::Float32, false),
]);

let rows = vec![
    MyStruct{ int32: 0, float: 3. },
    MyStruct{ int32: 4, float: 67.53 },
];

let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();
decoder.serialize(&rows).unwrap();

let batch = decoder.flush().unwrap().unwrap();

// Expect batch containing two columns
let int32 = batch.column(0).as_primitive::<Int32Type>();
assert_eq!(int32.values(), &[0, 4]);

let float = batch.column(1).as_primitive::<Float32Type>();
assert_eq!(float.values(), &[3., 67.53]);

Or even complex nested types

#[derive(Serialize)]
struct MyStruct {
    int32: i32,
    list: Vec<f64>,
    nested: Vec<Option<Nested>>,
}

impl MyStruct {
    /// Returns the [`Fields`] for [`MyStruct`]
    fn fields() -> Fields {
        let nested = DataType::Struct(Nested::fields());
        Fields::from([
            Arc::new(Field::new("int32", DataType::Int32, false)),
            Arc::new(Field::new_list(
                "list",
                Field::new("element", DataType::Float64, false),
                false,
            )),
            Arc::new(Field::new_list(
                "nested",
                Field::new("element", nested, true),
                true,
            )),
        ])
    }
}

#[derive(Serialize)]
struct Nested {
    map: BTreeMap<String, Vec<String>>
}

impl Nested {
    /// Returns the [`Fields`] for [`Nested`]
    fn fields() -> Fields {
        let element = Field::new("element", DataType::Utf8, false);
        Fields::from([
            Arc::new(Field::new_map(
                "map",
                "entries",
                Field::new("key", DataType::Utf8, false),
                Field::new_list("value", element, false),
                false, // sorted
                false, // nullable
            ))
        ])
    }
}

let data = vec![
    MyStruct {
        int32: 34,
        list: vec![1., 2., 34.],
        nested: vec![
            None,
            Some(Nested {
                map: vec![
                    ("key1".to_string(), vec!["foo".to_string(), "bar".to_string()]),
                    ("key2".to_string(), vec!["baz".to_string()])
                ].into_iter().collect()
            })
        ]
    },
    MyStruct {
        int32: 56,
        list: vec![],
        nested: vec![]
    },
    MyStruct {
        int32: 24,
        list: vec![-1., 245.],
        nested: vec![None]
    }
];

let schema = Schema::new(MyStruct::fields());
let mut decoder = ReaderBuilder::new(Arc::new(schema)).build_decoder().unwrap();
decoder.serialize(&data).unwrap();
let batch = decoder.flush().unwrap().unwrap();
assert_eq!(batch.num_rows(), 3);
assert_eq!(batch.num_columns(), 3);

// Convert to StructArray to format
let s = StructArray::from(batch);
let options = FormatOptions::default().with_null("null");
let formatter = ArrayFormatter::try_new(&s, &options).unwrap();

assert_eq!(&formatter.value(0).to_string(), "{int32: 34, list: [1.0, 2.0, 34.0], nested: [null, {map: {key1: [foo, bar], key2: [baz]}}]}");
assert_eq!(&formatter.value(1).to_string(), "{int32: 56, list: [], nested: []}");
assert_eq!(&formatter.value(2).to_string(), "{int32: 24, list: [-1.0, 245.0], nested: [null]}");

Note: this ignores any batch size setting, and always decodes all rows

source

pub fn flush(&mut self) -> Result<Option<RecordBatch>, ArrowError>

Flushes the currently buffered data to a RecordBatch

Returns Ok(None) if no buffered data

Note: if called part way through decoding a record, this will return an error

Trait Implementations§

source§

impl Debug for Decoder

source§

fn fmt(&self, f: &mut Formatter<'_>) -> Result

Formats the value using the given formatter. Read more

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for Twhere T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for Twhere T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for Twhere T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for Twhere U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.