polars_json/ndjson/
file.rsuse std::io::BufRead;
use std::num::NonZeroUsize;
use arrow::datatypes::ArrowDataType;
use fallible_streaming_iterator::FallibleStreamingIterator;
use indexmap::IndexSet;
use polars_error::*;
use polars_utils::aliases::{PlIndexSet, PlRandomState};
use simd_json::BorrowedValue;
fn read_rows<R: BufRead>(reader: &mut R, rows: &mut [String], limit: usize) -> PolarsResult<usize> {
if limit == 0 {
return Ok(0);
}
let mut row_number = 0;
for row in rows.iter_mut() {
loop {
row.clear();
let _ = reader.read_line(row).map_err(|e| {
PolarsError::ComputeError(format!("{e} at line {row_number}").into())
})?;
if row.is_empty() {
break;
}
if !row.trim().is_empty() {
break;
}
}
if row.is_empty() {
break;
}
row_number += 1;
if row_number == limit {
break;
}
}
Ok(row_number)
}
pub struct FileReader<R: BufRead> {
reader: R,
rows: Vec<String>,
number_of_rows: usize,
remaining: usize,
}
impl<R: BufRead> FileReader<R> {
pub fn new(reader: R, rows: Vec<String>, limit: Option<usize>) -> Self {
Self {
reader,
rows,
remaining: limit.unwrap_or(usize::MAX),
number_of_rows: 0,
}
}
}
impl<R: BufRead> FallibleStreamingIterator for FileReader<R> {
type Error = PolarsError;
type Item = [String];
fn advance(&mut self) -> PolarsResult<()> {
self.number_of_rows = read_rows(&mut self.reader, &mut self.rows, self.remaining)?;
self.remaining -= self.number_of_rows;
Ok(())
}
fn get(&self) -> Option<&Self::Item> {
if self.number_of_rows > 0 {
Some(&self.rows[..self.number_of_rows])
} else {
None
}
}
}
fn parse_value<'a>(scratch: &'a mut Vec<u8>, val: &[u8]) -> PolarsResult<BorrowedValue<'a>> {
scratch.clear();
scratch.extend_from_slice(val);
simd_json::to_borrowed_value(scratch)
.map_err(|e| PolarsError::ComputeError(format!("{e}").into()))
}
pub fn iter_unique_dtypes<R: std::io::BufRead>(
reader: &mut R,
number_of_rows: Option<NonZeroUsize>,
) -> PolarsResult<impl Iterator<Item = ArrowDataType>> {
if reader.fill_buf().map(|b| b.is_empty())? {
return Err(PolarsError::ComputeError(
"Cannot infer NDJSON types on empty reader because empty string is not a valid JSON value".into(),
));
}
let rows = vec!["".to_string(); 1]; let mut reader = FileReader::new(reader, rows, number_of_rows.map(|v| v.into()));
let mut dtypes = PlIndexSet::default();
let mut buf = vec![];
while let Some(rows) = reader.next()? {
let value = parse_value(&mut buf, rows[0].as_bytes())?;
let dtype = crate::json::infer(&value)?;
dtypes.insert(dtype);
}
Ok(dtypes.into_iter())
}
pub fn infer_iter<A: AsRef<str>>(rows: impl Iterator<Item = A>) -> PolarsResult<ArrowDataType> {
let mut dtypes = IndexSet::<_, PlRandomState>::default();
let mut buf = vec![];
for row in rows {
let v = parse_value(&mut buf, row.as_ref().as_bytes())?;
let dtype = crate::json::infer(&v)?;
if dtype != ArrowDataType::Null {
dtypes.insert(dtype);
}
}
let v: Vec<&ArrowDataType> = dtypes.iter().collect();
Ok(crate::json::infer_schema::coerce_dtype(&v))
}