polars_parquet/arrow/read/schema/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
//! APIs to handle Parquet <-> Arrow schemas.
use arrow::datatypes::{ArrowSchema, TimeUnit};
mod convert;
mod metadata;
pub(crate) use convert::*;
pub use convert::{parquet_to_arrow_schema, parquet_to_arrow_schema_with_options};
pub use metadata::read_schema_from_metadata;
use polars_error::PolarsResult;
use self::metadata::parse_key_value_metadata;
pub use crate::parquet::metadata::{FileMetadata, KeyValue, SchemaDescriptor};
pub use crate::parquet::schema::types::ParquetType;
/// Options when inferring schemas from Parquet
pub struct SchemaInferenceOptions {
/// When inferring schemas from the Parquet INT96 timestamp type, this is the corresponding TimeUnit
/// in the inferred Arrow Timestamp type.
///
/// This defaults to `TimeUnit::Nanosecond`, but INT96 timestamps outside of the range of years 1678-2262,
/// will overflow when parsed as `Timestamp(TimeUnit::Nanosecond)`. Setting this to a lower resolution
/// (e.g. TimeUnit::Milliseconds) will result in loss of precision, but support a larger range of dates
/// without overflowing when parsing the data.
pub int96_coerce_to_timeunit: TimeUnit,
}
impl Default for SchemaInferenceOptions {
fn default() -> Self {
SchemaInferenceOptions {
int96_coerce_to_timeunit: TimeUnit::Nanosecond,
}
}
}
/// Infers a [`ArrowSchema`] from parquet's [`FileMetadata`].
///
/// This first looks for the metadata key `"ARROW:schema"`; if it does not exist, it converts the
/// Parquet types declared in the file's Parquet schema to Arrow's equivalent.
///
/// # Error
/// This function errors iff the key `"ARROW:schema"` exists but is not correctly encoded,
/// indicating that the file's arrow metadata was incorrectly written.
pub fn infer_schema(file_metadata: &FileMetadata) -> PolarsResult<ArrowSchema> {
infer_schema_with_options(file_metadata, &None)
}
/// Like [`infer_schema`] but with configurable options which affects the behavior of inference
pub fn infer_schema_with_options(
file_metadata: &FileMetadata,
options: &Option<SchemaInferenceOptions>,
) -> PolarsResult<ArrowSchema> {
let mut metadata = parse_key_value_metadata(file_metadata.key_value_metadata());
let schema = read_schema_from_metadata(&mut metadata)?;
Ok(schema.unwrap_or_else(|| {
parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options)
}))
}