archive_to_parquet/
lib.rs

1#![doc = include_str!("../README.md")]
2use byte_unit::{Byte, Unit};
3use indicatif::DecimalBytes;
4pub use parquet::basic::Compression;
5use std::fmt::Display;
6use std::num::NonZeroUsize;
7
8mod batch;
9mod channel;
10mod converter;
11mod hasher;
12mod progress;
13mod sink;
14mod visitor;
15
16pub use anyreader_walker::{AnyWalker, ArchiveStack, EntryDetails, FileEntry, FormatKind};
17pub use channel::{new_record_batch_channel, ConversionCounter, RecordBatchChannel};
18pub use converter::{Converter, ProgressBarConverter, StandardConverter};
19pub use sink::{new_parquet_writer, IncludeType, ParquetSink};
20pub use visitor::*;
21
22#[allow(clippy::too_many_arguments)]
23#[derive(Debug, Clone, derive_new::new)]
24pub struct ConvertionOptions {
25    pub threads: NonZeroUsize,
26    pub include: IncludeType,
27    pub unique: bool,
28    pub compression: Compression,
29    pub min_size: Option<Byte>,
30    pub max_size: Option<Byte>,
31    pub batch_count: usize,
32    pub batch_size: Byte,
33    pub extract_strings: bool,
34}
35
36impl ConvertionOptions {
37    pub const fn const_default() -> Self {
38        Self {
39            threads: NonZeroUsize::new(8).unwrap(),
40            include: IncludeType::All,
41            unique: false,
42            compression: Compression::SNAPPY,
43            min_size: None,
44            max_size: None,
45            batch_count: 14,
46            // Also needs changing in the Args struct inside main.rs
47            batch_size: Byte::from_u64_with_unit(100, Unit::MB).unwrap(),
48            extract_strings: false,
49        }
50    }
51
52    #[inline(always)]
53    pub fn get_size_range(&self) -> Option<std::ops::Range<Byte>> {
54        match (self.min_size, self.max_size) {
55            (Some(min), Some(max)) => Some(min..max),
56            (None, Some(max)) => Some(Byte::from(0u64)..max),
57            (Some(min), None) => Some(min..Byte::from(u64::MAX)),
58            (None, None) => None,
59        }
60    }
61}
62
63impl Display for ConvertionOptions {
64    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
65        write!(
66            f,
67            "ConvertionOptions(threads={}, include={:?}, unique={}, compression={:?}",
68            self.threads, self.include, self.unique, self.compression
69        )?;
70        if let Some(min_size) = &self.min_size {
71            write!(f, ", min_size={}", DecimalBytes(min_size.as_u64()))?;
72        } else {
73            write!(f, ", min_size=None")?;
74        }
75
76        if let Some(max_size) = &self.max_size {
77            write!(f, ", max_size={}", DecimalBytes(max_size.as_u64()))?;
78        } else {
79            write!(f, ", size_range=None")?;
80        }
81        write!(
82            f,
83            ", batch_count={}, batch_size={:#.1})",
84            self.batch_count,
85            DecimalBytes(self.batch_size.as_u64())
86        )
87    }
88}