archive_to_parquet/
visitor.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
use crate::batch::OutputBatch;
use crate::channel::RecordBatchSender;
use crate::progress::Counters;
use crate::ConvertionOptions;
use anyreader_walker::{AnyWalker, ArchiveStack, EntryDetails, FileEntry, FormatKind};
use std::io::Read;
use std::path::PathBuf;
use tracing::{debug, error, trace};

#[derive(Debug)]
pub struct Visitor {
    input_path: PathBuf,
    batch: OutputBatch,
    channel: RecordBatchSender,
    stack: ArchiveStack,
    counters: Counters,
}

impl Visitor {
    pub(crate) fn new(
        path: impl Into<PathBuf>,
        channel: RecordBatchSender,
        options: ConvertionOptions,
    ) -> Self {
        Self {
            input_path: path.into(),
            channel,
            batch: OutputBatch::new_with_options(options),
            stack: ArchiveStack::default(),
            counters: Counters::default(),
        }
    }
}

impl Visitor {
    pub fn counters(&self) -> &Counters {
        &self.counters
    }

    fn send_batch(&mut self) -> std::io::Result<()> {
        let batch = self
            .batch
            .create_record_batch_and_reset()
            .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?;
        trace!("Sending batch with {} rows", batch.num_rows());
        self.counters.sent_batch();
        self.channel.send_batch(Ok(batch))?;
        Ok(())
    }

    fn try_walk(&mut self, entry: FileEntry<impl Read>) -> std::io::Result<()> {
        self.walk(entry)?;
        if !self.batch.is_empty() {
            self.send_batch()?;
        }
        Ok(())
    }

    pub fn start_walking(&mut self, entry: FileEntry<impl Read>) {
        debug!("Starting to walk: {}", entry.details());
        if let Err(e) = self.try_walk(entry) {
            error!("Error while walking {:?}: {}", self.stack.nested_path(), e);
            self.channel.send_batch(Err(e)).ok(); // Channel disconnected, ignore
        }
    }
}

impl AnyWalker for Visitor {
    fn visit_file_entry(&mut self, entry: &mut FileEntry<impl Read>) -> std::io::Result<()> {
        trace!(
            "Processing file: {}. Current source: {}",
            entry.details(),
            self.stack.nested_path().display()
        );

        let entry_size = self
            .batch
            .add_record(&self.input_path, self.stack.nested_path(), entry);

        self.counters.read_entry(entry_size);

        if self.batch.should_flush() {
            self.send_batch()?;
        }
        Ok(())
    }

    fn begin_visit_archive(
        &mut self,
        details: &EntryDetails,
        format: FormatKind,
    ) -> std::io::Result<bool> {
        // Detect quine zip files
        if format.is_zip() && Some(details) == self.stack.last_entry() {
            debug!(
                "Skipping archive: quine zip. details: {details}. Current source: {:?}",
                self.stack.nested_path()
            );
            return Ok(false);
        }
        self.stack.push_details(details.clone());
        debug!(
            "Processing archive: {details} - {format}. Current source: {:?}",
            self.stack.nested_path()
        );
        Ok(true)
    }

    fn end_visit_archive(
        &mut self,
        _details: EntryDetails,
        _format: FormatKind,
    ) -> std::io::Result<()> {
        self.counters.read_archive();

        let finished = self.stack.pop_details();
        debug!(
            "Finished processing archive: {finished:?}. Current source: {:?}",
            self.stack.nested_path()
        );
        Ok(())
    }
}