rc_zip/fsm/
archive.rs

1use super::FsmResult;
2use crate::{
3    encoding::Encoding,
4    error::{Error, FormatError},
5    parse::{
6        Archive, CentralDirectoryFileHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator,
7        EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Entry, Located,
8    },
9};
10
11use ownable::traits::IntoOwned;
12use tracing::trace;
13use winnow::{
14    error::ErrMode,
15    stream::{AsBytes, Offset},
16    Parser, Partial,
17};
18
19/// [ArchiveFsm] parses a valid zip archive into an [Archive]. In particular, this struct finds
20/// an end of central directory record, parses the entire central directory, detects text encoding,
21/// and normalizes metadata.
22///
23/// The loop is as follows:
24///
25///   * Call [Self::wants_read] to check if more data is needed.
26///   * If it returns `Some(offset)`, read the file at that offset
27///     into [Self::space] and then call [Self::fill] with
28///     the number of bytes read.
29///   * Call [Self::process] to process the data.
30///   * If it returns [FsmResult::Continue], loop back to the first step.
31///
32/// Look at the integration tests or
33/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for concrete examples.
34pub struct ArchiveFsm {
35    /// Size of the entire zip file
36    size: u64,
37
38    /// Current stage: finding the eocd, reading the eocd, reading the eocd64
39    /// locator, reading the eocd64, or reading the central directory
40    state: State,
41
42    /// Buffer for reading data from the file
43    buffer: Buffer,
44}
45
46#[derive(Default)]
47enum State {
48    /// Finding and reading the end of central directory record
49    ReadEocd {
50        /// size of the haystack in which we're looking for the end of central
51        /// directory record.
52        /// this may be less than 65KiB if the file is smaller than that.
53        haystack_size: u64,
54    },
55
56    /// Reading the zip64 end of central directory record.
57    ReadEocd64Locator {
58        eocdr: Located<EndOfCentralDirectoryRecord<'static>>,
59    },
60
61    /// Reading the zip64 end of central directory record.
62    ReadEocd64 {
63        eocdr64_offset: u64,
64        eocdr: Located<EndOfCentralDirectoryRecord<'static>>,
65    },
66
67    /// Reading all headers from the central directory
68    ReadCentralDirectory {
69        eocd: EndOfCentralDirectory<'static>,
70        directory_headers: Vec<CentralDirectoryFileHeader<'static>>,
71    },
72
73    #[default]
74    Transitioning,
75}
76
77impl ArchiveFsm {
78    /// This should be > 65KiB, because the section at the end of the
79    /// file that we check for end of central directory record is 65KiB.
80    const DEFAULT_BUFFER_SIZE: usize = 256 * 1024;
81
82    /// Create a new archive reader with a specified file size.
83    pub fn new(size: u64) -> Self {
84        let haystack_size: u64 = 65 * 1024;
85        let haystack_size = if size < haystack_size {
86            size
87        } else {
88            haystack_size
89        };
90
91        Self {
92            size,
93            buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE),
94            state: State::ReadEocd { haystack_size },
95        }
96    }
97
98    /// If this returns `Some(offset)`, the caller should read data from
99    /// `offset` into [Self::space] — without forgetting to call
100    /// [Self::fill] with the number of bytes written.
101    pub fn wants_read(&self) -> Option<u64> {
102        use State as S;
103        match self.state {
104            S::ReadEocd { haystack_size } => {
105                Some(self.buffer.read_offset(self.size - haystack_size))
106            }
107            S::ReadEocd64Locator { ref eocdr } => {
108                let length = EndOfCentralDirectory64Locator::LENGTH as u64;
109                Some(self.buffer.read_offset(eocdr.offset - length))
110            }
111            S::ReadEocd64 { eocdr64_offset, .. } => Some(self.buffer.read_offset(eocdr64_offset)),
112            S::ReadCentralDirectory { ref eocd, .. } => {
113                Some(self.buffer.read_offset(eocd.directory_offset()))
114            }
115            S::Transitioning => unreachable!(),
116        }
117    }
118
119    /// Process buffered data
120    ///
121    /// Errors returned from this function are caused by invalid zip archives,
122    /// unsupported format quirks, or implementation bugs - never I/O errors.
123    ///
124    /// A result of [FsmResult::Continue] gives back ownership of the state
125    /// machine and indicates the I/O loop should continue, starting with
126    /// [Self::wants_read].
127    ///
128    /// A result of [FsmResult::Done] consumes the state machine and returns
129    /// a fully-parsed [Archive].
130    pub fn process(mut self) -> Result<FsmResult<Self, Archive>, Error> {
131        use State as S;
132        match self.state {
133            S::ReadEocd { haystack_size } => {
134                if self.buffer.read_bytes() < haystack_size {
135                    // read the entire haystack before we can continue
136                    return Ok(FsmResult::Continue(self));
137                }
138
139                let res = {
140                    let haystack = &self.buffer.data()[..haystack_size as usize];
141                    EndOfCentralDirectoryRecord::find_in_block(haystack)
142                };
143                match res {
144                    None => Err(FormatError::DirectoryEndSignatureNotFound.into()),
145                    Some(eocdr) => {
146                        trace!(
147                            ?eocdr,
148                            size = self.size,
149                            "ReadEocd | found end of central directory record"
150                        );
151                        let mut eocdr = eocdr.into_owned();
152                        self.buffer.reset();
153                        eocdr.offset += self.size - haystack_size;
154
155                        if eocdr.offset < EndOfCentralDirectory64Locator::LENGTH as u64 {
156                            // no room for an EOCD64 locator, definitely not a zip64 file
157                            trace!(
158                                offset = eocdr.offset,
159                                eocd64locator_length = EndOfCentralDirectory64Locator::LENGTH,
160                                "no room for an EOCD64 locator, definitely not a zip64 file"
161                            );
162                            transition!(self.state => (S::ReadEocd { .. }) {
163                                S::ReadCentralDirectory {
164                                    eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
165                                    directory_headers: vec![],
166                                }
167                            });
168                            Ok(FsmResult::Continue(self))
169                        } else {
170                            trace!("ReadEocd | transition to ReadEocd64Locator");
171                            self.buffer.reset();
172                            transition!(self.state => (S::ReadEocd { .. }) {
173                                S::ReadEocd64Locator { eocdr }
174                            });
175                            Ok(FsmResult::Continue(self))
176                        }
177                    }
178                }
179            }
180            S::ReadEocd64Locator { .. } => {
181                let input = Partial::new(self.buffer.data());
182                match EndOfCentralDirectory64Locator::parser.parse_peek(input) {
183                    Err(ErrMode::Incomplete(_)) => {
184                        // need more data
185                        Ok(FsmResult::Continue(self))
186                    }
187                    Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => {
188                        // we don't have a zip64 end of central directory locator - that's ok!
189                        trace!("ReadEocd64Locator | no zip64 end of central directory locator");
190                        trace!(
191                            "ReadEocd64Locator | data we got: {:02x?}",
192                            self.buffer.data()
193                        );
194                        self.buffer.reset();
195                        transition!(self.state => (S::ReadEocd64Locator { eocdr }) {
196                            S::ReadCentralDirectory {
197                                eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
198                                directory_headers: vec![],
199                            }
200                        });
201                        Ok(FsmResult::Continue(self))
202                    }
203                    Ok((_, locator)) => {
204                        trace!(
205                            ?locator,
206                            "ReadEocd64Locator | found zip64 end of central directory locator"
207                        );
208                        self.buffer.reset();
209                        transition!(self.state => (S::ReadEocd64Locator { eocdr }) {
210                            S::ReadEocd64 {
211                                eocdr64_offset: locator.directory_offset,
212                                eocdr,
213                            }
214                        });
215                        Ok(FsmResult::Continue(self))
216                    }
217                }
218            }
219            S::ReadEocd64 { .. } => {
220                let input = Partial::new(self.buffer.data());
221                match EndOfCentralDirectory64Record::parser.parse_peek(input) {
222                    Err(ErrMode::Incomplete(_)) => {
223                        // need more data
224                        Ok(FsmResult::Continue(self))
225                    }
226                    Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => {
227                        // at this point, we really expected to have a zip64 end
228                        // of central directory record, so, we want to propagate
229                        // that error.
230                        Err(FormatError::Directory64EndRecordInvalid.into())
231                    }
232                    Ok((_, eocdr64)) => {
233                        self.buffer.reset();
234                        transition!(self.state => (S::ReadEocd64 { eocdr, eocdr64_offset }) {
235                            S::ReadCentralDirectory {
236                                eocd: EndOfCentralDirectory::new(self.size, eocdr, Some(Located {
237                                    offset: eocdr64_offset,
238                                    inner: eocdr64
239                                }))?,
240                                directory_headers: vec![],
241                            }
242                        });
243                        Ok(FsmResult::Continue(self))
244                    }
245                }
246            }
247            S::ReadCentralDirectory {
248                ref eocd,
249                ref mut directory_headers,
250            } => {
251                trace!(
252                    "ReadCentralDirectory | process(), available: {}",
253                    self.buffer.available_data()
254                );
255                let mut valid_consumed = 0;
256                let mut input = Partial::new(self.buffer.data());
257                trace!(
258                    initial_offset = input.as_bytes().offset_from(&self.buffer.data()),
259                    initial_len = input.len(),
260                    "initial offset & len"
261                );
262                'read_headers: while !input.is_empty() {
263                    match CentralDirectoryFileHeader::parser.parse_next(&mut input) {
264                        Ok(dh) => {
265                            trace!(
266                                input_empty_now = input.is_empty(),
267                                offset = input.as_bytes().offset_from(&self.buffer.data()),
268                                len = input.len(),
269                                "ReadCentralDirectory | parsed directory header"
270                            );
271                            valid_consumed = input.as_bytes().offset_from(&self.buffer.data());
272                            directory_headers.push(dh.into_owned());
273                        }
274                        Err(ErrMode::Incomplete(_needed)) => {
275                            // need more data to read the full header
276                            trace!("ReadCentralDirectory | incomplete!");
277                            break 'read_headers;
278                        }
279                        Err(ErrMode::Backtrack(err)) | Err(ErrMode::Cut(err)) => {
280                            // this is the normal end condition when reading
281                            // the central directory (due to 65536-entries non-zip64 files)
282                            // let's just check a few numbers first.
283
284                            // only compare 16 bits here
285                            let expected_records = directory_headers.len() as u16;
286                            let actual_records = eocd.directory_records() as u16;
287
288                            if expected_records != actual_records {
289                                tracing::trace!(
290                                    "error while reading central records: we read {} records, but EOCD announced {}. the last failed with: {err:?} (display: {err}). at that point, input had length {}",
291                                    expected_records,
292                                    actual_records,
293                                    input.len()
294                                );
295
296                                // if we read the wrong number of directory entries,
297                                // error out.
298                                return Err(FormatError::InvalidCentralRecord {
299                                    expected: expected_records,
300                                    actual: actual_records,
301                                }
302                                .into());
303                            }
304
305                            let mut detectorng = chardetng::EncodingDetector::new();
306                            let mut all_utf8 = true;
307                            let mut had_suspicious_chars_for_cp437 = false;
308
309                            {
310                                let max_feed: usize = 4096;
311                                let mut total_fed: usize = 0;
312                                let mut feed = |slice: &[u8]| {
313                                    detectorng.feed(slice, false);
314                                    for b in slice {
315                                        if (0xB0..=0xDF).contains(b) {
316                                            // those are, like, box drawing characters
317                                            had_suspicious_chars_for_cp437 = true;
318                                        }
319                                    }
320
321                                    total_fed += slice.len();
322                                    total_fed < max_feed
323                                };
324
325                                'recognize_encoding: for fh in
326                                    directory_headers.iter().filter(|fh| fh.is_non_utf8())
327                                {
328                                    all_utf8 = false;
329                                    if !feed(&fh.name[..]) || !feed(&fh.comment[..]) {
330                                        break 'recognize_encoding;
331                                    }
332                                }
333                            }
334
335                            let encoding = {
336                                if all_utf8 {
337                                    Encoding::Utf8
338                                } else {
339                                    let encoding = detectorng.guess(None, true);
340                                    if encoding == encoding_rs::SHIFT_JIS {
341                                        // well hold on, sometimes Codepage 437 is detected as
342                                        // Shift-JIS by chardetng. If we have any characters
343                                        // that aren't valid DOS file names, then okay it's probably
344                                        // Shift-JIS. Otherwise, assume it's CP437.
345                                        if had_suspicious_chars_for_cp437 {
346                                            Encoding::ShiftJis
347                                        } else {
348                                            Encoding::Cp437
349                                        }
350                                    } else if encoding == encoding_rs::UTF_8 {
351                                        Encoding::Utf8
352                                    } else {
353                                        Encoding::Cp437
354                                    }
355                                }
356                            };
357
358                            let global_offset = eocd.global_offset as u64;
359                            let entries: Result<Vec<Entry>, Error> = directory_headers
360                                .iter()
361                                .map(|x| x.as_entry(encoding, global_offset))
362                                .collect();
363                            let entries = entries?;
364
365                            let comment = encoding.decode(eocd.comment())?;
366
367                            return Ok(FsmResult::Done(Archive {
368                                size: self.size,
369                                comment,
370                                entries,
371                                encoding,
372                            }));
373                        }
374                    }
375                }
376                let consumed = valid_consumed;
377                tracing::trace!(%consumed, "ReadCentralDirectory total consumed");
378                self.buffer.consume(consumed);
379
380                // need more data
381                Ok(FsmResult::Continue(self))
382            }
383            S::Transitioning => unreachable!(),
384        }
385    }
386
387    /// Returns a mutable slice with all the available space to write to.
388    ///
389    /// After writing to this, call [Self::fill] with the number of bytes written.
390    #[inline]
391    pub fn space(&mut self) -> &mut [u8] {
392        if self.buffer.available_space() == 0 {
393            self.buffer.shift();
394        }
395        self.buffer.space()
396    }
397
398    /// After having written data to [Self::space], call this to indicate how
399    /// many bytes were written.
400    #[inline]
401    pub fn fill(&mut self, count: usize) -> usize {
402        self.buffer.fill(count)
403    }
404}
405
406/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since
407/// initialization or the last reset.
408pub(crate) struct Buffer {
409    pub(crate) buffer: oval::Buffer,
410    pub(crate) read_bytes: u64,
411}
412
413impl Buffer {
414    /// creates a new buffer with the specified capacity
415    pub(crate) fn with_capacity(size: usize) -> Self {
416        Self {
417            buffer: oval::Buffer::with_capacity(size),
418            read_bytes: 0,
419        }
420    }
421
422    /// resets the buffer (so that data() returns an empty slice,
423    /// and space() returns the full capacity), along with th e
424    /// read bytes counter.
425    pub(crate) fn reset(&mut self) {
426        self.read_bytes = 0;
427        self.buffer.reset();
428    }
429
430    /// returns the number of read bytes since the last reset
431    #[inline]
432    pub(crate) fn read_bytes(&self) -> u64 {
433        self.read_bytes
434    }
435
436    /// returns a slice with all the available data
437    #[inline]
438    pub(crate) fn data(&self) -> &[u8] {
439        self.buffer.data()
440    }
441
442    /// returns how much data can be read from the buffer
443    #[inline]
444    pub(crate) fn available_data(&self) -> usize {
445        self.buffer.available_data()
446    }
447
448    /// returns how much free space is available to write to
449    #[inline]
450    pub fn available_space(&self) -> usize {
451        self.buffer.available_space()
452    }
453
454    /// returns a mutable slice with all the available space to
455    /// write to
456    #[inline]
457    pub(crate) fn space(&mut self) -> &mut [u8] {
458        self.buffer.space()
459    }
460
461    /// moves the data at the beginning of the buffer
462    ///
463    /// if the position was more than 0, it is now 0
464    #[inline]
465    pub fn shift(&mut self) {
466        self.buffer.shift()
467    }
468
469    /// after having written data to the buffer, use this function
470    /// to indicate how many bytes were written
471    ///
472    /// if there is not enough available space, this function can call
473    /// `shift()` to move the remaining data to the beginning of the
474    /// buffer
475    #[inline]
476    pub(crate) fn fill(&mut self, count: usize) -> usize {
477        let n = self.buffer.fill(count);
478        self.read_bytes += n as u64;
479        n
480    }
481
482    /// advances the position tracker
483    ///
484    /// if the position gets past the buffer's half,
485    /// this will call `shift()` to move the remaining data
486    /// to the beginning of the buffer
487    #[inline]
488    pub(crate) fn consume(&mut self, size: usize) {
489        self.buffer.consume(size);
490    }
491
492    /// adds already-read bytes to the given offset. this is useful in
493    /// [ArchiveFsm], when we read records at fixed offsets within the file,
494    /// that possibly take several reads to fully parse.
495    pub(crate) fn read_offset(&self, offset: u64) -> u64 {
496        self.read_bytes + offset
497    }
498}