rc_zip/fsm/archive.rs
1use super::FsmResult;
2use crate::{
3 encoding::Encoding,
4 error::{Error, FormatError},
5 parse::{
6 Archive, CentralDirectoryFileHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator,
7 EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Entry, Located,
8 },
9};
10
11use ownable::traits::IntoOwned;
12use tracing::trace;
13use winnow::{
14 error::ErrMode,
15 stream::{AsBytes, Offset},
16 Parser, Partial,
17};
18
19/// [ArchiveFsm] parses a valid zip archive into an [Archive]. In particular, this struct finds
20/// an end of central directory record, parses the entire central directory, detects text encoding,
21/// and normalizes metadata.
22///
23/// The loop is as follows:
24///
25/// * Call [Self::wants_read] to check if more data is needed.
26/// * If it returns `Some(offset)`, read the file at that offset
27/// into [Self::space] and then call [Self::fill] with
28/// the number of bytes read.
29/// * Call [Self::process] to process the data.
30/// * If it returns [FsmResult::Continue], loop back to the first step.
31///
32/// Look at the integration tests or
33/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) for concrete examples.
34pub struct ArchiveFsm {
35 /// Size of the entire zip file
36 size: u64,
37
38 /// Current stage: finding the eocd, reading the eocd, reading the eocd64
39 /// locator, reading the eocd64, or reading the central directory
40 state: State,
41
42 /// Buffer for reading data from the file
43 buffer: Buffer,
44}
45
46#[derive(Default)]
47enum State {
48 /// Finding and reading the end of central directory record
49 ReadEocd {
50 /// size of the haystack in which we're looking for the end of central
51 /// directory record.
52 /// this may be less than 65KiB if the file is smaller than that.
53 haystack_size: u64,
54 },
55
56 /// Reading the zip64 end of central directory record.
57 ReadEocd64Locator {
58 eocdr: Located<EndOfCentralDirectoryRecord<'static>>,
59 },
60
61 /// Reading the zip64 end of central directory record.
62 ReadEocd64 {
63 eocdr64_offset: u64,
64 eocdr: Located<EndOfCentralDirectoryRecord<'static>>,
65 },
66
67 /// Reading all headers from the central directory
68 ReadCentralDirectory {
69 eocd: EndOfCentralDirectory<'static>,
70 directory_headers: Vec<CentralDirectoryFileHeader<'static>>,
71 },
72
73 #[default]
74 Transitioning,
75}
76
77impl ArchiveFsm {
78 /// This should be > 65KiB, because the section at the end of the
79 /// file that we check for end of central directory record is 65KiB.
80 const DEFAULT_BUFFER_SIZE: usize = 256 * 1024;
81
82 /// Create a new archive reader with a specified file size.
83 pub fn new(size: u64) -> Self {
84 let haystack_size: u64 = 65 * 1024;
85 let haystack_size = if size < haystack_size {
86 size
87 } else {
88 haystack_size
89 };
90
91 Self {
92 size,
93 buffer: Buffer::with_capacity(Self::DEFAULT_BUFFER_SIZE),
94 state: State::ReadEocd { haystack_size },
95 }
96 }
97
98 /// If this returns `Some(offset)`, the caller should read data from
99 /// `offset` into [Self::space] — without forgetting to call
100 /// [Self::fill] with the number of bytes written.
101 pub fn wants_read(&self) -> Option<u64> {
102 use State as S;
103 match self.state {
104 S::ReadEocd { haystack_size } => {
105 Some(self.buffer.read_offset(self.size - haystack_size))
106 }
107 S::ReadEocd64Locator { ref eocdr } => {
108 let length = EndOfCentralDirectory64Locator::LENGTH as u64;
109 Some(self.buffer.read_offset(eocdr.offset - length))
110 }
111 S::ReadEocd64 { eocdr64_offset, .. } => Some(self.buffer.read_offset(eocdr64_offset)),
112 S::ReadCentralDirectory { ref eocd, .. } => {
113 Some(self.buffer.read_offset(eocd.directory_offset()))
114 }
115 S::Transitioning => unreachable!(),
116 }
117 }
118
119 /// Process buffered data
120 ///
121 /// Errors returned from this function are caused by invalid zip archives,
122 /// unsupported format quirks, or implementation bugs - never I/O errors.
123 ///
124 /// A result of [FsmResult::Continue] gives back ownership of the state
125 /// machine and indicates the I/O loop should continue, starting with
126 /// [Self::wants_read].
127 ///
128 /// A result of [FsmResult::Done] consumes the state machine and returns
129 /// a fully-parsed [Archive].
130 pub fn process(mut self) -> Result<FsmResult<Self, Archive>, Error> {
131 use State as S;
132 match self.state {
133 S::ReadEocd { haystack_size } => {
134 if self.buffer.read_bytes() < haystack_size {
135 // read the entire haystack before we can continue
136 return Ok(FsmResult::Continue(self));
137 }
138
139 let res = {
140 let haystack = &self.buffer.data()[..haystack_size as usize];
141 EndOfCentralDirectoryRecord::find_in_block(haystack)
142 };
143 match res {
144 None => Err(FormatError::DirectoryEndSignatureNotFound.into()),
145 Some(eocdr) => {
146 trace!(
147 ?eocdr,
148 size = self.size,
149 "ReadEocd | found end of central directory record"
150 );
151 let mut eocdr = eocdr.into_owned();
152 self.buffer.reset();
153 eocdr.offset += self.size - haystack_size;
154
155 if eocdr.offset < EndOfCentralDirectory64Locator::LENGTH as u64 {
156 // no room for an EOCD64 locator, definitely not a zip64 file
157 trace!(
158 offset = eocdr.offset,
159 eocd64locator_length = EndOfCentralDirectory64Locator::LENGTH,
160 "no room for an EOCD64 locator, definitely not a zip64 file"
161 );
162 transition!(self.state => (S::ReadEocd { .. }) {
163 S::ReadCentralDirectory {
164 eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
165 directory_headers: vec![],
166 }
167 });
168 Ok(FsmResult::Continue(self))
169 } else {
170 trace!("ReadEocd | transition to ReadEocd64Locator");
171 self.buffer.reset();
172 transition!(self.state => (S::ReadEocd { .. }) {
173 S::ReadEocd64Locator { eocdr }
174 });
175 Ok(FsmResult::Continue(self))
176 }
177 }
178 }
179 }
180 S::ReadEocd64Locator { .. } => {
181 let input = Partial::new(self.buffer.data());
182 match EndOfCentralDirectory64Locator::parser.parse_peek(input) {
183 Err(ErrMode::Incomplete(_)) => {
184 // need more data
185 Ok(FsmResult::Continue(self))
186 }
187 Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => {
188 // we don't have a zip64 end of central directory locator - that's ok!
189 trace!("ReadEocd64Locator | no zip64 end of central directory locator");
190 trace!(
191 "ReadEocd64Locator | data we got: {:02x?}",
192 self.buffer.data()
193 );
194 self.buffer.reset();
195 transition!(self.state => (S::ReadEocd64Locator { eocdr }) {
196 S::ReadCentralDirectory {
197 eocd: EndOfCentralDirectory::new(self.size, eocdr, None)?,
198 directory_headers: vec![],
199 }
200 });
201 Ok(FsmResult::Continue(self))
202 }
203 Ok((_, locator)) => {
204 trace!(
205 ?locator,
206 "ReadEocd64Locator | found zip64 end of central directory locator"
207 );
208 self.buffer.reset();
209 transition!(self.state => (S::ReadEocd64Locator { eocdr }) {
210 S::ReadEocd64 {
211 eocdr64_offset: locator.directory_offset,
212 eocdr,
213 }
214 });
215 Ok(FsmResult::Continue(self))
216 }
217 }
218 }
219 S::ReadEocd64 { .. } => {
220 let input = Partial::new(self.buffer.data());
221 match EndOfCentralDirectory64Record::parser.parse_peek(input) {
222 Err(ErrMode::Incomplete(_)) => {
223 // need more data
224 Ok(FsmResult::Continue(self))
225 }
226 Err(ErrMode::Backtrack(_)) | Err(ErrMode::Cut(_)) => {
227 // at this point, we really expected to have a zip64 end
228 // of central directory record, so, we want to propagate
229 // that error.
230 Err(FormatError::Directory64EndRecordInvalid.into())
231 }
232 Ok((_, eocdr64)) => {
233 self.buffer.reset();
234 transition!(self.state => (S::ReadEocd64 { eocdr, eocdr64_offset }) {
235 S::ReadCentralDirectory {
236 eocd: EndOfCentralDirectory::new(self.size, eocdr, Some(Located {
237 offset: eocdr64_offset,
238 inner: eocdr64
239 }))?,
240 directory_headers: vec![],
241 }
242 });
243 Ok(FsmResult::Continue(self))
244 }
245 }
246 }
247 S::ReadCentralDirectory {
248 ref eocd,
249 ref mut directory_headers,
250 } => {
251 trace!(
252 "ReadCentralDirectory | process(), available: {}",
253 self.buffer.available_data()
254 );
255 let mut valid_consumed = 0;
256 let mut input = Partial::new(self.buffer.data());
257 trace!(
258 initial_offset = input.as_bytes().offset_from(&self.buffer.data()),
259 initial_len = input.len(),
260 "initial offset & len"
261 );
262 'read_headers: while !input.is_empty() {
263 match CentralDirectoryFileHeader::parser.parse_next(&mut input) {
264 Ok(dh) => {
265 trace!(
266 input_empty_now = input.is_empty(),
267 offset = input.as_bytes().offset_from(&self.buffer.data()),
268 len = input.len(),
269 "ReadCentralDirectory | parsed directory header"
270 );
271 valid_consumed = input.as_bytes().offset_from(&self.buffer.data());
272 directory_headers.push(dh.into_owned());
273 }
274 Err(ErrMode::Incomplete(_needed)) => {
275 // need more data to read the full header
276 trace!("ReadCentralDirectory | incomplete!");
277 break 'read_headers;
278 }
279 Err(ErrMode::Backtrack(err)) | Err(ErrMode::Cut(err)) => {
280 // this is the normal end condition when reading
281 // the central directory (due to 65536-entries non-zip64 files)
282 // let's just check a few numbers first.
283
284 // only compare 16 bits here
285 let expected_records = directory_headers.len() as u16;
286 let actual_records = eocd.directory_records() as u16;
287
288 if expected_records != actual_records {
289 tracing::trace!(
290 "error while reading central records: we read {} records, but EOCD announced {}. the last failed with: {err:?} (display: {err}). at that point, input had length {}",
291 expected_records,
292 actual_records,
293 input.len()
294 );
295
296 // if we read the wrong number of directory entries,
297 // error out.
298 return Err(FormatError::InvalidCentralRecord {
299 expected: expected_records,
300 actual: actual_records,
301 }
302 .into());
303 }
304
305 let mut detectorng = chardetng::EncodingDetector::new();
306 let mut all_utf8 = true;
307 let mut had_suspicious_chars_for_cp437 = false;
308
309 {
310 let max_feed: usize = 4096;
311 let mut total_fed: usize = 0;
312 let mut feed = |slice: &[u8]| {
313 detectorng.feed(slice, false);
314 for b in slice {
315 if (0xB0..=0xDF).contains(b) {
316 // those are, like, box drawing characters
317 had_suspicious_chars_for_cp437 = true;
318 }
319 }
320
321 total_fed += slice.len();
322 total_fed < max_feed
323 };
324
325 'recognize_encoding: for fh in
326 directory_headers.iter().filter(|fh| fh.is_non_utf8())
327 {
328 all_utf8 = false;
329 if !feed(&fh.name[..]) || !feed(&fh.comment[..]) {
330 break 'recognize_encoding;
331 }
332 }
333 }
334
335 let encoding = {
336 if all_utf8 {
337 Encoding::Utf8
338 } else {
339 let encoding = detectorng.guess(None, true);
340 if encoding == encoding_rs::SHIFT_JIS {
341 // well hold on, sometimes Codepage 437 is detected as
342 // Shift-JIS by chardetng. If we have any characters
343 // that aren't valid DOS file names, then okay it's probably
344 // Shift-JIS. Otherwise, assume it's CP437.
345 if had_suspicious_chars_for_cp437 {
346 Encoding::ShiftJis
347 } else {
348 Encoding::Cp437
349 }
350 } else if encoding == encoding_rs::UTF_8 {
351 Encoding::Utf8
352 } else {
353 Encoding::Cp437
354 }
355 }
356 };
357
358 let global_offset = eocd.global_offset as u64;
359 let entries: Result<Vec<Entry>, Error> = directory_headers
360 .iter()
361 .map(|x| x.as_entry(encoding, global_offset))
362 .collect();
363 let entries = entries?;
364
365 let comment = encoding.decode(eocd.comment())?;
366
367 return Ok(FsmResult::Done(Archive {
368 size: self.size,
369 comment,
370 entries,
371 encoding,
372 }));
373 }
374 }
375 }
376 let consumed = valid_consumed;
377 tracing::trace!(%consumed, "ReadCentralDirectory total consumed");
378 self.buffer.consume(consumed);
379
380 // need more data
381 Ok(FsmResult::Continue(self))
382 }
383 S::Transitioning => unreachable!(),
384 }
385 }
386
387 /// Returns a mutable slice with all the available space to write to.
388 ///
389 /// After writing to this, call [Self::fill] with the number of bytes written.
390 #[inline]
391 pub fn space(&mut self) -> &mut [u8] {
392 if self.buffer.available_space() == 0 {
393 self.buffer.shift();
394 }
395 self.buffer.space()
396 }
397
398 /// After having written data to [Self::space], call this to indicate how
399 /// many bytes were written.
400 #[inline]
401 pub fn fill(&mut self, count: usize) -> usize {
402 self.buffer.fill(count)
403 }
404}
405
406/// A wrapper around [oval::Buffer] that keeps track of how many bytes we've read since
407/// initialization or the last reset.
408pub(crate) struct Buffer {
409 pub(crate) buffer: oval::Buffer,
410 pub(crate) read_bytes: u64,
411}
412
413impl Buffer {
414 /// creates a new buffer with the specified capacity
415 pub(crate) fn with_capacity(size: usize) -> Self {
416 Self {
417 buffer: oval::Buffer::with_capacity(size),
418 read_bytes: 0,
419 }
420 }
421
422 /// resets the buffer (so that data() returns an empty slice,
423 /// and space() returns the full capacity), along with th e
424 /// read bytes counter.
425 pub(crate) fn reset(&mut self) {
426 self.read_bytes = 0;
427 self.buffer.reset();
428 }
429
430 /// returns the number of read bytes since the last reset
431 #[inline]
432 pub(crate) fn read_bytes(&self) -> u64 {
433 self.read_bytes
434 }
435
436 /// returns a slice with all the available data
437 #[inline]
438 pub(crate) fn data(&self) -> &[u8] {
439 self.buffer.data()
440 }
441
442 /// returns how much data can be read from the buffer
443 #[inline]
444 pub(crate) fn available_data(&self) -> usize {
445 self.buffer.available_data()
446 }
447
448 /// returns how much free space is available to write to
449 #[inline]
450 pub fn available_space(&self) -> usize {
451 self.buffer.available_space()
452 }
453
454 /// returns a mutable slice with all the available space to
455 /// write to
456 #[inline]
457 pub(crate) fn space(&mut self) -> &mut [u8] {
458 self.buffer.space()
459 }
460
461 /// moves the data at the beginning of the buffer
462 ///
463 /// if the position was more than 0, it is now 0
464 #[inline]
465 pub fn shift(&mut self) {
466 self.buffer.shift()
467 }
468
469 /// after having written data to the buffer, use this function
470 /// to indicate how many bytes were written
471 ///
472 /// if there is not enough available space, this function can call
473 /// `shift()` to move the remaining data to the beginning of the
474 /// buffer
475 #[inline]
476 pub(crate) fn fill(&mut self, count: usize) -> usize {
477 let n = self.buffer.fill(count);
478 self.read_bytes += n as u64;
479 n
480 }
481
482 /// advances the position tracker
483 ///
484 /// if the position gets past the buffer's half,
485 /// this will call `shift()` to move the remaining data
486 /// to the beginning of the buffer
487 #[inline]
488 pub(crate) fn consume(&mut self, size: usize) {
489 self.buffer.consume(size);
490 }
491
492 /// adds already-read bytes to the given offset. this is useful in
493 /// [ArchiveFsm], when we read records at fixed offsets within the file,
494 /// that possibly take several reads to fully parse.
495 pub(crate) fn read_offset(&self, offset: u64) -> u64 {
496 self.read_bytes + offset
497 }
498}