rc_zip/parse/
archive.rs

1use chrono::{offset::Utc, DateTime, TimeZone};
2use num_enum::{FromPrimitive, IntoPrimitive};
3use ownable::{IntoOwned, ToOwned};
4use winnow::{binary::le_u16, PResult, Partial};
5
6use crate::{
7    encoding::Encoding,
8    parse::{Mode, Version},
9};
10
11use super::{zero_datetime, ExtraField, NtfsAttr};
12
13/// An Archive contains general information about a zip files, along with a list
14/// of [entries][Entry].
15///
16/// It is obtained through a state machine like
17/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
18/// higher-levelr interfaces like
19/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
20/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
21pub struct Archive {
22    pub(crate) size: u64,
23    pub(crate) encoding: Encoding,
24    pub(crate) entries: Vec<Entry>,
25    pub(crate) comment: String,
26}
27
28impl Archive {
29    /// The size of .zip file that was read, in bytes.
30    #[inline(always)]
31    pub fn size(&self) -> u64 {
32        self.size
33    }
34
35    /// Iterate over all files in this zip, read from the central directory.
36    pub fn entries(&self) -> impl Iterator<Item = &Entry> {
37        self.entries.iter()
38    }
39
40    /// Attempts to look up an entry by name. This is usually a bad idea,
41    /// as names aren't necessarily normalized in zip archives.
42    pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
43        self.entries.iter().find(|&x| x.name == name.as_ref())
44    }
45
46    /// Returns the detected character encoding for text fields
47    /// (names, comments) inside this zip archive.
48    #[inline(always)]
49    pub fn encoding(&self) -> Encoding {
50        self.encoding
51    }
52
53    /// Returns the comment for this archive, if any. When reading
54    /// a zip file with an empty comment field, this will return None.
55    #[inline(always)]
56    pub fn comment(&self) -> &str {
57        &self.comment
58    }
59}
60
61/// Describes a zip archive entry (a file, a directory, a symlink)
62#[derive(Clone)]
63pub struct Entry {
64    /// Name of the file
65    ///
66    /// This should be a relative path, separated by `/`. However, there are zip
67    /// files in the wild with all sorts of evil variants, so, be conservative
68    /// in what you accept.
69    ///
70    /// See also [Self::sanitized_name], which returns a sanitized version of
71    /// the name, working around zip slip vulnerabilities.
72    pub name: String,
73
74    /// Compression method: Store, Deflate, Bzip2, etc.
75    pub method: Method,
76
77    /// Comment is any arbitrary user-defined string shorter than 64KiB
78    pub comment: String,
79
80    /// This entry's "last modified" timestamp - with caveats
81    ///
82    /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
83    /// by a few hours, if there is no extended timestamp information. It may have a resolution
84    /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
85    /// epoch, if something went really wrong.
86    ///
87    /// If you're reading this after the year 2038, or after the year 2108, godspeed.
88    pub modified: DateTime<Utc>,
89
90    /// This entry's "created" timestamp, if available.
91    ///
92    /// See [Self::modified] for caveats.
93    pub created: Option<DateTime<Utc>>,
94
95    /// This entry's "last accessed" timestamp, if available.
96    ///
97    /// See [Self::accessed] for caveats.
98    pub accessed: Option<DateTime<Utc>>,
99
100    /// Offset of the local file header in the zip file
101    ///
102    /// ```text
103    /// [optional non-zip data]
104    /// [local file header 1] <------ header_offset points here
105    /// [encryption header 1]
106    /// [file data 1]
107    /// [data descriptor 1]
108    /// ...
109    /// [central directory]
110    /// [optional zip64 end of central directory info]
111    /// [end of central directory record]
112    /// ```
113    pub header_offset: u64,
114
115    /// Version of zip needed to extract this archive.
116    pub reader_version: Version,
117
118    /// General purpose bit flag
119    ///
120    /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
121    /// Other flags can indicate: encryption (unsupported), various compression
122    /// settings (depending on the [Method] used).
123    ///
124    /// For LZMA, general-purpose bit 1 denotes the EOS marker.
125    pub flags: u16,
126
127    /// Unix user ID
128    ///
129    /// Only present if a Unix extra field or New Unix extra field was found.
130    pub uid: Option<u32>,
131
132    /// Unix group ID
133    ///
134    /// Only present if a Unix extra field or New Unix extra field was found.
135    pub gid: Option<u32>,
136
137    /// CRC-32 hash as found in the central directory.
138    ///
139    /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
140    /// commonly) in the data descriptor instead.
141    pub crc32: u32,
142
143    /// Size in bytes, after compression
144    pub compressed_size: u64,
145
146    /// Size in bytes, before compression
147    ///
148    /// This will be zero for directories.
149    pub uncompressed_size: u64,
150
151    /// File mode.
152    pub mode: Mode,
153}
154
155impl Entry {
156    /// Returns a sanitized version of the entry's name, if it
157    /// seems safe. In particular, if this method feels like the
158    /// entry name is trying to do a zip slip (cf.
159    /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
160    /// None.
161    ///
162    /// Other than that, it will strip any leading slashes on non-Windows OSes.
163    pub fn sanitized_name(&self) -> Option<&str> {
164        let name = self.name.as_str();
165
166        // refuse entries with traversed/absolute path to mitigate zip slip
167        if name.contains("..") {
168            return None;
169        }
170
171        #[cfg(windows)]
172        {
173            if name.contains(":\\") || name.starts_with("\\") {
174                return None;
175            }
176            Some(name)
177        }
178
179        #[cfg(not(windows))]
180        {
181            // strip absolute prefix on entries pointing to root path
182            let mut entry_chars = name.chars();
183            let mut name = name;
184            while name.starts_with('/') {
185                entry_chars.next();
186                name = entry_chars.as_str()
187            }
188            Some(name)
189        }
190    }
191
192    /// Apply the extra field to the entry, updating its metadata.
193    pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
194        match &ef {
195            ExtraField::Zip64(z64) => {
196                self.uncompressed_size = z64.uncompressed_size;
197                self.compressed_size = z64.compressed_size;
198                self.header_offset = z64.header_offset;
199            }
200            ExtraField::Timestamp(ts) => {
201                self.modified = Utc
202                    .timestamp_opt(ts.mtime as i64, 0)
203                    .single()
204                    .unwrap_or_else(zero_datetime);
205            }
206            ExtraField::Ntfs(nf) => {
207                for attr in &nf.attrs {
208                    // note: other attributes are unsupported
209                    if let NtfsAttr::Attr1(attr) = attr {
210                        self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
211                        self.created = attr.ctime.to_datetime();
212                        self.accessed = attr.atime.to_datetime();
213                    }
214                }
215            }
216            ExtraField::Unix(uf) => {
217                self.modified = Utc
218                    .timestamp_opt(uf.mtime as i64, 0)
219                    .single()
220                    .unwrap_or_else(zero_datetime);
221
222                if self.uid.is_none() {
223                    self.uid = Some(uf.uid as u32);
224                }
225
226                if self.gid.is_none() {
227                    self.gid = Some(uf.gid as u32);
228                }
229            }
230            ExtraField::NewUnix(uf) => {
231                self.uid = Some(uf.uid as u32);
232                self.gid = Some(uf.uid as u32);
233            }
234            _ => {}
235        };
236    }
237}
238
239/// The entry's file type: a directory, a file, or a symbolic link.
240#[derive(Debug, Eq, PartialEq)]
241pub enum EntryKind {
242    /// The entry is a directory
243    Directory,
244
245    /// The entry is a file
246    File,
247
248    /// The entry is a symbolic link
249    Symlink,
250}
251
252impl Entry {
253    /// Determine the kind of this entry based on its mode.
254    pub fn kind(&self) -> EntryKind {
255        if self.mode.has(Mode::SYMLINK) {
256            EntryKind::Symlink
257        } else if self.mode.has(Mode::DIR) {
258            EntryKind::Directory
259        } else {
260            EntryKind::File
261        }
262    }
263}
264
265/// Compression method used for a file entry.
266///
267/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
268/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
269///
270/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
271/// [Lzma][Method::Lzma] or others.
272#[derive(
273    Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive, IntoOwned, ToOwned,
274)]
275#[repr(u16)]
276pub enum Method {
277    /// No compression is applied
278    Store = 0,
279
280    /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
281    Deflate = 8,
282
283    /// [DEFLATE64](https://deflate64.com/)
284    Deflate64 = 9,
285
286    /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
287    Bzip2 = 12,
288
289    /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
290    Lzma = 14,
291
292    /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
293    Zstd = 93,
294
295    /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
296    Mp3 = 94,
297
298    /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
299    Xz = 95,
300
301    /// [JPEG](https://jpeg.org/jpeg/)
302    Jpeg = 96,
303
304    /// [WavPack](https://www.wavpack.com/)
305    WavPack = 97,
306
307    /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
308    Ppmd = 98,
309
310    /// AE-x encryption marker (see Appendix E of appnote)
311    Aex = 99,
312
313    /// A compression method that isn't recognized by this crate.
314    #[num_enum(catch_all)]
315    Unrecognized(u16),
316}
317
318impl Method {
319    /// Parse a method from a byte slice
320    pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
321        le_u16(i).map(From::from)
322    }
323}