rc_zip/parse/archive.rs
1use chrono::{offset::Utc, DateTime, TimeZone};
2use num_enum::{FromPrimitive, IntoPrimitive};
3use ownable::{IntoOwned, ToOwned};
4use winnow::{binary::le_u16, PResult, Partial};
5
6use crate::{
7 encoding::Encoding,
8 parse::{Mode, Version},
9};
10
11use super::{zero_datetime, ExtraField, NtfsAttr};
12
13/// An Archive contains general information about a zip files, along with a list
14/// of [entries][Entry].
15///
16/// It is obtained through a state machine like
17/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
18/// higher-levelr interfaces like
19/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
20/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
21pub struct Archive {
22 pub(crate) size: u64,
23 pub(crate) encoding: Encoding,
24 pub(crate) entries: Vec<Entry>,
25 pub(crate) comment: String,
26}
27
28impl Archive {
29 /// The size of .zip file that was read, in bytes.
30 #[inline(always)]
31 pub fn size(&self) -> u64 {
32 self.size
33 }
34
35 /// Iterate over all files in this zip, read from the central directory.
36 pub fn entries(&self) -> impl Iterator<Item = &Entry> {
37 self.entries.iter()
38 }
39
40 /// Attempts to look up an entry by name. This is usually a bad idea,
41 /// as names aren't necessarily normalized in zip archives.
42 pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
43 self.entries.iter().find(|&x| x.name == name.as_ref())
44 }
45
46 /// Returns the detected character encoding for text fields
47 /// (names, comments) inside this zip archive.
48 #[inline(always)]
49 pub fn encoding(&self) -> Encoding {
50 self.encoding
51 }
52
53 /// Returns the comment for this archive, if any. When reading
54 /// a zip file with an empty comment field, this will return None.
55 #[inline(always)]
56 pub fn comment(&self) -> &str {
57 &self.comment
58 }
59}
60
61/// Describes a zip archive entry (a file, a directory, a symlink)
62#[derive(Clone)]
63pub struct Entry {
64 /// Name of the file
65 ///
66 /// This should be a relative path, separated by `/`. However, there are zip
67 /// files in the wild with all sorts of evil variants, so, be conservative
68 /// in what you accept.
69 ///
70 /// See also [Self::sanitized_name], which returns a sanitized version of
71 /// the name, working around zip slip vulnerabilities.
72 pub name: String,
73
74 /// Compression method: Store, Deflate, Bzip2, etc.
75 pub method: Method,
76
77 /// Comment is any arbitrary user-defined string shorter than 64KiB
78 pub comment: String,
79
80 /// This entry's "last modified" timestamp - with caveats
81 ///
82 /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
83 /// by a few hours, if there is no extended timestamp information. It may have a resolution
84 /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
85 /// epoch, if something went really wrong.
86 ///
87 /// If you're reading this after the year 2038, or after the year 2108, godspeed.
88 pub modified: DateTime<Utc>,
89
90 /// This entry's "created" timestamp, if available.
91 ///
92 /// See [Self::modified] for caveats.
93 pub created: Option<DateTime<Utc>>,
94
95 /// This entry's "last accessed" timestamp, if available.
96 ///
97 /// See [Self::accessed] for caveats.
98 pub accessed: Option<DateTime<Utc>>,
99
100 /// Offset of the local file header in the zip file
101 ///
102 /// ```text
103 /// [optional non-zip data]
104 /// [local file header 1] <------ header_offset points here
105 /// [encryption header 1]
106 /// [file data 1]
107 /// [data descriptor 1]
108 /// ...
109 /// [central directory]
110 /// [optional zip64 end of central directory info]
111 /// [end of central directory record]
112 /// ```
113 pub header_offset: u64,
114
115 /// Version of zip needed to extract this archive.
116 pub reader_version: Version,
117
118 /// General purpose bit flag
119 ///
120 /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
121 /// Other flags can indicate: encryption (unsupported), various compression
122 /// settings (depending on the [Method] used).
123 ///
124 /// For LZMA, general-purpose bit 1 denotes the EOS marker.
125 pub flags: u16,
126
127 /// Unix user ID
128 ///
129 /// Only present if a Unix extra field or New Unix extra field was found.
130 pub uid: Option<u32>,
131
132 /// Unix group ID
133 ///
134 /// Only present if a Unix extra field or New Unix extra field was found.
135 pub gid: Option<u32>,
136
137 /// CRC-32 hash as found in the central directory.
138 ///
139 /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
140 /// commonly) in the data descriptor instead.
141 pub crc32: u32,
142
143 /// Size in bytes, after compression
144 pub compressed_size: u64,
145
146 /// Size in bytes, before compression
147 ///
148 /// This will be zero for directories.
149 pub uncompressed_size: u64,
150
151 /// File mode.
152 pub mode: Mode,
153}
154
155impl Entry {
156 /// Returns a sanitized version of the entry's name, if it
157 /// seems safe. In particular, if this method feels like the
158 /// entry name is trying to do a zip slip (cf.
159 /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
160 /// None.
161 ///
162 /// Other than that, it will strip any leading slashes on non-Windows OSes.
163 pub fn sanitized_name(&self) -> Option<&str> {
164 let name = self.name.as_str();
165
166 // refuse entries with traversed/absolute path to mitigate zip slip
167 if name.contains("..") {
168 return None;
169 }
170
171 #[cfg(windows)]
172 {
173 if name.contains(":\\") || name.starts_with("\\") {
174 return None;
175 }
176 Some(name)
177 }
178
179 #[cfg(not(windows))]
180 {
181 // strip absolute prefix on entries pointing to root path
182 let mut entry_chars = name.chars();
183 let mut name = name;
184 while name.starts_with('/') {
185 entry_chars.next();
186 name = entry_chars.as_str()
187 }
188 Some(name)
189 }
190 }
191
192 /// Apply the extra field to the entry, updating its metadata.
193 pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
194 match &ef {
195 ExtraField::Zip64(z64) => {
196 self.uncompressed_size = z64.uncompressed_size;
197 self.compressed_size = z64.compressed_size;
198 self.header_offset = z64.header_offset;
199 }
200 ExtraField::Timestamp(ts) => {
201 self.modified = Utc
202 .timestamp_opt(ts.mtime as i64, 0)
203 .single()
204 .unwrap_or_else(zero_datetime);
205 }
206 ExtraField::Ntfs(nf) => {
207 for attr in &nf.attrs {
208 // note: other attributes are unsupported
209 if let NtfsAttr::Attr1(attr) = attr {
210 self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
211 self.created = attr.ctime.to_datetime();
212 self.accessed = attr.atime.to_datetime();
213 }
214 }
215 }
216 ExtraField::Unix(uf) => {
217 self.modified = Utc
218 .timestamp_opt(uf.mtime as i64, 0)
219 .single()
220 .unwrap_or_else(zero_datetime);
221
222 if self.uid.is_none() {
223 self.uid = Some(uf.uid as u32);
224 }
225
226 if self.gid.is_none() {
227 self.gid = Some(uf.gid as u32);
228 }
229 }
230 ExtraField::NewUnix(uf) => {
231 self.uid = Some(uf.uid as u32);
232 self.gid = Some(uf.uid as u32);
233 }
234 _ => {}
235 };
236 }
237}
238
239/// The entry's file type: a directory, a file, or a symbolic link.
240#[derive(Debug, Eq, PartialEq)]
241pub enum EntryKind {
242 /// The entry is a directory
243 Directory,
244
245 /// The entry is a file
246 File,
247
248 /// The entry is a symbolic link
249 Symlink,
250}
251
252impl Entry {
253 /// Determine the kind of this entry based on its mode.
254 pub fn kind(&self) -> EntryKind {
255 if self.mode.has(Mode::SYMLINK) {
256 EntryKind::Symlink
257 } else if self.mode.has(Mode::DIR) {
258 EntryKind::Directory
259 } else {
260 EntryKind::File
261 }
262 }
263}
264
265/// Compression method used for a file entry.
266///
267/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
268/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
269///
270/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
271/// [Lzma][Method::Lzma] or others.
272#[derive(
273 Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive, IntoOwned, ToOwned,
274)]
275#[repr(u16)]
276pub enum Method {
277 /// No compression is applied
278 Store = 0,
279
280 /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
281 Deflate = 8,
282
283 /// [DEFLATE64](https://deflate64.com/)
284 Deflate64 = 9,
285
286 /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
287 Bzip2 = 12,
288
289 /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
290 Lzma = 14,
291
292 /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
293 Zstd = 93,
294
295 /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
296 Mp3 = 94,
297
298 /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
299 Xz = 95,
300
301 /// [JPEG](https://jpeg.org/jpeg/)
302 Jpeg = 96,
303
304 /// [WavPack](https://www.wavpack.com/)
305 WavPack = 97,
306
307 /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
308 Ppmd = 98,
309
310 /// AE-x encryption marker (see Appendix E of appnote)
311 Aex = 99,
312
313 /// A compression method that isn't recognized by this crate.
314 #[num_enum(catch_all)]
315 Unrecognized(u16),
316}
317
318impl Method {
319 /// Parse a method from a byte slice
320 pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
321 le_u16(i).map(From::from)
322 }
323}