1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
use chrono::{offset::Utc, DateTime, TimeZone};
use num_enum::{FromPrimitive, IntoPrimitive};
use ownable::{IntoOwned, ToOwned};
use winnow::{binary::le_u16, PResult, Partial};

use crate::{
    encoding::Encoding,
    parse::{Mode, Version},
};

use super::{zero_datetime, ExtraField, NtfsAttr};

/// An Archive contains general information about a zip files, along with a list
/// of [entries][Entry].
///
/// It is obtained through a state machine like
/// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use
/// higher-levelr interfaces like
/// [rc-zip-sync](https://crates.io/crates/rc-zip-sync) or
/// [rc-zip-tokio](https://crates.io/crates/rc-zip-tokio).
pub struct Archive {
    pub(crate) size: u64,
    pub(crate) encoding: Encoding,
    pub(crate) entries: Vec<Entry>,
    pub(crate) comment: String,
}

impl Archive {
    /// The size of .zip file that was read, in bytes.
    #[inline(always)]
    pub fn size(&self) -> u64 {
        self.size
    }

    /// Iterate over all files in this zip, read from the central directory.
    pub fn entries(&self) -> impl Iterator<Item = &Entry> {
        self.entries.iter()
    }

    /// Attempts to look up an entry by name. This is usually a bad idea,
    /// as names aren't necessarily normalized in zip archives.
    pub fn by_name<N: AsRef<str>>(&self, name: N) -> Option<&Entry> {
        self.entries.iter().find(|&x| x.name == name.as_ref())
    }

    /// Returns the detected character encoding for text fields
    /// (names, comments) inside this zip archive.
    #[inline(always)]
    pub fn encoding(&self) -> Encoding {
        self.encoding
    }

    /// Returns the comment for this archive, if any. When reading
    /// a zip file with an empty comment field, this will return None.
    #[inline(always)]
    pub fn comment(&self) -> &str {
        &self.comment
    }
}

/// Describes a zip archive entry (a file, a directory, a symlink)
#[derive(Clone)]
pub struct Entry {
    /// Name of the file
    ///
    /// This should be a relative path, separated by `/`. However, there are zip
    /// files in the wild with all sorts of evil variants, so, be conservative
    /// in what you accept.
    ///
    /// See also [Self::sanitized_name], which returns a sanitized version of
    /// the name, working around zip slip vulnerabilities.
    pub name: String,

    /// Compression method: Store, Deflate, Bzip2, etc.
    pub method: Method,

    /// Comment is any arbitrary user-defined string shorter than 64KiB
    pub comment: String,

    /// This entry's "last modified" timestamp - with caveats
    ///
    /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset
    /// by a few hours, if there is no extended timestamp information. It may have a resolution
    /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix
    /// epoch, if something went really wrong.
    ///
    /// If you're reading this after the year 2038, or after the year 2108, godspeed.
    pub modified: DateTime<Utc>,

    /// This entry's "created" timestamp, if available.
    ///
    /// See [Self::modified] for caveats.
    pub created: Option<DateTime<Utc>>,

    /// This entry's "last accessed" timestamp, if available.
    ///
    /// See [Self::accessed] for caveats.
    pub accessed: Option<DateTime<Utc>>,

    /// Offset of the local file header in the zip file
    ///
    /// ```text
    /// [optional non-zip data]
    /// [local file header 1] <------ header_offset points here
    /// [encryption header 1]
    /// [file data 1]
    /// [data descriptor 1]
    /// ...
    /// [central directory]
    /// [optional zip64 end of central directory info]
    /// [end of central directory record]
    /// ```
    pub header_offset: u64,

    /// Version of zip needed to extract this archive.
    pub reader_version: Version,

    /// General purpose bit flag
    ///
    /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names.
    /// Other flags can indicate: encryption (unsupported), various compression
    /// settings (depending on the [Method] used).
    ///
    /// For LZMA, general-purpose bit 1 denotes the EOS marker.
    pub flags: u16,

    /// Unix user ID
    ///
    /// Only present if a Unix extra field or New Unix extra field was found.
    pub uid: Option<u32>,

    /// Unix group ID
    ///
    /// Only present if a Unix extra field or New Unix extra field was found.
    pub gid: Option<u32>,

    /// CRC-32 hash as found in the central directory.
    ///
    /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more
    /// commonly) in the data descriptor instead.
    pub crc32: u32,

    /// Size in bytes, after compression
    pub compressed_size: u64,

    /// Size in bytes, before compression
    ///
    /// This will be zero for directories.
    pub uncompressed_size: u64,

    /// File mode.
    pub mode: Mode,
}

impl Entry {
    /// Returns a sanitized version of the entry's name, if it
    /// seems safe. In particular, if this method feels like the
    /// entry name is trying to do a zip slip (cf.
    /// <https://snyk.io/research/zip-slip-vulnerability>), it'll return
    /// None.
    ///
    /// Other than that, it will strip any leading slashes on non-Windows OSes.
    pub fn sanitized_name(&self) -> Option<&str> {
        let name = self.name.as_str();

        // refuse entries with traversed/absolute path to mitigate zip slip
        if name.contains("..") {
            return None;
        }

        #[cfg(windows)]
        {
            if name.contains(":\\") || name.starts_with("\\") {
                return None;
            }
            Some(name)
        }

        #[cfg(not(windows))]
        {
            // strip absolute prefix on entries pointing to root path
            let mut entry_chars = name.chars();
            let mut name = name;
            while name.starts_with('/') {
                entry_chars.next();
                name = entry_chars.as_str()
            }
            Some(name)
        }
    }

    /// Apply the extra field to the entry, updating its metadata.
    pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) {
        match &ef {
            ExtraField::Zip64(z64) => {
                self.uncompressed_size = z64.uncompressed_size;
                self.compressed_size = z64.compressed_size;
                self.header_offset = z64.header_offset;
            }
            ExtraField::Timestamp(ts) => {
                self.modified = Utc
                    .timestamp_opt(ts.mtime as i64, 0)
                    .single()
                    .unwrap_or_else(zero_datetime);
            }
            ExtraField::Ntfs(nf) => {
                for attr in &nf.attrs {
                    // note: other attributes are unsupported
                    if let NtfsAttr::Attr1(attr) = attr {
                        self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime);
                        self.created = attr.ctime.to_datetime();
                        self.accessed = attr.atime.to_datetime();
                    }
                }
            }
            ExtraField::Unix(uf) => {
                self.modified = Utc
                    .timestamp_opt(uf.mtime as i64, 0)
                    .single()
                    .unwrap_or_else(zero_datetime);

                if self.uid.is_none() {
                    self.uid = Some(uf.uid as u32);
                }

                if self.gid.is_none() {
                    self.gid = Some(uf.gid as u32);
                }
            }
            ExtraField::NewUnix(uf) => {
                self.uid = Some(uf.uid as u32);
                self.gid = Some(uf.uid as u32);
            }
            _ => {}
        };
    }
}

/// The entry's file type: a directory, a file, or a symbolic link.
#[derive(Debug)]
pub enum EntryKind {
    /// The entry is a directory
    Directory,

    /// The entry is a file
    File,

    /// The entry is a symbolic link
    Symlink,
}

impl Entry {
    /// Determine the kind of this entry based on its mode.
    pub fn kind(&self) -> EntryKind {
        if self.mode.has(Mode::SYMLINK) {
            EntryKind::Symlink
        } else if self.mode.has(Mode::DIR) {
            EntryKind::Directory
        } else {
            EntryKind::File
        }
    }
}

/// Compression method used for a file entry.
///
/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only
/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used.
///
/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2],
/// [Lzma][Method::Lzma] or others.
#[derive(
    Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive, IntoOwned, ToOwned,
)]
#[repr(u16)]
pub enum Method {
    /// No compression is applied
    Store = 0,

    /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt)
    Deflate = 8,

    /// [DEFLATE64](https://deflate64.com/)
    Deflate64 = 9,

    /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf)
    Bzip2 = 12,

    /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt)
    Lzma = 14,

    /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878)
    Zstd = 93,

    /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en)
    Mp3 = 94,

    /// [XZ](https://tukaani.org/xz/xz-file-format.txt)
    Xz = 95,

    /// [JPEG](https://jpeg.org/jpeg/)
    Jpeg = 96,

    /// [WavPack](https://www.wavpack.com/)
    WavPack = 97,

    /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching)
    Ppmd = 98,

    /// AE-x encryption marker (see Appendix E of appnote)
    Aex = 99,

    /// A compression method that isn't recognized by this crate.
    #[num_enum(catch_all)]
    Unrecognized(u16),
}

impl Method {
    /// Parse a method from a byte slice
    pub fn parser(i: &mut Partial<&[u8]>) -> PResult<Self> {
        le_u16(i).map(From::from)
    }
}