gzip_header/
lib.rs

1//! A library to decode and encode headers for the
2//! [gzip format](http://www.gzip.org/zlib/rfc-gzip.html).
3//! The library also contains a reader absctraction over a CRC checksum hasher.
4//!
5//! A file in the gzip format contains a gzip header, a number of compressed data blocks in the
6//! [DEFLATE](http://www.gzip.org/zlib/rfc-deflate.html) format, and ends with the CRC32-checksum
7//! (in the IEEE format) and number of bytes (modulo `2^32`) of the uncompressed data.
8//!
9//! The gzip header is purely a set of metadata, and doesn't have any impact on the decoding of the
10//! compressed data other than the fact that `DEFLATE`-encoded data with a gzip-header is
11//! checked using the CRC32 algorithm.
12//!
13//! This library is based on the gzip header functionality in the
14//! [flate2](https://crates.io/crates/flate2) crate.
15
16#![forbid(unsafe_code)]
17extern crate crc32fast;
18
19mod crc_reader;
20
21use std::borrow::Cow;
22use std::default::Default;
23use std::ffi::CString;
24use std::fmt;
25use std::io::Read;
26use std::{env, io, time};
27
28pub use crc_reader::{Crc, CrcReader};
29
30static FHCRC: u8 = 1 << 1;
31static FEXTRA: u8 = 1 << 2;
32static FNAME: u8 = 1 << 3;
33static FCOMMENT: u8 = 1 << 4;
34
35/// An enum describing the different OS types described in the gzip format.
36/// See http://www.gzip.org/format.txt (Additionally, the Apple(19) value is defined in the zlib
37/// library).
38#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
39#[repr(u8)]
40pub enum FileSystemType {
41    ///MS-DOS/old FAT filesystem
42    Fat = 0,
43    Amiga = 1,
44    Vms = 2,
45    Unix = 3,
46    Vcms = 4,
47    AtariTos = 5,
48    Hpfs = 6,
49    /// Used for apple platforms. Newer encoders may use 19 instead for modern systems.
50    Macintosh = 7,
51    Zsystem = 8,
52    Cpm = 9,
53    /// This is used for Windows/NTFS in zlib newer than 1.2.11, but not in gzip due to following
54    /// updates to the ZIP format.
55    /// See https://github.com/madler/zlib/issues/235 and
56    /// https://github.com/madler/zlib/commit/ce12c5cd00628bf8f680c98123a369974d32df15
57    Tops20OrNTFS = 10,
58    /// Used for Windows platforms for older zlib versions and other encoders.
59    NTFS = 11,
60    SmsQdos = 12,
61    Riscos = 13,
62    /// Newer fat filesystems (i.e FAT32).
63    Vfat = 14,
64    Mvs = 15,
65    Beos = 16,
66    TandemNsk = 17,
67    Theos = 18,
68    /// Modern apple platforms.
69    /// Defined in the zlib library (see zutil.h)
70    Apple = 19,
71    Unknown = 255,
72}
73
74impl FileSystemType {
75    /// Get the raw byte value of this `FileSystemType` variant.
76    pub const fn as_u8(&self) -> u8 {
77        *self as u8
78    }
79
80    /// Get the corresponding `ExtraFlags` value from a raw byte.
81    ///
82    /// Returns `FileSystemType::Unknown` (defined as 255 as that is the value used in the
83    /// specification for `Unknown`) if the value is not one of the currently known types
84    /// (Which currently means any value > 19).
85    pub fn from_u8(value: u8) -> FileSystemType {
86        use FileSystemType::*;
87        match value {
88            0 => Fat,
89            1 => Amiga,
90            2 => Vms,
91            3 => Unix,
92            4 => Vcms,
93            5 => AtariTos,
94            6 => Hpfs,
95            7 => Macintosh,
96            8 => Zsystem,
97            9 => Cpm,
98            10 => Tops20OrNTFS,
99            11 => NTFS,
100            12 => SmsQdos,
101            13 => Riscos,
102            14 => Vfat,
103            15 => Mvs,
104            16 => Beos,
105            17 => TandemNsk,
106            18 => Theos,
107            19 => Apple,
108            _ => Unknown,
109        }
110    }
111}
112
113impl fmt::Display for FileSystemType {
114    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
115        use FileSystemType::*;
116        match *self {
117            Fat => "FAT filesystem (MS-DOS, OS/2, NT/Win32)",
118            Amiga => "Amiga",
119            Vms => "VMS or OpenVMS",
120            Unix => "Unix type system/Linux",
121            Vcms => "VM/CMS",
122            AtariTos => "Atari TOS",
123            Hpfs => "HPFS filesystem (OS/2, NT)",
124            Macintosh => "Macintosh operating system (Classic Mac OS, OS/X, macOS, iOS etc.)",
125            Zsystem => "Z-System",
126            Cpm => "CP/M",
127            Tops20OrNTFS => "NTFS (New zlib versions) or TOPS-20",
128            NTFS => "NTFS",
129            SmsQdos => "SMS/QDOS",
130            Riscos => "Acorn RISC OS",
131            Vfat => "VFAT file system (Win95, NT)",
132            Mvs => "MVS or PRIMOS",
133            Beos => "BeOS",
134            TandemNsk => "Tandem/NSK",
135            Theos => "THEOS",
136            Apple => "macOS, OS/X, iOS or watchOS",
137            _ => "Unknown or unset",
138        }
139        .fmt(f)
140    }
141}
142
143/// Valid values for the extra flag in the gzip specification.
144///
145/// This is a field to be used by the compression methods. For deflate, which is the only
146/// specified compression method, this is a value indicating the level of compression of the
147/// contained compressed data. This value does not have to correspond to the actual compression
148/// level of the contained data, it's only a hint that the the encoder may set.
149#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
150#[repr(u8)]
151pub enum ExtraFlags {
152    Default = 0,
153    MaximumCompression = 2,
154    FastestCompression = 4,
155}
156
157impl ExtraFlags {
158    /// Get the corresponding `ExtraFlags` value from a raw byte.
159    ///
160    /// Returns `ExtraFlags::Default` (defined as 0 by the gzip specification) for values other than
161    /// 2 and 4.
162    pub fn from_u8(value: u8) -> ExtraFlags {
163        use ExtraFlags::*;
164        match value {
165            2 => MaximumCompression,
166            4 => FastestCompression,
167            _ => Default,
168        }
169    }
170
171    /// Get the raw byte value of this `ExtraFlags` variant.
172    pub const fn as_u8(&self) -> u8 {
173        *self as u8
174    }
175}
176
177impl fmt::Display for ExtraFlags {
178    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
179        match *self {
180            ExtraFlags::Default => "No extra flags (Default) or unknown.",
181            ExtraFlags::MaximumCompression => "Maximum compression algorithm (DEFLATE).",
182            ExtraFlags::FastestCompression => "Fastest compression algorithm (DEFLATE)",
183        }
184        .fmt(f)
185    }
186}
187
188impl Default for ExtraFlags {
189    fn default() -> ExtraFlags {
190        ExtraFlags::Default
191    }
192}
193
194/// A builder structure to create a new gzip header.
195///
196/// This structure controls header configuration options such as the filename.
197#[derive(Debug, Default, Clone, Eq, PartialEq)]
198pub struct GzBuilder {
199    extra: Option<Vec<u8>>,
200    filename: Option<CString>,
201    comment: Option<CString>,
202    // Whether this should be signed is a bit unclear, the gzip spec says mtime is in the unix
203    // time format, which is normally signed, however zlib seems to use an unsigned long for this
204    // field.
205    mtime: u32,
206    os: Option<FileSystemType>,
207    xfl: ExtraFlags,
208}
209
210impl GzBuilder {
211    /// Create a new blank builder with no header by default.
212    pub fn new() -> GzBuilder {
213        GzBuilder {
214            extra: None,
215            filename: None,
216            comment: None,
217            mtime: 0,
218            os: None,
219            xfl: ExtraFlags::Default,
220        }
221    }
222
223    /// Configure the `mtime` field in the gzip header.
224    pub fn mtime(mut self, mtime: u32) -> GzBuilder {
225        self.mtime = mtime;
226        self
227    }
228
229    /// Configure the `extra` field in the gzip header.
230    pub fn extra<T: Into<Vec<u8>>>(mut self, extra: T) -> GzBuilder {
231        self.extra = Some(extra.into());
232        self
233    }
234
235    /// Configure the `filename` field in the gzip header.
236    ///
237    /// # Panics
238    /// Panics if the filename argument contains a byte with the value 0.
239    pub fn filename<T: Into<Vec<u8>>>(mut self, filename: T) -> GzBuilder {
240        self.filename = Some(CString::new(filename).unwrap());
241        self
242    }
243
244    /// Configure the `comment` field in the gzip header.
245    ///
246    /// # Panics
247    /// Panics if the comment argument contains a byte with the value 0.
248    pub fn comment<T: Into<Vec<u8>>>(mut self, comment: T) -> GzBuilder {
249        self.comment = Some(CString::new(comment).unwrap());
250        self
251    }
252
253    /// Configure the `os` field in the gzip header.
254    ///
255    /// This is taken from `std::env::consts::OS` if not set explicitly.
256    pub fn os(mut self, os: FileSystemType) -> GzBuilder {
257        self.os = Some(os);
258        self
259    }
260
261    /// Configure the `xfl` field in the gzip header.
262    ///
263    /// The default is `ExtraFlags::Default` (meaning not set).
264    pub fn xfl(mut self, xfl: ExtraFlags) -> GzBuilder {
265        self.xfl = xfl;
266        self
267    }
268
269    /// Transforms this builder structure into a raw vector of bytes, setting the `XFL` field to the
270    /// value specified by `lvl`.
271    pub fn into_header_xfl(mut self, lvl: ExtraFlags) -> Vec<u8> {
272        self.xfl = lvl;
273        self.into_header()
274    }
275
276    /// Transforms this builder structure into a raw vector of bytes.
277    pub fn into_header(self) -> Vec<u8> {
278        self.into_header_inner(false)
279    }
280
281    /// Transforms this builder structure into a raw vector of bytes.
282    pub fn into_header_with_checksum(self) -> Vec<u8> {
283        self.into_header_inner(true)
284    }
285
286    fn into_header_inner(self, use_crc: bool) -> Vec<u8> {
287        let GzBuilder {
288            extra,
289            filename,
290            comment,
291            mtime,
292            os,
293            xfl,
294        } = self;
295        let os = match os {
296            Some(f) => f,
297            // Set the OS based on the system the binary is compiled for if not set,
298            // as this is a required field.
299            // These defaults are taken from what modern zlib uses, which are not the same as
300            // what's used in flate2.
301            None => match env::consts::OS {
302                "linux" | "freebsd" | "dragonfly" | "netbsd" | "openbsd" | "solaris" | "bitrig" => {
303                    FileSystemType::Unix
304                }
305                "macos" => FileSystemType::Apple,
306                "win32" => FileSystemType::Tops20OrNTFS,
307                _ => FileSystemType::Unknown,
308            },
309        };
310        let mut flg = 0;
311        if use_crc {
312            flg |= FHCRC;
313        };
314        let mut header = vec![0u8; 10];
315
316        if let Some(v) = extra {
317            flg |= FEXTRA;
318            header.push((v.len()/* >> 0*/) as u8);
319            header.push((v.len() >> 8) as u8);
320            header.extend(v);
321        }
322
323        if let Some(filename) = filename {
324            flg |= FNAME;
325            header.extend(filename.as_bytes_with_nul().iter().cloned());
326        }
327
328        if let Some(comment) = comment {
329            flg |= FCOMMENT;
330            header.extend(comment.as_bytes_with_nul().iter().cloned());
331        }
332
333        header[0] = 0x1f;
334        header[1] = 0x8b;
335        header[2] = 8;
336        header[3] = flg;
337        header[4] = mtime /*>> 0*/ as u8;
338        header[5] = (mtime >> 8) as u8;
339        header[6] = (mtime >> 16) as u8;
340        header[7] = (mtime >> 24) as u8;
341        header[8] = xfl.as_u8();
342        header[9] = os.as_u8();
343
344        if use_crc {
345            let mut crc = Crc::new();
346            crc.update(&header);
347            let checksum = crc.sum() as u16;
348            header.extend(&[checksum as u8, (checksum >> 8) as u8]);
349        }
350
351        header
352    }
353}
354
355/// A structure representing the raw header of a gzip stream.
356///
357/// The header can contain metadata about the file that was compressed, if
358/// present.
359#[derive(Debug, Clone, PartialEq, Eq, Hash)]
360pub struct GzHeader {
361    extra: Option<Vec<u8>>,
362    filename: Option<Vec<u8>>,
363    comment: Option<Vec<u8>>,
364    mtime: u32,
365    os: u8,
366    xfl: u8,
367}
368
369impl GzHeader {
370    /// Returns the `filename` field of this gzip header, if present.
371    ///
372    /// The `filename` field the gzip header is supposed to be stored using ISO 8859-1 (LATIN-1)
373    /// encoding and be zero-terminated if following the specification.
374    pub fn filename(&self) -> Option<&[u8]> {
375        self.filename.as_ref().map(|s| &s[..])
376    }
377
378    /// Returns the `extra` field of this gzip header, if present.
379    pub fn extra(&self) -> Option<&[u8]> {
380        self.extra.as_ref().map(|s| &s[..])
381    }
382
383    /// Returns the `comment` field of this gzip stream's header, if present.
384    ///
385    /// The `comment` field in the gzip header is supposed to be stored using ISO 8859-1 (LATIN-1)
386    /// encoding and be zero-terminated if following the specification.
387    pub fn comment(&self) -> Option<&[u8]> {
388        self.comment.as_ref().map(|s| &s[..])
389    }
390
391    /// Returns the `mtime` field of this gzip header.
392    ///
393    /// This gives the most recent modification time of the contained file, or alternatively
394    /// the timestamp of when the file was compressed if the data did not come from a file, or
395    /// a timestamp was not available when compressing. The time is specified the Unix format,
396    /// that is: seconds since 00:00:00 GMT, Jan. 1, 1970. (Not that this may cause problems for
397    /// MS-DOS and other systems that use local rather than Universal time.)
398    /// An `mtime` value of 0 means that the timestamp is not set.
399    pub const fn mtime(&self) -> u32 {
400        self.mtime
401    }
402
403    /// Returns the `mtime` field of this gzip header as a `SystemTime` if present.
404    ///
405    /// Returns `None` if the `mtime` is not set, i.e 0.
406    /// See [`mtime`](#method.mtime) for more detail.
407    pub fn mtime_as_datetime(&self) -> Option<time::SystemTime> {
408        if self.mtime == 0 {
409            None
410        } else {
411            let duration = time::Duration::new(u64::from(self.mtime), 0);
412            let datetime = time::UNIX_EPOCH + duration;
413            Some(datetime)
414        }
415    }
416
417    /// Returns the `os` field of this gzip stream's header.
418    pub const fn os(&self) -> u8 {
419        self.os
420    }
421
422    /// Returns the `xfl` field of this gzip stream's header.
423    pub const fn xfl(&self) -> u8 {
424        self.xfl
425    }
426}
427
428#[inline]
429fn into_string(data: Option<&[u8]>) -> Cow<str> {
430    data.map_or_else(
431        || Cow::Borrowed("(Not set)"),
432        |d| String::from_utf8_lossy(d),
433    )
434}
435
436impl fmt::Display for GzHeader {
437    /// Crudely display the contents of the header
438    ///
439    /// Note that filename/commend are required to be ISO 8859-1 (LATIN-1) encoded by the spec,
440    /// however to avoid dragging in dependencies we simply interpret them as UTF-8.
441    /// This may result in garbled output if the names contain special characters.
442    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
443        write!(
444            f,
445            "Filename: {}\n\
446             Comment: {}\n\
447             Extra: {:?}\n\
448             mtime: {}\n\
449             os: {}\n\
450             xfl: {}",
451            into_string(self.filename()),
452            into_string(self.comment()),
453            // We display extra as raw bytes for now.
454            self.extra,
455            self.mtime,
456            FileSystemType::from_u8(self.os),
457            ExtraFlags::Default, //ExtraFlags::from_u8(self.xfl),
458        )
459    }
460}
461
462fn corrupt() -> io::Error {
463    io::Error::new(
464        io::ErrorKind::InvalidInput,
465        "corrupt gzip stream does not have a matching header checksum",
466    )
467}
468
469fn bad_header() -> io::Error {
470    io::Error::new(io::ErrorKind::InvalidInput, "invalid gzip header")
471}
472
473/// Try to read a little-endian u16 from the provided reader.
474fn read_le_u16<R: Read>(r: &mut R) -> io::Result<u16> {
475    let mut b = [0; 2];
476    r.read_exact(&mut b)?;
477    Ok((b[0] as u16) | ((b[1] as u16) << 8))
478}
479
480/// Try to read a gzip header from the provided reader.
481///
482/// Returns a `GzHeader` with the fields filled out if sucessful, or an `io::Error` with
483/// `ErrorKind::InvalidInput` if decoding of the header.
484///
485/// Note that a gzip steam can contain multiple "members". Each member contains a header,
486/// followed by compressed data and finally a checksum and byte count.
487/// This method will only read the header for the "member" at the start of the stream.
488pub fn read_gz_header<R: Read>(r: &mut R) -> io::Result<GzHeader> {
489    let mut crc_reader = CrcReader::new(r);
490    let mut header = [0; 10];
491    crc_reader.read_exact(&mut header)?;
492
493    // `ID1` and `ID2` are fixed values to identify a gzip file.
494    let id1 = header[0];
495    let id2 = header[1];
496    if id1 != 0x1f || id2 != 0x8b {
497        return Err(bad_header());
498    }
499    // `CM` describes the compression method. Currently only method 8 (DEFLATE) is specified.
500    // by the gzip format.
501    let cm = header[2];
502    if cm != 8 {
503        return Err(bad_header());
504    }
505
506    // `FLG` the bits in this field indicates whether the `FTEXT`, `FHCRC`, `FEXTRA`, `FNAME` and
507    // `FCOMMENT` fields are present in the header.
508    let flg = header[3];
509    let mtime = (header[4] as u32/* << 0*/)
510        | ((header[5] as u32) << 8)
511        | ((header[6] as u32) << 16)
512        | ((header[7] as u32) << 24);
513    // `XFL` describes the compression level used by the encoder. (May not actually
514    // match what the encoder used and has no impact on decompression.)
515    let xfl = header[8];
516    // `os` describes what type of operating system/file system the file was created on.
517    let os = header[9];
518
519    let extra = if flg & FEXTRA != 0 {
520        // Length of the FEXTRA field.
521        let xlen = read_le_u16(&mut crc_reader)?;
522        let mut extra = vec![0; xlen as usize];
523        crc_reader.read_exact(&mut extra)?;
524        Some(extra)
525    } else {
526        None
527    };
528    let filename = if flg & FNAME != 0 {
529        // wow this is slow
530        let mut b = Vec::new();
531        for byte in crc_reader.by_ref().bytes() {
532            let byte = byte?;
533            if byte == 0 {
534                break;
535            }
536            b.push(byte);
537        }
538        Some(b)
539    } else {
540        None
541    };
542    let comment = if flg & FCOMMENT != 0 {
543        // wow this is slow
544        let mut b = Vec::new();
545        for byte in crc_reader.by_ref().bytes() {
546            let byte = byte?;
547            if byte == 0 {
548                break;
549            }
550            b.push(byte);
551        }
552        Some(b)
553    } else {
554        None
555    };
556
557    // If the `FHCRC` flag is set, the header contains a two-byte CRC16 checksum of the header bytes
558    // that needs to be validated.
559    if flg & FHCRC != 0 {
560        let calced_crc = crc_reader.crc().sum() as u16;
561        let stored_crc = read_le_u16(&mut crc_reader)?;
562        if calced_crc != stored_crc {
563            return Err(corrupt());
564        }
565    }
566
567    Ok(GzHeader {
568        extra,
569        filename,
570        comment,
571        mtime,
572        os,
573        xfl,
574    })
575}
576
577#[cfg(test)]
578mod tests {
579    use super::*;
580    use std::io::Cursor;
581
582    fn roundtrip_inner(use_crc: bool) {
583        const COMMENT: &'static [u8] = b"Comment";
584        const FILENAME: &'static [u8] = b"Filename";
585        const MTIME: u32 = 12345;
586        const OS: FileSystemType = FileSystemType::NTFS;
587        const XFL: ExtraFlags = ExtraFlags::FastestCompression;
588
589        let header = GzBuilder::new()
590            .comment(COMMENT)
591            .filename(FILENAME)
592            .mtime(MTIME)
593            .os(OS)
594            .xfl(ExtraFlags::FastestCompression)
595            .into_header_inner(use_crc);
596
597        let mut reader = Cursor::new(header.clone());
598
599        let header_read = read_gz_header(&mut reader).unwrap();
600
601        assert_eq!(header_read.comment().unwrap(), COMMENT);
602        assert_eq!(header_read.filename().unwrap(), FILENAME);
603        assert_eq!(header_read.mtime(), MTIME);
604        assert_eq!(header_read.os(), OS.as_u8());
605        assert_eq!(header_read.xfl(), XFL.as_u8());
606    }
607
608    #[test]
609    fn roundtrip() {
610        roundtrip_inner(false);
611    }
612
613    #[test]
614    fn roundtrip_with_crc() {
615        roundtrip_inner(true);
616    }
617
618    #[test]
619    fn filesystem_enum() {
620        for n in 0..20 {
621            assert_eq!(n, FileSystemType::from_u8(n).as_u8());
622        }
623
624        for n in 20..(u8::max_value() as u16) + 1 {
625            assert_eq!(FileSystemType::from_u8(n as u8), FileSystemType::Unknown);
626        }
627    }
628}