1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
//! Character encodings used in ZIP files.
//!
//! ZIP entry paths may be encoded in a variety of character encodings:
//! historically, CP-437 was used, but many modern zip files use UTF-8 with an
//! optional UTF-8 flag.
//!
//! Others use the system's local character encoding, and we have no choice but
//! to make an educated guess thanks to the chardet-ng crate.

use std::fmt;

/// Encodings supported by this crate
#[derive(Clone, Copy, PartialEq, Eq, Debug)]
pub enum Encoding {
    /// [UTF-8](https://en.wikipedia.org/wiki/UTF-8), opt-in for ZIP files.
    Utf8,

    /// [Codepage 437](https://en.wikipedia.org/wiki/Code_page_437), also known as
    /// OEM-US, PC-8, or DOS Latin US.
    ///
    /// This is the fallback if UTF-8 is not specified and no other encoding
    /// is auto-detected. It was the original encoding of the zip format.
    Cp437,

    /// [Shift JIS](https://en.wikipedia.org/wiki/Shift_JIS), also known as SJIS.
    ///
    /// Still in use by some Japanese users as of 2019.
    ShiftJis,
}

impl fmt::Display for Encoding {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        use Encoding as T;
        match self {
            T::Utf8 => write!(f, "utf-8"),
            T::Cp437 => write!(f, "cp-437"),
            T::ShiftJis => write!(f, "shift-jis"),
        }
    }
}

/// Errors encountered while converting text to UTF-8.
#[derive(Debug, thiserror::Error)]
pub enum DecodingError {
    /// Text claimed to be UTF-8, but wasn't (as far as we can tell).
    #[error("invalid utf-8: {0}")]
    Utf8Error(std::str::Utf8Error),

    /// Text is too large to be converted.
    ///
    /// In practice, this happens if the text's length is larger than
    /// [usize::MAX], which seems unlikely.
    #[error("text too large to be converted")]
    StringTooLarge,

    /// Text is not valid in the given encoding.
    #[error("encoding error: {0}")]
    EncodingError(&'static str),
}

impl From<std::str::Utf8Error> for DecodingError {
    fn from(e: std::str::Utf8Error) -> Self {
        DecodingError::Utf8Error(e)
    }
}

impl Encoding {
    pub(crate) fn decode(&self, i: &[u8]) -> Result<String, DecodingError> {
        match self {
            Encoding::Utf8 => {
                let s = std::str::from_utf8(i)?;
                Ok(s.to_string())
            }
            Encoding::Cp437 => Ok(oem_cp::decode_string_complete_table(
                i,
                &oem_cp::code_table::DECODING_TABLE_CP437,
            )),
            Encoding::ShiftJis => self.decode_as(i, encoding_rs::SHIFT_JIS),
        }
    }

    fn decode_as(
        &self,
        i: &[u8],
        encoding: &'static encoding_rs::Encoding,
    ) -> Result<String, DecodingError> {
        let mut decoder = encoding.new_decoder();
        let len = decoder
            .max_utf8_buffer_length(i.len())
            .ok_or(DecodingError::StringTooLarge)?;
        let mut v = vec![0u8; len];
        let last = true;
        let (_decoder_result, _decoder_read, decoder_written, had_errors) =
            decoder.decode_to_utf8(i, &mut v, last);
        if had_errors {
            return Err(DecodingError::EncodingError(encoding.name()));
        }
        v.resize(decoder_written, 0u8);
        Ok(unsafe { String::from_utf8_unchecked(v) })
    }
}

// detect_utf8 reports whether s is a valid UTF-8 string, and whether the string
// must be considered UTF-8 encoding (i.e., not compatible with CP-437, ASCII,
// or any other common encoding).
pub(crate) fn detect_utf8(input: &[u8]) -> (bool, bool) {
    match std::str::from_utf8(input) {
        Err(_) => {
            // not valid utf-8
            (false, false)
        }
        Ok(s) => {
            let mut require = false;

            // Officially, ZIP uses CP-437, but many readers use the system's
            // local character encoding. Most encoding are compatible with a large
            // subset of CP-437, which itself is ASCII-like.
            //
            // Forbid 0x7e and 0x5c since EUC-KR and Shift-JIS replace those
            // characters with localized currency and overline characters.
            for c in s.chars() {
                if c < 0x20 as char || c > 0x7d as char || c == 0x5c as char {
                    require = true
                }
            }
            (true, require)
        }
    }
}