oem_cp 2.0.0

Rust library that handles OEM code pages (e.g. CP{437,737,850}) for single byte character sets
Documentation
use std::collections::HashMap;
use std::convert::TryInto;
use std::fs::File;
use std::io::{BufReader, BufWriter, Write};
use std::path::PathBuf;
use std::{env, io};

use serde::Deserialize;

enum Table {
    Complete([char; 128]),
    Incomplete([Option<char>; 128]),
}

/// Parsed code tables from `assets/code_tables.json`
struct CodeTables {
    /// The file creation time as a ISO 8601 Timestamp
    created: String,
    /// The code tables
    ///
    /// `(code_page, table)`
    tables: Vec<(u16, Table)>,
}

fn main() -> io::Result<()> {
    generate_tables()?;

    Ok(())
}

/// Generates `$OUT_DIR/code_table.rs` from `./assets/code_tables.json`
fn generate_tables() -> io::Result<()> {
    let code_tables = parse_code_tables()?;
    let mut output = open_output()?;

    write_header(&mut output, code_tables.created)?;

    for (code_page, table) in &code_tables.tables {
        write_decoding(&mut output, *code_page, table)?;
    }

    for (code_page, table) in &code_tables.tables {
        write_encoding(&mut output, *code_page, table)?;
    }

    write_decoding_table_cp_map(&mut output, &code_tables.tables)?;
    write_encoding_table_cp_map(&mut output, &code_tables.tables)?;

    write_footer(&mut output)?;

    Ok(())
}

fn open_output() -> io::Result<BufWriter<File>> {
    let path = {
        let mut path = PathBuf::from(env::var("OUT_DIR").unwrap());
        path.push("code_table.rs");
        path
    };
    let output = BufWriter::new(File::create(path)?);
    Ok(output)
}

fn parse_code_tables() -> io::Result<CodeTables> {
    let path = {
        let mut path = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
        path.push("assets");
        path.push("code_tables.json");
        path
    };
    let file = BufReader::new(File::open(path)?);

    #[derive(Deserialize)]
    struct JsonCodeTables {
        created: String,
        tables: HashMap<String, Vec<Option<u32>>>,
    }
    let JsonCodeTables { created, tables } = serde_json::from_reader(file).unwrap();

    let mut tables = tables
        .into_iter()
        .map(|(code_page, table)| {
            let complete = table.iter().all(Option::is_some);
            let code_page = code_page.parse().unwrap();
            let table = table
                .into_iter()
                .skip(128)
                .map(|i| i.map(|i| char::from_u32(i).unwrap()));
            let table = if complete {
                Table::Complete(
                    table
                        .map(Option::unwrap)
                        .collect::<Vec<_>>()
                        .try_into()
                        .unwrap(),
                )
            } else {
                Table::Incomplete(table.collect::<Vec<_>>().try_into().unwrap())
            };
            (code_page, table)
        })
        .collect::<Vec<_>>();

    tables.sort_unstable_by_key(|(code_page, _table)| *code_page);

    Ok(CodeTables { created, tables })
}

fn write_header(mut dst: impl Write, created: String) -> io::Result<()> {
    writeln!(
        &mut dst,
        "/// Code table
/// Generated at {created}
pub mod code_table {{

use super::code_table_type::TableType;
use super::OEMCPHashMap;
use TableType::*;
"
    )
}

fn write_decoding(mut dst: impl Write, code_page: u16, table: &Table) -> io::Result<()> {
    writeln!(&mut dst, "/// Decoding table (CP{code_page} to Unicode)")?;
    match table {
        Table::Complete(table) => {
            writeln!(
                &mut dst,
                "pub static DECODING_TABLE_CP{code_page}: [char; 128] = {table:?};"
            )?;
        }
        Table::Incomplete(table) => {
            writeln!(
                &mut dst,
                "pub static DECODING_TABLE_CP{code_page}: [Option<char>; 128] = {table:?};"
            )?;
        }
    }

    writeln!(&mut dst)?;

    Ok(())
}

fn write_encoding(mut dst: impl Write, code_page: u16, table: &Table) -> io::Result<()> {
    let mut map = phf_codegen::Map::new();

    match table {
        Table::Complete(table) => {
            for (i, c) in table
                .iter()
                .copied()
                .enumerate()
                .map(|(i, c)| (i + 0x80, c))
            {
                map.entry(c, &i.to_string());
            }
        }
        Table::Incomplete(table) => {
            for (i, c) in table
                .iter()
                .copied()
                .enumerate()
                .filter_map(|(i, c)| c.map(|c| (i + 0x80, c)))
            {
                map.entry(c, &i.to_string());
            }
        }
    }

    write!(
        &mut dst,
        "/// Encoding table (Unicode to CP{code_page})
pub static ENCODING_TABLE_CP{code_page}: OEMCPHashMap<char, u8> = {map};",
        map = map.build()
    )?;

    Ok(())
}

fn write_decoding_table_cp_map(mut dst: impl Write, tables: &[(u16, Table)]) -> io::Result<()> {
    let mut map = phf_codegen::Map::new();

    for (code_page, table) in tables {
        let ty = match table {
            Table::Complete(_) => "Complete",
            Table::Incomplete(_) => "Incomplete",
        };
        map.entry(code_page, &format!("{ty}(&DECODING_TABLE_CP{code_page})"));
    }

    writeln!(
        &mut dst,
        r#"/// map from codepage to decoding table
///
/// `.get` returns `code_table_type::{{Complete,Incomplete}}`.
///
/// * `Complete`: the decoding table doesn't have undefined mapping.
/// * `Incomplete`:  it have some undefined mapping.
///
/// This enumerate provides methods `decode_string_lossy` and `decode_string_checked`.
/// The following examples show the use of them.  `if let Some(decoder) = *snip* decoder.decode_string_*snip*` is convenient for practical use.
///
/// # Examples
///
/// ```
/// use oem_cp::code_table::{{DECODING_TABLE_CP_MAP, DECODING_TABLE_CP437}};
/// use oem_cp::code_table_type::TableType::*;
/// assert_eq!(DECODING_TABLE_CP_MAP.get(&437).unwrap().decode_string_lossy(vec![0x31, 0xF6, 0xAB, 0x3D, 0x32]), "1÷½=2".to_string());
/// if let Some(cp874_table) = DECODING_TABLE_CP_MAP.get(&874) {{
///     // means shrimp in Thai (U+E49 => 0xE9)
///     assert_eq!(cp874_table.decode_string_checked(vec![0xA1, 0xD8, 0xE9, 0xA7]), Some("กุ้ง".to_string()));
///     // undefined mapping 0xDB for CP874 Windows dialect (strict mode with MB_ERR_INVALID_CHARS)
///     assert_eq!(cp874_table.decode_string_checked(vec![0xDB]), None);
/// }} else {{
///     panic!("CP874 must be defined in DECODING_TABLE_CP_MAP");
/// }}
/// ```
pub static DECODING_TABLE_CP_MAP: OEMCPHashMap<u16, TableType> = {map};"#,
        map = map.build()
    )?;

    Ok(())
}

fn write_encoding_table_cp_map(mut dst: impl Write, tables: &[(u16, Table)]) -> io::Result<()> {
    let mut map = phf_codegen::Map::new();

    for (code_page, _table) in tables {
        map.entry(*code_page, &format!("&ENCODING_TABLE_CP{code_page}"));
    }

    writeln!(
        &mut dst,
        r#"/// map from codepage to encoding table
///
/// # Examples
///
/// ```
/// # use std::ptr;
/// use oem_cp::code_table::{{ENCODING_TABLE_CP_MAP, ENCODING_TABLE_CP437}};
/// assert!(ptr::eq(*ENCODING_TABLE_CP_MAP.get(&437).unwrap(), &ENCODING_TABLE_CP437));
/// // CP932 (Shift-JIS; Japanese MBCS) is unsupported
/// assert!(ENCODING_TABLE_CP_MAP.get(&932).is_none());
///
/// use oem_cp::encode_string_checked;
///
/// if let Some(cp437_table) = ENCODING_TABLE_CP_MAP.get(&437) {{
///     assert_eq!(encode_string_checked("π≈22/7", cp437_table), Some(vec![0xE3, 0xF7, 0x32, 0x32, 0x2F, 0x37]));
/// }} else {{
///     panic!("CP437 must be registered in ENCODING_TABLE_CP_MAP");
/// }}
/// ```
pub static ENCODING_TABLE_CP_MAP: OEMCPHashMap<u16, &'static OEMCPHashMap<char, u8>> = {map};"#,
        map = map.build()
    )?;

    Ok(())
}

fn write_footer(mut dst: impl Write) -> io::Result<()> {
    writeln!(&mut dst, "}}")
}