xoev-xwasser-codelists 0.700.7+0.7.0

"XOEV XWasser XML Standard"
use std::io::Write;
use std::path::{Path, PathBuf};

#[path = "src/cl/mod.rs"]
mod cl;

use cl::model::{CodeList, DataSource};

use raxb::de::from_str;
use raxb::quick_xml::events::Event;

#[derive(serde::Serialize)]
pub struct Data<T> {
    items: Vec<T>,
}

impl<T> Default for Data<T> {
    fn default() -> Self {
        Self { items: Vec::new() }
    }
}

pub struct DataSet<T> {
    file: std::fs::File,
    data: Data<T>,
}

impl<T> DataSet<T> {
    pub fn insert(&mut self, entry: T) {
        self.data.items.push(entry);
    }
}

impl<T> DataSet<T>
where
    T: serde::ser::Serialize,
{
    pub fn save(&mut self) -> anyhow::Result<()> {
        self.file.write_all(&bson::to_vec(&self.data)?)?;
        Ok(())
    }
}

pub struct DataSetFactory {
    data_set_path: PathBuf,
}

impl DataSetFactory {
    pub fn new<P: AsRef<Path>>(data_set_path: P) -> anyhow::Result<Self> {
        if !data_set_path.as_ref().exists() {
            std::fs::create_dir_all(data_set_path.as_ref())?;
        }
        Ok(Self {
            data_set_path: data_set_path.as_ref().to_path_buf(),
        })
    }

    pub fn create<T>(&self, items: Vec<T>) -> anyhow::Result<DataSet<T>>
    where
        T: DataSource,
    {
        let file_path = self.data_set_path.join(T::name());
        if file_path.exists() {
            std::fs::remove_file(&file_path)?;
        }
        Ok(DataSet {
            file: std::fs::File::create(&file_path)?,
            data: Data { items },
        })
    }
}

fn write_codelist_docs(s: &mut String, codelist: &CodeList) -> anyhow::Result<()> {
    use std::fmt::Write;

    let field_ids = codelist
        .header
        .fields
        .iter()
        .enumerate()
        .map(|(id, v)| format!("`{}` ({id})", v.id.as_ref()))
        .collect::<Vec<String>>();

    writeln!(
        s,
        "## {}",
        codelist.header.identification.long_name.as_ref()
    )?;
    writeln!(s)?;
    writeln!(s, "{}", codelist.header.description.codelist_description)?;
    writeln!(s)?;
    writeln!(s, "| | |")?;
    writeln!(s, "| -- | -- |")?;
    writeln!(
        s,
        "| short name | {} |",
        codelist.header.identification.short_name.as_ref()
    )?;
    writeln!(
        s,
        "| canonical uri | `{}` |",
        codelist.header.identification.canonical_uri.as_ref()
    )?;
    writeln!(
        s,
        "| canonical version uri | {} |",
        codelist
            .header
            .identification
            .canonical_version_uri
            .as_ref()
    )?;
    writeln!(s)?;
    writeln!(s, "| Field | {} |", field_ids.join(" | "))?;
    write!(s, "|")?;
    for _ in 0..field_ids.len() {
        write!(s, " -- |")?;
    }
    writeln!(s, " -- |")?;
    write!(s, "| type ")?;
    for field in codelist.header.fields.iter() {
        write!(s, "| {:?} ", field.field_type)?;
    }
    writeln!(s, "|")?;
    write!(s, "| Usage ")?;
    for field in codelist.header.fields.iter() {
        write!(s, "| {:?} ", field.usage)?;
    }
    writeln!(s, "|")?;
    write!(s, "| Lang ")?;
    for field in codelist.header.fields.iter() {
        write!(s, "| {:?} ", field.lang)?;
    }
    writeln!(s, "|")?;

    writeln!(s)?;

    Ok(())
}

fn write_readme(items: &[CodeList], version: &str) -> anyhow::Result<String> {
    let mut content = format!("# XWasser codelists - version `{version}`\n\n");

    for item in items {
        write_codelist_docs(&mut content, item)?;
    }

    Ok(content)
}

fn main() -> anyhow::Result<()> {
    let public_out = Path::new("./public");
    let data_dir = Path::new("./data");
    let versions = std::fs::read_dir(data_dir)?;
    for e in versions {
        let version = e?.file_name();
        let version_dir = data_dir.join(&version);
        let json_dir = public_out.join(&version);
        let xml_files = std::fs::read_dir(&version_dir)?;
        let mut items = Vec::new();
        let mut json_content = Vec::new();
        std::fs::create_dir_all(&json_dir)?;
        let json_file = json_dir.join("codelist.json");
        let readme_file = json_dir.join("README.md");
        for xml_file in xml_files {
            let xml_file = xml_file?;
            eprintln!("process {}", xml_file.path().display());
            if xml_file
                .path()
                .extension()
                .map(|ext| ext.to_ascii_lowercase().to_string_lossy().as_ref() == "xml")
                .unwrap_or(false)
            {
                let xml = std::fs::read(xml_file.path())?;
                let mut rdr =
                    raxb::quick_xml::reader::Reader::from_reader(std::io::Cursor::new(xml.clone()));
                let mut buf = Vec::new();
                let mut s = String::default();
                let mut is_valid_utf_8 = true;
                loop {
                    match rdr.read_event_into(&mut buf) {
                        Ok(Event::Decl(decl)) => match decl.encoding() {
                            Some(Ok(encoding)) => {
                                if encoding.as_ref() == b"ISO-8859-1" {
                                    let encoder = encoding_rs::Encoding::for_label(b"iso-8859-1")
                                        .ok_or(anyhow::anyhow!(
                                        "unknown encoding ISO_8859_1"
                                    ))?;
                                    let (result, valid_to) = encoder.decode_with_bom_removal(&xml);
                                    eprintln!("{valid_to}");
                                    s = result.replace("ISO-8859-1", "UTF-8");
                                    std::fs::write(xml_file.path(), s.as_bytes())?;
                                    is_valid_utf_8 = false
                                }
                            }
                            _ => break,
                        },
                        Err(err) => {
                            eprintln!("{err:#?}");
                            std::process::exit(1);
                        }
                        _ => {
                            break;
                        }
                    }
                }
                if is_valid_utf_8 {
                    s = String::from_utf8(rdr.into_inner().into_inner())?;
                }
                let result: cl::parser::input::CodeList =
                    from_str(&s).expect("unable to deserialize xml");
                let parsed: CodeList = result.into();
                if !parsed.values.is_empty() {
                    let json_obj = serde_json::to_string(&parsed)?;
                    json_content.push(json_obj);
                    items.push(parsed);
                }
            }
        }
        std::fs::write(
            &readme_file,
            &write_readme(&items, version.to_str().unwrap())?,
        )?;
        DataSetFactory::new(
            PathBuf::from(std::env::var("OUT_DIR").expect("OUT_DIR variable"))
                .join("data")
                .join(version),
        )?
        .create::<CodeList>(items)?
        .save()?;
        std::fs::write(
            &json_file,
            format!("[\n  {}\n]", json_content.join(",\n  ")),
        )?;
    }
    Ok(())
}