parse_book_source/analyzer/
html.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
use super::Analyzer;
use anyhow::anyhow;
use regex::Regex;
use scraper::{Html, Selector};

fn html_decode(s: &str) -> String {
    let mut result = s.replace("&", "&");
    result = result.replace("&lt;", "<");
    result = result.replace("&gt;", ">");
    result = result.replace("&nbsp;", " ");
    result = result.replace("&#39;", "'");
    result = result.replace("&quot;", "\"");
    result = result.replace("<br/>", "\n");
    result
}

fn get_html_string(html: &str) -> String {
    let re_tags = Regex::new(r"</?(?:div|p|br|hr|h\d|article|b|dd|dl|html)[^>]*>").unwrap();
    let re_comments = Regex::new(r"<!--[\w\W\r\n]*?-->").unwrap();
    let mut result = re_tags.replace_all(html, "\n").to_string();
    result = re_comments.replace_all(&result, "").to_string();
    html_decode(&result)
}

pub struct HtmlAnalyzer {
    content: String,
}

impl Analyzer for HtmlAnalyzer {
    fn parse(content: &str) -> crate::Result<Self>
    where
        Self: Sized,
    {
        Ok(Self {
            content: content.to_string(),
        })
    }

    fn get_elements(&self, rule: &str) -> crate::Result<Vec<String>> {
        let document = Html::parse_document(&self.content);
        let selector = Selector::parse(rule.trim()).map_err(|e| anyhow!("{e}"))?;

        Ok(document.select(&selector).map(|el| el.html()).collect())
    }

    fn get_string(&self, rule: &str) -> crate::Result<String> {
        Ok(self.get_string_list(rule)?.join("  "))
    }

    fn get_string_list(&self, rule: &str) -> crate::Result<Vec<String>> {
        if !rule.contains('@') {
            return Ok(vec![self._get_result(rule, None)]);
        }

        let (selectors, last_rule) = rule.split_once('@').unwrap();
        let document = Html::parse_document(&self.content);

        if selectors.is_empty() {
            return Ok(vec![]);
        }
        let selector = Selector::parse(selectors).expect("Invalid selector");

        Ok(document
            .select(&selector)
            .map(|el| self._get_result(last_rule, Some(el.html().as_str())))
            .collect())
    }
}

impl HtmlAnalyzer {
    fn _get_result(&self, last_rule: &str, html: Option<&str>) -> String {
        let document = Html::parse_fragment(html.unwrap_or(&self.content));

        match last_rule {
            "text" => document.root_element().text().collect::<String>(),
            "textNodes" => {
                let selector = Selector::parse(":root > *").unwrap();
                document
                    .select(&selector)
                    .map(|el| el.text().collect::<String>())
                    .collect::<Vec<String>>()
                    .join("\n")
                    .trim()
                    .to_string()
            }
            "outerHtml" => document.html(),
            "innerHtml" => {
                let selector = Selector::parse(":root").unwrap();
                document
                    .select(&selector)
                    .map(|el| el.inner_html())
                    .collect::<Vec<String>>()
                    .join("\n")
                    .trim()
                    .to_string()
            }
            "html" => get_html_string(document.html().as_str()),
            _ => document
                .root_element()
                .child_elements()
                .next()
                .unwrap()
                .attr(last_rule)
                .unwrap_or("")
                .to_string(),
        }
    }
}