use crate::{string_parser::is_char_supported, ParserResult};
use nom::{
branch::alt,
bytes::complete::tag,
character::complete::{anychar, char, line_ending, multispace1},
combinator::{cut, map, recognize, value, verify},
error::{ErrorKind, VerboseError, VerboseErrorKind},
multi::fold_many0,
sequence::{preceded, terminated},
};
pub struct Sanitizer;
impl Sanitizer {
pub fn parse(string: &str) -> ParserResult<&str> {
preceded(Self::parse_whitespaces, Self::parse_comments)(string)
}
pub fn parse_whitespaces(string: &str) -> ParserResult<&str> {
recognize(Self::many0_(alt((multispace1, tag("\\\n")))))(string)
}
pub fn parse_comments(string: &str) -> ParserResult<&str> {
recognize(Self::many0_(terminated(Self::parse_comment, Self::parse_whitespaces)))(string)
}
pub fn parse_comment(string: &str) -> ParserResult<&str> {
preceded(
char('/'),
alt((preceded(char('/'), cut(Self::str_till_eol)), preceded(char('*'), cut(Self::str_till_star_slash)))),
)(string)
}
pub fn parse_safe_char(string: &str) -> ParserResult<char> {
fn is_safe(ch: &char) -> bool {
is_char_supported(*ch)
}
verify(anychar, is_safe)(string)
}
}
impl Sanitizer {
fn eoi(string: &str) -> ParserResult<()> {
match string.is_empty() {
true => Ok((string, ())),
false => {
Err(nom::Err::Error(VerboseError { errors: vec![(string, VerboseErrorKind::Nom(ErrorKind::Eof))] }))
}
}
}
fn eol(string: &str) -> ParserResult<()> {
alt((
Self::eoi, value((), line_ending),
))(string)
}
fn till<'a, A, B, F, G>(mut f: F, mut g: G) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
where
F: FnMut(&'a str) -> ParserResult<'a, A>,
G: FnMut(&'a str) -> ParserResult<'a, B>,
{
move |mut i| loop {
if let Ok((i2, _)) = g(i) {
break Ok((i2, ()));
}
let (i2, _) = f(i)?;
i = i2;
}
}
fn str_till_eol(string: &str) -> ParserResult<&str> {
map(
recognize(Self::till(alt((value((), tag("\\\n")), value((), Sanitizer::parse_safe_char))), Self::eol)),
|i| {
if i.as_bytes().last() == Some(&b'\n') { &i[0..i.len() - 1] } else { i }
},
)(string)
}
fn str_till_star_slash(string: &str) -> ParserResult<&str> {
map(recognize(Self::till(value((), Sanitizer::parse_safe_char), tag("*/"))), |i| {
&i[0..i.len() - 2] })(string)
}
fn many0_<'a, A, F>(mut f: F) -> impl FnMut(&'a str) -> ParserResult<'a, ()>
where
F: FnMut(&'a str) -> ParserResult<'a, A>,
{
move |string| fold_many0(&mut f, || (), |_, _| ())(string)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_safe_char() {
assert_eq!(("", 'A'), Sanitizer::parse_safe_char("A").unwrap());
assert_eq!((" and more", 'A'), Sanitizer::parse_safe_char("A and more").unwrap());
assert_eq!(("", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141}").unwrap());
assert_eq!((" and more", '\u{4141}'), Sanitizer::parse_safe_char("\u{4141} and more").unwrap());
assert!(Sanitizer::parse_safe_char("\x00").is_err());
assert!(Sanitizer::parse_safe_char("\x01").is_err());
assert!(Sanitizer::parse_safe_char("\x02").is_err());
assert!(Sanitizer::parse_safe_char("\x03").is_err());
assert!(Sanitizer::parse_safe_char("\x04").is_err());
assert!(Sanitizer::parse_safe_char("\x05").is_err());
assert!(Sanitizer::parse_safe_char("\x06").is_err());
assert!(Sanitizer::parse_safe_char("\x07").is_err());
assert!(Sanitizer::parse_safe_char("\x08").is_err());
assert!(Sanitizer::parse_safe_char("\x09").is_ok());
assert!(Sanitizer::parse_safe_char("\x0a").is_ok());
assert!(Sanitizer::parse_safe_char("\x0b").is_err());
assert!(Sanitizer::parse_safe_char("\x0c").is_err());
assert!(Sanitizer::parse_safe_char("\x0d").is_ok());
assert!(Sanitizer::parse_safe_char("\x0e").is_err());
assert!(Sanitizer::parse_safe_char("\x0f").is_err());
assert!(Sanitizer::parse_safe_char("\x10").is_err());
assert!(Sanitizer::parse_safe_char("\x11").is_err());
assert!(Sanitizer::parse_safe_char("\x12").is_err());
assert!(Sanitizer::parse_safe_char("\x13").is_err());
assert!(Sanitizer::parse_safe_char("\x14").is_err());
assert!(Sanitizer::parse_safe_char("\x15").is_err());
assert!(Sanitizer::parse_safe_char("\x16").is_err());
assert!(Sanitizer::parse_safe_char("\x17").is_err());
assert!(Sanitizer::parse_safe_char("\x18").is_err());
assert!(Sanitizer::parse_safe_char("\x19").is_err());
assert!(Sanitizer::parse_safe_char("\x1a").is_err());
assert!(Sanitizer::parse_safe_char("\x1b").is_err());
assert!(Sanitizer::parse_safe_char("\x1c").is_err());
assert!(Sanitizer::parse_safe_char("\x1d").is_err());
assert!(Sanitizer::parse_safe_char("\x1e").is_err());
assert!(Sanitizer::parse_safe_char("\x1f").is_err());
assert!(Sanitizer::parse_safe_char("\x7f").is_err());
assert!(Sanitizer::parse_safe_char("\u{2029}").is_ok());
assert!(Sanitizer::parse_safe_char("\u{202a}").is_err());
assert!(Sanitizer::parse_safe_char("\u{202b}").is_err());
assert!(Sanitizer::parse_safe_char("\u{202c}").is_err());
assert!(Sanitizer::parse_safe_char("\u{202d}").is_err());
assert!(Sanitizer::parse_safe_char("\u{202e}").is_err());
assert!(Sanitizer::parse_safe_char("\u{202f}").is_ok());
assert!(Sanitizer::parse_safe_char("\u{2065}").is_ok());
assert!(Sanitizer::parse_safe_char("\u{2066}").is_err());
assert!(Sanitizer::parse_safe_char("\u{2067}").is_err());
assert!(Sanitizer::parse_safe_char("\u{2068}").is_err());
assert!(Sanitizer::parse_safe_char("\u{2069}").is_err());
assert!(Sanitizer::parse_safe_char("\u{206a}").is_ok());
}
#[test]
fn test_sanitize() {
assert_eq!(("hello world", ""), Sanitizer::parse("hello world").unwrap());
assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap());
assert_eq!(("hello world", ""), Sanitizer::parse(" hello world").unwrap());
assert_eq!(("hello world", ""), Sanitizer::parse("\nhello world").unwrap());
assert_eq!(("hello world", ""), Sanitizer::parse(" \nhello world").unwrap());
assert_eq!(("hello world ", ""), Sanitizer::parse("hello world ").unwrap());
assert_eq!(("hello world", "// hello\n"), Sanitizer::parse("// hello\nhello world").unwrap());
assert_eq!(("hello world", "/* hello */"), Sanitizer::parse("/* hello */hello world").unwrap());
assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse("/* hello */\nhello world").unwrap());
assert_eq!(("hello world", "/** hello */"), Sanitizer::parse("/** hello */hello world").unwrap());
assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse("/** hello */\nhello world").unwrap());
assert_eq!(("/\nhello world", ""), Sanitizer::parse("/\nhello world").unwrap());
assert_eq!(("hello world", "// hello\n"), Sanitizer::parse(" \n// hello\nhello world").unwrap());
assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse(" \n /* hello */\nhello world").unwrap());
assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse(" \n\t /** hello */\nhello world").unwrap());
assert_eq!(("/\nhello world", ""), Sanitizer::parse(" /\nhello world").unwrap());
}
#[test]
fn test_whitespaces() {
assert_eq!(("hello world", ""), Sanitizer::parse_whitespaces("hello world").unwrap());
assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap());
assert_eq!(("hello world", " "), Sanitizer::parse_whitespaces(" hello world").unwrap());
assert_eq!(("hello world", "\n"), Sanitizer::parse_whitespaces("\nhello world").unwrap());
assert_eq!(("hello world", " \n"), Sanitizer::parse_whitespaces(" \nhello world").unwrap());
assert_eq!(("hello world", "\t"), Sanitizer::parse_whitespaces("\thello world").unwrap());
assert_eq!(("hello world", " \t"), Sanitizer::parse_whitespaces(" \thello world").unwrap());
assert_eq!(("hello world", " \n\t"), Sanitizer::parse_whitespaces(" \n\thello world").unwrap());
assert_eq!(("hello world ", ""), Sanitizer::parse_whitespaces("hello world ").unwrap());
}
#[test]
fn test_comments() {
assert_eq!(("hello world", "// hello\n"), Sanitizer::parse_comments("// hello\nhello world").unwrap());
assert_eq!(("hello world", "/* hello */\n"), Sanitizer::parse_comments("/* hello */\nhello world").unwrap());
assert_eq!(("hello world", "/** hello */\n"), Sanitizer::parse_comments("/** hello */\nhello world").unwrap());
assert_eq!(("/\nhello world", ""), Sanitizer::parse_comments("/\nhello world").unwrap());
assert_eq!(
("hello world", "// hel\u{4141}lo\n"),
Sanitizer::parse_comments("// hel\u{4141}lo\nhello world").unwrap()
);
assert!(Sanitizer::parse_comments("// hel\x08lo\nhello world").is_err());
assert!(Sanitizer::parse_comments("// hel\u{2066}lo\nhello world").is_err());
assert!(Sanitizer::parse_comments("/* hel\x7flo */\nhello world").is_err());
assert!(Sanitizer::parse_comments("/* hel\u{202d}lo */\nhello world").is_err());
assert!(Sanitizer::parse_comments("/** hel\x00lo */\nhello world").is_err());
assert!(Sanitizer::parse_comments("/** hel\u{202a}lo */\nhello world").is_err());
}
}