lalrpop_util/
lexer.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#![doc(hidden)]
//! The built-in lalrpop lexer
//!
//! This is the code for the built in lexer, and is linked by lalrpop generated parsers to provide
//! lexer support when you don't write a custom lexer.
//!
//! Typically you don't want to use APIs from this module directly, they are public to be accessed
//! by the generated parser.
use alloc::{fmt, vec::Vec};
use core::marker::PhantomData;

use crate::ParseError;

use regex_automata::hybrid::dfa::{Cache, DFA};
use regex_automata::hybrid::{BuildError, LazyStateID};
use regex_automata::nfa::thompson::Config as NfaConfig;
use regex_automata::util::syntax::Config as SyntaxConfig;
use regex_automata::{Anchored, Input, MatchKind};

#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
pub struct Token<'input>(pub usize, pub &'input str);
impl fmt::Display for Token<'_> {
    fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
        fmt::Display::fmt(self.1, formatter)
    }
}

pub struct MatcherBuilder {
    dfa: DFA,
    skip_vec: Vec<bool>,
}

impl MatcherBuilder {
    #[allow(clippy::result_large_err)]
    pub fn new<S>(exprs: impl IntoIterator<Item = (S, bool)>) -> Result<MatcherBuilder, BuildError>
    where
        S: AsRef<str>,
    {
        let exprs = exprs.into_iter();
        let mut regex_vec = Vec::with_capacity(exprs.size_hint().0);
        let mut skip_vec = Vec::with_capacity(exprs.size_hint().0);
        for (regex, skip) in exprs {
            regex_vec.push(regex);
            skip_vec.push(skip);
        }

        let enable_unicode = cfg!(feature = "unicode");
        let dfa = DFA::builder()
            .configure(DFA::config().match_kind(MatchKind::All))
            .syntax(
                SyntaxConfig::new()
                    .unicode(enable_unicode)
                    .utf8(enable_unicode),
            )
            .thompson(NfaConfig::new().utf8(enable_unicode).shrink(true))
            .build_many(&regex_vec)?;

        Ok(MatcherBuilder { dfa, skip_vec })
    }

    pub fn matcher<'input, 'builder, E>(
        &'builder self,
        text: &'input str,
    ) -> Matcher<'input, 'builder, E> {
        let input = Input::new(text).anchored(Anchored::Yes);
        let mut cache = self.dfa.create_cache();
        let start = self.dfa.start_state_forward(&mut cache, &input).unwrap();
        Matcher {
            text,
            consumed: 0,
            cache,
            start,
            dfa: &self.dfa,
            skip_vec: &self.skip_vec,
            _marker: PhantomData,
        }
    }
}

pub struct Matcher<'input, 'builder, E> {
    text: &'input str,
    consumed: usize,
    cache: Cache,
    start: LazyStateID,
    dfa: &'builder DFA,
    skip_vec: &'builder [bool],
    _marker: PhantomData<fn() -> E>,
}

impl<'input, E> Iterator for Matcher<'input, '_, E> {
    type Item = Result<(usize, Token<'input>, usize), ParseError<usize, Token<'input>, E>>;

    fn next(&mut self) -> Option<Self::Item> {
        loop {
            let text = self.text;
            let start_offset = self.consumed;
            if text.is_empty() {
                self.consumed = start_offset;
                return None;
            }

            let mut match_ = None;
            'search: {
                let mut state = self.start;
                for (i, byte) in text.bytes().enumerate() {
                    state = self.dfa.next_state(&mut self.cache, state, byte).unwrap();
                    if state.is_match() {
                        match_ = Some((state, i));
                    } else if state.is_dead() {
                        break 'search;
                    }
                }
                state = self.dfa.next_eoi_state(&mut self.cache, state).unwrap();
                if state.is_match() {
                    match_ = Some((state, text.len()));
                }
            }

            let (match_state, longest_match) = match match_ {
                Some(match_) => match_,
                None => {
                    return Some(Err(ParseError::InvalidToken {
                        location: start_offset,
                    }))
                }
            };
            let index = (0..self.dfa.match_len(&self.cache, match_state))
                .map(|n| {
                    self.dfa
                        .match_pattern(&self.cache, match_state, n)
                        .as_usize()
                })
                .max()
                .unwrap();

            let result = &text[..longest_match];
            let remaining = &text[longest_match..];
            let end_offset = start_offset + longest_match;
            self.text = remaining;
            self.consumed = end_offset;

            if self.skip_vec[index] {
                if longest_match == 0 {
                    return Some(Err(ParseError::InvalidToken {
                        location: start_offset,
                    }));
                }
                continue;
            }

            return Some(Ok((start_offset, Token(index, result), end_offset)));
        }
    }
}