semver_parser/
lexer.rs

1//! Lexer for semver ranges.
2//!
3//! Breaks a string of input into an iterator of tokens that can be used with a parser.
4//!
5//! This should be used with the [`parser`] module.
6//!
7//! [`parser`]: ../parser/index.html
8//!
9//! # Examples
10//!
11//! Example without errors:
12//!
13//! ```rust
14//! use semver_parser::lexer::{Lexer, Token};
15//!
16//! let mut l = Lexer::new("foo 123 *");
17//!
18//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
19//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
20//! assert_eq!(Some(Ok(Token::Numeric(123))), l.next());
21//! assert_eq!(Some(Ok(Token::Whitespace(7, 8))), l.next());
22//! assert_eq!(Some(Ok(Token::Star)), l.next());
23//! assert_eq!(None, l.next());
24//! ```
25//!
26//! Example with error:
27//!
28//! ```rust
29//! use semver_parser::lexer::{Lexer, Token, Error};
30//!
31//! let mut l = Lexer::new("foo / *");
32//!
33//! assert_eq!(Some(Ok(Token::AlphaNumeric("foo"))), l.next());
34//! assert_eq!(Some(Ok(Token::Whitespace(3, 4))), l.next());
35//! assert_eq!(Some(Err(Error::UnexpectedChar('/'))), l.next());
36//! ```
37
38use self::Error::*;
39use self::Token::*;
40use std::str;
41
42macro_rules! scan_while {
43    ($slf:expr, $start:expr, $first:pat $(| $rest:pat)*) => {{
44        let mut __end = $start;
45
46        loop {
47            if let Some((idx, c)) = $slf.one() {
48                __end = idx;
49
50                match c {
51                    $first $(| $rest)* => $slf.step(),
52                    _ => break,
53                }
54
55                continue;
56            } else {
57                __end = $slf.input.len();
58            }
59
60            break;
61        }
62
63        __end
64    }}
65}
66
67/// Semver tokens.
68#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
69pub enum Token<'input> {
70    /// `=`
71    Eq,
72    /// `>`
73    Gt,
74    /// `<`
75    Lt,
76    /// `<=`
77    LtEq,
78    /// `>=`
79    GtEq,
80    /// '^`
81    Caret,
82    /// '~`
83    Tilde,
84    /// '*`
85    Star,
86    /// `.`
87    Dot,
88    /// `,`
89    Comma,
90    /// `-`
91    Hyphen,
92    /// `+`
93    Plus,
94    /// '||'
95    Or,
96    /// any number of whitespace (`\t\r\n `) and its span.
97    Whitespace(usize, usize),
98    /// Numeric component, like `0` or `42`.
99    Numeric(u64),
100    /// Alphanumeric component, like `alpha1` or `79deadbe`.
101    AlphaNumeric(&'input str),
102}
103
104impl<'input> Token<'input> {
105    /// Check if the current token is a whitespace token.
106    pub fn is_whitespace(&self) -> bool {
107        match *self {
108            Whitespace(..) => true,
109            _ => false,
110        }
111    }
112
113    /// Check if the current token is a wildcard token.
114    pub fn is_wildcard(&self) -> bool {
115        match *self {
116            Star | AlphaNumeric("X") | AlphaNumeric("x") => true,
117            _ => false,
118        }
119    }
120}
121
122#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
123pub enum Error {
124    /// Unexpected character.
125    UnexpectedChar(char),
126}
127
128/// Lexer for semver tokens belonging to a range.
129#[derive(Debug)]
130pub struct Lexer<'input> {
131    input: &'input str,
132    chars: str::CharIndices<'input>,
133    // lookahead
134    c1: Option<(usize, char)>,
135    c2: Option<(usize, char)>,
136}
137
138impl<'input> Lexer<'input> {
139    /// Construct a new lexer for the given input.
140    pub fn new(input: &str) -> Lexer {
141        let mut chars = input.char_indices();
142        let c1 = chars.next();
143        let c2 = chars.next();
144
145        Lexer {
146            input,
147            chars,
148            c1,
149            c2,
150        }
151    }
152
153    /// Shift all lookahead storage by one.
154    fn step(&mut self) {
155        self.c1 = self.c2;
156        self.c2 = self.chars.next();
157    }
158
159    fn step_n(&mut self, n: usize) {
160        for _ in 0..n {
161            self.step();
162        }
163    }
164
165    /// Access the one character, or set it if it is not set.
166    fn one(&mut self) -> Option<(usize, char)> {
167        self.c1
168    }
169
170    /// Access two characters.
171    fn two(&mut self) -> Option<(usize, char, char)> {
172        self.c1
173            .and_then(|(start, c1)| self.c2.map(|(_, c2)| (start, c1, c2)))
174    }
175
176    /// Consume a component.
177    ///
178    /// A component can either be an alphanumeric or numeric.
179    /// Does not permit leading zeroes if numeric.
180    fn component(&mut self, start: usize) -> Result<Token<'input>, Error> {
181        let end = scan_while!(self, start, '0'..='9' | 'A'..='Z' | 'a'..='z');
182        let input = &self.input[start..end];
183
184        let mut it = input.chars();
185        let (a, b) = (it.next(), it.next());
186
187        // exactly zero
188        if a == Some('0') && b.is_none() {
189            return Ok(Numeric(0));
190        }
191
192        if a != Some('0') {
193            if let Ok(numeric) = input.parse::<u64>() {
194                return Ok(Numeric(numeric));
195            }
196        }
197
198        Ok(AlphaNumeric(input))
199    }
200
201    /// Consume whitespace.
202    fn whitespace(&mut self, start: usize) -> Result<Token<'input>, Error> {
203        let end = scan_while!(self, start, ' ' | '\t' | '\n' | '\r');
204        Ok(Whitespace(start, end))
205    }
206}
207
208impl<'input> Iterator for Lexer<'input> {
209    type Item = Result<Token<'input>, Error>;
210
211    fn next(&mut self) -> Option<Self::Item> {
212        #[allow(clippy::never_loop)]
213        loop {
214            // two subsequent char tokens.
215            if let Some((_, a, b)) = self.two() {
216                let two = match (a, b) {
217                    ('<', '=') => Some(LtEq),
218                    ('>', '=') => Some(GtEq),
219                    ('|', '|') => Some(Or),
220                    _ => None,
221                };
222
223                if let Some(two) = two {
224                    self.step_n(2);
225                    return Some(Ok(two));
226                }
227            }
228
229            // single char and start of numeric tokens.
230            if let Some((start, c)) = self.one() {
231                let tok = match c {
232                    ' ' | '\t' | '\n' | '\r' => {
233                        self.step();
234                        return Some(self.whitespace(start));
235                    }
236                    '=' => Eq,
237                    '>' => Gt,
238                    '<' => Lt,
239                    '^' => Caret,
240                    '~' => Tilde,
241                    '*' => Star,
242                    '.' => Dot,
243                    ',' => Comma,
244                    '-' => Hyphen,
245                    '+' => Plus,
246                    '0'..='9' | 'a'..='z' | 'A'..='Z' => {
247                        self.step();
248                        return Some(self.component(start));
249                    }
250                    c => return Some(Err(UnexpectedChar(c))),
251                };
252
253                self.step();
254                return Some(Ok(tok));
255            };
256
257            return None;
258        }
259    }
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    fn lex(input: &str) -> Vec<Token> {
267        Lexer::new(input).map(Result::unwrap).collect::<Vec<_>>()
268    }
269
270    #[test]
271    pub fn simple_tokens() {
272        assert_eq!(
273            lex("=><<=>=^~*.,-+||"),
274            vec![Eq, Gt, Lt, LtEq, GtEq, Caret, Tilde, Star, Dot, Comma, Hyphen, Plus, Or,]
275        );
276    }
277
278    #[test]
279    pub fn whitespace() {
280        assert_eq!(
281            lex("  foo \t\n\rbar"),
282            vec![
283                Whitespace(0, 2),
284                AlphaNumeric("foo"),
285                Whitespace(5, 9),
286                AlphaNumeric("bar"),
287            ]
288        );
289    }
290
291    #[test]
292    pub fn components() {
293        assert_eq!(lex("42"), vec![Numeric(42)]);
294        assert_eq!(lex("0"), vec![Numeric(0)]);
295        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
296        assert_eq!(lex("01"), vec![AlphaNumeric("01")]);
297        assert_eq!(lex("5885644aa"), vec![AlphaNumeric("5885644aa")]);
298        assert_eq!(lex("beta2"), vec![AlphaNumeric("beta2")]);
299        assert_eq!(lex("beta.2"), vec![AlphaNumeric("beta"), Dot, Numeric(2)]);
300    }
301
302    #[test]
303    pub fn is_wildcard() {
304        assert_eq!(Star.is_wildcard(), true);
305        assert_eq!(AlphaNumeric("x").is_wildcard(), true);
306        assert_eq!(AlphaNumeric("X").is_wildcard(), true);
307        assert_eq!(AlphaNumeric("other").is_wildcard(), false);
308    }
309
310    #[test]
311    pub fn empty() {
312        assert_eq!(lex(""), vec![]);
313    }
314
315    #[test]
316    pub fn numeric_all_numbers() {
317        let expected: Vec<Token> = vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
318            .into_iter()
319            .map(Numeric)
320            .collect::<Vec<_>>();
321
322        let actual: Vec<_> = lex("0 1 2 3 4 5 6 7 8 9")
323            .into_iter()
324            .filter(|t| !t.is_whitespace())
325            .collect();
326
327        assert_eq!(actual, expected);
328    }
329}