solang_parser/
lexer.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Custom Solidity lexer.
4//!
5//! Solidity needs a custom lexer for two reasons:
6//!  - comments and doc comments
7//!  - pragma value is [^;]+
8
9use crate::pt::{Comment, Loc};
10use itertools::{peek_nth, PeekNth};
11use phf::phf_map;
12use std::{fmt, str::CharIndices};
13use thiserror::Error;
14use unicode_xid::UnicodeXID;
15
16/// A spanned [Token].
17pub type Spanned<'a> = (usize, Token<'a>, usize);
18
19/// [Lexer]'s Result type.
20pub type Result<'a, T = Spanned<'a>, E = LexicalError> = std::result::Result<T, E>;
21
22/// A Solidity lexical token. Produced by [Lexer].
23#[derive(Copy, Clone, PartialEq, Eq, Debug)]
24#[allow(missing_docs)]
25pub enum Token<'input> {
26    Identifier(&'input str),
27    /// `(unicode, literal)`
28    StringLiteral(bool, &'input str),
29    AddressLiteral(&'input str),
30    HexLiteral(&'input str),
31    /// `(number, exponent)`
32    Number(&'input str, &'input str),
33    /// `(number, fraction, exponent)`
34    RationalNumber(&'input str, &'input str, &'input str),
35    HexNumber(&'input str),
36    Divide,
37    Contract,
38    Library,
39    Interface,
40    Function,
41    Pragma,
42    Import,
43
44    Struct,
45    Event,
46    Enum,
47    Type,
48
49    Memory,
50    Storage,
51    Calldata,
52
53    Public,
54    Private,
55    Internal,
56    External,
57
58    Constant,
59
60    New,
61    Delete,
62
63    Pure,
64    View,
65    Payable,
66
67    Do,
68    Continue,
69    Break,
70
71    Throw,
72    Emit,
73    Return,
74    Returns,
75    Revert,
76
77    Uint(u16),
78    Int(u16),
79    Bytes(u8),
80    // prior to 0.8.0 `byte` used to be an alias for `bytes1`
81    Byte,
82    DynamicBytes,
83    Bool,
84    Address,
85    String,
86
87    Semicolon,
88    Comma,
89    OpenParenthesis,
90    CloseParenthesis,
91    OpenCurlyBrace,
92    CloseCurlyBrace,
93
94    BitwiseOr,
95    BitwiseOrAssign,
96    Or,
97
98    BitwiseXor,
99    BitwiseXorAssign,
100
101    BitwiseAnd,
102    BitwiseAndAssign,
103    And,
104
105    AddAssign,
106    Increment,
107    Add,
108
109    SubtractAssign,
110    Decrement,
111    Subtract,
112
113    MulAssign,
114    Mul,
115    Power,
116    DivideAssign,
117    ModuloAssign,
118    Modulo,
119
120    Equal,
121    Assign,
122    ColonAssign,
123
124    NotEqual,
125    Not,
126
127    True,
128    False,
129    Else,
130    Anonymous,
131    For,
132    While,
133    If,
134
135    ShiftRight,
136    ShiftRightAssign,
137    Less,
138    LessEqual,
139
140    ShiftLeft,
141    ShiftLeftAssign,
142    More,
143    MoreEqual,
144
145    Constructor,
146    Indexed,
147
148    Member,
149    Colon,
150    OpenBracket,
151    CloseBracket,
152    BitwiseNot,
153    Question,
154
155    Mapping,
156    Arrow,
157
158    Try,
159    Catch,
160
161    Receive,
162    Fallback,
163
164    As,
165    Is,
166    Abstract,
167    Virtual,
168    Override,
169    Using,
170    Modifier,
171    Immutable,
172    Unchecked,
173
174    Assembly,
175    Let,
176    Leave,
177    Switch,
178    Case,
179    Default,
180    YulArrow,
181
182    Annotation(&'input str),
183}
184
185impl<'input> fmt::Display for Token<'input> {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187        match self {
188            Token::Identifier(id) => write!(f, "{id}"),
189            Token::StringLiteral(false, s) => write!(f, "\"{s}\""),
190            Token::StringLiteral(true, s) => write!(f, "unicode\"{s}\""),
191            Token::HexLiteral(hex) => write!(f, "{hex}"),
192            Token::AddressLiteral(address) => write!(f, "{address}"),
193            Token::Number(integer, "") => write!(f, "{integer}"),
194            Token::Number(integer, exp) => write!(f, "{integer}e{exp}"),
195            Token::RationalNumber(integer, fraction, "") => {
196                write!(f, "{integer}.{fraction}")
197            }
198            Token::RationalNumber(integer, fraction, exp) => {
199                write!(f, "{integer}.{fraction}e{exp}")
200            }
201            Token::HexNumber(n) => write!(f, "{n}"),
202            Token::Uint(w) => write!(f, "uint{w}"),
203            Token::Int(w) => write!(f, "int{w}"),
204            Token::Bytes(w) => write!(f, "bytes{w}"),
205            Token::Byte => write!(f, "byte"),
206            Token::DynamicBytes => write!(f, "bytes"),
207            Token::Semicolon => write!(f, ";"),
208            Token::Comma => write!(f, ","),
209            Token::OpenParenthesis => write!(f, "("),
210            Token::CloseParenthesis => write!(f, ")"),
211            Token::OpenCurlyBrace => write!(f, "{{"),
212            Token::CloseCurlyBrace => write!(f, "}}"),
213            Token::BitwiseOr => write!(f, "|"),
214            Token::BitwiseOrAssign => write!(f, "|="),
215            Token::Or => write!(f, "||"),
216            Token::BitwiseXor => write!(f, "^"),
217            Token::BitwiseXorAssign => write!(f, "^="),
218            Token::BitwiseAnd => write!(f, "&"),
219            Token::BitwiseAndAssign => write!(f, "&="),
220            Token::And => write!(f, "&&"),
221            Token::AddAssign => write!(f, "+="),
222            Token::Increment => write!(f, "++"),
223            Token::Add => write!(f, "+"),
224            Token::SubtractAssign => write!(f, "-="),
225            Token::Decrement => write!(f, "--"),
226            Token::Subtract => write!(f, "-"),
227            Token::MulAssign => write!(f, "*="),
228            Token::Mul => write!(f, "*"),
229            Token::Power => write!(f, "**"),
230            Token::Divide => write!(f, "/"),
231            Token::DivideAssign => write!(f, "/="),
232            Token::ModuloAssign => write!(f, "%="),
233            Token::Modulo => write!(f, "%"),
234            Token::Equal => write!(f, "=="),
235            Token::Assign => write!(f, "="),
236            Token::ColonAssign => write!(f, ":="),
237            Token::NotEqual => write!(f, "!="),
238            Token::Not => write!(f, "!"),
239            Token::ShiftLeft => write!(f, "<<"),
240            Token::ShiftLeftAssign => write!(f, "<<="),
241            Token::More => write!(f, ">"),
242            Token::MoreEqual => write!(f, ">="),
243            Token::Member => write!(f, "."),
244            Token::Colon => write!(f, ":"),
245            Token::OpenBracket => write!(f, "["),
246            Token::CloseBracket => write!(f, "]"),
247            Token::BitwiseNot => write!(f, "~"),
248            Token::Question => write!(f, "?"),
249            Token::ShiftRightAssign => write!(f, "<<="),
250            Token::ShiftRight => write!(f, "<<"),
251            Token::Less => write!(f, "<"),
252            Token::LessEqual => write!(f, "<="),
253            Token::Bool => write!(f, "bool"),
254            Token::Address => write!(f, "address"),
255            Token::String => write!(f, "string"),
256            Token::Contract => write!(f, "contract"),
257            Token::Library => write!(f, "library"),
258            Token::Interface => write!(f, "interface"),
259            Token::Function => write!(f, "function"),
260            Token::Pragma => write!(f, "pragma"),
261            Token::Import => write!(f, "import"),
262            Token::Struct => write!(f, "struct"),
263            Token::Event => write!(f, "event"),
264            Token::Enum => write!(f, "enum"),
265            Token::Type => write!(f, "type"),
266            Token::Memory => write!(f, "memory"),
267            Token::Storage => write!(f, "storage"),
268            Token::Calldata => write!(f, "calldata"),
269            Token::Public => write!(f, "public"),
270            Token::Private => write!(f, "private"),
271            Token::Internal => write!(f, "internal"),
272            Token::External => write!(f, "external"),
273            Token::Constant => write!(f, "constant"),
274            Token::New => write!(f, "new"),
275            Token::Delete => write!(f, "delete"),
276            Token::Pure => write!(f, "pure"),
277            Token::View => write!(f, "view"),
278            Token::Payable => write!(f, "payable"),
279            Token::Do => write!(f, "do"),
280            Token::Continue => write!(f, "continue"),
281            Token::Break => write!(f, "break"),
282            Token::Throw => write!(f, "throw"),
283            Token::Emit => write!(f, "emit"),
284            Token::Return => write!(f, "return"),
285            Token::Returns => write!(f, "returns"),
286            Token::Revert => write!(f, "revert"),
287            Token::True => write!(f, "true"),
288            Token::False => write!(f, "false"),
289            Token::Else => write!(f, "else"),
290            Token::Anonymous => write!(f, "anonymous"),
291            Token::For => write!(f, "for"),
292            Token::While => write!(f, "while"),
293            Token::If => write!(f, "if"),
294            Token::Constructor => write!(f, "constructor"),
295            Token::Indexed => write!(f, "indexed"),
296            Token::Mapping => write!(f, "mapping"),
297            Token::Arrow => write!(f, "=>"),
298            Token::Try => write!(f, "try"),
299            Token::Catch => write!(f, "catch"),
300            Token::Receive => write!(f, "receive"),
301            Token::Fallback => write!(f, "fallback"),
302            Token::As => write!(f, "as"),
303            Token::Is => write!(f, "is"),
304            Token::Abstract => write!(f, "abstract"),
305            Token::Virtual => write!(f, "virtual"),
306            Token::Override => write!(f, "override"),
307            Token::Using => write!(f, "using"),
308            Token::Modifier => write!(f, "modifier"),
309            Token::Immutable => write!(f, "immutable"),
310            Token::Unchecked => write!(f, "unchecked"),
311            Token::Assembly => write!(f, "assembly"),
312            Token::Let => write!(f, "let"),
313            Token::Leave => write!(f, "leave"),
314            Token::Switch => write!(f, "switch"),
315            Token::Case => write!(f, "case"),
316            Token::Default => write!(f, "default"),
317            Token::YulArrow => write!(f, "->"),
318            Token::Annotation(name) => write!(f, "@{name}"),
319        }
320    }
321}
322
323/// Custom Solidity lexer.
324///
325/// # Examples
326///
327/// ```
328/// use solang_parser::lexer::{Lexer, Token};
329///
330/// let source = "uint256 number = 0;";
331/// let mut comments = Vec::new();
332/// let mut errors = Vec::new();
333/// let mut lexer = Lexer::new(source, 0, &mut comments, &mut errors);
334///
335/// let mut next_token = || lexer.next().map(|(_, token, _)| token);
336/// assert_eq!(next_token(), Some(Token::Uint(256)));
337/// assert_eq!(next_token(), Some(Token::Identifier("number")));
338/// assert_eq!(next_token(), Some(Token::Assign));
339/// assert_eq!(next_token(), Some(Token::Number("0", "")));
340/// assert_eq!(next_token(), Some(Token::Semicolon));
341/// assert_eq!(next_token(), None);
342/// assert!(errors.is_empty());
343/// assert!(comments.is_empty());
344/// ```
345#[derive(Debug)]
346pub struct Lexer<'input> {
347    input: &'input str,
348    chars: PeekNth<CharIndices<'input>>,
349    comments: &'input mut Vec<Comment>,
350    file_no: usize,
351    /// While parsing version semver, do not parse rational numbers
352    parse_semver: bool,
353    last_tokens: [Option<Token<'input>>; 2],
354    /// The mutable reference to the error vector.
355    pub errors: &'input mut Vec<LexicalError>,
356}
357
358/// An error thrown by [Lexer].
359#[derive(Debug, Clone, PartialEq, Eq, Error)]
360#[allow(missing_docs)]
361pub enum LexicalError {
362    #[error("end of file found in comment")]
363    EndOfFileInComment(Loc),
364
365    #[error("end of file found in string literal")]
366    EndOfFileInString(Loc),
367
368    #[error("end of file found in hex literal string")]
369    EndofFileInHex(Loc),
370
371    #[error("missing number")]
372    MissingNumber(Loc),
373
374    #[error("invalid character '{1}' in hex literal string")]
375    InvalidCharacterInHexLiteral(Loc, char),
376
377    #[error("unrecognised token '{1}'")]
378    UnrecognisedToken(Loc, String),
379
380    #[error("missing exponent")]
381    MissingExponent(Loc),
382
383    #[error("'{1}' found where 'from' expected")]
384    ExpectedFrom(Loc, String),
385}
386
387/// Returns whether `word` is a keyword in Solidity.
388pub fn is_keyword(word: &str) -> bool {
389    KEYWORDS.contains_key(word)
390}
391
392static KEYWORDS: phf::Map<&'static str, Token> = phf_map! {
393    "address" => Token::Address,
394    "anonymous" => Token::Anonymous,
395    "bool" => Token::Bool,
396    "break" => Token::Break,
397    "bytes1" => Token::Bytes(1),
398    "bytes2" => Token::Bytes(2),
399    "bytes3" => Token::Bytes(3),
400    "bytes4" => Token::Bytes(4),
401    "bytes5" => Token::Bytes(5),
402    "bytes6" => Token::Bytes(6),
403    "bytes7" => Token::Bytes(7),
404    "bytes8" => Token::Bytes(8),
405    "bytes9" => Token::Bytes(9),
406    "bytes10" => Token::Bytes(10),
407    "bytes11" => Token::Bytes(11),
408    "bytes12" => Token::Bytes(12),
409    "bytes13" => Token::Bytes(13),
410    "bytes14" => Token::Bytes(14),
411    "bytes15" => Token::Bytes(15),
412    "bytes16" => Token::Bytes(16),
413    "bytes17" => Token::Bytes(17),
414    "bytes18" => Token::Bytes(18),
415    "bytes19" => Token::Bytes(19),
416    "bytes20" => Token::Bytes(20),
417    "bytes21" => Token::Bytes(21),
418    "bytes22" => Token::Bytes(22),
419    "bytes23" => Token::Bytes(23),
420    "bytes24" => Token::Bytes(24),
421    "bytes25" => Token::Bytes(25),
422    "bytes26" => Token::Bytes(26),
423    "bytes27" => Token::Bytes(27),
424    "bytes28" => Token::Bytes(28),
425    "bytes29" => Token::Bytes(29),
426    "bytes30" => Token::Bytes(30),
427    "bytes31" => Token::Bytes(31),
428    "bytes32" => Token::Bytes(32),
429    "bytes" => Token::DynamicBytes,
430    "byte" => Token::Byte,
431    "calldata" => Token::Calldata,
432    "case" => Token::Case,
433    "constant" => Token::Constant,
434    "constructor" => Token::Constructor,
435    "continue" => Token::Continue,
436    "contract" => Token::Contract,
437    "default" => Token::Default,
438    "delete" => Token::Delete,
439    "do" => Token::Do,
440    "else" => Token::Else,
441    "emit" => Token::Emit,
442    "enum" => Token::Enum,
443    "event" => Token::Event,
444    "external" => Token::External,
445    "false" => Token::False,
446    "for" => Token::For,
447    "function" => Token::Function,
448    "if" => Token::If,
449    "import" => Token::Import,
450    "indexed" => Token::Indexed,
451    "int8" => Token::Int(8),
452    "int16" => Token::Int(16),
453    "int24" => Token::Int(24),
454    "int32" => Token::Int(32),
455    "int40" => Token::Int(40),
456    "int48" => Token::Int(48),
457    "int56" => Token::Int(56),
458    "int64" => Token::Int(64),
459    "int72" => Token::Int(72),
460    "int80" => Token::Int(80),
461    "int88" => Token::Int(88),
462    "int96" => Token::Int(96),
463    "int104" => Token::Int(104),
464    "int112" => Token::Int(112),
465    "int120" => Token::Int(120),
466    "int128" => Token::Int(128),
467    "int136" => Token::Int(136),
468    "int144" => Token::Int(144),
469    "int152" => Token::Int(152),
470    "int160" => Token::Int(160),
471    "int168" => Token::Int(168),
472    "int176" => Token::Int(176),
473    "int184" => Token::Int(184),
474    "int192" => Token::Int(192),
475    "int200" => Token::Int(200),
476    "int208" => Token::Int(208),
477    "int216" => Token::Int(216),
478    "int224" => Token::Int(224),
479    "int232" => Token::Int(232),
480    "int240" => Token::Int(240),
481    "int248" => Token::Int(248),
482    "int256" => Token::Int(256),
483    "interface" => Token::Interface,
484    "internal" => Token::Internal,
485    "int" => Token::Int(256),
486    "leave" => Token::Leave,
487    "library" => Token::Library,
488    "mapping" => Token::Mapping,
489    "memory" => Token::Memory,
490    "new" => Token::New,
491    "payable" => Token::Payable,
492    "pragma" => Token::Pragma,
493    "private" => Token::Private,
494    "public" => Token::Public,
495    "pure" => Token::Pure,
496    "returns" => Token::Returns,
497    "return" => Token::Return,
498    "revert" => Token::Revert,
499    "storage" => Token::Storage,
500    "string" => Token::String,
501    "struct" => Token::Struct,
502    "switch" => Token::Switch,
503    "throw" => Token::Throw,
504    "true" => Token::True,
505    "type" => Token::Type,
506    "uint8" => Token::Uint(8),
507    "uint16" => Token::Uint(16),
508    "uint24" => Token::Uint(24),
509    "uint32" => Token::Uint(32),
510    "uint40" => Token::Uint(40),
511    "uint48" => Token::Uint(48),
512    "uint56" => Token::Uint(56),
513    "uint64" => Token::Uint(64),
514    "uint72" => Token::Uint(72),
515    "uint80" => Token::Uint(80),
516    "uint88" => Token::Uint(88),
517    "uint96" => Token::Uint(96),
518    "uint104" => Token::Uint(104),
519    "uint112" => Token::Uint(112),
520    "uint120" => Token::Uint(120),
521    "uint128" => Token::Uint(128),
522    "uint136" => Token::Uint(136),
523    "uint144" => Token::Uint(144),
524    "uint152" => Token::Uint(152),
525    "uint160" => Token::Uint(160),
526    "uint168" => Token::Uint(168),
527    "uint176" => Token::Uint(176),
528    "uint184" => Token::Uint(184),
529    "uint192" => Token::Uint(192),
530    "uint200" => Token::Uint(200),
531    "uint208" => Token::Uint(208),
532    "uint216" => Token::Uint(216),
533    "uint224" => Token::Uint(224),
534    "uint232" => Token::Uint(232),
535    "uint240" => Token::Uint(240),
536    "uint248" => Token::Uint(248),
537    "uint256" => Token::Uint(256),
538    "uint" => Token::Uint(256),
539    "view" => Token::View,
540    "while" => Token::While,
541    "try" => Token::Try,
542    "catch" => Token::Catch,
543    "receive" => Token::Receive,
544    "fallback" => Token::Fallback,
545    "as" => Token::As,
546    "is" => Token::Is,
547    "abstract" => Token::Abstract,
548    "virtual" => Token::Virtual,
549    "override" => Token::Override,
550    "using" => Token::Using,
551    "modifier" => Token::Modifier,
552    "immutable" => Token::Immutable,
553    "unchecked" => Token::Unchecked,
554    "assembly" => Token::Assembly,
555    "let" => Token::Let,
556};
557
558impl<'input> Lexer<'input> {
559    /// Instantiates a new Lexer.
560    ///
561    /// # Examples
562    ///
563    /// ```
564    /// use solang_parser::lexer::Lexer;
565    ///
566    /// let source = "uint256 number = 0;";
567    /// let mut comments = Vec::new();
568    /// let mut errors = Vec::new();
569    /// let mut lexer = Lexer::new(source, 0, &mut comments, &mut errors);
570    /// ```
571    pub fn new(
572        input: &'input str,
573        file_no: usize,
574        comments: &'input mut Vec<Comment>,
575        errors: &'input mut Vec<LexicalError>,
576    ) -> Self {
577        Lexer {
578            input,
579            chars: peek_nth(input.char_indices()),
580            comments,
581            file_no,
582            parse_semver: false,
583            last_tokens: [None, None],
584            errors,
585        }
586    }
587
588    fn parse_number(&mut self, mut start: usize, ch: char) -> Result<'input> {
589        let mut is_rational = false;
590        if ch == '0' {
591            if let Some((_, 'x')) = self.chars.peek() {
592                // hex number
593                self.chars.next();
594
595                let mut end = match self.chars.next() {
596                    Some((end, ch)) if ch.is_ascii_hexdigit() => end,
597                    Some((..)) => {
598                        return Err(LexicalError::MissingNumber(Loc::File(
599                            self.file_no,
600                            start,
601                            start + 1,
602                        )));
603                    }
604                    None => {
605                        return Err(LexicalError::EndofFileInHex(Loc::File(
606                            self.file_no,
607                            start,
608                            self.input.len(),
609                        )));
610                    }
611                };
612
613                while let Some((i, ch)) = self.chars.peek() {
614                    if !ch.is_ascii_hexdigit() && *ch != '_' {
615                        break;
616                    }
617                    end = *i;
618                    self.chars.next();
619                }
620
621                return Ok((start, Token::HexNumber(&self.input[start..=end]), end + 1));
622            }
623        }
624
625        if ch == '.' {
626            is_rational = true;
627            start -= 1;
628        }
629
630        let mut end = start;
631        while let Some((i, ch)) = self.chars.peek() {
632            if !ch.is_ascii_digit() && *ch != '_' {
633                break;
634            }
635            end = *i;
636            self.chars.next();
637        }
638
639        if self.parse_semver {
640            let integer = &self.input[start..=end];
641            let exp = &self.input[0..0];
642
643            return Ok((start, Token::Number(integer, exp), end + 1));
644        }
645
646        let mut rational_end = end;
647        let mut end_before_rational = end + 1;
648        let mut rational_start = end;
649        if is_rational {
650            end_before_rational = start;
651            rational_start = start + 1;
652        }
653
654        if let Some((_, '.')) = self.chars.peek() {
655            if let Some((i, ch)) = self.chars.peek_nth(1) {
656                if ch.is_ascii_digit() && !is_rational {
657                    rational_start = *i;
658                    rational_end = *i;
659                    is_rational = true;
660                    self.chars.next(); // advance over '.'
661                    while let Some((i, ch)) = self.chars.peek() {
662                        if !ch.is_ascii_digit() && *ch != '_' {
663                            break;
664                        }
665                        rational_end = *i;
666                        end = *i;
667                        self.chars.next();
668                    }
669                }
670            }
671        }
672
673        let old_end = end;
674        let mut exp_start = end + 1;
675
676        if let Some((i, 'e' | 'E')) = self.chars.peek() {
677            exp_start = *i + 1;
678            self.chars.next();
679            // Negative exponent
680            while matches!(self.chars.peek(), Some((_, '-'))) {
681                self.chars.next();
682            }
683            while let Some((i, ch)) = self.chars.peek() {
684                if !ch.is_ascii_digit() && *ch != '_' {
685                    break;
686                }
687                end = *i;
688                self.chars.next();
689            }
690
691            if exp_start > end {
692                return Err(LexicalError::MissingExponent(Loc::File(
693                    self.file_no,
694                    start,
695                    self.input.len(),
696                )));
697            }
698        }
699
700        if is_rational {
701            let integer = &self.input[start..end_before_rational];
702            let fraction = &self.input[rational_start..=rational_end];
703            let exp = &self.input[exp_start..=end];
704
705            return Ok((
706                start,
707                Token::RationalNumber(integer, fraction, exp),
708                end + 1,
709            ));
710        }
711
712        let integer = &self.input[start..=old_end];
713        let exp = &self.input[exp_start..=end];
714
715        Ok((start, Token::Number(integer, exp), end + 1))
716    }
717
718    fn string(
719        &mut self,
720        unicode: bool,
721        token_start: usize,
722        string_start: usize,
723        quote_char: char,
724    ) -> Result<'input> {
725        let mut end;
726
727        let mut last_was_escape = false;
728
729        loop {
730            if let Some((i, ch)) = self.chars.next() {
731                end = i;
732                if !last_was_escape {
733                    if ch == quote_char {
734                        break;
735                    }
736                    last_was_escape = ch == '\\';
737                } else {
738                    last_was_escape = false;
739                }
740            } else {
741                return Err(LexicalError::EndOfFileInString(Loc::File(
742                    self.file_no,
743                    token_start,
744                    self.input.len(),
745                )));
746            }
747        }
748
749        Ok((
750            token_start,
751            Token::StringLiteral(unicode, &self.input[string_start..end]),
752            end + 1,
753        ))
754    }
755
756    fn next(&mut self) -> Option<Spanned<'input>> {
757        'toplevel: loop {
758            match self.chars.next() {
759                Some((start, ch)) if ch == '_' || ch == '$' || UnicodeXID::is_xid_start(ch) => {
760                    let (id, end) = self.match_identifier(start);
761
762                    if id == "unicode" {
763                        match self.chars.peek() {
764                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
765                                let quote_char = *quote_char;
766
767                                self.chars.next();
768                                let str_res = self.string(true, start, start + 8, quote_char);
769                                match str_res {
770                                    Err(lex_err) => self.errors.push(lex_err),
771                                    Ok(val) => return Some(val),
772                                }
773                            }
774                            _ => (),
775                        }
776                    }
777
778                    if id == "hex" {
779                        match self.chars.peek() {
780                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
781                                let quote_char = *quote_char;
782
783                                self.chars.next();
784
785                                for (i, ch) in &mut self.chars {
786                                    if ch == quote_char {
787                                        return Some((
788                                            start,
789                                            Token::HexLiteral(&self.input[start..=i]),
790                                            i + 1,
791                                        ));
792                                    }
793
794                                    if !ch.is_ascii_hexdigit() && ch != '_' {
795                                        // Eat up the remainer of the string
796                                        for (_, ch) in &mut self.chars {
797                                            if ch == quote_char {
798                                                break;
799                                            }
800                                        }
801
802                                        self.errors.push(
803                                            LexicalError::InvalidCharacterInHexLiteral(
804                                                Loc::File(self.file_no, i, i + 1),
805                                                ch,
806                                            ),
807                                        );
808                                        continue 'toplevel;
809                                    }
810                                }
811
812                                self.errors.push(LexicalError::EndOfFileInString(Loc::File(
813                                    self.file_no,
814                                    start,
815                                    self.input.len(),
816                                )));
817                                return None;
818                            }
819                            _ => (),
820                        }
821                    }
822
823                    if id == "address" {
824                        match self.chars.peek() {
825                            Some((_, quote_char @ '"')) | Some((_, quote_char @ '\'')) => {
826                                let quote_char = *quote_char;
827
828                                self.chars.next();
829
830                                for (i, ch) in &mut self.chars {
831                                    if ch == quote_char {
832                                        return Some((
833                                            start,
834                                            Token::AddressLiteral(&self.input[start..=i]),
835                                            i + 1,
836                                        ));
837                                    }
838                                }
839
840                                self.errors.push(LexicalError::EndOfFileInString(Loc::File(
841                                    self.file_no,
842                                    start,
843                                    self.input.len(),
844                                )));
845                                return None;
846                            }
847                            _ => (),
848                        }
849                    }
850
851                    return if let Some(w) = KEYWORDS.get(id) {
852                        Some((start, *w, end))
853                    } else {
854                        Some((start, Token::Identifier(id), end))
855                    };
856                }
857                Some((start, quote_char @ '"')) | Some((start, quote_char @ '\'')) => {
858                    let str_res = self.string(false, start, start + 1, quote_char);
859                    match str_res {
860                        Err(lex_err) => self.errors.push(lex_err),
861                        Ok(val) => return Some(val),
862                    }
863                }
864                Some((start, '/')) => {
865                    match self.chars.peek() {
866                        Some((_, '=')) => {
867                            self.chars.next();
868                            return Some((start, Token::DivideAssign, start + 2));
869                        }
870                        Some((_, '/')) => {
871                            // line comment
872                            self.chars.next();
873
874                            let mut newline = false;
875
876                            let doc_comment = match self.chars.next() {
877                                Some((_, '/')) => {
878                                    // ///(/)+ is still a line comment
879                                    !matches!(self.chars.peek(), Some((_, '/')))
880                                }
881                                Some((_, ch)) if ch == '\n' || ch == '\r' => {
882                                    newline = true;
883                                    false
884                                }
885                                _ => false,
886                            };
887
888                            let mut last = start + 3;
889
890                            if !newline {
891                                loop {
892                                    match self.chars.next() {
893                                        None => {
894                                            last = self.input.len();
895                                            break;
896                                        }
897                                        Some((offset, '\n' | '\r')) => {
898                                            last = offset;
899                                            break;
900                                        }
901                                        Some(_) => (),
902                                    }
903                                }
904                            }
905
906                            if doc_comment {
907                                self.comments.push(Comment::DocLine(
908                                    Loc::File(self.file_no, start, last),
909                                    self.input[start..last].to_owned(),
910                                ));
911                            } else {
912                                self.comments.push(Comment::Line(
913                                    Loc::File(self.file_no, start, last),
914                                    self.input[start..last].to_owned(),
915                                ));
916                            }
917                        }
918                        Some((_, '*')) => {
919                            // multiline comment
920                            self.chars.next();
921
922                            let doc_comment_start = matches!(self.chars.peek(), Some((_, '*')));
923
924                            let mut last = start + 3;
925                            let mut seen_star = false;
926
927                            loop {
928                                if let Some((i, ch)) = self.chars.next() {
929                                    if seen_star && ch == '/' {
930                                        break;
931                                    }
932                                    seen_star = ch == '*';
933                                    last = i;
934                                } else {
935                                    self.errors.push(LexicalError::EndOfFileInComment(Loc::File(
936                                        self.file_no,
937                                        start,
938                                        self.input.len(),
939                                    )));
940                                    return None;
941                                }
942                            }
943
944                            // `/**/` is not a doc comment
945                            if doc_comment_start && last > start + 2 {
946                                self.comments.push(Comment::DocBlock(
947                                    Loc::File(self.file_no, start, last + 2),
948                                    self.input[start..last + 2].to_owned(),
949                                ));
950                            } else {
951                                self.comments.push(Comment::Block(
952                                    Loc::File(self.file_no, start, last + 2),
953                                    self.input[start..last + 2].to_owned(),
954                                ));
955                            }
956                        }
957                        _ => {
958                            return Some((start, Token::Divide, start + 1));
959                        }
960                    }
961                }
962                Some((start, ch)) if ch.is_ascii_digit() => {
963                    let parse_result = self.parse_number(start, ch);
964                    match parse_result {
965                        Err(lex_err) => {
966                            self.errors.push(lex_err.clone());
967                            if matches!(lex_err, LexicalError::EndofFileInHex(_)) {
968                                return None;
969                            }
970                        }
971                        Ok(parse_result) => return Some(parse_result),
972                    }
973                }
974                Some((start, '@')) => {
975                    let (id, end) = self.match_identifier(start);
976                    if id.len() == 1 {
977                        self.errors.push(LexicalError::UnrecognisedToken(
978                            Loc::File(self.file_no, start, start + 1),
979                            id.to_owned(),
980                        ));
981                    } else {
982                        return Some((start, Token::Annotation(&id[1..]), end));
983                    };
984                }
985                Some((i, ';')) => {
986                    self.parse_semver = false;
987                    return Some((i, Token::Semicolon, i + 1));
988                }
989                Some((i, ',')) => return Some((i, Token::Comma, i + 1)),
990                Some((i, '(')) => return Some((i, Token::OpenParenthesis, i + 1)),
991                Some((i, ')')) => return Some((i, Token::CloseParenthesis, i + 1)),
992                Some((i, '{')) => return Some((i, Token::OpenCurlyBrace, i + 1)),
993                Some((i, '}')) => return Some((i, Token::CloseCurlyBrace, i + 1)),
994                Some((i, '~')) => return Some((i, Token::BitwiseNot, i + 1)),
995                Some((i, '=')) => {
996                    return match self.chars.peek() {
997                        Some((_, '=')) => {
998                            self.chars.next();
999                            Some((i, Token::Equal, i + 2))
1000                        }
1001                        Some((_, '>')) => {
1002                            self.chars.next();
1003                            Some((i, Token::Arrow, i + 2))
1004                        }
1005                        _ => Some((i, Token::Assign, i + 1)),
1006                    }
1007                }
1008                Some((i, '!')) => {
1009                    return if let Some((_, '=')) = self.chars.peek() {
1010                        self.chars.next();
1011                        Some((i, Token::NotEqual, i + 2))
1012                    } else {
1013                        Some((i, Token::Not, i + 1))
1014                    }
1015                }
1016                Some((i, '|')) => {
1017                    return match self.chars.peek() {
1018                        Some((_, '=')) => {
1019                            self.chars.next();
1020                            Some((i, Token::BitwiseOrAssign, i + 2))
1021                        }
1022                        Some((_, '|')) => {
1023                            self.chars.next();
1024                            Some((i, Token::Or, i + 2))
1025                        }
1026                        _ => Some((i, Token::BitwiseOr, i + 1)),
1027                    };
1028                }
1029                Some((i, '&')) => {
1030                    return match self.chars.peek() {
1031                        Some((_, '=')) => {
1032                            self.chars.next();
1033                            Some((i, Token::BitwiseAndAssign, i + 2))
1034                        }
1035                        Some((_, '&')) => {
1036                            self.chars.next();
1037                            Some((i, Token::And, i + 2))
1038                        }
1039                        _ => Some((i, Token::BitwiseAnd, i + 1)),
1040                    };
1041                }
1042                Some((i, '^')) => {
1043                    return match self.chars.peek() {
1044                        Some((_, '=')) => {
1045                            self.chars.next();
1046                            Some((i, Token::BitwiseXorAssign, i + 2))
1047                        }
1048                        _ => Some((i, Token::BitwiseXor, i + 1)),
1049                    };
1050                }
1051                Some((i, '+')) => {
1052                    return match self.chars.peek() {
1053                        Some((_, '=')) => {
1054                            self.chars.next();
1055                            Some((i, Token::AddAssign, i + 2))
1056                        }
1057                        Some((_, '+')) => {
1058                            self.chars.next();
1059                            Some((i, Token::Increment, i + 2))
1060                        }
1061                        _ => Some((i, Token::Add, i + 1)),
1062                    };
1063                }
1064                Some((i, '-')) => {
1065                    return match self.chars.peek() {
1066                        Some((_, '=')) => {
1067                            self.chars.next();
1068                            Some((i, Token::SubtractAssign, i + 2))
1069                        }
1070                        Some((_, '-')) => {
1071                            self.chars.next();
1072                            Some((i, Token::Decrement, i + 2))
1073                        }
1074                        Some((_, '>')) => {
1075                            self.chars.next();
1076                            Some((i, Token::YulArrow, i + 2))
1077                        }
1078                        _ => Some((i, Token::Subtract, i + 1)),
1079                    };
1080                }
1081                Some((i, '*')) => {
1082                    return match self.chars.peek() {
1083                        Some((_, '=')) => {
1084                            self.chars.next();
1085                            Some((i, Token::MulAssign, i + 2))
1086                        }
1087                        Some((_, '*')) => {
1088                            self.chars.next();
1089                            Some((i, Token::Power, i + 2))
1090                        }
1091                        _ => Some((i, Token::Mul, i + 1)),
1092                    };
1093                }
1094                Some((i, '%')) => {
1095                    return match self.chars.peek() {
1096                        Some((_, '=')) => {
1097                            self.chars.next();
1098                            Some((i, Token::ModuloAssign, i + 2))
1099                        }
1100                        _ => Some((i, Token::Modulo, i + 1)),
1101                    };
1102                }
1103                Some((i, '<')) => {
1104                    return match self.chars.peek() {
1105                        Some((_, '<')) => {
1106                            self.chars.next();
1107                            if let Some((_, '=')) = self.chars.peek() {
1108                                self.chars.next();
1109                                Some((i, Token::ShiftLeftAssign, i + 3))
1110                            } else {
1111                                Some((i, Token::ShiftLeft, i + 2))
1112                            }
1113                        }
1114                        Some((_, '=')) => {
1115                            self.chars.next();
1116                            Some((i, Token::LessEqual, i + 2))
1117                        }
1118                        _ => Some((i, Token::Less, i + 1)),
1119                    };
1120                }
1121                Some((i, '>')) => {
1122                    return match self.chars.peek() {
1123                        Some((_, '>')) => {
1124                            self.chars.next();
1125                            if let Some((_, '=')) = self.chars.peek() {
1126                                self.chars.next();
1127                                Some((i, Token::ShiftRightAssign, i + 3))
1128                            } else {
1129                                Some((i, Token::ShiftRight, i + 2))
1130                            }
1131                        }
1132                        Some((_, '=')) => {
1133                            self.chars.next();
1134                            Some((i, Token::MoreEqual, i + 2))
1135                        }
1136                        _ => Some((i, Token::More, i + 1)),
1137                    };
1138                }
1139                Some((i, '.')) => {
1140                    if let Some((_, a)) = self.chars.peek() {
1141                        if a.is_ascii_digit() && !self.parse_semver {
1142                            return match self.parse_number(i + 1, '.') {
1143                                Err(lex_error) => {
1144                                    self.errors.push(lex_error);
1145                                    None
1146                                }
1147                                Ok(parse_result) => Some(parse_result),
1148                            };
1149                        }
1150                    }
1151                    return Some((i, Token::Member, i + 1));
1152                }
1153                Some((i, '[')) => return Some((i, Token::OpenBracket, i + 1)),
1154                Some((i, ']')) => return Some((i, Token::CloseBracket, i + 1)),
1155                Some((i, ':')) => {
1156                    return match self.chars.peek() {
1157                        Some((_, '=')) => {
1158                            self.chars.next();
1159                            Some((i, Token::ColonAssign, i + 2))
1160                        }
1161                        _ => Some((i, Token::Colon, i + 1)),
1162                    };
1163                }
1164                Some((i, '?')) => return Some((i, Token::Question, i + 1)),
1165                Some((_, ch)) if ch.is_whitespace() => (),
1166                Some((start, _)) => {
1167                    let mut end;
1168
1169                    loop {
1170                        if let Some((i, ch)) = self.chars.next() {
1171                            end = i;
1172
1173                            if ch.is_whitespace() {
1174                                break;
1175                            }
1176                        } else {
1177                            end = self.input.len();
1178                            break;
1179                        }
1180                    }
1181
1182                    self.errors.push(LexicalError::UnrecognisedToken(
1183                        Loc::File(self.file_no, start, end),
1184                        self.input[start..end].to_owned(),
1185                    ));
1186                }
1187                None => return None, // End of file
1188            }
1189        }
1190    }
1191
1192    fn match_identifier(&mut self, start: usize) -> (&'input str, usize) {
1193        let end;
1194        loop {
1195            if let Some((i, ch)) = self.chars.peek() {
1196                if !UnicodeXID::is_xid_continue(*ch) && *ch != '$' {
1197                    end = *i;
1198                    break;
1199                }
1200                self.chars.next();
1201            } else {
1202                end = self.input.len();
1203                break;
1204            }
1205        }
1206
1207        (&self.input[start..end], end)
1208    }
1209}
1210
1211impl<'input> Iterator for Lexer<'input> {
1212    type Item = Spanned<'input>;
1213
1214    fn next(&mut self) -> Option<Self::Item> {
1215        // Lexer should be aware of whether the last two tokens were
1216        // pragma followed by identifier. If this is true, then special parsing should be
1217        // done for the pragma value
1218        if let [Some(Token::Pragma), Some(Token::Identifier(_))] = self.last_tokens {
1219            self.parse_semver = true;
1220        }
1221
1222        let token = self.next();
1223
1224        self.last_tokens = [
1225            self.last_tokens[1],
1226            match token {
1227                Some((_, n, _)) => Some(n),
1228                _ => None,
1229            },
1230        ];
1231
1232        token
1233    }
1234}
1235
1236#[cfg(test)]
1237mod tests {
1238    use super::*;
1239
1240    #[test]
1241    fn test_lexer() {
1242        let mut comments = Vec::new();
1243        let mut errors = Vec::new();
1244
1245        let multiple_errors = r#" 9ea -9e € bool hex uint8 hex"g"   /**  "#;
1246        let tokens = Lexer::new(multiple_errors, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1247        assert_eq!(
1248            tokens,
1249            vec![
1250                (3, Token::Identifier("a"), 4),
1251                (5, Token::Subtract, 6),
1252                (13, Token::Bool, 17),
1253                (18, Token::Identifier("hex"), 21),
1254                (22, Token::Uint(8), 27),
1255            ]
1256        );
1257
1258        assert_eq!(
1259            errors,
1260            vec![
1261                LexicalError::MissingExponent(Loc::File(0, 1, 42)),
1262                LexicalError::MissingExponent(Loc::File(0, 6, 42)),
1263                LexicalError::UnrecognisedToken(Loc::File(0, 9, 12), '€'.to_string()),
1264                LexicalError::InvalidCharacterInHexLiteral(Loc::File(0, 32, 33), 'g'),
1265                LexicalError::EndOfFileInComment(Loc::File(0, 37, 42)),
1266            ]
1267        );
1268
1269        let mut errors = Vec::new();
1270        let tokens = Lexer::new("bool", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1271
1272        assert_eq!(tokens, vec!((0, Token::Bool, 4)));
1273
1274        let tokens = Lexer::new("uint8", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1275
1276        assert_eq!(tokens, vec!((0, Token::Uint(8), 5)));
1277
1278        let tokens = Lexer::new("hex", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1279
1280        assert_eq!(tokens, vec!((0, Token::Identifier("hex"), 3)));
1281
1282        let tokens = Lexer::new(
1283            "hex\"cafe_dead\" /* adad*** */",
1284            0,
1285            &mut comments,
1286            &mut errors,
1287        )
1288        .collect::<Vec<_>>();
1289
1290        assert_eq!(tokens, vec!((0, Token::HexLiteral("hex\"cafe_dead\""), 14)));
1291
1292        let tokens = Lexer::new(
1293            "// foo bar\n0x00fead0_12 00090 0_0",
1294            0,
1295            &mut comments,
1296            &mut errors,
1297        )
1298        .collect::<Vec<_>>();
1299
1300        assert_eq!(
1301            tokens,
1302            vec!(
1303                (11, Token::HexNumber("0x00fead0_12"), 23),
1304                (24, Token::Number("00090", ""), 29),
1305                (30, Token::Number("0_0", ""), 33)
1306            )
1307        );
1308
1309        let tokens = Lexer::new(
1310            "// foo bar\n0x00fead0_12 9.0008 0_0",
1311            0,
1312            &mut comments,
1313            &mut errors,
1314        )
1315        .collect::<Vec<_>>();
1316
1317        assert_eq!(
1318            tokens,
1319            vec!(
1320                (11, Token::HexNumber("0x00fead0_12"), 23),
1321                (24, Token::RationalNumber("9", "0008", ""), 30),
1322                (31, Token::Number("0_0", ""), 34)
1323            )
1324        );
1325
1326        let tokens = Lexer::new(
1327            "// foo bar\n0x00fead0_12 .0008 0.9e2",
1328            0,
1329            &mut comments,
1330            &mut errors,
1331        )
1332        .collect::<Vec<_>>();
1333
1334        assert_eq!(
1335            tokens,
1336            vec!(
1337                (11, Token::HexNumber("0x00fead0_12"), 23),
1338                (24, Token::RationalNumber("", "0008", ""), 29),
1339                (30, Token::RationalNumber("0", "9", "2"), 35)
1340            )
1341        );
1342
1343        let tokens = Lexer::new(
1344            "// foo bar\n0x00fead0_12 .0008 0.9e-2-2",
1345            0,
1346            &mut comments,
1347            &mut errors,
1348        )
1349        .collect::<Vec<_>>();
1350
1351        assert_eq!(
1352            tokens,
1353            vec!(
1354                (11, Token::HexNumber("0x00fead0_12"), 23),
1355                (24, Token::RationalNumber("", "0008", ""), 29),
1356                (30, Token::RationalNumber("0", "9", "-2"), 36),
1357                (36, Token::Subtract, 37),
1358                (37, Token::Number("2", ""), 38)
1359            )
1360        );
1361
1362        let tokens = Lexer::new("1.2_3e2-", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1363
1364        assert_eq!(
1365            tokens,
1366            vec!(
1367                (0, Token::RationalNumber("1", "2_3", "2"), 7),
1368                (7, Token::Subtract, 8)
1369            )
1370        );
1371
1372        let tokens = Lexer::new("\"foo\"", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1373
1374        assert_eq!(tokens, vec!((0, Token::StringLiteral(false, "foo"), 5)));
1375
1376        let tokens = Lexer::new(
1377            "pragma solidity >=0.5.0 <0.7.0;",
1378            0,
1379            &mut comments,
1380            &mut errors,
1381        )
1382        .collect::<Vec<_>>();
1383
1384        assert_eq!(
1385            tokens,
1386            vec!(
1387                (0, Token::Pragma, 6),
1388                (7, Token::Identifier("solidity"), 15),
1389                (16, Token::MoreEqual, 18),
1390                (18, Token::Number("0", ""), 19),
1391                (19, Token::Member, 20),
1392                (20, Token::Number("5", ""), 21),
1393                (21, Token::Member, 22),
1394                (22, Token::Number("0", ""), 23),
1395                (24, Token::Less, 25),
1396                (25, Token::Number("0", ""), 26),
1397                (26, Token::Member, 27),
1398                (27, Token::Number("7", ""), 28),
1399                (28, Token::Member, 29),
1400                (29, Token::Number("0", ""), 30),
1401                (30, Token::Semicolon, 31),
1402            )
1403        );
1404
1405        let tokens = Lexer::new(
1406            "pragma solidity \t>=0.5.0 <0.7.0 \n ;",
1407            0,
1408            &mut comments,
1409            &mut errors,
1410        )
1411        .collect::<Vec<_>>();
1412
1413        assert_eq!(
1414            tokens,
1415            vec!(
1416                (0, Token::Pragma, 6),
1417                (7, Token::Identifier("solidity"), 15),
1418                (17, Token::MoreEqual, 19),
1419                (19, Token::Number("0", ""), 20),
1420                (20, Token::Member, 21),
1421                (21, Token::Number("5", ""), 22),
1422                (22, Token::Member, 23),
1423                (23, Token::Number("0", ""), 24),
1424                (25, Token::Less, 26),
1425                (26, Token::Number("0", ""), 27),
1426                (27, Token::Member, 28),
1427                (28, Token::Number("7", ""), 29),
1428                (29, Token::Member, 30),
1429                (30, Token::Number("0", ""), 31),
1430                (34, Token::Semicolon, 35),
1431            )
1432        );
1433
1434        let tokens =
1435            Lexer::new("pragma solidity 赤;", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1436
1437        assert_eq!(
1438            tokens,
1439            vec!(
1440                (0, Token::Pragma, 6),
1441                (7, Token::Identifier("solidity"), 15),
1442                (16, Token::Identifier("赤"), 19),
1443                (19, Token::Semicolon, 20)
1444            )
1445        );
1446
1447        let tokens = Lexer::new(">>= >> >= >", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1448
1449        assert_eq!(
1450            tokens,
1451            vec!(
1452                (0, Token::ShiftRightAssign, 3),
1453                (4, Token::ShiftRight, 6),
1454                (7, Token::MoreEqual, 9),
1455                (10, Token::More, 11),
1456            )
1457        );
1458
1459        let tokens = Lexer::new("<<= << <= <", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1460
1461        assert_eq!(
1462            tokens,
1463            vec!(
1464                (0, Token::ShiftLeftAssign, 3),
1465                (4, Token::ShiftLeft, 6),
1466                (7, Token::LessEqual, 9),
1467                (10, Token::Less, 11),
1468            )
1469        );
1470
1471        let tokens = Lexer::new("-16 -- - -=", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1472
1473        assert_eq!(
1474            tokens,
1475            vec!(
1476                (0, Token::Subtract, 1),
1477                (1, Token::Number("16", ""), 3),
1478                (4, Token::Decrement, 6),
1479                (7, Token::Subtract, 8),
1480                (9, Token::SubtractAssign, 11),
1481            )
1482        );
1483
1484        let tokens = Lexer::new("-4 ", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1485
1486        assert_eq!(
1487            tokens,
1488            vec!((0, Token::Subtract, 1), (1, Token::Number("4", ""), 2),)
1489        );
1490
1491        let mut errors = Vec::new();
1492        let _ = Lexer::new(r#"hex"abcdefg""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1493
1494        assert_eq!(
1495            errors,
1496            vec![LexicalError::InvalidCharacterInHexLiteral(
1497                Loc::File(0, 10, 11),
1498                'g'
1499            )]
1500        );
1501
1502        let mut errors = Vec::new();
1503        let _ = Lexer::new(r#" € "#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1504
1505        assert_eq!(
1506            errors,
1507            vec!(LexicalError::UnrecognisedToken(
1508                Loc::File(0, 1, 4),
1509                "€".to_owned()
1510            ))
1511        );
1512
1513        let mut errors = Vec::new();
1514        let _ = Lexer::new(r#"€"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1515
1516        assert_eq!(
1517            errors,
1518            vec!(LexicalError::UnrecognisedToken(
1519                Loc::File(0, 0, 3),
1520                "€".to_owned()
1521            ))
1522        );
1523
1524        let tokens =
1525            Lexer::new(r#"pragma foo bar"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1526
1527        assert_eq!(
1528            tokens,
1529            vec!(
1530                (0, Token::Pragma, 6),
1531                (7, Token::Identifier("foo"), 10),
1532                (11, Token::Identifier("bar"), 14),
1533            )
1534        );
1535
1536        comments.truncate(0);
1537
1538        let tokens = Lexer::new(r#"/// foo"#, 0, &mut comments, &mut errors).count();
1539
1540        assert_eq!(tokens, 0);
1541        assert_eq!(
1542            comments,
1543            vec![Comment::DocLine(Loc::File(0, 0, 7), "/// foo".to_owned())],
1544        );
1545
1546        comments.truncate(0);
1547
1548        let tokens = Lexer::new("/// jadajadadjada\n// bar", 0, &mut comments, &mut errors).count();
1549
1550        assert_eq!(tokens, 0);
1551        assert_eq!(
1552            comments,
1553            vec!(
1554                Comment::DocLine(Loc::File(0, 0, 17), "/// jadajadadjada".to_owned()),
1555                Comment::Line(Loc::File(0, 18, 24), "// bar".to_owned())
1556            )
1557        );
1558
1559        comments.truncate(0);
1560
1561        let tokens = Lexer::new("/**/", 0, &mut comments, &mut errors).count();
1562
1563        assert_eq!(tokens, 0);
1564        assert_eq!(
1565            comments,
1566            vec!(Comment::Block(Loc::File(0, 0, 4), "/**/".to_owned()))
1567        );
1568
1569        comments.truncate(0);
1570
1571        let tokens = Lexer::new(r#"/** foo */"#, 0, &mut comments, &mut errors).count();
1572
1573        assert_eq!(tokens, 0);
1574        assert_eq!(
1575            comments,
1576            vec!(Comment::DocBlock(
1577                Loc::File(0, 0, 10),
1578                "/** foo */".to_owned()
1579            ))
1580        );
1581
1582        comments.truncate(0);
1583
1584        let tokens = Lexer::new(
1585            "/** jadajadadjada */\n/* bar */",
1586            0,
1587            &mut comments,
1588            &mut errors,
1589        )
1590        .count();
1591
1592        assert_eq!(tokens, 0);
1593        assert_eq!(
1594            comments,
1595            vec!(
1596                Comment::DocBlock(Loc::File(0, 0, 20), "/** jadajadadjada */".to_owned()),
1597                Comment::Block(Loc::File(0, 21, 30), "/* bar */".to_owned())
1598            )
1599        );
1600
1601        let tokens = Lexer::new("/************/", 0, &mut comments, &mut errors).next();
1602        assert_eq!(tokens, None);
1603
1604        let mut errors = Vec::new();
1605        let _ = Lexer::new("/**", 0, &mut comments, &mut errors).next();
1606        assert_eq!(
1607            errors,
1608            vec!(LexicalError::EndOfFileInComment(Loc::File(0, 0, 3)))
1609        );
1610
1611        let mut errors = Vec::new();
1612        let tokens = Lexer::new("//////////////", 0, &mut comments, &mut errors).next();
1613        assert_eq!(tokens, None);
1614
1615        // some unicode tests
1616        let tokens = Lexer::new(
1617            ">=\u{a0} . très\u{2028}αβγδεζηθικλμνξοπρστυφχψω\u{85}カラス",
1618            0,
1619            &mut comments,
1620            &mut errors,
1621        )
1622        .collect::<Vec<_>>();
1623
1624        assert_eq!(
1625            tokens,
1626            vec!(
1627                (0, Token::MoreEqual, 2),
1628                (5, Token::Member, 6),
1629                (7, Token::Identifier("très"), 12),
1630                (15, Token::Identifier("αβγδεζηθικλμνξοπρστυφχψω"), 63),
1631                (65, Token::Identifier("カラス"), 74)
1632            )
1633        );
1634
1635        let tokens = Lexer::new(r#"unicode"€""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1636
1637        assert_eq!(tokens, vec!((0, Token::StringLiteral(true, "€"), 12)));
1638
1639        let tokens =
1640            Lexer::new(r#"unicode "€""#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1641
1642        assert_eq!(
1643            tokens,
1644            vec!(
1645                (0, Token::Identifier("unicode"), 7),
1646                (8, Token::StringLiteral(false, "€"), 13),
1647            )
1648        );
1649
1650        // scientific notation
1651        let tokens = Lexer::new(r#" 1e0 "#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1652
1653        assert_eq!(tokens, vec!((1, Token::Number("1", "0"), 4)));
1654
1655        let tokens = Lexer::new(r#" -9e0123"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1656
1657        assert_eq!(
1658            tokens,
1659            vec!((1, Token::Subtract, 2), (2, Token::Number("9", "0123"), 8),)
1660        );
1661
1662        let mut errors = Vec::new();
1663        let tokens = Lexer::new(r#" -9e"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1664
1665        assert_eq!(tokens, vec!((1, Token::Subtract, 2)));
1666        assert_eq!(
1667            errors,
1668            vec!(LexicalError::MissingExponent(Loc::File(0, 2, 4)))
1669        );
1670
1671        let mut errors = Vec::new();
1672        let tokens = Lexer::new(r#"9ea"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1673
1674        assert_eq!(tokens, vec!((2, Token::Identifier("a"), 3)));
1675        assert_eq!(
1676            errors,
1677            vec!(LexicalError::MissingExponent(Loc::File(0, 0, 3)))
1678        );
1679
1680        let mut errors = Vec::new();
1681        let tokens = Lexer::new(r#"42.a"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1682
1683        assert_eq!(
1684            tokens,
1685            vec!(
1686                (0, Token::Number("42", ""), 2),
1687                (2, Token::Member, 3),
1688                (3, Token::Identifier("a"), 4)
1689            )
1690        );
1691
1692        let tokens = Lexer::new(r#"42..a"#, 0, &mut comments, &mut errors).collect::<Vec<_>>();
1693
1694        assert_eq!(
1695            tokens,
1696            vec!(
1697                (0, Token::Number("42", ""), 2),
1698                (2, Token::Member, 3),
1699                (3, Token::Member, 4),
1700                (4, Token::Identifier("a"), 5)
1701            )
1702        );
1703
1704        comments.truncate(0);
1705
1706        let tokens = Lexer::new("/// jadajadadjada\n// bar", 0, &mut comments, &mut errors).count();
1707
1708        assert_eq!(tokens, 0);
1709        assert_eq!(
1710            comments,
1711            vec!(
1712                Comment::DocLine(Loc::File(0, 0, 17), "/// jadajadadjada".to_owned()),
1713                Comment::Line(Loc::File(0, 18, 24), "// bar".to_owned())
1714            )
1715        );
1716
1717        comments.truncate(0);
1718
1719        let tokens = Lexer::new("/**/", 0, &mut comments, &mut errors).count();
1720
1721        assert_eq!(tokens, 0);
1722        assert_eq!(
1723            comments,
1724            vec!(Comment::Block(Loc::File(0, 0, 4), "/**/".to_owned()))
1725        );
1726
1727        comments.truncate(0);
1728
1729        let tokens = Lexer::new(r#"/** foo */"#, 0, &mut comments, &mut errors).count();
1730
1731        assert_eq!(tokens, 0);
1732        assert_eq!(
1733            comments,
1734            vec!(Comment::DocBlock(
1735                Loc::File(0, 0, 10),
1736                "/** foo */".to_owned()
1737            ))
1738        );
1739
1740        comments.truncate(0);
1741
1742        let tokens = Lexer::new(
1743            "/** jadajadadjada */\n/* bar */",
1744            0,
1745            &mut comments,
1746            &mut errors,
1747        )
1748        .count();
1749
1750        assert_eq!(tokens, 0);
1751        assert_eq!(
1752            comments,
1753            vec!(
1754                Comment::DocBlock(Loc::File(0, 0, 20), "/** jadajadadjada */".to_owned()),
1755                Comment::Block(Loc::File(0, 21, 30), "/* bar */".to_owned())
1756            )
1757        );
1758
1759        let tokens = Lexer::new("/************/", 0, &mut comments, &mut errors).next();
1760        assert_eq!(tokens, None);
1761
1762        let mut errors = Vec::new();
1763        let _ = Lexer::new("/**", 0, &mut comments, &mut errors).next();
1764        assert_eq!(
1765            errors,
1766            vec!(LexicalError::EndOfFileInComment(Loc::File(0, 0, 3)))
1767        );
1768
1769        let mut errors = Vec::new();
1770        let tokens = Lexer::new("//////////////", 0, &mut comments, &mut errors).next();
1771        assert_eq!(tokens, None);
1772
1773        // some unicode tests
1774        let tokens = Lexer::new(
1775            ">=\u{a0} . très\u{2028}αβγδεζηθικλμνξοπρστυφχψω\u{85}カラス",
1776            0,
1777            &mut comments,
1778            &mut errors,
1779        )
1780        .collect::<Vec<(usize, Token, usize)>>();
1781
1782        assert_eq!(
1783            tokens,
1784            vec!(
1785                (0, Token::MoreEqual, 2),
1786                (5, Token::Member, 6),
1787                (7, Token::Identifier("très"), 12),
1788                (15, Token::Identifier("αβγδεζηθικλμνξοπρστυφχψω"), 63),
1789                (65, Token::Identifier("カラス"), 74)
1790            )
1791        );
1792
1793        let tokens =
1794            Lexer::new(r#"unicode"€""#, 0, &mut comments, &mut errors)
1795                .collect::<Vec<(usize, Token, usize)>>();
1796
1797        assert_eq!(tokens, vec!((0, Token::StringLiteral(true, "€"), 12)));
1798
1799        let tokens =
1800            Lexer::new(r#"unicode "€""#, 0, &mut comments, &mut errors)
1801                .collect::<Vec<(usize, Token, usize)>>();
1802
1803        assert_eq!(
1804            tokens,
1805            vec!(
1806                (0, Token::Identifier("unicode"), 7),
1807                (8, Token::StringLiteral(false, "€"), 13),
1808            )
1809        );
1810
1811        // scientific notation
1812        let tokens =
1813            Lexer::new(r#" 1e0 "#, 0, &mut comments, &mut errors)
1814                .collect::<Vec<(usize, Token, usize)>>();
1815
1816        assert_eq!(tokens, vec!((1, Token::Number("1", "0"), 4)));
1817
1818        let tokens =
1819            Lexer::new(r#" -9e0123"#, 0, &mut comments, &mut errors)
1820                .collect::<Vec<(usize, Token, usize)>>();
1821
1822        assert_eq!(
1823            tokens,
1824            vec!((1, Token::Subtract, 2), (2, Token::Number("9", "0123"), 8),)
1825        );
1826
1827        let mut errors = Vec::new();
1828        let tokens = Lexer::new(r#" -9e"#, 0, &mut comments, &mut errors)
1829            .collect::<Vec<(usize, Token, usize)>>();
1830
1831        assert_eq!(tokens, vec!((1, Token::Subtract, 2)));
1832        assert_eq!(
1833            errors,
1834            vec!(LexicalError::MissingExponent(Loc::File(0, 2, 4)))
1835        );
1836
1837        let mut errors = Vec::new();
1838        let tokens = Lexer::new(r#"9ea"#, 0, &mut comments, &mut errors)
1839            .collect::<Vec<(usize, Token, usize)>>();
1840
1841        assert_eq!(tokens, vec!((2, Token::Identifier("a"), 3)));
1842        assert_eq!(
1843            errors,
1844            vec!(LexicalError::MissingExponent(Loc::File(0, 0, 3)))
1845        );
1846
1847        let mut errors = Vec::new();
1848        let tokens = Lexer::new(r#"42.a"#, 0, &mut comments, &mut errors)
1849            .collect::<Vec<(usize, Token, usize)>>();
1850
1851        assert_eq!(
1852            tokens,
1853            vec!(
1854                (0, Token::Number("42", ""), 2),
1855                (2, Token::Member, 3),
1856                (3, Token::Identifier("a"), 4)
1857            )
1858        );
1859
1860        let tokens =
1861            Lexer::new(r#"42..a"#, 0, &mut comments, &mut errors)
1862                .collect::<Vec<(usize, Token, usize)>>();
1863
1864        assert_eq!(
1865            tokens,
1866            vec!(
1867                (0, Token::Number("42", ""), 2),
1868                (2, Token::Member, 3),
1869                (3, Token::Member, 4),
1870                (4, Token::Identifier("a"), 5)
1871            )
1872        );
1873
1874        let mut errors = Vec::new();
1875        let _ = Lexer::new(r#"hex"g""#, 0, &mut comments, &mut errors)
1876            .collect::<Vec<(usize, Token, usize)>>();
1877        assert_eq!(
1878            errors,
1879            vec!(LexicalError::InvalidCharacterInHexLiteral(
1880                Loc::File(0, 4, 5),
1881                'g'
1882            ),)
1883        );
1884
1885        let mut errors = Vec::new();
1886        let tokens =
1887            Lexer::new(".9", 0, &mut comments, &mut errors).collect::<Vec<(usize, Token, usize)>>();
1888
1889        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", ""), 2)));
1890
1891        let mut errors = Vec::new();
1892        let tokens = Lexer::new(".9e10", 0, &mut comments, &mut errors)
1893            .collect::<Vec<(usize, Token, usize)>>();
1894
1895        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", "10"), 5)));
1896
1897        let mut errors = Vec::new();
1898        let tokens = Lexer::new(".9", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1899
1900        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", ""), 2)));
1901
1902        let mut errors = Vec::new();
1903        let tokens = Lexer::new(".9e10", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1904
1905        assert_eq!(tokens, vec!((0, Token::RationalNumber("", "9", "10"), 5)));
1906
1907        errors.clear();
1908        comments.clear();
1909        let tokens =
1910            Lexer::new("@my_annotation", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1911        assert_eq!(tokens, vec![(0, Token::Annotation("my_annotation"), 14)]);
1912        assert!(errors.is_empty());
1913        assert!(comments.is_empty());
1914
1915        errors.clear();
1916        comments.clear();
1917        let tokens =
1918            Lexer::new("@ my_annotation", 0, &mut comments, &mut errors).collect::<Vec<_>>();
1919        assert_eq!(tokens, vec![(2, Token::Identifier("my_annotation"), 15)]);
1920        assert_eq!(
1921            errors,
1922            vec![LexicalError::UnrecognisedToken(
1923                Loc::File(0, 0, 1),
1924                "@".to_string()
1925            )]
1926        );
1927        assert!(comments.is_empty());
1928    }
1929}