rustc_ap_rustc_lexer/
lib.rs

1//! Low-level Rust lexer.
2//!
3//! The idea with `rustc_lexer` is to make a reusable library,
4//! by separating out pure lexing and rustc-specific concerns, like spans,
5//! error reporting, and interning.  So, rustc_lexer operates directly on `&str`,
6//! produces simple tokens which are a pair of type-tag and a bit of original text,
7//! and does not report errors, instead storing them as flags on the token.
8//!
9//! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
10//! For that see [`rustc_parse::lexer`], which converts this basic token stream
11//! into wide tokens used by actual parser.
12//!
13//! The purpose of this crate is to convert raw sources into a labeled sequence
14//! of well-known token types, so building an actual Rust token stream will
15//! be easier.
16//!
17//! The main entity of this crate is the [`TokenKind`] enum which represents common
18//! lexeme types.
19//!
20//! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
21// We want to be able to build this crate with a stable compiler, so no
22// `#![feature]` attributes should be added.
23
24mod cursor;
25pub mod unescape;
26
27#[cfg(test)]
28mod tests;
29
30use self::LiteralKind::*;
31use self::TokenKind::*;
32use crate::cursor::{Cursor, EOF_CHAR};
33use std::convert::TryFrom;
34
35/// Parsed token.
36/// It doesn't contain information about data that has been parsed,
37/// only the type of the token and its size.
38#[derive(Debug)]
39pub struct Token {
40    pub kind: TokenKind,
41    pub len: usize,
42}
43
44impl Token {
45    fn new(kind: TokenKind, len: usize) -> Token {
46        Token { kind, len }
47    }
48}
49
50/// Enum representing common lexeme types.
51// perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
52#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
53pub enum TokenKind {
54    // Multi-char tokens:
55    /// "// comment"
56    LineComment { doc_style: Option<DocStyle> },
57    /// `/* block comment */`
58    ///
59    /// Block comments can be recursive, so the sequence like `/* /* */`
60    /// will not be considered terminated and will result in a parsing error.
61    BlockComment { doc_style: Option<DocStyle>, terminated: bool },
62    /// Any whitespace characters sequence.
63    Whitespace,
64    /// "ident" or "continue"
65    /// At this step keywords are also considered identifiers.
66    Ident,
67    /// "r#ident"
68    RawIdent,
69    /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
70    /// prefix (`foo`) is included in the token, not the separator (which is
71    /// lexed as its own distinct token). In Rust 2021 and later, reserved
72    /// prefixes are reported as errors; in earlier editions, they result in a
73    /// (allowed by default) lint, and are treated as regular identifier
74    /// tokens.
75    UnknownPrefix,
76    /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
77    Literal { kind: LiteralKind, suffix_start: usize },
78    /// "'a"
79    Lifetime { starts_with_number: bool },
80
81    // One-char tokens:
82    /// ";"
83    Semi,
84    /// ","
85    Comma,
86    /// "."
87    Dot,
88    /// "("
89    OpenParen,
90    /// ")"
91    CloseParen,
92    /// "{"
93    OpenBrace,
94    /// "}"
95    CloseBrace,
96    /// "["
97    OpenBracket,
98    /// "]"
99    CloseBracket,
100    /// "@"
101    At,
102    /// "#"
103    Pound,
104    /// "~"
105    Tilde,
106    /// "?"
107    Question,
108    /// ":"
109    Colon,
110    /// "$"
111    Dollar,
112    /// "="
113    Eq,
114    /// "!"
115    Bang,
116    /// "<"
117    Lt,
118    /// ">"
119    Gt,
120    /// "-"
121    Minus,
122    /// "&"
123    And,
124    /// "|"
125    Or,
126    /// "+"
127    Plus,
128    /// "*"
129    Star,
130    /// "/"
131    Slash,
132    /// "^"
133    Caret,
134    /// "%"
135    Percent,
136
137    /// Unknown token, not expected by the lexer, e.g. "№"
138    Unknown,
139}
140
141#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
142pub enum DocStyle {
143    Outer,
144    Inner,
145}
146
147#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
148pub enum LiteralKind {
149    /// "12_u8", "0o100", "0b120i99"
150    Int { base: Base, empty_int: bool },
151    /// "12.34f32", "0b100.100"
152    Float { base: Base, empty_exponent: bool },
153    /// "'a'", "'\\'", "'''", "';"
154    Char { terminated: bool },
155    /// "b'a'", "b'\\'", "b'''", "b';"
156    Byte { terminated: bool },
157    /// ""abc"", ""abc"
158    Str { terminated: bool },
159    /// "b"abc"", "b"abc"
160    ByteStr { terminated: bool },
161    /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
162    RawStr { n_hashes: u16, err: Option<RawStrError> },
163    /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
164    RawByteStr { n_hashes: u16, err: Option<RawStrError> },
165}
166
167/// Error produced validating a raw string. Represents cases like:
168/// - `r##~"abcde"##`: `InvalidStarter`
169/// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
170/// - Too many `#`s (>65535): `TooManyDelimiters`
171// perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
172#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
173pub enum RawStrError {
174    /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
175    InvalidStarter { bad_char: char },
176    /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
177    /// may have intended to terminate it.
178    NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
179    /// More than 65535 `#`s exist.
180    TooManyDelimiters { found: usize },
181}
182
183/// Base of numeric literal encoding according to its prefix.
184#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
185pub enum Base {
186    /// Literal starts with "0b".
187    Binary,
188    /// Literal starts with "0o".
189    Octal,
190    /// Literal starts with "0x".
191    Hexadecimal,
192    /// Literal doesn't contain a prefix.
193    Decimal,
194}
195
196/// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
197/// but shebang isn't a part of rust syntax.
198pub fn strip_shebang(input: &str) -> Option<usize> {
199    // Shebang must start with `#!` literally, without any preceding whitespace.
200    // For simplicity we consider any line starting with `#!` a shebang,
201    // regardless of restrictions put on shebangs by specific platforms.
202    if let Some(input_tail) = input.strip_prefix("#!") {
203        // Ok, this is a shebang but if the next non-whitespace token is `[`,
204        // then it may be valid Rust code, so consider it Rust code.
205        let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
206            !matches!(
207                tok,
208                TokenKind::Whitespace
209                    | TokenKind::LineComment { doc_style: None }
210                    | TokenKind::BlockComment { doc_style: None, .. }
211            )
212        });
213        if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
214            // No other choice than to consider this a shebang.
215            return Some(2 + input_tail.lines().next().unwrap_or_default().len());
216        }
217    }
218    None
219}
220
221/// Parses the first token from the provided input string.
222pub fn first_token(input: &str) -> Token {
223    debug_assert!(!input.is_empty());
224    Cursor::new(input).advance_token()
225}
226
227/// Creates an iterator that produces tokens from the input string.
228pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
229    std::iter::from_fn(move || {
230        if input.is_empty() {
231            return None;
232        }
233        let token = first_token(input);
234        input = &input[token.len..];
235        Some(token)
236    })
237}
238
239/// True if `c` is considered a whitespace according to Rust language definition.
240/// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
241/// for definitions of these classes.
242pub fn is_whitespace(c: char) -> bool {
243    // This is Pattern_White_Space.
244    //
245    // Note that this set is stable (ie, it doesn't change with different
246    // Unicode versions), so it's ok to just hard-code the values.
247
248    matches!(
249        c,
250        // Usual ASCII suspects
251        '\u{0009}'   // \t
252        | '\u{000A}' // \n
253        | '\u{000B}' // vertical tab
254        | '\u{000C}' // form feed
255        | '\u{000D}' // \r
256        | '\u{0020}' // space
257
258        // NEXT LINE from latin1
259        | '\u{0085}'
260
261        // Bidi markers
262        | '\u{200E}' // LEFT-TO-RIGHT MARK
263        | '\u{200F}' // RIGHT-TO-LEFT MARK
264
265        // Dedicated whitespace characters from Unicode
266        | '\u{2028}' // LINE SEPARATOR
267        | '\u{2029}' // PARAGRAPH SEPARATOR
268    )
269}
270
271/// True if `c` is valid as a first character of an identifier.
272/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
273/// a formal definition of valid identifier name.
274pub fn is_id_start(c: char) -> bool {
275    // This is XID_Start OR '_' (which formally is not a XID_Start).
276    // We also add fast-path for ascii idents
277    ('a'..='z').contains(&c)
278        || ('A'..='Z').contains(&c)
279        || c == '_'
280        || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_start(c))
281}
282
283/// True if `c` is valid as a non-first character of an identifier.
284/// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
285/// a formal definition of valid identifier name.
286pub fn is_id_continue(c: char) -> bool {
287    // This is exactly XID_Continue.
288    // We also add fast-path for ascii idents
289    ('a'..='z').contains(&c)
290        || ('A'..='Z').contains(&c)
291        || ('0'..='9').contains(&c)
292        || c == '_'
293        || (c > '\x7f' && unicode_xid::UnicodeXID::is_xid_continue(c))
294}
295
296/// The passed string is lexically an identifier.
297pub fn is_ident(string: &str) -> bool {
298    let mut chars = string.chars();
299    if let Some(start) = chars.next() {
300        is_id_start(start) && chars.all(is_id_continue)
301    } else {
302        false
303    }
304}
305
306impl Cursor<'_> {
307    /// Parses a token from the input string.
308    fn advance_token(&mut self) -> Token {
309        let first_char = self.bump().unwrap();
310        let token_kind = match first_char {
311            // Slash, comment or block comment.
312            '/' => match self.first() {
313                '/' => self.line_comment(),
314                '*' => self.block_comment(),
315                _ => Slash,
316            },
317
318            // Whitespace sequence.
319            c if is_whitespace(c) => self.whitespace(),
320
321            // Raw identifier, raw string literal or identifier.
322            'r' => match (self.first(), self.second()) {
323                ('#', c1) if is_id_start(c1) => self.raw_ident(),
324                ('#', _) | ('"', _) => {
325                    let (n_hashes, err) = self.raw_double_quoted_string(1);
326                    let suffix_start = self.len_consumed();
327                    if err.is_none() {
328                        self.eat_literal_suffix();
329                    }
330                    let kind = RawStr { n_hashes, err };
331                    Literal { kind, suffix_start }
332                }
333                _ => self.ident_or_unknown_prefix(),
334            },
335
336            // Byte literal, byte string literal, raw byte string literal or identifier.
337            'b' => match (self.first(), self.second()) {
338                ('\'', _) => {
339                    self.bump();
340                    let terminated = self.single_quoted_string();
341                    let suffix_start = self.len_consumed();
342                    if terminated {
343                        self.eat_literal_suffix();
344                    }
345                    let kind = Byte { terminated };
346                    Literal { kind, suffix_start }
347                }
348                ('"', _) => {
349                    self.bump();
350                    let terminated = self.double_quoted_string();
351                    let suffix_start = self.len_consumed();
352                    if terminated {
353                        self.eat_literal_suffix();
354                    }
355                    let kind = ByteStr { terminated };
356                    Literal { kind, suffix_start }
357                }
358                ('r', '"') | ('r', '#') => {
359                    self.bump();
360                    let (n_hashes, err) = self.raw_double_quoted_string(2);
361                    let suffix_start = self.len_consumed();
362                    if err.is_none() {
363                        self.eat_literal_suffix();
364                    }
365                    let kind = RawByteStr { n_hashes, err };
366                    Literal { kind, suffix_start }
367                }
368                _ => self.ident_or_unknown_prefix(),
369            },
370
371            // Identifier (this should be checked after other variant that can
372            // start as identifier).
373            c if is_id_start(c) => self.ident_or_unknown_prefix(),
374
375            // Numeric literal.
376            c @ '0'..='9' => {
377                let literal_kind = self.number(c);
378                let suffix_start = self.len_consumed();
379                self.eat_literal_suffix();
380                TokenKind::Literal { kind: literal_kind, suffix_start }
381            }
382
383            // One-symbol tokens.
384            ';' => Semi,
385            ',' => Comma,
386            '.' => Dot,
387            '(' => OpenParen,
388            ')' => CloseParen,
389            '{' => OpenBrace,
390            '}' => CloseBrace,
391            '[' => OpenBracket,
392            ']' => CloseBracket,
393            '@' => At,
394            '#' => Pound,
395            '~' => Tilde,
396            '?' => Question,
397            ':' => Colon,
398            '$' => Dollar,
399            '=' => Eq,
400            '!' => Bang,
401            '<' => Lt,
402            '>' => Gt,
403            '-' => Minus,
404            '&' => And,
405            '|' => Or,
406            '+' => Plus,
407            '*' => Star,
408            '^' => Caret,
409            '%' => Percent,
410
411            // Lifetime or character literal.
412            '\'' => self.lifetime_or_char(),
413
414            // String literal.
415            '"' => {
416                let terminated = self.double_quoted_string();
417                let suffix_start = self.len_consumed();
418                if terminated {
419                    self.eat_literal_suffix();
420                }
421                let kind = Str { terminated };
422                Literal { kind, suffix_start }
423            }
424            _ => Unknown,
425        };
426        Token::new(token_kind, self.len_consumed())
427    }
428
429    fn line_comment(&mut self) -> TokenKind {
430        debug_assert!(self.prev() == '/' && self.first() == '/');
431        self.bump();
432
433        let doc_style = match self.first() {
434            // `//!` is an inner line doc comment.
435            '!' => Some(DocStyle::Inner),
436            // `////` (more than 3 slashes) is not considered a doc comment.
437            '/' if self.second() != '/' => Some(DocStyle::Outer),
438            _ => None,
439        };
440
441        self.eat_while(|c| c != '\n');
442        LineComment { doc_style }
443    }
444
445    fn block_comment(&mut self) -> TokenKind {
446        debug_assert!(self.prev() == '/' && self.first() == '*');
447        self.bump();
448
449        let doc_style = match self.first() {
450            // `/*!` is an inner block doc comment.
451            '!' => Some(DocStyle::Inner),
452            // `/***` (more than 2 stars) is not considered a doc comment.
453            // `/**/` is not considered a doc comment.
454            '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
455            _ => None,
456        };
457
458        let mut depth = 1usize;
459        while let Some(c) = self.bump() {
460            match c {
461                '/' if self.first() == '*' => {
462                    self.bump();
463                    depth += 1;
464                }
465                '*' if self.first() == '/' => {
466                    self.bump();
467                    depth -= 1;
468                    if depth == 0 {
469                        // This block comment is closed, so for a construction like "/* */ */"
470                        // there will be a successfully parsed block comment "/* */"
471                        // and " */" will be processed separately.
472                        break;
473                    }
474                }
475                _ => (),
476            }
477        }
478
479        BlockComment { doc_style, terminated: depth == 0 }
480    }
481
482    fn whitespace(&mut self) -> TokenKind {
483        debug_assert!(is_whitespace(self.prev()));
484        self.eat_while(is_whitespace);
485        Whitespace
486    }
487
488    fn raw_ident(&mut self) -> TokenKind {
489        debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
490        // Eat "#" symbol.
491        self.bump();
492        // Eat the identifier part of RawIdent.
493        self.eat_identifier();
494        RawIdent
495    }
496
497    fn ident_or_unknown_prefix(&mut self) -> TokenKind {
498        debug_assert!(is_id_start(self.prev()));
499        // Start is already eaten, eat the rest of identifier.
500        self.eat_while(is_id_continue);
501        // Known prefixes must have been handled earlier. So if
502        // we see a prefix here, it is definitely a unknown prefix.
503        match self.first() {
504            '#' | '"' | '\'' => UnknownPrefix,
505            _ => Ident,
506        }
507    }
508
509    fn number(&mut self, first_digit: char) -> LiteralKind {
510        debug_assert!('0' <= self.prev() && self.prev() <= '9');
511        let mut base = Base::Decimal;
512        if first_digit == '0' {
513            // Attempt to parse encoding base.
514            let has_digits = match self.first() {
515                'b' => {
516                    base = Base::Binary;
517                    self.bump();
518                    self.eat_decimal_digits()
519                }
520                'o' => {
521                    base = Base::Octal;
522                    self.bump();
523                    self.eat_decimal_digits()
524                }
525                'x' => {
526                    base = Base::Hexadecimal;
527                    self.bump();
528                    self.eat_hexadecimal_digits()
529                }
530                // Not a base prefix.
531                '0'..='9' | '_' | '.' | 'e' | 'E' => {
532                    self.eat_decimal_digits();
533                    true
534                }
535                // Just a 0.
536                _ => return Int { base, empty_int: false },
537            };
538            // Base prefix was provided, but there were no digits
539            // after it, e.g. "0x".
540            if !has_digits {
541                return Int { base, empty_int: true };
542            }
543        } else {
544            // No base prefix, parse number in the usual way.
545            self.eat_decimal_digits();
546        };
547
548        match self.first() {
549            // Don't be greedy if this is actually an
550            // integer literal followed by field/method access or a range pattern
551            // (`0..2` and `12.foo()`)
552            '.' if self.second() != '.' && !is_id_start(self.second()) => {
553                // might have stuff after the ., and if it does, it needs to start
554                // with a number
555                self.bump();
556                let mut empty_exponent = false;
557                if self.first().is_digit(10) {
558                    self.eat_decimal_digits();
559                    match self.first() {
560                        'e' | 'E' => {
561                            self.bump();
562                            empty_exponent = !self.eat_float_exponent();
563                        }
564                        _ => (),
565                    }
566                }
567                Float { base, empty_exponent }
568            }
569            'e' | 'E' => {
570                self.bump();
571                let empty_exponent = !self.eat_float_exponent();
572                Float { base, empty_exponent }
573            }
574            _ => Int { base, empty_int: false },
575        }
576    }
577
578    fn lifetime_or_char(&mut self) -> TokenKind {
579        debug_assert!(self.prev() == '\'');
580
581        let can_be_a_lifetime = if self.second() == '\'' {
582            // It's surely not a lifetime.
583            false
584        } else {
585            // If the first symbol is valid for identifier, it can be a lifetime.
586            // Also check if it's a number for a better error reporting (so '0 will
587            // be reported as invalid lifetime and not as unterminated char literal).
588            is_id_start(self.first()) || self.first().is_digit(10)
589        };
590
591        if !can_be_a_lifetime {
592            let terminated = self.single_quoted_string();
593            let suffix_start = self.len_consumed();
594            if terminated {
595                self.eat_literal_suffix();
596            }
597            let kind = Char { terminated };
598            return Literal { kind, suffix_start };
599        }
600
601        // Either a lifetime or a character literal with
602        // length greater than 1.
603
604        let starts_with_number = self.first().is_digit(10);
605
606        // Skip the literal contents.
607        // First symbol can be a number (which isn't a valid identifier start),
608        // so skip it without any checks.
609        self.bump();
610        self.eat_while(is_id_continue);
611
612        // Check if after skipping literal contents we've met a closing
613        // single quote (which means that user attempted to create a
614        // string with single quotes).
615        if self.first() == '\'' {
616            self.bump();
617            let kind = Char { terminated: true };
618            Literal { kind, suffix_start: self.len_consumed() }
619        } else {
620            Lifetime { starts_with_number }
621        }
622    }
623
624    fn single_quoted_string(&mut self) -> bool {
625        debug_assert!(self.prev() == '\'');
626        // Check if it's a one-symbol literal.
627        if self.second() == '\'' && self.first() != '\\' {
628            self.bump();
629            self.bump();
630            return true;
631        }
632
633        // Literal has more than one symbol.
634
635        // Parse until either quotes are terminated or error is detected.
636        loop {
637            match self.first() {
638                // Quotes are terminated, finish parsing.
639                '\'' => {
640                    self.bump();
641                    return true;
642                }
643                // Probably beginning of the comment, which we don't want to include
644                // to the error report.
645                '/' => break,
646                // Newline without following '\'' means unclosed quote, stop parsing.
647                '\n' if self.second() != '\'' => break,
648                // End of file, stop parsing.
649                EOF_CHAR if self.is_eof() => break,
650                // Escaped slash is considered one character, so bump twice.
651                '\\' => {
652                    self.bump();
653                    self.bump();
654                }
655                // Skip the character.
656                _ => {
657                    self.bump();
658                }
659            }
660        }
661        // String was not terminated.
662        false
663    }
664
665    /// Eats double-quoted string and returns true
666    /// if string is terminated.
667    fn double_quoted_string(&mut self) -> bool {
668        debug_assert!(self.prev() == '"');
669        while let Some(c) = self.bump() {
670            match c {
671                '"' => {
672                    return true;
673                }
674                '\\' if self.first() == '\\' || self.first() == '"' => {
675                    // Bump again to skip escaped character.
676                    self.bump();
677                }
678                _ => (),
679            }
680        }
681        // End of file reached.
682        false
683    }
684
685    /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
686    fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
687        // Wrap the actual function to handle the error with too many hashes.
688        // This way, it eats the whole raw string.
689        let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
690        // Only up to 65535 `#`s are allowed in raw strings
691        match u16::try_from(n_hashes) {
692            Ok(num) => (num, err),
693            // We lie about the number of hashes here :P
694            Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
695        }
696    }
697
698    fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
699        debug_assert!(self.prev() == 'r');
700        let start_pos = self.len_consumed();
701        let mut possible_terminator_offset = None;
702        let mut max_hashes = 0;
703
704        // Count opening '#' symbols.
705        let mut eaten = 0;
706        while self.first() == '#' {
707            eaten += 1;
708            self.bump();
709        }
710        let n_start_hashes = eaten;
711
712        // Check that string is started.
713        match self.bump() {
714            Some('"') => (),
715            c => {
716                let c = c.unwrap_or(EOF_CHAR);
717                return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
718            }
719        }
720
721        // Skip the string contents and on each '#' character met, check if this is
722        // a raw string termination.
723        loop {
724            self.eat_while(|c| c != '"');
725
726            if self.is_eof() {
727                return (
728                    n_start_hashes,
729                    Some(RawStrError::NoTerminator {
730                        expected: n_start_hashes,
731                        found: max_hashes,
732                        possible_terminator_offset,
733                    }),
734                );
735            }
736
737            // Eat closing double quote.
738            self.bump();
739
740            // Check that amount of closing '#' symbols
741            // is equal to the amount of opening ones.
742            // Note that this will not consume extra trailing `#` characters:
743            // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
744            // followed by a `#` token.
745            let mut n_end_hashes = 0;
746            while self.first() == '#' && n_end_hashes < n_start_hashes {
747                n_end_hashes += 1;
748                self.bump();
749            }
750
751            if n_end_hashes == n_start_hashes {
752                return (n_start_hashes, None);
753            } else if n_end_hashes > max_hashes {
754                // Keep track of possible terminators to give a hint about
755                // where there might be a missing terminator
756                possible_terminator_offset =
757                    Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
758                max_hashes = n_end_hashes;
759            }
760        }
761    }
762
763    fn eat_decimal_digits(&mut self) -> bool {
764        let mut has_digits = false;
765        loop {
766            match self.first() {
767                '_' => {
768                    self.bump();
769                }
770                '0'..='9' => {
771                    has_digits = true;
772                    self.bump();
773                }
774                _ => break,
775            }
776        }
777        has_digits
778    }
779
780    fn eat_hexadecimal_digits(&mut self) -> bool {
781        let mut has_digits = false;
782        loop {
783            match self.first() {
784                '_' => {
785                    self.bump();
786                }
787                '0'..='9' | 'a'..='f' | 'A'..='F' => {
788                    has_digits = true;
789                    self.bump();
790                }
791                _ => break,
792            }
793        }
794        has_digits
795    }
796
797    /// Eats the float exponent. Returns true if at least one digit was met,
798    /// and returns false otherwise.
799    fn eat_float_exponent(&mut self) -> bool {
800        debug_assert!(self.prev() == 'e' || self.prev() == 'E');
801        if self.first() == '-' || self.first() == '+' {
802            self.bump();
803        }
804        self.eat_decimal_digits()
805    }
806
807    // Eats the suffix of the literal, e.g. "_u8".
808    fn eat_literal_suffix(&mut self) {
809        self.eat_identifier();
810    }
811
812    // Eats the identifier.
813    fn eat_identifier(&mut self) {
814        if !is_id_start(self.first()) {
815            return;
816        }
817        self.bump();
818
819        self.eat_while(is_id_continue);
820    }
821
822    /// Eats symbols while predicate returns true or until the end of file is reached.
823    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
824        while predicate(self.first()) && !self.is_eof() {
825            self.bump();
826        }
827    }
828}
rustc_ap_rustc_lexer/lib.rs

rustc_ap_rustc_lexer/
lib.rs