sway_parse/
token.rs

1use core::mem;
2use extension_trait::extension_trait;
3use num_bigint::BigUint;
4use std::sync::Arc;
5use sway_ast::literal::{LitChar, LitInt, LitIntType, LitString, Literal};
6use sway_ast::token::{
7    Comment, CommentKind, CommentedGroup, CommentedTokenStream, CommentedTokenTree, DocComment,
8    DocStyle, GenericTokenTree, Punct, Spacing, TokenStream,
9};
10use sway_error::error::CompileError;
11use sway_error::handler::{ErrorEmitted, Handler};
12use sway_error::lex_error::{LexError, LexErrorKind};
13use sway_types::{
14    ast::{Delimiter, PunctKind},
15    Ident, SourceId, Span, Spanned,
16};
17use unicode_bidi::format_chars::{ALM, FSI, LRE, LRI, LRM, LRO, PDF, PDI, RLE, RLI, RLM, RLO};
18use unicode_xid::UnicodeXID;
19
20#[extension_trait]
21impl CharExt for char {
22    /// Converts the character into an opening delimiter, if any.
23    fn as_open_delimiter(self) -> Option<Delimiter> {
24        match self {
25            '(' => Some(Delimiter::Parenthesis),
26            '{' => Some(Delimiter::Brace),
27            '[' => Some(Delimiter::Bracket),
28            _ => None,
29        }
30    }
31
32    /// Converts the character into a closing delimiter, if any.
33    fn as_close_delimiter(self) -> Option<Delimiter> {
34        match self {
35            ')' => Some(Delimiter::Parenthesis),
36            '}' => Some(Delimiter::Brace),
37            ']' => Some(Delimiter::Bracket),
38            _ => None,
39        }
40    }
41
42    /// Determines what sort of punctuation this character is, if any.
43    fn as_punct_kind(self) -> Option<PunctKind> {
44        match self {
45            ';' => Some(PunctKind::Semicolon),
46            ':' => Some(PunctKind::Colon),
47            '/' => Some(PunctKind::ForwardSlash),
48            ',' => Some(PunctKind::Comma),
49            '*' => Some(PunctKind::Star),
50            '+' => Some(PunctKind::Add),
51            '-' => Some(PunctKind::Sub),
52            '<' => Some(PunctKind::LessThan),
53            '>' => Some(PunctKind::GreaterThan),
54            '=' => Some(PunctKind::Equals),
55            '.' => Some(PunctKind::Dot),
56            '!' => Some(PunctKind::Bang),
57            '%' => Some(PunctKind::Percent),
58            '&' => Some(PunctKind::Ampersand),
59            '^' => Some(PunctKind::Caret),
60            '|' => Some(PunctKind::Pipe),
61            '_' => Some(PunctKind::Underscore),
62            '#' => Some(PunctKind::Sharp),
63            _ => None,
64        }
65    }
66}
67
68struct CharIndicesInner<'a> {
69    src: &'a str,
70    position: usize,
71}
72
73impl Iterator for CharIndicesInner<'_> {
74    type Item = (usize, char);
75
76    fn next(&mut self) -> Option<(usize, char)> {
77        let mut char_indices = self.src[self.position..].char_indices();
78        let (_, c) = char_indices.next()?;
79        let ret = (self.position, c);
80        match char_indices.next() {
81            Some((char_width, _)) => self.position += char_width,
82            None => self.position = self.src.len(),
83        };
84        Some(ret)
85    }
86}
87
88type CharIndices<'a> = std::iter::Peekable<CharIndicesInner<'a>>;
89type Result<T> = core::result::Result<T, ErrorEmitted>;
90
91struct Lexer<'l> {
92    handler: &'l Handler,
93    src: &'l Arc<str>,
94    source_id: &'l Option<SourceId>,
95    stream: &'l mut CharIndices<'l>,
96}
97
98pub fn lex(
99    handler: &Handler,
100    src: &Arc<str>,
101    start: usize,
102    end: usize,
103    source_id: Option<SourceId>,
104) -> Result<TokenStream> {
105    lex_commented(handler, src, start, end, &source_id).map(|stream| stream.strip_comments())
106}
107
108pub fn lex_commented(
109    handler: &Handler,
110    src: &Arc<str>,
111    start: usize,
112    end: usize,
113    source_id: &Option<SourceId>,
114) -> Result<CommentedTokenStream> {
115    let stream = &mut CharIndicesInner {
116        src: &src[..end],
117        position: start,
118    }
119    .peekable();
120    let mut l = Lexer {
121        handler,
122        src,
123        source_id,
124        stream,
125    };
126    let mut gather_module_docs = false;
127    let mut file_start_offset: usize = 0;
128
129    let mut parent_token_trees = Vec::new();
130    let mut token_trees = Vec::new();
131    while let Some((mut index, mut character)) = l.stream.next() {
132        if character.is_whitespace() {
133            // if the beginning of a file starts with whitespace
134            // we must keep track to ensure that the module level docs
135            // will get inserted into the tree correctly
136            if index - file_start_offset == 0 {
137                file_start_offset += character.len_utf8();
138            }
139            continue;
140        }
141        if character == '/' {
142            match l.stream.peek() {
143                Some((_, '/')) => {
144                    // search_end is the index at which we stop looking backwards for
145                    // a newline
146                    let search_end = token_trees
147                        .last()
148                        .map(|tt| {
149                            if let CommentedTokenTree::Tree(t) = tt {
150                                t.span().end()
151                            } else {
152                                0
153                            }
154                        })
155                        .unwrap_or_default();
156
157                    let has_newline = src[search_end..index]
158                        .chars()
159                        .rev()
160                        .take_while(|c| c.is_whitespace())
161                        .filter(|&c| c == '\n')
162                        .count()
163                        > 0;
164                    // We found a comment at the start of file, which should be accounted for as a Newlined comment.
165                    let start_of_file_found = search_end == 0 && index == 0;
166
167                    let comment_kind = if has_newline || start_of_file_found {
168                        CommentKind::Newlined
169                    } else {
170                        CommentKind::Trailing
171                    };
172
173                    let ctt = lex_line_comment(
174                        &mut l,
175                        end,
176                        index,
177                        comment_kind,
178                        file_start_offset,
179                        gather_module_docs,
180                    );
181                    if let CommentedTokenTree::Tree(GenericTokenTree::DocComment(DocComment {
182                        doc_style: DocStyle::Inner,
183                        ..
184                    })) = &ctt
185                    {
186                        gather_module_docs = true;
187                    }
188                    token_trees.push(ctt);
189                    continue;
190                }
191                Some((_, '*')) => {
192                    if let Some(token) = lex_block_comment(&mut l, index) {
193                        token_trees.push(token);
194                    }
195                    continue;
196                }
197                Some(_) | None => {}
198            }
199        } else {
200            gather_module_docs = false;
201        }
202
203        if character.is_xid_start() || character == '_' {
204            // Raw identifier, e.g., `r#foo`? Then mark as such, stripping the prefix `r#`.
205            let is_raw_ident = character == 'r' && matches!(l.stream.peek(), Some((_, '#')));
206            if is_raw_ident {
207                l.stream.next();
208                if let Some((next_index, next_character)) = l.stream.next() {
209                    character = next_character;
210                    index = next_index;
211                }
212                if !(character.is_xid_start() || character == '_') {
213                    let kind = LexErrorKind::InvalidCharacter {
214                        position: index,
215                        character,
216                    };
217                    let span = span_one(&l, index, character);
218                    error(l.handler, LexError { kind, span });
219                    continue;
220                }
221            }
222
223            // Don't accept just `_` as an identifier.
224            let not_is_single_underscore = character != '_'
225                || l.stream
226                    .peek()
227                    .is_some_and(|(_, next)| next.is_xid_continue());
228            if not_is_single_underscore {
229                // Consume until we hit other than `XID_CONTINUE`.
230                while l.stream.next_if(|(_, c)| c.is_xid_continue()).is_some() {}
231                let ident = Ident::new_with_raw(span_until(&mut l, index), is_raw_ident);
232                token_trees.push(CommentedTokenTree::Tree(ident.into()));
233                continue;
234            }
235        }
236        if let Some(delimiter) = character.as_open_delimiter() {
237            let token_trees = mem::take(&mut token_trees);
238            parent_token_trees.push((token_trees, index, delimiter));
239            continue;
240        }
241        if let Some(close_delimiter) = character.as_close_delimiter() {
242            match parent_token_trees.pop() {
243                None => {
244                    // Recover by ignoring the unexpected closing delim,
245                    // giving the parser opportunities to realize the need for an opening delim
246                    // in e.g., this example:
247                    //
248                    // fn foo() // <-- Parser expects grouped tokens in `{ ... }` here.
249                    //     let x = 0;
250                    // } // <- This recovery.
251                    let kind = LexErrorKind::UnexpectedCloseDelimiter {
252                        position: index,
253                        close_delimiter,
254                    };
255                    let span = span_one(&l, index, character);
256                    error(l.handler, LexError { kind, span });
257                }
258                Some((parent, open_index, open_delimiter)) => {
259                    if open_delimiter != close_delimiter {
260                        // Recover on e.g., a `{ )` mismatch by having `)` interpreted as `}`.
261                        let kind = LexErrorKind::MismatchedDelimiters {
262                            open_position: open_index,
263                            close_position: index,
264                            open_delimiter,
265                            close_delimiter,
266                        };
267                        let span = span_one(&l, index, character);
268                        error(l.handler, LexError { kind, span });
269                    }
270                    token_trees = lex_close_delimiter(
271                        &mut l,
272                        index,
273                        parent,
274                        token_trees,
275                        open_index,
276                        open_delimiter,
277                    );
278                }
279            }
280            continue;
281        }
282        if let Some(token) = lex_string(&mut l, index, character)? {
283            token_trees.push(token);
284            continue;
285        }
286        if let Some(token) = lex_char(&mut l, index, character)? {
287            token_trees.push(token);
288            continue;
289        }
290        if let Some(token) = lex_int_lit(&mut l, index, character)? {
291            token_trees.push(token);
292            continue;
293        }
294        if let Some(token) = lex_punctuation(&mut l, index, character) {
295            token_trees.push(token);
296            continue;
297        }
298
299        // Recover by simply ignoring the character.
300        // NOTE(Centril): I'm not sure how good of an idea this is... time will tell.
301        let kind = LexErrorKind::InvalidCharacter {
302            position: index,
303            character,
304        };
305        let span = span_one(&l, index, character);
306        error(l.handler, LexError { kind, span });
307        continue;
308    }
309
310    // Recover all unclosed delimiters.
311    while let Some((parent, open_index, open_delimiter)) = parent_token_trees.pop() {
312        let kind = LexErrorKind::UnclosedDelimiter {
313            open_position: open_index,
314            open_delimiter,
315        };
316        let span = span_one(&l, open_index, open_delimiter.as_open_char());
317        error(l.handler, LexError { kind, span });
318
319        token_trees = lex_close_delimiter(
320            &mut l,
321            src.len(),
322            parent,
323            token_trees,
324            open_index,
325            open_delimiter,
326        );
327    }
328    Ok(CommentedTokenStream {
329        token_trees,
330        full_span: span(&l, start, end),
331    })
332}
333
334fn lex_close_delimiter(
335    l: &mut Lexer<'_>,
336    index: usize,
337    mut parent: Vec<CommentedTokenTree>,
338    token_trees: Vec<CommentedTokenTree>,
339    open_index: usize,
340    delimiter: Delimiter,
341) -> Vec<CommentedTokenTree> {
342    let start_index = open_index + delimiter.as_open_char().len_utf8();
343    let full_span = span(l, start_index, index);
344    let group = CommentedGroup {
345        token_stream: CommentedTokenStream {
346            token_trees,
347            full_span,
348        },
349        delimiter,
350        span: span_until(l, open_index),
351    };
352    parent.push(CommentedTokenTree::Tree(group.into()));
353    parent
354}
355
356fn lex_line_comment(
357    l: &mut Lexer<'_>,
358    end: usize,
359    index: usize,
360    comment_kind: CommentKind,
361    offset: usize,
362    gather_module_docs: bool,
363) -> CommentedTokenTree {
364    let _ = l.stream.next();
365
366    // Find end; either at EOF or at `\n`.
367    let end = l
368        .stream
369        .find(|(_, character)| *character == '\n')
370        .map_or(end, |(end, _)| end);
371    let sp = span(l, index, end);
372
373    let doc_style = match (sp.as_str().chars().nth(2), sp.as_str().chars().nth(3)) {
374        // `//!` is an inner line doc comment.
375        (Some('!'), _) => {
376            if index - offset == 0 || gather_module_docs {
377                // TODO(#4112): remove this conditional block to enable
378                // inner doc comment attributes for all items
379                Some(DocStyle::Inner)
380            } else {
381                None
382            }
383        }
384        // `////` (more than 3 slashes) is not considered a doc comment.
385        (Some('/'), Some('/')) => None,
386        // `///` is an outer line doc comment.
387        (Some('/'), _) => Some(DocStyle::Outer),
388        _ => None,
389    };
390
391    if let Some(doc_style) = doc_style {
392        let doc_comment = DocComment {
393            span: sp,
394            doc_style,
395            content_span: span(l, index + 3, end),
396        };
397        CommentedTokenTree::Tree(doc_comment.into())
398    } else {
399        Comment {
400            span: sp,
401            comment_kind,
402        }
403        .into()
404    }
405}
406
407fn lex_block_comment(l: &mut Lexer<'_>, index: usize) -> Option<CommentedTokenTree> {
408    // Lexing a multi-line comment.
409    let _ = l.stream.next();
410    let mut unclosed_indices = vec![index];
411
412    let unclosed_multiline_comment = |l: &Lexer<'_>, unclosed_indices: Vec<_>| {
413        let span = span(l, *unclosed_indices.last().unwrap(), l.src.len() - 1);
414        let kind = LexErrorKind::UnclosedMultilineComment { unclosed_indices };
415        error(l.handler, LexError { kind, span });
416        None
417    };
418
419    // We first start by assuming that block comments are inlined.
420    let mut comment_kind = CommentKind::Inlined;
421
422    loop {
423        match l.stream.next() {
424            None => return unclosed_multiline_comment(l, unclosed_indices),
425            Some((_, '*')) => match l.stream.next() {
426                None => return unclosed_multiline_comment(l, unclosed_indices),
427                // Matched `*/`, so we're closing some multi-line comment. It could be nested.
428                Some((slash_ix, '/')) => {
429                    let start = unclosed_indices.pop().unwrap();
430                    if unclosed_indices.is_empty() {
431                        // For the purposes of lexing,
432                        // nested multi-line comments constitute a single multi-line comment.
433                        // We could represent them as several ones, but that's unnecessary.
434                        let end = slash_ix + '/'.len_utf8();
435                        let span = span(l, start, end);
436                        return Some(Comment { span, comment_kind }.into());
437                    }
438                }
439                Some(_) => {}
440            },
441            // Found nested multi-line comment.
442            Some((next_index, '/')) => match l.stream.next() {
443                None => return unclosed_multiline_comment(l, unclosed_indices),
444                Some((_, '*')) => unclosed_indices.push(next_index),
445                Some(_) => {}
446            },
447            Some((_, '\n')) => {
448                // If we find a newline character while lexing, this means that the block comment is multiline.
449                // Example:
450                // /* this is a
451                //    multilined block comment */
452                comment_kind = CommentKind::Multilined;
453            }
454            Some(_) => {}
455        }
456    }
457}
458
459fn lex_string(
460    l: &mut Lexer<'_>,
461    index: usize,
462    character: char,
463) -> Result<Option<CommentedTokenTree>> {
464    if character != '"' {
465        return Ok(None);
466    }
467    let mut parsed = String::new();
468    loop {
469        let unclosed_string_lit = |l: &Lexer<'_>, end| {
470            error(
471                l.handler,
472                LexError {
473                    kind: LexErrorKind::UnclosedStringLiteral { position: index },
474                    span: span(l, index, end),
475                },
476            )
477        };
478        let (next_index, next_character) = l.stream.next().ok_or_else(|| {
479            // last character may not be a unicode boundary
480            let mut end = l.src.len() - 1;
481            while !l.src.is_char_boundary(end) {
482                end -= 1;
483            }
484            unclosed_string_lit(l, end)
485        })?;
486        parsed.push(match next_character {
487            '\\' => parse_escape_code(l)
488                .map_err(|e| e.unwrap_or_else(|| unclosed_string_lit(l, l.src.len())))?,
489            '"' => break,
490            // do not allow text direction codepoints
491            ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO => {
492                let kind = LexErrorKind::UnicodeTextDirInLiteral {
493                    position: next_index,
494                    character: next_character,
495                };
496                let span = span_one(l, next_index, next_character);
497                error(l.handler, LexError { span, kind });
498                continue;
499            }
500            _ => next_character,
501        });
502    }
503    let span = span_until(l, index);
504    let literal = Literal::String(LitString { span, parsed });
505    Ok(Some(CommentedTokenTree::Tree(literal.into())))
506}
507
508fn lex_char(
509    l: &mut Lexer<'_>,
510    index: usize,
511    character: char,
512) -> Result<Option<CommentedTokenTree>> {
513    let is_quote = |c| c == '\'';
514    if !is_quote(character) {
515        return Ok(None);
516    }
517
518    let unclosed_char_lit = |l: &Lexer<'_>| {
519        let err = LexError {
520            kind: LexErrorKind::UnclosedCharLiteral { position: index },
521            span: span(l, index, l.src.len()),
522        };
523        error(l.handler, err)
524    };
525    let next = |l: &mut Lexer<'_>| l.stream.next().ok_or_else(|| unclosed_char_lit(l));
526    let escape = |l: &mut Lexer<'_>, next_char| {
527        if next_char == '\\' {
528            parse_escape_code(l).map_err(|e| e.unwrap_or_else(|| unclosed_char_lit(l)))
529        } else {
530            Ok(next_char)
531        }
532    };
533
534    let (next_index, next_char) = next(l)?;
535    // do not allow text direction codepoints
536    if let ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO = next_char {
537        let kind = LexErrorKind::UnicodeTextDirInLiteral {
538            position: next_index,
539            character: next_char,
540        };
541        let span = span_one(l, next_index, next_char);
542        error(l.handler, LexError { span, kind });
543    }
544
545    let parsed = escape(l, next_char)?;
546
547    // Consume the closing `'`.
548    let (next_index, next_char) = next(l)?;
549    let sp = span_until(l, index);
550
551    // Not a closing quote? Then this is e.g., 'ab'.
552    // Most likely the user meant a string literal, so recover as that instead.
553    let literal = if !is_quote(next_char) {
554        let mut string = String::new();
555        string.push(parsed);
556        string.push(escape(l, next_char)?);
557        loop {
558            let (_, next_char) = next(l)?;
559            if is_quote(next_char) {
560                break;
561            }
562            string.push(next_char);
563        }
564
565        // Emit the expected closing quote error.
566        error(
567            l.handler,
568            LexError {
569                kind: LexErrorKind::ExpectedCloseQuote {
570                    position: next_index,
571                },
572                span: span(l, next_index, next_index + string.len()),
573            },
574        );
575
576        Literal::String(LitString {
577            span: sp,
578            parsed: string,
579        })
580    } else {
581        Literal::Char(LitChar { span: sp, parsed })
582    };
583
584    Ok(Some(CommentedTokenTree::Tree(literal.into())))
585}
586
587fn parse_escape_code(l: &mut Lexer<'_>) -> core::result::Result<char, Option<ErrorEmitted>> {
588    let error = |kind, span| Err(Some(error(l.handler, LexError { kind, span })));
589
590    match l.stream.next() {
591        None => Err(None),
592        Some((_, '"')) => Ok('"'),
593        Some((_, '\'')) => Ok('\''),
594        Some((_, 'n')) => Ok('\n'),
595        Some((_, 'r')) => Ok('\r'),
596        Some((_, 't')) => Ok('\t'),
597        Some((_, '\\')) => Ok('\\'),
598        Some((_, '0')) => Ok('\0'),
599        Some((index, 'x')) => {
600            let (high, low) = match (l.stream.next(), l.stream.next()) {
601                (Some((_, high)), Some((_, low))) => (high, low),
602                _ => return Err(None),
603            };
604            let (high, low) = match (high.to_digit(16), low.to_digit(16)) {
605                (Some(high), Some(low)) => (high, low),
606                _ => return error(LexErrorKind::InvalidHexEscape, span_until(l, index)),
607            };
608            let parsed_character = char::from_u32((high << 4) | low).unwrap();
609            Ok(parsed_character)
610        }
611        Some((index, 'u')) => {
612            match l.stream.next() {
613                None => return Err(None),
614                Some((_, '{')) => (),
615                Some((_, unexpected_char)) => {
616                    let span = span_one(l, index, unexpected_char);
617                    let kind = LexErrorKind::UnicodeEscapeMissingBrace { position: index };
618                    return error(kind, span);
619                }
620            }
621            let mut digits_start_position_opt = None;
622            let mut char_value = BigUint::from(0u32);
623            let digits_end_position = loop {
624                let (position, digit) = match l.stream.next() {
625                    None => return Err(None),
626                    Some((position, '}')) => break position,
627                    Some((position, digit)) => (position, digit),
628                };
629                if digits_start_position_opt.is_none() {
630                    digits_start_position_opt = Some(position);
631                };
632                let digit = match digit.to_digit(16) {
633                    None => {
634                        let span = span_one(l, position, digit);
635                        let kind = LexErrorKind::InvalidUnicodeEscapeDigit { position };
636                        return error(kind, span);
637                    }
638                    Some(digit) => digit,
639                };
640                char_value *= 16u32;
641                char_value += digit;
642            };
643            let digits_start_position = digits_start_position_opt.unwrap_or(digits_end_position);
644            let char_value = match u32::try_from(char_value) {
645                Err(..) => {
646                    let span = span(l, digits_start_position, digits_end_position);
647                    let kind = LexErrorKind::UnicodeEscapeOutOfRange { position: index };
648                    return error(kind, span);
649                }
650                Ok(char_value) => char_value,
651            };
652            let parsed_character = match char::from_u32(char_value) {
653                None => {
654                    let span_all = span_until(l, index);
655                    let kind = LexErrorKind::UnicodeEscapeInvalidCharValue { span: span_all };
656                    let span = span(l, digits_start_position, digits_end_position);
657                    return error(kind, span);
658                }
659                Some(parsed_character) => parsed_character,
660            };
661            Ok(parsed_character)
662        }
663        Some((index, unexpected_char)) => error(
664            LexErrorKind::InvalidEscapeCode { position: index },
665            span_one(l, index, unexpected_char),
666        ),
667    }
668}
669
670fn lex_int_lit(
671    l: &mut Lexer<'_>,
672    index: usize,
673    character: char,
674) -> Result<Option<CommentedTokenTree>> {
675    let digit = match character.to_digit(10) {
676        None => return Ok(None),
677        Some(d) => d,
678    };
679
680    let decimal_int_lit = |l, digit: u32| {
681        let mut big_uint = BigUint::from(digit);
682        let end_opt = parse_digits(&mut big_uint, l, 10);
683        (big_uint, end_opt)
684    };
685    let (radix, (big_uint, end_opt)) = if digit == 0 {
686        let prefixed_int_lit = |l: &mut Lexer<'_>, radix| {
687            let _ = l.stream.next();
688            let d = l.stream.next();
689            let incomplete_int_lit = |end| {
690                let kind = match radix {
691                    16 => LexErrorKind::IncompleteHexIntLiteral { position: index },
692                    8 => LexErrorKind::IncompleteOctalIntLiteral { position: index },
693                    2 => LexErrorKind::IncompleteBinaryIntLiteral { position: index },
694                    _ => unreachable!(),
695                };
696                let span = span(l, index, end);
697                error(l.handler, LexError { kind, span })
698            };
699            let (digit_pos, digit) = d.ok_or_else(|| incomplete_int_lit(l.src.len()))?;
700            let radix_digit = digit
701                .to_digit(radix)
702                .ok_or_else(|| incomplete_int_lit(digit_pos))?;
703            let mut big_uint = BigUint::from(radix_digit);
704            let end_opt = parse_digits(&mut big_uint, l, radix);
705            Ok((big_uint, end_opt))
706        };
707
708        match l.stream.peek() {
709            Some((_, 'x')) => (16, prefixed_int_lit(l, 16)?),
710            Some((_, 'o')) => (8, prefixed_int_lit(l, 8)?),
711            Some((_, 'b')) => (2, prefixed_int_lit(l, 2)?),
712            Some((_, '_' | '0'..='9')) => (10, decimal_int_lit(l, 0)),
713            Some(&(next_index, _)) => (10, (BigUint::from(0u32), Some(next_index))),
714            None => (10, (BigUint::from(0u32), None)),
715        }
716    } else {
717        (10, decimal_int_lit(l, digit))
718    };
719
720    let ty_opt = lex_int_ty_opt(l)?;
721
722    // Only accepts u256 literals in hex form
723    if let Some((LitIntType::U256, span)) = &ty_opt {
724        if radix != 16 {
725            return Err(error(
726                l.handler,
727                LexError {
728                    kind: LexErrorKind::U256NotInHex,
729                    span: span.clone(),
730                },
731            ));
732        }
733    }
734
735    let literal = Literal::Int(LitInt {
736        span: span(l, index, end_opt.unwrap_or(l.src.len())),
737        parsed: big_uint,
738        ty_opt,
739        is_generated_b256: false,
740    });
741
742    Ok(Some(CommentedTokenTree::Tree(literal.into())))
743}
744
745fn lex_int_ty_opt(l: &mut Lexer<'_>) -> Result<Option<(LitIntType, Span)>> {
746    let (suffix_start_position, c) = match l.stream.next_if(|(_, c)| c.is_xid_continue()) {
747        None => return Ok(None),
748        Some(x) => x,
749    };
750    let mut suffix = String::from(c);
751    let suffix_end_position = loop {
752        match l.stream.peek() {
753            Some((_, c)) if c.is_xid_continue() => {
754                suffix.push(*c);
755                let _ = l.stream.next();
756            }
757            Some((pos, _)) => break *pos,
758            None => break l.src.len(),
759        }
760    };
761    // Parse the suffix to a known one, or if unknown, recover by throwing it away.
762    let ty = match parse_int_suffix(&suffix) {
763        Some(s) => s,
764        None => {
765            let span = span(l, suffix_start_position, suffix_end_position);
766            let kind = LexErrorKind::InvalidIntSuffix {
767                suffix: Ident::new(span.clone()),
768            };
769            error(l.handler, LexError { kind, span });
770            return Ok(None);
771        }
772    };
773    let span = span_until(l, suffix_start_position);
774    Ok(Some((ty, span)))
775}
776
777/// Interpret the given `suffix` string as a `LitIntType`.
778pub fn parse_int_suffix(suffix: &str) -> Option<LitIntType> {
779    Some(match suffix {
780        "u8" => LitIntType::U8,
781        "u16" => LitIntType::U16,
782        "u32" => LitIntType::U32,
783        "u64" => LitIntType::U64,
784        "u256" => LitIntType::U256,
785        "i8" => LitIntType::I8,
786        "i16" => LitIntType::I16,
787        "i32" => LitIntType::I32,
788        "i64" => LitIntType::I64,
789        _ => return None,
790    })
791}
792
793fn parse_digits(big_uint: &mut BigUint, l: &mut Lexer<'_>, radix: u32) -> Option<usize> {
794    loop {
795        match l.stream.peek() {
796            None => break None,
797            Some((_, '_')) => {
798                let _ = l.stream.next();
799            }
800            Some(&(index, character)) => match character.to_digit(radix) {
801                None => break Some(index),
802                Some(digit) => {
803                    let _ = l.stream.next();
804                    *big_uint *= radix;
805                    *big_uint += digit;
806                }
807            },
808        };
809    }
810}
811
812fn lex_punctuation(l: &mut Lexer<'_>, index: usize, character: char) -> Option<CommentedTokenTree> {
813    let punct = Punct {
814        kind: character.as_punct_kind()?,
815        spacing: match l.stream.peek() {
816            Some((_, next_character)) if next_character.as_punct_kind().is_some() => Spacing::Joint,
817            _ => Spacing::Alone,
818        },
819        span: span_until(l, index),
820    };
821    Some(CommentedTokenTree::Tree(punct.into()))
822}
823
824fn span_until(l: &mut Lexer<'_>, start: usize) -> Span {
825    let end = l.stream.peek().map_or(l.src.len(), |(end, _)| *end);
826    span(l, start, end)
827}
828
829fn span_one(l: &Lexer<'_>, start: usize, c: char) -> Span {
830    span(l, start, start + c.len_utf8())
831}
832
833fn span(l: &Lexer<'_>, start: usize, end: usize) -> Span {
834    Span::new(l.src.clone(), start, end, *l.source_id).unwrap()
835}
836
837/// Emit a lexer error.
838fn error(handler: &Handler, error: LexError) -> ErrorEmitted {
839    handler.emit_err(CompileError::Lex { error })
840}
841
842#[cfg(test)]
843mod tests {
844    use super::*;
845    use assert_matches::assert_matches;
846    use std::sync::Arc;
847    use sway_ast::{
848        literal::{LitChar, Literal},
849        token::{
850            Comment, CommentKind, CommentedTokenTree, CommentedTree, DocComment, DocStyle,
851            TokenTree,
852        },
853    };
854    use sway_error::{
855        error::CompileError,
856        handler::Handler,
857        lex_error::{LexError, LexErrorKind},
858    };
859
860    #[test]
861    fn lex_bidi() {
862        let input = "
863            script;
864            use std::string::String;
865            fn main() {
866                let a = String::from_ascii_str(\"fuel\");
867                let b = String::from_ascii_str(\"fuel\u{202E}\u{2066}// Same string again\u{2069}\u{2066}\");
868                if a.as_bytes() == b.as_bytes() {
869                    log(\"same\");
870                } else {
871                    log(\"different\");
872                }
873                let lrm = '\u{202E}';
874                log(lrm);
875            }
876        ";
877        let start = 0;
878        let end = input.len();
879        let path = None;
880        let handler = Handler::default();
881        let _stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap();
882        let (errors, warnings) = handler.consume();
883        assert_eq!(warnings.len(), 0);
884        assert_eq!(errors.len(), 5);
885        for err in errors {
886            assert_matches!(
887                err,
888                CompileError::Lex {
889                    error: LexError {
890                        span: _,
891                        kind: LexErrorKind::UnicodeTextDirInLiteral {
892                            position: _,
893                            character: _
894                        }
895                    }
896                }
897            );
898        }
899    }
900
901    #[test]
902    fn lex_commented_token_stream() {
903        let input = r#"
904        //
905        // Single-line comment.
906        struct Foo {
907            /* multi-
908             * line-
909             * comment */
910            bar: i32, // trailing comment
911        }
912        "#;
913        let start = 0;
914        let end = input.len();
915        let path = None;
916        let handler = Handler::default();
917        let stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap();
918        assert!(handler.consume().0.is_empty());
919        let mut tts = stream.token_trees().iter();
920        assert_eq!(tts.next().unwrap().span().as_str(), "//");
921        assert_eq!(
922            tts.next().unwrap().span().as_str(),
923            "// Single-line comment."
924        );
925        assert_eq!(tts.next().unwrap().span().as_str(), "struct");
926        assert_eq!(tts.next().unwrap().span().as_str(), "Foo");
927        {
928            let group = match tts.next() {
929                Some(CommentedTokenTree::Tree(CommentedTree::Group(group))) => group,
930                _ => panic!("expected group"),
931            };
932            let mut tts = group.token_stream.token_trees().iter();
933            assert_eq!(
934                tts.next().unwrap().span().as_str(),
935                "/* multi-\n             * line-\n             * comment */",
936            );
937            assert_eq!(tts.next().unwrap().span().as_str(), "bar");
938            assert_eq!(tts.next().unwrap().span().as_str(), ":");
939            assert_eq!(tts.next().unwrap().span().as_str(), "i32");
940            assert_eq!(tts.next().unwrap().span().as_str(), ",");
941            assert_matches!(
942                tts.next(),
943                Some(CommentedTokenTree::Comment(Comment {
944                    span,
945                    comment_kind: CommentKind::Trailing,
946                })) if span.as_str() ==  "// trailing comment"
947            );
948            assert!(tts.next().is_none());
949        }
950        assert!(tts.next().is_none());
951    }
952
953    #[test]
954    fn lex_comments_check_comment_kind() {
955        let input = r#"
956        // CommentKind::Newlined
957        abi Foo {
958            // CommentKind::Newlined
959            fn bar(); // CommentKind::Trailing
960            // CommentKind::Newlined
961        }
962        "#;
963        let start = 0;
964        let end = input.len();
965        let path = None;
966        let handler = Handler::default();
967        let stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap();
968        assert!(handler.consume().0.is_empty());
969        let mut tts = stream.token_trees().iter();
970
971        assert_matches!(
972            tts.next(),
973            Some(CommentedTokenTree::Comment(Comment {
974                span,
975                comment_kind: CommentKind::Newlined,
976            })) if span.as_str() ==  "// CommentKind::Newlined"
977        );
978        assert_eq!(tts.next().unwrap().span().as_str(), "abi");
979        assert_eq!(tts.next().unwrap().span().as_str(), "Foo");
980
981        {
982            let group = match tts.next() {
983                Some(CommentedTokenTree::Tree(CommentedTree::Group(group))) => group,
984                _ => panic!("expected group"),
985            };
986            let mut tts = group.token_stream.token_trees().iter();
987
988            assert_matches!(
989                tts.next(),
990                Some(CommentedTokenTree::Comment(Comment {
991                    span,
992                    comment_kind: CommentKind::Newlined,
993                })) if span.as_str() ==  "// CommentKind::Newlined"
994            );
995            assert_eq!(tts.next().unwrap().span().as_str(), "fn");
996            assert_eq!(tts.next().unwrap().span().as_str(), "bar");
997            assert_eq!(tts.next().unwrap().span().as_str(), "()");
998            assert_eq!(tts.next().unwrap().span().as_str(), ";");
999            assert_matches!(
1000                tts.next(),
1001                Some(CommentedTokenTree::Comment(Comment {
1002                    span,
1003                    comment_kind: CommentKind::Trailing,
1004                })) if span.as_str() ==  "// CommentKind::Trailing"
1005            );
1006            assert_matches!(
1007                tts.next(),
1008                Some(CommentedTokenTree::Comment(Comment {
1009                    span,
1010                    comment_kind: CommentKind::Newlined,
1011                })) if span.as_str() ==  "// CommentKind::Newlined"
1012            );
1013            assert!(tts.next().is_none());
1014        }
1015    }
1016
1017    #[test]
1018    fn lex_doc_comments() {
1019        let input = r#"
1020        //none
1021        ////none
1022        //!inner
1023        //! inner
1024        ///outer
1025        /// outer
1026        "#;
1027        let start = 0;
1028        let end = input.len();
1029        let path = None;
1030        let handler = Handler::default();
1031        let stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap();
1032        assert!(handler.consume().0.is_empty());
1033        let mut tts = stream.token_trees().iter();
1034        assert_matches!(
1035            tts.next(),
1036            Some(CommentedTokenTree::Comment(Comment {
1037                span,
1038                comment_kind: CommentKind::Newlined,
1039            })) if span.as_str() ==  "//none"
1040        );
1041        assert_matches!(
1042            tts.next(),
1043            Some(CommentedTokenTree::Comment(Comment {
1044                span,
1045                comment_kind: CommentKind::Newlined,
1046            })) if span.as_str() ==  "////none"
1047        );
1048        assert_matches!(
1049            tts.next(),
1050            Some(CommentedTokenTree::Comment(Comment {
1051                span,
1052                comment_kind: CommentKind::Newlined,
1053            })) if span.as_str() ==  "//!inner"
1054        );
1055        assert_matches!(
1056            tts.next(),
1057            Some(CommentedTokenTree::Comment(Comment {
1058                span,
1059                comment_kind: CommentKind::Newlined,
1060            })) if span.as_str() ==  "//! inner"
1061        );
1062        assert_matches!(
1063            tts.next(),
1064            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1065                doc_style: DocStyle::Outer,
1066                span,
1067                content_span
1068            }))) if span.as_str() ==  "///outer" && content_span.as_str() == "outer"
1069        );
1070        assert_matches!(
1071            tts.next(),
1072            Some(CommentedTokenTree::Tree(CommentedTree::DocComment(DocComment {
1073                doc_style: DocStyle::Outer,
1074                span,
1075                content_span
1076            }))) if span.as_str() ==  "/// outer" && content_span.as_str() == " outer"
1077        );
1078        assert_eq!(tts.next(), None);
1079    }
1080
1081    #[test]
1082    fn lex_char_escaped_quote() {
1083        let input = r"
1084        '\''
1085        ";
1086        let handler = Handler::default();
1087        let stream = lex(&handler, &Arc::from(input), 0, input.len(), None).unwrap();
1088        assert!(handler.consume().0.is_empty());
1089        let mut tts = stream.token_trees().iter();
1090        assert_matches!(
1091            tts.next(),
1092            Some(TokenTree::Literal(Literal::Char(LitChar {
1093                parsed: '\'',
1094                ..
1095            })))
1096        );
1097        assert_eq!(tts.next(), None);
1098    }
1099}