quil_rs/parser/lexer/
mod.rs

1// Copyright 2021 Rigetti Computing
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15mod error;
16mod quoted_strings;
17mod wrapped_parsers;
18
19use std::str::FromStr;
20
21use nom::{
22    bytes::complete::{is_a, take_till, take_while, take_while1},
23    character::complete::{digit1, one_of},
24    combinator::{all_consuming, map, recognize, value},
25    multi::many0,
26    number::complete::double,
27    sequence::{pair, preceded, terminated, tuple},
28    Finish, IResult,
29};
30use nom_locate::LocatedSpan;
31use wrapped_parsers::{alt, tag};
32
33pub use super::token::{KeywordToken, Token, TokenWithLocation};
34use crate::parser::lexer::wrapped_parsers::expecting;
35use crate::parser::token::token_with_location;
36pub(crate) use error::InternalLexError;
37pub use error::{LexError, LexErrorKind};
38
39#[derive(Debug, Copy, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
40#[strum(serialize_all = "SCREAMING-KEBAB-CASE")]
41pub enum Command {
42    Add,
43    And,
44    Call,
45    Capture,
46    Convert,
47    Declare,
48    #[strum(to_string = "DEFCAL")]
49    DefCal,
50    #[strum(to_string = "DEFCIRCUIT")]
51    DefCircuit,
52    #[strum(to_string = "DEFFRAME")]
53    DefFrame,
54    #[strum(to_string = "DEFGATE")]
55    DefGate,
56    #[strum(to_string = "DEFWAVEFORM")]
57    DefWaveform,
58    Delay,
59    Div,
60    Eq,
61    Exchange,
62    Fence,
63    GE,
64    GT,
65    Halt,
66    Include,
67    Ior,
68    Jump,
69    JumpUnless,
70    JumpWhen,
71    Label,
72    LE,
73    Load,
74    LT,
75    Measure,
76    Move,
77    Mul,
78    Neg,
79    Nop,
80    Not,
81    Pragma,
82    Pulse,
83    RawCapture,
84    Reset,
85    SetFrequency,
86    SetPhase,
87    SetScale,
88    ShiftFrequency,
89    ShiftPhase,
90    SwapPhases,
91    Store,
92    Sub,
93    Wait,
94    Xor,
95}
96
97#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
98#[strum(serialize_all = "UPPERCASE")]
99pub enum DataType {
100    Bit,
101    Octet,
102    Real,
103    Integer,
104}
105
106#[derive(Debug, Clone, PartialEq, Eq, strum::Display, strum::EnumString)]
107#[strum(serialize_all = "UPPERCASE")]
108pub enum Modifier {
109    Controlled,
110    Dagger,
111    Forked, // Not in the Quil grammar
112}
113
114#[derive(Debug, Clone, PartialEq, Eq, strum::Display)]
115pub enum Operator {
116    #[strum(serialize = "^")]
117    Caret,
118    #[strum(serialize = "-")]
119    Minus,
120    #[strum(serialize = "+")]
121    Plus,
122    #[strum(serialize = "/")]
123    Slash,
124    #[strum(serialize = "*")]
125    Star,
126}
127
128pub type LexInput<'a> = LocatedSpan<&'a str>;
129pub(crate) type InternalLexResult<'a, T = Token, E = InternalLexError<'a>> =
130    IResult<LexInput<'a>, T, E>;
131pub type LexResult<'a, T = Token, E = LexError> = IResult<LexInput<'a>, T, E>;
132
133/// Completely lex a string, returning the tokens within. Panics if the string cannot be completely read.
134pub(crate) fn lex(input: LexInput) -> Result<Vec<TokenWithLocation>, LexError> {
135    all_consuming(_lex)(input)
136        .finish()
137        .map(|(_, tokens)| tokens)
138        .map_err(LexError::from)
139}
140
141fn _lex(input: LexInput) -> InternalLexResult<Vec<TokenWithLocation>> {
142    terminated(
143        many0(alt(
144            "indentation or a token preceded by whitespace",
145            (lex_indent, preceded(many0(tag(" ")), lex_token)),
146        )),
147        many0(one_of("\n\t ")),
148    )(input)
149}
150
151/// The Quil spec defines an indent as exactly 4 spaces. However, the lexer recognizes tabs as well
152/// to allow for more flexible formatting.
153fn lex_indent(input: LexInput) -> InternalLexResult<TokenWithLocation> {
154    alt(
155        "indentation",
156        (
157            token_with_location(value(Token::Indentation, tag("    "))),
158            token_with_location(value(Token::Indentation, tag("\t"))),
159        ),
160    )(input)
161}
162
163fn lex_token(input: LexInput) -> InternalLexResult<TokenWithLocation> {
164    alt(
165        "a token",
166        (
167            token_with_location(lex_comment),
168            token_with_location(lex_punctuation),
169            token_with_location(lex_target),
170            token_with_location(lex_string),
171            // Operator must come before number (or it may be parsed as a prefix)
172            token_with_location(lex_operator),
173            token_with_location(lex_variable),
174            // Identifiers must come before numbers so that `NaN`, `Inf`, and `Infinity` aren't
175            // parsed as floats; Nom, as of version 7.1.1, will parse those strings,
176            // case-insensitively, as floats
177            token_with_location(lex_keyword_or_identifier),
178            token_with_location(lex_number),
179        ),
180    )(input)
181}
182
183fn lex_comment(input: LexInput) -> InternalLexResult {
184    let (input, _) = tag("#")(input)?;
185    let (input, content) = take_till(|c| c == '\n')(input)?;
186    Ok((input, Token::Comment(content.to_string())))
187}
188
189fn keyword_or_identifier(identifier: String) -> Token {
190    fn parse<T: FromStr>(token: impl Fn(T) -> Token, identifier: &str) -> Result<Token, T::Err> {
191        T::from_str(identifier).map(token)
192    }
193
194    parse(KeywordToken::into, &identifier)
195        .or_else(|_| parse(Token::Command, &identifier))
196        .or_else(|_| parse(Token::DataType, &identifier))
197        .or_else(|_| parse(Token::Modifier, &identifier))
198        .unwrap_or(Token::Identifier(identifier))
199}
200
201fn is_valid_identifier_leading_character(chr: char) -> bool {
202    chr.is_ascii_alphabetic() || chr == '_'
203}
204
205fn is_valid_identifier_end_character(chr: char) -> bool {
206    is_valid_identifier_leading_character(chr) || chr.is_ascii_digit()
207}
208
209fn is_dash(chr: char) -> bool {
210    chr == '-'
211}
212
213fn lex_identifier_raw(input: LexInput) -> InternalLexResult<String> {
214    expecting(
215        "a valid identifier",
216        map(
217            tuple::<_, _, InternalLexError, _>((
218                take_while1(is_valid_identifier_leading_character),
219                take_while(is_valid_identifier_end_character),
220                recognize(many0(pair(
221                    take_while1(is_dash),
222                    take_while1(is_valid_identifier_end_character),
223                ))),
224            )),
225            |(leading, middle, trailing_dash_vars)| {
226                format!("{leading}{middle}{trailing_dash_vars}")
227            },
228        ),
229    )(input)
230}
231
232fn lex_keyword_or_identifier(input: LexInput) -> InternalLexResult {
233    let (input, identifier) = lex_identifier_raw(input)?;
234    let token = keyword_or_identifier(identifier);
235    Ok((input, token))
236}
237
238fn lex_target(input: LexInput) -> InternalLexResult {
239    let (input, _) = tag("@")(input)?;
240    let (input, label) = lex_identifier_raw(input)?;
241    Ok((input, Token::Target(label)))
242}
243
244fn lex_number(input: LexInput) -> InternalLexResult {
245    let (input, float_string): (LexInput, LexInput) = recognize(double)(input)?;
246    let integer_parse_result: IResult<LexInput, _> = all_consuming(digit1)(float_string);
247    Ok((
248        input,
249        match integer_parse_result {
250            Ok(_) => float_string
251                .parse::<u64>()
252                .map(Token::Integer)
253                .map_err(|e| InternalLexError::from_kind(input, e.into()))
254                .map_err(nom::Err::Failure)?,
255            Err(_) => Token::Float(double(float_string)?.1),
256        },
257    ))
258}
259
260fn lex_operator(input: LexInput) -> InternalLexResult {
261    use Operator::*;
262    map(
263        alt(
264            "an operator",
265            (
266                value(Caret, tag("^")),
267                value(Minus, tag("-")),
268                value(Plus, tag("+")),
269                value(Slash, tag("/")),
270                value(Star, tag("*")),
271            ),
272        ),
273        Token::Operator,
274    )(input)
275}
276
277fn recognize_newlines(input: LexInput) -> InternalLexResult<LexInput> {
278    alt(
279        "one or more newlines",
280        (
281            is_a::<_, _, InternalLexError>("\n"),
282            is_a::<_, _, InternalLexError>("\r\n"),
283        ),
284    )(input)
285}
286
287fn lex_punctuation(input: LexInput) -> InternalLexResult {
288    use Token::*;
289    alt(
290        "punctuation",
291        (
292            value(Colon, tag(":")),
293            value(Comma, tag(",")),
294            value(
295                Indentation,
296                alt("four spaces or a tab character", (tag("    "), tag("\t"))),
297            ),
298            value(LBracket, tag("[")),
299            value(LParenthesis, tag("(")),
300            value(NewLine, recognize_newlines),
301            value(RBracket, tag("]")),
302            value(RParenthesis, tag(")")),
303            value(Semicolon, tag(";")),
304        ),
305    )(input)
306}
307
308fn lex_string(input: LexInput) -> InternalLexResult {
309    map(quoted_strings::unescaped_quoted_string, Token::String)(input)
310}
311
312fn lex_variable(input: LexInput) -> InternalLexResult {
313    map(preceded(tag("%"), lex_identifier_raw), |ident| {
314        Token::Variable(ident)
315    })(input)
316}
317
318#[cfg(test)]
319mod tests {
320    use nom_locate::LocatedSpan;
321    use rstest::*;
322
323    use crate::parser::{common::tests::KITCHEN_SINK_QUIL, DataType};
324
325    use super::{lex, Command, Operator, Token};
326
327    #[test]
328    fn comment() {
329        let input = LocatedSpan::new("# hello\n#world\n#\n#");
330        let tokens = lex(input).unwrap();
331        assert_eq!(
332            tokens,
333            vec![
334                Token::Comment(" hello".to_owned()),
335                Token::NewLine,
336                Token::Comment("world".to_owned()),
337                Token::NewLine,
338                Token::Comment("".to_owned()),
339                Token::NewLine,
340                Token::Comment("".to_owned())
341            ]
342        )
343    }
344
345    #[test]
346    fn keywords() {
347        let input = LocatedSpan::new("DEFGATE DEFCIRCUIT JUMP-WHEN MATRIX LOAD load LOAD-MEMORY");
348        let tokens = lex(input).unwrap();
349        assert_eq!(
350            tokens,
351            vec![
352                Token::Command(Command::DefGate),
353                Token::Command(Command::DefCircuit),
354                Token::Command(Command::JumpWhen),
355                Token::Matrix,
356                Token::Command(Command::Load),
357                Token::Identifier(String::from("load")),
358                Token::Identifier(String::from("LOAD-MEMORY"))
359            ]
360        )
361    }
362
363    #[test]
364    fn number() {
365        let input = LocatedSpan::new("2 2i 2.0 2e3 2.0e3 (1+2i)");
366        let tokens = lex(input).unwrap();
367        assert_eq!(
368            tokens,
369            vec![
370                Token::Integer(2),
371                Token::Integer(2),
372                Token::Identifier("i".to_owned()),
373                Token::Float(2.0),
374                Token::Float(2000f64),
375                Token::Float(2000f64),
376                Token::LParenthesis,
377                Token::Integer(1),
378                Token::Operator(Operator::Plus),
379                Token::Integer(2),
380                Token::Identifier("i".to_owned()),
381                Token::RParenthesis
382            ]
383        )
384    }
385
386    #[test]
387    fn string() {
388        let input = LocatedSpan::new("\"hello\"\n\"world\"");
389        let tokens = lex(input).unwrap();
390        assert_eq!(
391            tokens,
392            vec![
393                Token::String("hello".to_owned()),
394                Token::NewLine,
395                Token::String("world".to_owned())
396            ]
397        )
398    }
399
400    #[test]
401    fn gate_operation() {
402        let input = LocatedSpan::new("I 0; RX 1\nCZ 0 1");
403        let tokens = lex(input).unwrap();
404        assert_eq!(
405            tokens,
406            vec![
407                Token::Identifier("I".to_owned()),
408                Token::Integer(0),
409                Token::Semicolon,
410                Token::Identifier("RX".to_owned()),
411                Token::Integer(1),
412                Token::NewLine,
413                Token::Identifier("CZ".to_owned()),
414                Token::Integer(0),
415                Token::Integer(1),
416            ]
417        )
418    }
419
420    #[test]
421    fn label() {
422        let input = LocatedSpan::new("@hello\n@world");
423        let tokens = lex(input).unwrap();
424        assert_eq!(
425            tokens,
426            vec![
427                Token::Target("hello".to_owned()),
428                Token::NewLine,
429                Token::Target("world".to_owned())
430            ]
431        )
432    }
433
434    #[test]
435    fn indentation() {
436        let input = LocatedSpan::new("    ");
437        let tokens = lex(input).unwrap();
438        assert_eq!(tokens, vec![Token::Indentation,])
439    }
440
441    #[test]
442    fn indented_block() {
443        let input = LocatedSpan::new("DEFGATE Name AS PERMUTATION:\n\t1,0\n    0,1");
444        let tokens = lex(input).unwrap();
445        assert_eq!(
446            tokens,
447            vec![
448                Token::Command(Command::DefGate),
449                Token::Identifier("Name".to_owned()),
450                Token::As,
451                Token::Permutation,
452                Token::Colon,
453                Token::NewLine,
454                Token::Indentation,
455                Token::Integer(1),
456                Token::Comma,
457                Token::Integer(0),
458                Token::NewLine,
459                Token::Indentation,
460                Token::Integer(0),
461                Token::Comma,
462                Token::Integer(1),
463            ]
464        )
465    }
466
467    #[test]
468    fn surrounding_whitespace() {
469        let input = LocatedSpan::new("\nI 0\n    \n");
470        let tokens = lex(input).unwrap();
471        assert_eq!(
472            tokens,
473            vec![
474                Token::NewLine,
475                Token::Identifier("I".to_owned()),
476                Token::Integer(0),
477                Token::NewLine,
478                Token::Indentation,
479                Token::NewLine
480            ]
481        )
482    }
483
484    #[rstest(input, expected,
485        case("_", vec![Token::Identifier("_".to_string())]),
486        case("a", vec![Token::Identifier("a".to_string())]),
487        case("_a-2_b-2_", vec![Token::Identifier("_a-2_b-2_".to_string())]),
488        case("a-2-%var", vec![
489            Token::Identifier("a-2".to_string()),
490            Token::Operator(Operator::Minus),
491            Token::Variable("var".to_string())
492        ]),
493        case("BIT", vec![Token::DataType(DataType::Bit)]),
494        case("BITS", vec![Token::Identifier("BITS".to_string())]),
495        case("NaN", vec![Token::Identifier("NaN".to_string())]),
496        case("nan", vec![Token::Identifier("nan".to_string())]),
497        case("NaNa", vec![Token::Identifier("NaNa".to_string())]),
498        case("nana", vec![Token::Identifier("nana".to_string())]),
499        case("INF", vec![Token::Identifier("INF".to_string())]),
500        case("Infinity", vec![Token::Identifier("Infinity".to_string())]),
501        case("Inferior", vec![Token::Identifier("Inferior".to_string())]),
502        case("-NaN", vec![Token::Operator(Operator::Minus), Token::Identifier("NaN".to_string())]),
503        case("-inf", vec![Token::Operator(Operator::Minus), Token::Identifier("inf".to_string())]),
504        case("-Infinity", vec![
505            Token::Operator(Operator::Minus),
506            Token::Identifier("Infinity".to_string())
507        ]),
508        case("-inferior", vec![
509            Token::Operator(Operator::Minus),
510            Token::Identifier("inferior".to_string())
511        ]),
512    )]
513    fn it_lexes_identifier(input: &str, expected: Vec<Token>) {
514        let input = LocatedSpan::new(input);
515        let tokens = lex(input).unwrap();
516        assert_eq!(tokens, expected);
517    }
518
519    #[rstest(input, not_expected,
520        case("a-", vec![Token::Identifier("_-".to_string())]),
521        case("-a", vec![Token::Identifier("-a".to_string())]),
522        case("a\\", vec![Token::Identifier("_\\".to_string())]),
523    )]
524    fn it_fails_to_lex_identifier(input: &str, not_expected: Vec<Token>) {
525        let input = LocatedSpan::new(input);
526        if let Ok(tokens) = lex(input) {
527            assert_ne!(tokens, not_expected);
528        }
529    }
530
531    /// Test that an entire sample program can be lexed without failure.
532    #[test]
533    fn kitchen_sink() {
534        let input = LocatedSpan::new(KITCHEN_SINK_QUIL);
535
536        lex(input).unwrap();
537    }
538}