tree_sitter_generate/
parse_grammar.rs

1use std::collections::HashSet;
2
3use anyhow::Result;
4use serde::{Deserialize, Serialize};
5use serde_json::{Map, Value};
6use thiserror::Error;
7
8use super::{
9    grammars::{InputGrammar, PrecedenceEntry, Variable, VariableType},
10    rules::{Precedence, Rule},
11};
12use crate::grammars::ReservedWordContext;
13
14#[derive(Deserialize)]
15#[serde(tag = "type")]
16#[allow(non_camel_case_types)]
17#[allow(clippy::upper_case_acronyms)]
18enum RuleJSON {
19    ALIAS {
20        content: Box<RuleJSON>,
21        named: bool,
22        value: String,
23    },
24    BLANK,
25    STRING {
26        value: String,
27    },
28    PATTERN {
29        value: String,
30        flags: Option<String>,
31    },
32    SYMBOL {
33        name: String,
34    },
35    CHOICE {
36        members: Vec<RuleJSON>,
37    },
38    FIELD {
39        name: String,
40        content: Box<RuleJSON>,
41    },
42    SEQ {
43        members: Vec<RuleJSON>,
44    },
45    REPEAT {
46        content: Box<RuleJSON>,
47    },
48    REPEAT1 {
49        content: Box<RuleJSON>,
50    },
51    PREC_DYNAMIC {
52        value: i32,
53        content: Box<RuleJSON>,
54    },
55    PREC_LEFT {
56        value: PrecedenceValueJSON,
57        content: Box<RuleJSON>,
58    },
59    PREC_RIGHT {
60        value: PrecedenceValueJSON,
61        content: Box<RuleJSON>,
62    },
63    PREC {
64        value: PrecedenceValueJSON,
65        content: Box<RuleJSON>,
66    },
67    TOKEN {
68        content: Box<RuleJSON>,
69    },
70    IMMEDIATE_TOKEN {
71        content: Box<RuleJSON>,
72    },
73    RESERVED {
74        context_name: String,
75        content: Box<RuleJSON>,
76    },
77}
78
79#[derive(Deserialize)]
80#[serde(untagged)]
81enum PrecedenceValueJSON {
82    Integer(i32),
83    Name(String),
84}
85
86#[derive(Deserialize)]
87pub struct GrammarJSON {
88    pub name: String,
89    rules: Map<String, Value>,
90    #[serde(default)]
91    precedences: Vec<Vec<RuleJSON>>,
92    #[serde(default)]
93    conflicts: Vec<Vec<String>>,
94    #[serde(default)]
95    externals: Vec<RuleJSON>,
96    #[serde(default)]
97    extras: Vec<RuleJSON>,
98    #[serde(default)]
99    inline: Vec<String>,
100    #[serde(default)]
101    supertypes: Vec<String>,
102    #[serde(default)]
103    word: Option<String>,
104    #[serde(default)]
105    reserved: Map<String, Value>,
106}
107
108pub type ParseGrammarResult<T> = Result<T, ParseGrammarError>;
109
110#[derive(Debug, Error, Serialize)]
111pub enum ParseGrammarError {
112    #[error("{0}")]
113    Serialization(String),
114    #[error("Rules in the `extras` array must not contain empty strings")]
115    InvalidExtra,
116    #[error("Invalid rule in precedences array. Only strings and symbols are allowed")]
117    Unexpected,
118    #[error("Reserved word sets must be arrays")]
119    InvalidReservedWordSet,
120    #[error("Grammar Error: Unexpected rule `{0}` in `token()` call")]
121    UnexpectedRule(String),
122}
123
124impl From<serde_json::Error> for ParseGrammarError {
125    fn from(value: serde_json::Error) -> Self {
126        Self::Serialization(value.to_string())
127    }
128}
129
130/// Check if a rule is referenced by another rule.
131///
132/// This function is used to determine if a variable is used in a given rule,
133/// and `is_other` indicates if the rule is an external, and if it is,
134/// to not assume that a named symbol that is equal to itself means it's being referenced.
135///
136/// For example, if we have an external rule **and** a normal rule both called `foo`,
137/// `foo` should not be thought of as directly used unless it's used within another rule.
138fn rule_is_referenced(rule: &Rule, target: &str, is_external: bool) -> bool {
139    match rule {
140        Rule::NamedSymbol(name) => name == target && !is_external,
141        Rule::Choice(rules) | Rule::Seq(rules) => {
142            rules.iter().any(|r| rule_is_referenced(r, target, false))
143        }
144        Rule::Metadata { rule, .. } | Rule::Reserved { rule, .. } => {
145            rule_is_referenced(rule, target, is_external)
146        }
147        Rule::Repeat(inner) => rule_is_referenced(inner, target, false),
148        Rule::Blank | Rule::String(_) | Rule::Pattern(_, _) | Rule::Symbol(_) => false,
149    }
150}
151
152fn variable_is_used(
153    grammar_rules: &[(String, Rule)],
154    extras: &[Rule],
155    externals: &[Rule],
156    target_name: &str,
157    in_progress: &mut HashSet<String>,
158) -> bool {
159    let root = &grammar_rules.first().unwrap().0;
160    if target_name == root {
161        return true;
162    }
163
164    if extras
165        .iter()
166        .any(|rule| rule_is_referenced(rule, target_name, false))
167    {
168        return true;
169    }
170
171    if externals
172        .iter()
173        .any(|rule| rule_is_referenced(rule, target_name, true))
174    {
175        return true;
176    }
177
178    in_progress.insert(target_name.to_string());
179    let result = grammar_rules
180        .iter()
181        .filter(|(key, _)| *key != target_name)
182        .any(|(name, rule)| {
183            if !rule_is_referenced(rule, target_name, false) || in_progress.contains(name) {
184                return false;
185            }
186            variable_is_used(grammar_rules, extras, externals, name, in_progress)
187        });
188    in_progress.remove(target_name);
189
190    result
191}
192
193pub(crate) fn parse_grammar(input: &str) -> ParseGrammarResult<InputGrammar> {
194    let mut grammar_json = serde_json::from_str::<GrammarJSON>(input)?;
195
196    let mut extra_symbols =
197        grammar_json
198            .extras
199            .into_iter()
200            .try_fold(Vec::<Rule>::new(), |mut acc, item| {
201                let rule = parse_rule(item, false)?;
202                if let Rule::String(ref value) = rule {
203                    if value.is_empty() {
204                        Err(ParseGrammarError::InvalidExtra)?;
205                    }
206                }
207                acc.push(rule);
208                ParseGrammarResult::Ok(acc)
209            })?;
210
211    let mut external_tokens = grammar_json
212        .externals
213        .into_iter()
214        .map(|e| parse_rule(e, false))
215        .collect::<ParseGrammarResult<Vec<_>>>()?;
216
217    let mut precedence_orderings = Vec::with_capacity(grammar_json.precedences.len());
218    for list in grammar_json.precedences {
219        let mut ordering = Vec::with_capacity(list.len());
220        for entry in list {
221            ordering.push(match entry {
222                RuleJSON::STRING { value } => PrecedenceEntry::Name(value),
223                RuleJSON::SYMBOL { name } => PrecedenceEntry::Symbol(name),
224                _ => Err(ParseGrammarError::Unexpected)?,
225            });
226        }
227        precedence_orderings.push(ordering);
228    }
229
230    let mut variables = Vec::with_capacity(grammar_json.rules.len());
231
232    let rules = grammar_json
233        .rules
234        .into_iter()
235        .map(|(n, r)| Ok((n, parse_rule(serde_json::from_value(r)?, false)?)))
236        .collect::<ParseGrammarResult<Vec<_>>>()?;
237
238    let mut in_progress = HashSet::new();
239
240    for (name, rule) in &rules {
241        if !variable_is_used(
242            &rules,
243            &extra_symbols,
244            &external_tokens,
245            name,
246            &mut in_progress,
247        ) && grammar_json.word.as_ref().is_none_or(|w| w != name)
248        {
249            grammar_json.conflicts.retain(|r| !r.contains(name));
250            grammar_json.supertypes.retain(|r| r != name);
251            grammar_json.inline.retain(|r| r != name);
252            extra_symbols.retain(|r| !rule_is_referenced(r, name, true));
253            external_tokens.retain(|r| !rule_is_referenced(r, name, true));
254            precedence_orderings.retain(|r| {
255                !r.iter().any(|e| {
256                    let PrecedenceEntry::Symbol(s) = e else {
257                        return false;
258                    };
259                    s == name
260                })
261            });
262            continue;
263        }
264        variables.push(Variable {
265            name: name.clone(),
266            kind: VariableType::Named,
267            rule: rule.clone(),
268        });
269    }
270
271    let reserved_words = grammar_json
272        .reserved
273        .into_iter()
274        .map(|(name, rule_values)| {
275            let mut reserved_words = Vec::new();
276
277            let Value::Array(rule_values) = rule_values else {
278                Err(ParseGrammarError::InvalidReservedWordSet)?
279            };
280
281            for value in rule_values {
282                reserved_words.push(parse_rule(serde_json::from_value(value)?, false)?);
283            }
284            Ok(ReservedWordContext {
285                name,
286                reserved_words,
287            })
288        })
289        .collect::<ParseGrammarResult<Vec<_>>>()?;
290
291    Ok(InputGrammar {
292        name: grammar_json.name,
293        word_token: grammar_json.word,
294        expected_conflicts: grammar_json.conflicts,
295        supertype_symbols: grammar_json.supertypes,
296        variables_to_inline: grammar_json.inline,
297        precedence_orderings,
298        variables,
299        extra_symbols,
300        external_tokens,
301        reserved_words,
302    })
303}
304
305fn parse_rule(json: RuleJSON, is_token: bool) -> ParseGrammarResult<Rule> {
306    match json {
307        RuleJSON::ALIAS {
308            content,
309            value,
310            named,
311        } => parse_rule(*content, is_token).map(|r| Rule::alias(r, value, named)),
312        RuleJSON::BLANK => Ok(Rule::Blank),
313        RuleJSON::STRING { value } => Ok(Rule::String(value)),
314        RuleJSON::PATTERN { value, flags } => Ok(Rule::Pattern(
315            value,
316            flags.map_or(String::new(), |f| {
317                f.matches(|c| {
318                    if c == 'i' {
319                        true
320                    } else {
321                        // silently ignore unicode flags
322                        if c != 'u' && c != 'v' {
323                            eprintln!("Warning: unsupported flag {c}");
324                        }
325                        false
326                    }
327                })
328                .collect()
329            }),
330        )),
331        RuleJSON::SYMBOL { name } => {
332            if is_token {
333                Err(ParseGrammarError::UnexpectedRule(name))?
334            } else {
335                Ok(Rule::NamedSymbol(name))
336            }
337        }
338        RuleJSON::CHOICE { members } => members
339            .into_iter()
340            .map(|m| parse_rule(m, is_token))
341            .collect::<ParseGrammarResult<Vec<_>>>()
342            .map(Rule::choice),
343        RuleJSON::FIELD { content, name } => {
344            parse_rule(*content, is_token).map(|r| Rule::field(name, r))
345        }
346        RuleJSON::SEQ { members } => members
347            .into_iter()
348            .map(|m| parse_rule(m, is_token))
349            .collect::<ParseGrammarResult<Vec<_>>>()
350            .map(Rule::seq),
351        RuleJSON::REPEAT1 { content } => parse_rule(*content, is_token).map(Rule::repeat),
352        RuleJSON::REPEAT { content } => {
353            parse_rule(*content, is_token).map(|m| Rule::choice(vec![Rule::repeat(m), Rule::Blank]))
354        }
355        RuleJSON::PREC { value, content } => {
356            parse_rule(*content, is_token).map(|r| Rule::prec(value.into(), r))
357        }
358        RuleJSON::PREC_LEFT { value, content } => {
359            parse_rule(*content, is_token).map(|r| Rule::prec_left(value.into(), r))
360        }
361        RuleJSON::PREC_RIGHT { value, content } => {
362            parse_rule(*content, is_token).map(|r| Rule::prec_right(value.into(), r))
363        }
364        RuleJSON::PREC_DYNAMIC { value, content } => {
365            parse_rule(*content, is_token).map(|r| Rule::prec_dynamic(value, r))
366        }
367        RuleJSON::RESERVED {
368            content,
369            context_name,
370        } => parse_rule(*content, is_token).map(|r| Rule::Reserved {
371            rule: Box::new(r),
372            context_name,
373        }),
374        RuleJSON::TOKEN { content } => parse_rule(*content, true).map(Rule::token),
375        RuleJSON::IMMEDIATE_TOKEN { content } => {
376            parse_rule(*content, is_token).map(Rule::immediate_token)
377        }
378    }
379}
380
381impl From<PrecedenceValueJSON> for Precedence {
382    fn from(val: PrecedenceValueJSON) -> Self {
383        match val {
384            PrecedenceValueJSON::Integer(i) => Self::Integer(i),
385            PrecedenceValueJSON::Name(i) => Self::Name(i),
386        }
387    }
388}
389
390#[cfg(test)]
391mod tests {
392    use super::*;
393
394    #[test]
395    fn test_parse_grammar() {
396        let grammar = parse_grammar(
397            r#"{
398            "name": "my_lang",
399            "rules": {
400                "file": {
401                    "type": "REPEAT1",
402                    "content": {
403                        "type": "SYMBOL",
404                        "name": "statement"
405                    }
406                },
407                "statement": {
408                    "type": "STRING",
409                    "value": "foo"
410                }
411            }
412        }"#,
413        )
414        .unwrap();
415
416        assert_eq!(grammar.name, "my_lang");
417        assert_eq!(
418            grammar.variables,
419            vec![
420                Variable {
421                    name: "file".to_string(),
422                    kind: VariableType::Named,
423                    rule: Rule::repeat(Rule::NamedSymbol("statement".to_string()))
424                },
425                Variable {
426                    name: "statement".to_string(),
427                    kind: VariableType::Named,
428                    rule: Rule::String("foo".to_string())
429                },
430            ]
431        );
432    }
433}