tree_sitter_generate/
lib.rs

1use std::{
2    env, fs,
3    io::Write,
4    path::{Path, PathBuf},
5    process::{Command, Stdio},
6    sync::LazyLock,
7};
8
9use anyhow::Result;
10use regex::{Regex, RegexBuilder};
11use semver::Version;
12use serde::{Deserialize, Serialize};
13use thiserror::Error;
14
15mod build_tables;
16mod dedup;
17mod grammar_files;
18mod grammars;
19mod nfa;
20mod node_types;
21pub mod parse_grammar;
22mod prepare_grammar;
23mod render;
24mod rules;
25mod tables;
26
27use build_tables::build_tables;
28pub use build_tables::ParseTableBuilderError;
29use grammars::InputGrammar;
30pub use node_types::VariableInfoError;
31use parse_grammar::parse_grammar;
32pub use parse_grammar::ParseGrammarError;
33use prepare_grammar::prepare_grammar;
34pub use prepare_grammar::PrepareGrammarError;
35use render::render_c_code;
36pub use render::{ABI_VERSION_MAX, ABI_VERSION_MIN};
37
38static JSON_COMMENT_REGEX: LazyLock<Regex> = LazyLock::new(|| {
39    RegexBuilder::new("^\\s*//.*")
40        .multi_line(true)
41        .build()
42        .unwrap()
43});
44
45struct GeneratedParser {
46    c_code: String,
47    node_types_json: String,
48}
49
50pub const ALLOC_HEADER: &str = include_str!("templates/alloc.h");
51pub const ARRAY_HEADER: &str = include_str!("templates/array.h");
52
53pub type GenerateResult<T> = Result<T, GenerateError>;
54
55#[derive(Debug, Error, Serialize)]
56pub enum GenerateError {
57    #[error("Error with specified path -- {0}")]
58    GrammarPath(String),
59    #[error("{0}")]
60    IO(String),
61    #[error(transparent)]
62    LoadGrammarFile(#[from] LoadGrammarError),
63    #[error(transparent)]
64    ParseGrammar(#[from] ParseGrammarError),
65    #[error(transparent)]
66    Prepare(#[from] PrepareGrammarError),
67    #[error(transparent)]
68    VariableInfo(#[from] VariableInfoError),
69    #[error(transparent)]
70    BuildTables(#[from] ParseTableBuilderError),
71    #[error(transparent)]
72    ParseVersion(#[from] ParseVersionError),
73}
74
75impl From<std::io::Error> for GenerateError {
76    fn from(value: std::io::Error) -> Self {
77        Self::IO(value.to_string())
78    }
79}
80
81pub type LoadGrammarFileResult<T> = Result<T, LoadGrammarError>;
82
83#[derive(Debug, Error, Serialize)]
84pub enum LoadGrammarError {
85    #[error("Path to a grammar file with `.js` or `.json` extension is required")]
86    InvalidPath,
87    #[error("Failed to load grammar.js -- {0}")]
88    LoadJSGrammarFile(#[from] JSError),
89    #[error("Failed to load grammar.json -- {0}")]
90    IO(String),
91    #[error("Unknown grammar file extension: {0:?}")]
92    FileExtension(PathBuf),
93}
94
95impl From<std::io::Error> for LoadGrammarError {
96    fn from(value: std::io::Error) -> Self {
97        Self::IO(value.to_string())
98    }
99}
100
101#[derive(Debug, Error, Serialize)]
102pub enum ParseVersionError {
103    #[error("{0}")]
104    Version(String),
105    #[error("{0}")]
106    JSON(String),
107    #[error("{0}")]
108    IO(String),
109}
110
111pub type JSResult<T> = Result<T, JSError>;
112
113#[derive(Debug, Error, Serialize)]
114pub enum JSError {
115    #[error("Failed to run `{runtime}` -- {error}")]
116    JSRuntimeSpawn { runtime: String, error: String },
117    #[error("Got invalid UTF8 from `{runtime}` -- {error}")]
118    JSRuntimeUtf8 { runtime: String, error: String },
119    #[error("`{runtime}` process exited with status {code}")]
120    JSRuntimeExit { runtime: String, code: i32 },
121    #[error("{0}")]
122    IO(String),
123    #[error("Could not parse this package's version as semver -- {0}")]
124    Semver(String),
125    #[error("Failed to serialze grammar JSON -- {0}")]
126    Serialzation(String),
127}
128
129impl From<std::io::Error> for JSError {
130    fn from(value: std::io::Error) -> Self {
131        Self::IO(value.to_string())
132    }
133}
134
135impl From<serde_json::Error> for JSError {
136    fn from(value: serde_json::Error) -> Self {
137        Self::Serialzation(value.to_string())
138    }
139}
140
141impl From<semver::Error> for JSError {
142    fn from(value: semver::Error) -> Self {
143        Self::Semver(value.to_string())
144    }
145}
146
147pub fn generate_parser_in_directory(
148    repo_path: &Path,
149    out_path: Option<&str>,
150    grammar_path: Option<&str>,
151    mut abi_version: usize,
152    report_symbol_name: Option<&str>,
153    js_runtime: Option<&str>,
154) -> GenerateResult<()> {
155    let mut repo_path = repo_path.to_owned();
156    let mut grammar_path = grammar_path;
157
158    // Populate a new empty grammar directory.
159    if let Some(path) = grammar_path {
160        let path = PathBuf::from(path);
161        if !path
162            .try_exists()
163            .map_err(|e| GenerateError::GrammarPath(e.to_string()))?
164        {
165            fs::create_dir_all(&path)?;
166            grammar_path = None;
167            repo_path = path;
168        }
169    }
170
171    let grammar_path = grammar_path.map_or_else(|| repo_path.join("grammar.js"), PathBuf::from);
172
173    // Read the grammar file.
174    let grammar_json = load_grammar_file(&grammar_path, js_runtime)?;
175
176    let src_path = out_path.map_or_else(|| repo_path.join("src"), PathBuf::from);
177    let header_path = src_path.join("tree_sitter");
178
179    // Ensure that the output directories exist.
180    fs::create_dir_all(&src_path)?;
181    fs::create_dir_all(&header_path)?;
182
183    if grammar_path.file_name().unwrap() != "grammar.json" {
184        fs::write(src_path.join("grammar.json"), &grammar_json).map_err(|e| {
185            GenerateError::IO(format!(
186                "Failed to write grammar.json to {src_path:?} -- {e}"
187            ))
188        })?;
189    }
190
191    // Parse and preprocess the grammar.
192    let input_grammar = parse_grammar(&grammar_json)?;
193
194    let semantic_version = read_grammar_version(&repo_path)?;
195
196    if semantic_version.is_none() && abi_version > ABI_VERSION_MIN {
197        println!("Warning: No `tree-sitter.json` file found in your grammar, this file is required to generate with ABI {abi_version}. Using ABI version {ABI_VERSION_MIN} instead.");
198        println!("This file can be set up with `tree-sitter init`. For more information, see https://tree-sitter.github.io/tree-sitter/cli/init.");
199        abi_version = ABI_VERSION_MIN;
200    }
201
202    // Generate the parser and related files.
203    let GeneratedParser {
204        c_code,
205        node_types_json,
206    } = generate_parser_for_grammar_with_opts(
207        &input_grammar,
208        abi_version,
209        semantic_version.map(|v| (v.major as u8, v.minor as u8, v.patch as u8)),
210        report_symbol_name,
211    )?;
212
213    write_file(&src_path.join("parser.c"), c_code)?;
214    write_file(&src_path.join("node-types.json"), node_types_json)?;
215    write_file(&header_path.join("alloc.h"), ALLOC_HEADER)?;
216    write_file(&header_path.join("array.h"), ARRAY_HEADER)?;
217    write_file(&header_path.join("parser.h"), tree_sitter::PARSER_HEADER)?;
218
219    Ok(())
220}
221
222pub fn generate_parser_for_grammar(
223    grammar_json: &str,
224    semantic_version: Option<(u8, u8, u8)>,
225) -> GenerateResult<(String, String)> {
226    let grammar_json = JSON_COMMENT_REGEX.replace_all(grammar_json, "\n");
227    let input_grammar = parse_grammar(&grammar_json)?;
228    let parser = generate_parser_for_grammar_with_opts(
229        &input_grammar,
230        tree_sitter::LANGUAGE_VERSION,
231        semantic_version,
232        None,
233    )?;
234    Ok((input_grammar.name, parser.c_code))
235}
236
237fn generate_parser_for_grammar_with_opts(
238    input_grammar: &InputGrammar,
239    abi_version: usize,
240    semantic_version: Option<(u8, u8, u8)>,
241    report_symbol_name: Option<&str>,
242) -> GenerateResult<GeneratedParser> {
243    let (syntax_grammar, lexical_grammar, inlines, simple_aliases) =
244        prepare_grammar(input_grammar)?;
245    let variable_info =
246        node_types::get_variable_info(&syntax_grammar, &lexical_grammar, &simple_aliases)?;
247    let node_types_json = node_types::generate_node_types_json(
248        &syntax_grammar,
249        &lexical_grammar,
250        &simple_aliases,
251        &variable_info,
252    );
253    let supertype_symbol_map =
254        node_types::get_supertype_symbol_map(&syntax_grammar, &simple_aliases, &variable_info);
255    let tables = build_tables(
256        &syntax_grammar,
257        &lexical_grammar,
258        &simple_aliases,
259        &variable_info,
260        &inlines,
261        report_symbol_name,
262    )?;
263    let c_code = render_c_code(
264        &input_grammar.name,
265        tables,
266        syntax_grammar,
267        lexical_grammar,
268        simple_aliases,
269        abi_version,
270        semantic_version,
271        supertype_symbol_map,
272    );
273    Ok(GeneratedParser {
274        c_code,
275        node_types_json: serde_json::to_string_pretty(&node_types_json).unwrap(),
276    })
277}
278
279/// This will read the `tree-sitter.json` config file and attempt to extract the version.
280///
281/// If the file is not found in the current directory or any of its parent directories, this will
282/// return `None` to maintain backwards compatibility. If the file is found but the version cannot
283/// be parsed as semver, this will return an error.
284fn read_grammar_version(repo_path: &Path) -> Result<Option<Version>, ParseVersionError> {
285    #[derive(Deserialize)]
286    struct TreeSitterJson {
287        metadata: Metadata,
288    }
289
290    #[derive(Deserialize)]
291    struct Metadata {
292        version: String,
293    }
294
295    let filename = "tree-sitter.json";
296    let mut path = repo_path.join(filename);
297
298    loop {
299        let json = path
300            .exists()
301            .then(|| {
302                let contents = fs::read_to_string(path.as_path()).map_err(|e| {
303                    ParseVersionError::IO(format!("Failed to read `{}` -- {e}", path.display()))
304                })?;
305                serde_json::from_str::<TreeSitterJson>(&contents).map_err(|e| {
306                    ParseVersionError::JSON(format!("Failed to parse `{}` -- {e}", path.display()))
307                })
308            })
309            .transpose()?;
310        if let Some(json) = json {
311            return Version::parse(&json.metadata.version)
312                .map_err(|e| {
313                    ParseVersionError::Version(format!(
314                        "Failed to parse `{}` version as semver -- {e}",
315                        path.display()
316                    ))
317                })
318                .map(Some);
319        }
320        path.pop(); // filename
321        if !path.pop() {
322            return Ok(None);
323        }
324        path.push(filename);
325    }
326}
327
328pub fn load_grammar_file(
329    grammar_path: &Path,
330    js_runtime: Option<&str>,
331) -> LoadGrammarFileResult<String> {
332    if grammar_path.is_dir() {
333        Err(LoadGrammarError::InvalidPath)?;
334    }
335    match grammar_path.extension().and_then(|e| e.to_str()) {
336        Some("js") => Ok(load_js_grammar_file(grammar_path, js_runtime)?),
337        Some("json") => Ok(fs::read_to_string(grammar_path)?),
338        _ => Err(LoadGrammarError::FileExtension(grammar_path.to_owned()))?,
339    }
340}
341
342fn load_js_grammar_file(grammar_path: &Path, js_runtime: Option<&str>) -> JSResult<String> {
343    let grammar_path = fs::canonicalize(grammar_path)?;
344
345    #[cfg(windows)]
346    let grammar_path = url::Url::from_file_path(grammar_path)
347        .expect("Failed to convert path to URL")
348        .to_string();
349
350    let js_runtime = js_runtime.unwrap_or("node");
351
352    let mut js_command = Command::new(js_runtime);
353    match js_runtime {
354        "node" => {
355            js_command.args(["--input-type=module", "-"]);
356        }
357        "bun" => {
358            js_command.arg("-");
359        }
360        "deno" => {
361            js_command.args(["run", "--allow-all", "-"]);
362        }
363        _ => {}
364    }
365
366    let mut js_process = js_command
367        .env("TREE_SITTER_GRAMMAR_PATH", grammar_path)
368        .stdin(Stdio::piped())
369        .stdout(Stdio::piped())
370        .spawn()
371        .map_err(|e| JSError::JSRuntimeSpawn {
372            runtime: js_runtime.to_string(),
373            error: e.to_string(),
374        })?;
375
376    let mut js_stdin = js_process
377        .stdin
378        .take()
379        .ok_or_else(|| JSError::IO(format!("Failed to open stdin for `{js_runtime}`")))?;
380
381    let cli_version = Version::parse(env!("CARGO_PKG_VERSION"))?;
382    write!(
383        js_stdin,
384        "globalThis.TREE_SITTER_CLI_VERSION_MAJOR = {};
385         globalThis.TREE_SITTER_CLI_VERSION_MINOR = {};
386         globalThis.TREE_SITTER_CLI_VERSION_PATCH = {};",
387        cli_version.major, cli_version.minor, cli_version.patch,
388    )
389    .map_err(|e| {
390        JSError::IO(format!(
391            "Failed to write tree-sitter version to `{js_runtime}`'s stdin -- {e}"
392        ))
393    })?;
394    js_stdin.write(include_bytes!("./dsl.js")).map_err(|e| {
395        JSError::IO(format!(
396            "Failed to write grammar dsl to `{js_runtime}`'s stdin -- {e}"
397        ))
398    })?;
399    drop(js_stdin);
400
401    let output = js_process
402        .wait_with_output()
403        .map_err(|e| JSError::IO(format!("Failed to read output from `{js_runtime}` -- {e}")))?;
404    match output.status.code() {
405        None => panic!("`{js_runtime}` process was killed"),
406        Some(0) => {
407            let stdout = String::from_utf8(output.stdout).map_err(|e| JSError::JSRuntimeUtf8 {
408                runtime: js_runtime.to_string(),
409                error: e.to_string(),
410            })?;
411
412            let mut grammar_json = &stdout[..];
413
414            if let Some(pos) = stdout.rfind('\n') {
415                // If there's a newline, split the last line from the rest of the output
416                let node_output = &stdout[..pos];
417                grammar_json = &stdout[pos + 1..];
418
419                let mut stdout = std::io::stdout().lock();
420                stdout.write_all(node_output.as_bytes())?;
421                stdout.write_all(b"\n")?;
422                stdout.flush()?;
423            }
424
425            Ok(serde_json::to_string_pretty(&serde_json::from_str::<
426                serde_json::Value,
427            >(grammar_json)?)?)
428        }
429        Some(code) => Err(JSError::JSRuntimeExit {
430            runtime: js_runtime.to_string(),
431            code,
432        }),
433    }
434}
435
436pub fn write_file(path: &Path, body: impl AsRef<[u8]>) -> GenerateResult<()> {
437    fs::write(path, body)
438        .map_err(|e| GenerateError::IO(format!("Failed to write {:?} -- {e}", path.file_name())))
439}