clang_ast/
lib.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
//! [![github]](https://github.com/dtolnay/clang-ast) [![crates-io]](https://crates.io/crates/clang-ast) [![docs-rs]](https://docs.rs/clang-ast)
//!
//! [github]: https://img.shields.io/badge/github-8da0cb?style=for-the-badge&labelColor=555555&logo=github
//! [crates-io]: https://img.shields.io/badge/crates.io-fc8d62?style=for-the-badge&labelColor=555555&logo=rust
//! [docs-rs]: https://img.shields.io/badge/docs.rs-66c2a5?style=for-the-badge&labelColor=555555&logo=docs.rs
//!
//! <br>
//!
//! This library provides deserialization logic for efficiently processing
//! Clang's `-ast-dump=json` format.
//!
//! <br>
//!
//! # Format overview
//!
//! An AST dump is generated by a compiler command like:
//!
//! <pre>
//! <code>$  <b>clang++ -Xclang -ast-dump=json -fsyntax-only path/to/source.cc</b></code>
//! </pre>
//!
//! The high-level structure is a tree of nodes, each of which has an `"id"` and
//! a `"kind"`, zero or more further fields depending on what the node kind is,
//! and finally an optional `"inner"` array of child nodes.
//!
//! As an example, for an input file containing just the declaration `class S;`,
//! the AST would be as follows:
//!
//! ```
//! # stringify! {
//! {
//!   "id": "0x1fcea38",                 //<-- root node
//!   "kind": "TranslationUnitDecl",
//!   "inner": [
//!     {
//!       "id": "0xadf3a8",              //<-- first child node
//!       "kind": "CXXRecordDecl",
//!       "loc": {
//!         "offset": 6,
//!         "file": "source.cc",
//!         "line": 1,
//!         "col": 7,
//!         "tokLen": 1
//!       },
//!       "range": {
//!         "begin": {
//!           "offset": 0,
//!           "col": 1,
//!           "tokLen": 5
//!         },
//!         "end": {
//!           "offset": 6,
//!           "col": 7,
//!           "tokLen": 1
//!         }
//!       },
//!       "name": "S",
//!       "tagUsed": "class"
//!     }
//!   ]
//! }
//! # };
//! ```
//!
//! <br><br>
//!
//! # Library design
//!
//! By design, the clang-ast crate *does not* provide a single great big data
//! structure that exhaustively covers every possible field of every possible
//! Clang node type. There are three major reasons:
//!
//! - **Performance** &mdash; these ASTs get quite large. For a reasonable
//!   mid-sized translation unit that includes several platform headers, you can
//!   easily get an AST that is tens to hundreds of megabytes of JSON. To
//!   maintain performance of downstream tooling built on the AST, it's critical
//!   that you deserialize only the few fields which are directly required by
//!   your use case, and allow Serde's deserializer to efficiently ignore all
//!   the rest.
//!
//! - **Stability** &mdash; as Clang is developed, the specific fields
//!   associated with each node kind are expected to change over time in
//!   non-additive ways. This is nonproblematic because the churn on the scale
//!   of individual nodes is minimal (maybe one change every several years).
//!   However, if there were a data structure that promised to be able to
//!   deserialize every possible piece of information in every node, practically
//!   every change to Clang would be a breaking change to some node *somewhere*
//!   despite your tooling not caring anything at all about that node kind. By
//!   deserializing only those fields which are directly relevant to your use
//!   case, you become insulated from the vast majority of syntax tree changes.
//!
//! - **Compile time** &mdash; a typical use case involves inspecting only a
//!   tiny fraction of the possible nodes or fields, on the order of 1%.
//!   Consequently your code will compile 100&times; faster than if you tried to
//!   include everything in the data structure.
//!
//! <br>
//!
//! # Data structures
//!
//! The core data structure of the clang-ast crate is `Node<T>`.
//!
//! ```
//! # use clang_ast::Id;
//! #
//! pub struct Node<T> {
//!     pub id: Id,
//!     pub kind: T,
//!     pub inner: Vec<Node<T>>,
//! }
//! ```
//!
//! The caller must provide their own kind type `T`, which is an enum or struct
//! as described below. `T` determines exactly what information the clang-ast
//! crate will deserialize out of the AST dump.
//!
//! By convention you should name your `T` type `Clang`.
//!
//! <br>
//!
//! # T = enum
//!
//! Most often, you'll want `Clang` to be an enum. In this case your enum must
//! have one variant per node kind that you care about. The name of each variant
//! matches the `"kind"` entry seen in the AST.
//!
//! Additionally there must be a fallback variant, which must be named either
//! `Unknown` or `Other`, into which clang-ast will put all tree nodes not
//! matching one of the expected kinds.
//!
//! ```no_run
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//!
//! pub type Node = clang_ast::Node<Clang>;
//!
//! #[derive(Deserialize)]
//! pub enum Clang {
//!     NamespaceDecl { name: Option<String> },
//!     EnumDecl { name: Option<String> },
//!     EnumConstantDecl { name: String },
//!     Other,
//! }
//!
//! fn main() {
//!     let json = std::fs::read_to_string("ast.json").unwrap();
//!     let node: Node = serde_json::from_str(&json).unwrap();
//!
//! }
//! ```
//!
//! The above is a simple example with variants for processing `"kind":
//! "NamespaceDecl"`,&ensp;`"kind": "EnumDecl"`,&ensp;and `"kind":
//! "EnumConstantDecl"` nodes. This is sufficient to extract the set of variants
//! of every enum in the translation unit, and the enums' namespace (possibly
//! anonymous) and enum name (possibly anonymous).
//!
//! Newtype variants are fine too, particularly if you'll be deserializing more
//! than one field for some nodes.
//!
//! ```
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//!
//! pub type Node = clang_ast::Node<Clang>;
//!
//! #[derive(Deserialize)]
//! pub enum Clang {
//!     NamespaceDecl(NamespaceDecl),
//!     EnumDecl(EnumDecl),
//!     EnumConstantDecl(EnumConstantDecl),
//!     Other,
//! }
//!
//! #[derive(Deserialize, Debug)]
//! pub struct NamespaceDecl {
//!     pub name: Option<String>,
//! }
//!
//! #[derive(Deserialize, Debug)]
//! pub struct EnumDecl {
//!     pub name: Option<String>,
//! }
//!
//! #[derive(Deserialize, Debug)]
//! pub struct EnumConstantDecl {
//!     pub name: String,
//! }
//! ```
//!
//! <br><br>
//!
//! # T = struct
//!
//! Rarely, it can make sense to instantiate Node with `Clang` being a struct
//! type, instead of an enum. This allows for deserializing a uniform group of
//! data out of *every* node in the syntax tree.
//!
//! The following example struct collects the `"loc"` and `"range"` of every
//! node if present; these fields provide the file name / line / column position
//! of nodes. Not every node kind contains this information, so we use `Option`
//! to collect it for just the nodes that have it.
//!
//! ```
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//!
//! pub type Node = clang_ast::Node<Clang>;
//!
//! #[derive(Deserialize)]
//! pub struct Clang {
//!     pub kind: String,  // or clang_ast::Kind
//!     pub loc: Option<clang_ast::SourceLocation>,
//!     pub range: Option<clang_ast::SourceRange>,
//! }
//! ```
//!
//! If you really need, it's also possible to store *every other piece of
//! key/value information about every node* via a weakly typed `Map<String,
//! Value>` and the Serde `flatten` attribute.
//!
//! ```
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//! use serde_json::{Map, Value};
//!
//! #[derive(Deserialize)]
//! pub struct Clang {
//!     pub kind: String,  // or clang_ast::Kind
//!     #[serde(flatten)]
//!     pub data: Map<String, Value>,
//! }
//! ```
//!
//! <br><br>
//!
//! # Hybrid approach
//!
//! To deserialize kind-specific information about a fixed set of node kinds you
//! care about, as well as some uniform information about every other kind of
//! node, you can use a hybrid of the two approaches by giving your `Other` /
//! `Unknown` fallback variant some fields.
//!
//! ```
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//!
//! pub type Node = clang_ast::Node<Clang>;
//!
//! #[derive(Deserialize)]
//! pub enum Clang {
//!     NamespaceDecl(NamespaceDecl),
//!     EnumDecl(EnumDecl),
//!     Other {
//!         kind: clang_ast::Kind,
//!     },
//! }
//! #
//! # #[derive(Deserialize)]
//! # struct NamespaceDecl;
//! #
//! # #[derive(Deserialize)]
//! # struct EnumDecl;
//! ```
//!
//! <br><br>
//!
//! # Source locations
//!
//! Many node kinds expose the source location of the corresponding source code
//! tokens, which includes:
//!
//! - the filepath at which they're located;
//! - the chain of `#include`s by which that file was brought into the
//!   translation unit;
//! - line/column positions within the source file;
//! - macro expansion trace for tokens constructed by expansion of a C
//!   preprocessor macro.
//!
//! You'll find this information in fields called `"loc"` and/or `"range"` in
//! the JSON representation.
//!
//! ```
//! # stringify! {
//! {
//!   "id": "0x1251428",
//!   "kind": "NamespaceDecl",
//!   "loc": {                           //<--
//!     "offset": 7004,
//!     "file": "/usr/include/x86_64-linux-gnu/c++/10/bits/c++config.h",
//!     "line": 258,
//!     "col": 11,
//!     "tokLen": 3,
//!     "includedFrom": {
//!       "file": "/usr/include/c++/10/utility"
//!     }
//!   },
//!   "range": {                         //<--
//!     "begin": {
//!       "offset": 6994,
//!       "col": 1,
//!       "tokLen": 9
//!     },
//!     "end": {
//!       "offset": 7155,
//!       "line": 266,
//!       "col": 1,
//!       "tokLen": 1
//!     }
//!   },
//!   ...
//! }
//! # };
//! ```
//!
//! The naive deserialization of these structures is challenging to work with
//! because Clang uses field omission to mean "same as previous". So if a
//! `"loc"` is printed without a `"file"` inside, it means the loc is in the
//! same file as the immediately previous loc in serialization order.
//!
//! The clang-ast crate provides types for deserializing this source location
//! information painlessly, producing `Arc<str>` as the type of filepaths which
//! may be shared across multiple source locations.
//!
//! ```
//! use serde::Deserialize;
//! # use serde_derive::Deserialize;
//!
//! pub type Node = clang_ast::Node<Clang>;
//!
//! #[derive(Deserialize)]
//! pub enum Clang {
//!     NamespaceDecl(NamespaceDecl),
//!     Other,
//! }
//!
//! #[derive(Deserialize, Debug)]
//! pub struct NamespaceDecl {
//!     pub name: Option<String>,
//!     pub loc: clang_ast::SourceLocation,    //<--
//!     pub range: clang_ast::SourceRange,     //<--
//! }
//! ```
//!
//! <br><br>
//!
//! # Node identifiers
//!
//! Every syntax tree node has an `"id"`. In JSON it's the memory address of
//! Clang's internal memory allocation for that node, serialized to a hex
//! string.
//!
//! The AST dump uses ids as backreferences in nodes of directed acyclic graph
//! nature. For example the following MemberExpr node is part of the invocation
//! of an `operator bool` conversion, and thus its syntax tree refers to the
//! resolved `operator bool` conversion function declaration:
//!
//! ```
//! # stringify! {
//! {
//!   "id": "0x9918b88",
//!   "kind": "MemberExpr",
//!   "valueCategory": "rvalue",
//!   "referencedMemberDecl": "0x12d8330",     //<--
//!   ...
//! }
//! # };
//! ```
//!
//! The node it references, with memory address 0x12d8330, is found somewhere
//! earlier in the syntax tree:
//!
//! ```
//! # stringify! {
//! {
//!   "id": "0x12d8330",                       //<--
//!   "kind": "CXXConversionDecl",
//!   "name": "operator bool",
//!   "mangledName": "_ZNKSt17integral_constantIbLb1EEcvbEv",
//!   "type": {
//!     "qualType": "std::integral_constant<bool, true>::value_type () const noexcept"
//!   },
//!   "constexpr": true,
//!   ...
//! }
//! # };
//! ```
//!
//! Due to the ubiquitous use of ids for backreferencing, it is valuable to
//! deserialize them not as strings but as a 64-bit integer. The clang-ast crate
//! provides an `Id` type for this purpose, which is cheaply copyable, hashable,
//! and comparible more cheaply than a string. You may find yourself with lots
//! of hashtables keyed on `Id`.

#![doc(html_root_url = "https://docs.rs/clang-ast/0.1.27")]
#![allow(
    clippy::blocks_in_conditions,
    clippy::derivable_impls,
    clippy::doc_markdown,
    clippy::let_underscore_untyped,
    clippy::match_like_matches_macro,
    clippy::must_use_candidate,
    clippy::needless_lifetimes,
    clippy::ptr_arg,
    clippy::uninlined_format_args
)]

mod dedup;
mod deserializer;
mod id;
mod intern;
mod kind;
mod loc;
mod serializer;

extern crate serde;

use crate::deserializer::NodeDeserializer;
use crate::kind::AnyKind;
use crate::serializer::NodeSerializer;
use serde::de::{Deserialize, Deserializer, MapAccess, Visitor};
use serde::ser::{Serialize, SerializeMap, Serializer};
use std::fmt;
use std::marker::PhantomData;

pub use crate::id::Id;
pub use crate::kind::Kind;
pub use crate::loc::{BareSourceLocation, IncludedFrom, SourceLocation, SourceRange};

/// <font style="font-variant:small-caps">syntax tree root</font>
#[derive(Clone, Eq, PartialEq, Hash, Debug)]
pub struct Node<T> {
    pub id: Id,
    pub kind: T,
    pub inner: Vec<Node<T>>,
}

struct NodeVisitor<T> {
    marker: PhantomData<fn() -> T>,
}

impl<'de, T> Visitor<'de> for NodeVisitor<T>
where
    T: Deserialize<'de>,
{
    type Value = Node<T>;

    fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
        formatter.write_str("clang syntax tree node")
    }

    fn visit_map<M>(self, mut map: M) -> Result<Self::Value, M::Error>
    where
        M: MapAccess<'de>,
    {
        enum FirstField {
            Id,
            Kind,
            Inner,
        }

        struct FirstFieldVisitor;

        impl<'de> Visitor<'de> for FirstFieldVisitor {
            type Value = FirstField;

            fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
                formatter.write_str("field identifier")
            }

            fn visit_str<E>(self, field: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                static FIELDS: &[&str] = &["id", "kind", "inner"];
                match field {
                    "id" => Ok(FirstField::Id),
                    "kind" => Ok(FirstField::Kind),
                    "inner" => Ok(FirstField::Inner),
                    _ => Err(E::unknown_field(field, FIELDS)),
                }
            }
        }

        impl<'de> Deserialize<'de> for FirstField {
            fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
            where
                D: Deserializer<'de>,
            {
                deserializer.deserialize_identifier(FirstFieldVisitor)
            }
        }

        let mut id = None;
        let mut inner = Vec::new();
        let kind = loop {
            match map.next_key()? {
                None => {
                    let kind = AnyKind::Kind(Kind::null);
                    let deserializer = NodeDeserializer::new(&kind, &mut inner, map);
                    break T::deserialize(deserializer)?;
                }
                Some(FirstField::Id) => {
                    if id.is_some() {
                        return Err(serde::de::Error::duplicate_field("id"));
                    }
                    id = Some(map.next_value()?);
                }
                Some(FirstField::Kind) => {
                    let kind: AnyKind = map.next_value()?;
                    let deserializer = NodeDeserializer::new(&kind, &mut inner, map);
                    break T::deserialize(deserializer)?;
                }
                Some(FirstField::Inner) => {
                    return Err(serde::de::Error::missing_field("kind"));
                }
            }
        };

        let id = id.unwrap_or_default();

        Ok(Node { id, kind, inner })
    }
}

impl<'de, T> Deserialize<'de> for Node<T>
where
    T: Deserialize<'de>,
{
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: Deserializer<'de>,
    {
        let _intern = intern::activate();
        let marker = PhantomData;
        let visitor = NodeVisitor { marker };
        deserializer.deserialize_map(visitor)
    }
}

impl<T> Serialize for Node<T>
where
    T: Serialize,
{
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: Serializer,
    {
        let _dedup = dedup::activate();
        let mut map = serializer.serialize_map(None)?;
        map.serialize_entry("id", &self.id)?;
        T::serialize(&self.kind, NodeSerializer::new(&mut map))?;
        if !self.inner.is_empty() {
            map.serialize_entry("inner", &self.inner)?;
        }
        map.end()
    }
}