jsonc_parser/
parse_to_ast.rs

1use std::borrow::Cow;
2use std::collections::HashMap;
3use std::rc::Rc;
4
5use super::ast::*;
6use super::common::Range;
7use super::errors::*;
8use super::scanner::Scanner;
9use super::tokens::Token;
10use super::tokens::TokenAndRange;
11
12/// Map where the comments are stored in collections where
13/// the key is the previous token end or start of file or
14/// next token start or end of the file.
15pub type CommentMap<'a> = HashMap<usize, Rc<Vec<Comment<'a>>>>;
16
17#[derive(Default, Debug, PartialEq, Clone)]
18pub enum CommentCollectionStrategy {
19  #[default]
20  Off,
21  Separate,
22  AsTokens,
23}
24
25/// Options for collecting comments and tokens.
26#[derive(Default, Clone)]
27pub struct CollectOptions {
28  /// Include comments in the result.
29  pub comments: CommentCollectionStrategy,
30  /// Include tokens in the result.
31  pub tokens: bool,
32}
33
34/// Options for parsing.
35#[derive(Clone)]
36pub struct ParseOptions {
37  /// Allow comments (defaults to `true`).
38  pub allow_comments: bool,
39  /// Allow words and numbers as object property names (defaults to `true`).
40  pub allow_loose_object_property_names: bool,
41  /// Allow trailing commas on object literal and array literal values (defaults to `true`).
42  pub allow_trailing_commas: bool,
43}
44
45impl Default for ParseOptions {
46  fn default() -> Self {
47    Self {
48      allow_comments: true,
49      allow_loose_object_property_names: true,
50      allow_trailing_commas: true,
51    }
52  }
53}
54
55/// Result of parsing the text.
56pub struct ParseResult<'a> {
57  /// Collection of comments in the text.
58  ///
59  /// Provide `comments: true` to the `ParseOptions` for this to have a value.
60  ///
61  /// Remarks: The key is the start and end position of the tokens.
62  pub comments: Option<CommentMap<'a>>,
63  /// The JSON value the text contained.
64  pub value: Option<Value<'a>>,
65  /// Collection of tokens (excluding any comments).
66  ///
67  /// Provide `tokens: true` to the `ParseOptions` for this to have a value.
68  pub tokens: Option<Vec<TokenAndRange<'a>>>,
69}
70
71struct Context<'a> {
72  scanner: Scanner<'a>,
73  comments: Option<CommentMap<'a>>,
74  current_comments: Option<Vec<Comment<'a>>>,
75  last_token_end: usize,
76  range_stack: Vec<Range>,
77  tokens: Option<Vec<TokenAndRange<'a>>>,
78  collect_comments_as_tokens: bool,
79  allow_comments: bool,
80  allow_trailing_commas: bool,
81  allow_loose_object_property_names: bool,
82}
83
84impl<'a> Context<'a> {
85  pub fn scan(&mut self) -> Result<Option<Token<'a>>, ParseError> {
86    let previous_end = self.last_token_end;
87    let token = self.scan_handling_comments()?;
88    self.last_token_end = self.scanner.token_end();
89
90    // store the comment for the previous token end, and current token start
91    if let Some(comments) = self.comments.as_mut() {
92      if let Some(current_comments) = self.current_comments.take() {
93        let current_comments = Rc::new(current_comments);
94        comments.insert(previous_end, current_comments.clone());
95        comments.insert(self.scanner.token_start(), current_comments);
96      }
97    }
98
99    if let Some(token) = &token {
100      if self.tokens.is_some() {
101        self.capture_token(token.clone());
102      }
103    }
104
105    Ok(token)
106  }
107
108  pub fn token(&self) -> Option<Token<'a>> {
109    self.scanner.token()
110  }
111
112  pub fn start_range(&mut self) {
113    self.range_stack.push(Range {
114      start: self.scanner.token_start(),
115      end: 0,
116    });
117  }
118
119  pub fn end_range(&mut self) -> Range {
120    let mut range = self
121      .range_stack
122      .pop()
123      .expect("Range was popped from the stack, but the stack was empty.");
124    range.end = self.scanner.token_end();
125    range
126  }
127
128  pub fn create_range_from_last_token(&self) -> Range {
129    Range {
130      start: self.scanner.token_start(),
131      end: self.scanner.token_end(),
132    }
133  }
134
135  pub fn create_error(&self, kind: ParseErrorKind) -> ParseError {
136    self.scanner.create_error_for_current_token(kind)
137  }
138
139  pub fn create_error_for_current_range(&mut self, kind: ParseErrorKind) -> ParseError {
140    let range = self.end_range();
141    self.create_error_for_range(range, kind)
142  }
143
144  pub fn create_error_for_range(&self, range: Range, kind: ParseErrorKind) -> ParseError {
145    self.scanner.create_error_for_range(range, kind)
146  }
147
148  fn scan_handling_comments(&mut self) -> Result<Option<Token<'a>>, ParseError> {
149    loop {
150      let token = self.scanner.scan()?;
151      match token {
152        Some(token @ Token::CommentLine(_) | token @ Token::CommentBlock(_)) if self.collect_comments_as_tokens => {
153          self.capture_token(token);
154        }
155        Some(Token::CommentLine(text)) => {
156          self.handle_comment(Comment::Line(CommentLine {
157            range: self.create_range_from_last_token(),
158            text,
159          }))?;
160        }
161        Some(Token::CommentBlock(text)) => {
162          self.handle_comment(Comment::Block(CommentBlock {
163            range: self.create_range_from_last_token(),
164            text,
165          }))?;
166        }
167        _ => return Ok(token),
168      }
169    }
170  }
171
172  fn capture_token(&mut self, token: Token<'a>) {
173    let range = self.create_range_from_last_token();
174    if let Some(tokens) = self.tokens.as_mut() {
175      tokens.push(TokenAndRange {
176        token: token.clone(),
177        range,
178      });
179    }
180  }
181
182  fn handle_comment(&mut self, comment: Comment<'a>) -> Result<(), ParseError> {
183    if !self.allow_comments {
184      return Err(self.create_error(ParseErrorKind::CommentsNotAllowed));
185    }
186
187    if self.comments.is_some() {
188      if let Some(comments) = self.current_comments.as_mut() {
189        comments.push(comment);
190      } else {
191        self.current_comments = Some(vec![comment]);
192      }
193    }
194
195    Ok(())
196  }
197}
198
199/// Parses a string containing JSONC to an AST with comments and tokens.
200///
201/// # Example
202///
203/// ```
204/// use jsonc_parser::CollectOptions;
205/// use jsonc_parser::CommentCollectionStrategy;
206/// use jsonc_parser::parse_to_ast;
207/// use jsonc_parser::ParseOptions;
208///
209/// let parse_result = parse_to_ast(r#"{ "test": 5 } // test"#, &CollectOptions {
210///     comments: CommentCollectionStrategy::Separate, // include comments in result
211///     tokens: true, // include tokens in result
212/// }, &Default::default()).expect("Should parse.");
213/// // ...inspect parse_result for value, tokens, and comments here...
214/// ```
215pub fn parse_to_ast<'a>(
216  text: &'a str,
217  collect_options: &CollectOptions,
218  parse_options: &ParseOptions,
219) -> Result<ParseResult<'a>, ParseError> {
220  let mut context = Context {
221    scanner: Scanner::new(text),
222    comments: match collect_options.comments {
223      CommentCollectionStrategy::Separate => Some(Default::default()),
224      CommentCollectionStrategy::Off | CommentCollectionStrategy::AsTokens => None,
225    },
226    current_comments: None,
227    last_token_end: 0,
228    range_stack: Vec::new(),
229    tokens: if collect_options.tokens { Some(Vec::new()) } else { None },
230    collect_comments_as_tokens: collect_options.comments == CommentCollectionStrategy::AsTokens,
231    allow_comments: parse_options.allow_comments,
232    allow_trailing_commas: parse_options.allow_trailing_commas,
233    allow_loose_object_property_names: parse_options.allow_loose_object_property_names,
234  };
235  context.scan()?;
236  let value = parse_value(&mut context)?;
237
238  if context.scan()?.is_some() {
239    return Err(context.create_error(ParseErrorKind::MultipleRootJsonValues));
240  }
241
242  debug_assert!(context.range_stack.is_empty());
243
244  Ok(ParseResult {
245    comments: context.comments,
246    tokens: context.tokens,
247    value,
248  })
249}
250
251fn parse_value<'a>(context: &mut Context<'a>) -> Result<Option<Value<'a>>, ParseError> {
252  match context.token() {
253    None => Ok(None),
254    Some(token) => match token {
255      Token::OpenBrace => Ok(Some(Value::Object(parse_object(context)?))),
256      Token::OpenBracket => Ok(Some(Value::Array(parse_array(context)?))),
257      Token::String(value) => Ok(Some(Value::StringLit(create_string_lit(context, value)))),
258      Token::Boolean(value) => Ok(Some(Value::BooleanLit(create_boolean_lit(context, value)))),
259      Token::Number(value) => Ok(Some(Value::NumberLit(create_number_lit(context, value)))),
260      Token::Null => return Ok(Some(Value::NullKeyword(create_null_keyword(context)))),
261      Token::CloseBracket => Err(context.create_error(ParseErrorKind::UnexpectedCloseBracket)),
262      Token::CloseBrace => Err(context.create_error(ParseErrorKind::UnexpectedCloseBrace)),
263      Token::Comma => Err(context.create_error(ParseErrorKind::UnexpectedComma)),
264      Token::Colon => Err(context.create_error(ParseErrorKind::UnexpectedColon)),
265      Token::Word(_) => Err(context.create_error(ParseErrorKind::UnexpectedWord)),
266      Token::CommentLine(_) => unreachable!(),
267      Token::CommentBlock(_) => unreachable!(),
268    },
269  }
270}
271
272fn parse_object<'a>(context: &mut Context<'a>) -> Result<Object<'a>, ParseError> {
273  debug_assert!(context.token() == Some(Token::OpenBrace));
274  let mut properties = Vec::new();
275
276  context.start_range();
277  context.scan()?;
278
279  loop {
280    match context.token() {
281      Some(Token::CloseBrace) => break,
282      Some(Token::String(prop_name)) => {
283        properties.push(parse_object_property(context, PropName::String(prop_name))?);
284      }
285      Some(Token::Word(prop_name)) | Some(Token::Number(prop_name)) => {
286        properties.push(parse_object_property(context, PropName::Word(prop_name))?);
287      }
288      None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedObject)),
289      _ => return Err(context.create_error(ParseErrorKind::UnexpectedTokenInObject)),
290    }
291
292    // skip the comma
293    if let Some(Token::Comma) = context.scan()? {
294      let comma_range = context.create_range_from_last_token();
295      if let Some(Token::CloseBrace) = context.scan()? {
296        if !context.allow_trailing_commas {
297          return Err(context.create_error_for_range(comma_range, ParseErrorKind::TrailingCommasNotAllowed));
298        }
299      }
300    }
301  }
302
303  Ok(Object {
304    range: context.end_range(),
305    properties,
306  })
307}
308
309enum PropName<'a> {
310  String(Cow<'a, str>),
311  Word(&'a str),
312}
313
314fn parse_object_property<'a>(context: &mut Context<'a>, prop_name: PropName<'a>) -> Result<ObjectProp<'a>, ParseError> {
315  context.start_range();
316
317  let name = match prop_name {
318    PropName::String(prop_name) => ObjectPropName::String(create_string_lit(context, prop_name)),
319    PropName::Word(prop_name) => {
320      if context.allow_loose_object_property_names {
321        ObjectPropName::Word(create_word(context, prop_name))
322      } else {
323        return Err(context.create_error(ParseErrorKind::ExpectedStringObjectProperty));
324      }
325    }
326  };
327
328  match context.scan()? {
329    Some(Token::Colon) => {}
330    _ => return Err(context.create_error(ParseErrorKind::ExpectedColonAfterObjectKey)),
331  }
332
333  context.scan()?;
334  let value = parse_value(context)?;
335
336  match value {
337    Some(value) => Ok(ObjectProp {
338      range: context.end_range(),
339      name,
340      value,
341    }),
342    None => Err(context.create_error(ParseErrorKind::ExpectedObjectValue)),
343  }
344}
345
346fn parse_array<'a>(context: &mut Context<'a>) -> Result<Array<'a>, ParseError> {
347  debug_assert!(context.token() == Some(Token::OpenBracket));
348  let mut elements = Vec::new();
349
350  context.start_range();
351  context.scan()?;
352
353  loop {
354    match context.token() {
355      Some(Token::CloseBracket) => break,
356      None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedArray)),
357      _ => match parse_value(context)? {
358        Some(value) => elements.push(value),
359        None => return Err(context.create_error_for_current_range(ParseErrorKind::UnterminatedArray)),
360      },
361    }
362
363    // skip the comma
364    if let Some(Token::Comma) = context.scan()? {
365      let comma_range = context.create_range_from_last_token();
366      if let Some(Token::CloseBracket) = context.scan()? {
367        if !context.allow_trailing_commas {
368          return Err(context.create_error_for_range(comma_range, ParseErrorKind::TrailingCommasNotAllowed));
369        }
370      }
371    }
372  }
373
374  Ok(Array {
375    range: context.end_range(),
376    elements,
377  })
378}
379
380// factory functions
381
382fn create_string_lit<'a>(context: &Context<'a>, value: Cow<'a, str>) -> StringLit<'a> {
383  StringLit {
384    range: context.create_range_from_last_token(),
385    value,
386  }
387}
388
389fn create_word<'a>(context: &Context<'a>, value: &'a str) -> WordLit<'a> {
390  WordLit {
391    range: context.create_range_from_last_token(),
392    value,
393  }
394}
395
396fn create_boolean_lit(context: &Context, value: bool) -> BooleanLit {
397  BooleanLit {
398    range: context.create_range_from_last_token(),
399    value,
400  }
401}
402
403fn create_number_lit<'a>(context: &Context<'a>, value: &'a str) -> NumberLit<'a> {
404  NumberLit {
405    range: context.create_range_from_last_token(),
406    value,
407  }
408}
409
410fn create_null_keyword(context: &Context) -> NullKeyword {
411  NullKeyword {
412    range: context.create_range_from_last_token(),
413  }
414}
415
416#[cfg(test)]
417mod tests {
418  use super::*;
419  use pretty_assertions::assert_eq;
420
421  #[test]
422  fn it_should_error_when_has_multiple_values() {
423    assert_has_error(
424      "[][]",
425      "Text cannot contain more than one JSON value on line 1 column 3",
426    );
427  }
428
429  #[test]
430  fn it_should_error_when_object_is_not_terminated() {
431    assert_has_error("{", "Unterminated object on line 1 column 1");
432  }
433
434  #[test]
435  fn it_should_error_when_object_has_unexpected_token() {
436    assert_has_error("{ [] }", "Unexpected token in object on line 1 column 3");
437  }
438
439  #[test]
440  fn it_should_error_when_object_has_two_non_string_tokens() {
441    assert_has_error(
442      "{ asdf asdf: 5 }",
443      "Expected colon after the string or word in object property on line 1 column 8",
444    );
445  }
446
447  #[test]
448  fn it_should_error_when_array_is_not_terminated() {
449    assert_has_error("[", "Unterminated array on line 1 column 1");
450  }
451
452  #[test]
453  fn it_should_error_when_array_has_unexpected_token() {
454    assert_has_error("[:]", "Unexpected colon on line 1 column 2");
455  }
456
457  #[test]
458  fn it_should_error_when_comment_block_not_closed() {
459    assert_has_error("/* test", "Unterminated comment block on line 1 column 1");
460  }
461
462  #[test]
463  fn it_should_error_when_string_lit_not_closed() {
464    assert_has_error("\" test", "Unterminated string literal on line 1 column 1");
465  }
466
467  fn assert_has_error(text: &str, message: &str) {
468    let result = parse_to_ast(text, &Default::default(), &Default::default());
469    match result {
470      Ok(_) => panic!("Expected error, but did not find one."),
471      Err(err) => assert_eq!(err.to_string(), message),
472    }
473  }
474
475  #[test]
476  fn strict_should_error_object_trailing_comma() {
477    assert_has_strict_error(
478      r#"{ "test": 5, }"#,
479      "Trailing commas are not allowed on line 1 column 12",
480    );
481  }
482
483  #[test]
484  fn strict_should_error_array_trailing_comma() {
485    assert_has_strict_error(r#"[ "test", ]"#, "Trailing commas are not allowed on line 1 column 9");
486  }
487
488  #[test]
489  fn strict_should_error_comment_line() {
490    assert_has_strict_error(r#"[ "test" ] // 1"#, "Comments are not allowed on line 1 column 12");
491  }
492
493  #[test]
494  fn strict_should_error_comment_block() {
495    assert_has_strict_error(r#"[ "test" /* 1 */]"#, "Comments are not allowed on line 1 column 10");
496  }
497
498  #[test]
499  fn strict_should_error_word_property() {
500    assert_has_strict_error(
501      r#"{ word: 5 }"#,
502      "Expected string for object property on line 1 column 3",
503    );
504  }
505
506  #[track_caller]
507  fn assert_has_strict_error(text: &str, message: &str) {
508    let result = parse_to_ast(
509      text,
510      &Default::default(),
511      &ParseOptions {
512        allow_comments: false,
513        allow_loose_object_property_names: false,
514        allow_trailing_commas: false,
515      },
516    );
517    match result {
518      Ok(_) => panic!("Expected error, but did not find one."),
519      Err(err) => assert_eq!(err.to_string(), message),
520    }
521  }
522
523  #[test]
524  fn it_should_not_include_tokens_by_default() {
525    let result = parse_to_ast("{}", &Default::default(), &Default::default()).unwrap();
526    assert!(result.tokens.is_none());
527  }
528
529  #[test]
530  fn it_should_include_tokens_when_specified() {
531    let result = parse_to_ast(
532      "{}",
533      &CollectOptions {
534        tokens: true,
535        ..Default::default()
536      },
537      &Default::default(),
538    )
539    .unwrap();
540    let tokens = result.tokens.unwrap();
541    assert_eq!(tokens.len(), 2);
542  }
543
544  #[test]
545  fn it_should_not_include_comments_by_default() {
546    let result = parse_to_ast("{}", &Default::default(), &Default::default()).unwrap();
547    assert!(result.comments.is_none());
548  }
549
550  #[test]
551  fn it_should_include_comments_when_specified() {
552    let result = parse_to_ast(
553      "{} // 2",
554      &CollectOptions {
555        comments: CommentCollectionStrategy::Separate,
556        ..Default::default()
557      },
558      &Default::default(),
559    )
560    .unwrap();
561    let comments = result.comments.unwrap();
562    assert_eq!(comments.len(), 2); // for both positions, but it's the same comment
563  }
564
565  #[cfg(not(feature = "error_unicode_width"))]
566  #[test]
567  fn error_correct_line_column_unicode_width() {
568    assert_has_strict_error(r#"["🧑‍🦰", ["#, "Unterminated array on line 1 column 9");
569  }
570
571  #[cfg(feature = "error_unicode_width")]
572  #[test]
573  fn error_correct_line_column_unicode_width() {
574    assert_has_strict_error(r#"["🧑‍🦰", ["#, "Unterminated array on line 1 column 10");
575  }
576}