tree-sitter-regex 0.21.0

/**
 * @file Regex grammar for tree-sitter
 * @author Max Brunsfeld <maxbrunsfeld@gmail.com>
 * @author Amaan Qureshi <amaanq12@gmail.com>
 * @license MIT
 */

/// <reference types="tree-sitter-cli/dsl" />
// @ts-check

/**
 *
 * @param {RuleBuilder<string>} prefix - The rule builder
 *
 * @return {RuleBuilder<string>}
 *
 */
const quantifierRule = prefix => $ => seq(
  prefix($),
  optional(alias('?', $.lazy)),
);

const SYNTAX_CHARS = [
  ...'^$\\.*+?()[]|',
];

const SYNTAX_CHARS_ESCAPED = SYNTAX_CHARS.map(
  char => `\\${char}`,
).join('');

module.exports = grammar({
  name: 'regex',

  extras: _ => [/\r?\n/],

  inline: $ => [
    $._character_escape,
    $._class_atom,
  ],

  conflicts: $ => [[$.character_class, $.class_range]],

  rules: {
    pattern: $ => choice(
      $.alternation,
      $.term,
    ),

    alternation: $ => seq(
      optional($.term),
      repeat1(seq('|', optional($.term))),
    ),

    term: $ => repeat1(seq(
      choice(
        $.start_assertion,
        $.end_assertion,
        $.boundary_assertion,
        $.non_boundary_assertion,
        $.lookaround_assertion,
        $.pattern_character,
        $.character_class,
        $.any_character,
        $.decimal_escape,
        $.character_class_escape,
        $._character_escape,
        $.backreference_escape,
        $.anonymous_capturing_group,
        $.named_capturing_group,
        $.non_capturing_group,
      ),
      optional(choice(
        $.zero_or_more,
        $.one_or_more,
        $.optional,
        $.count_quantifier,
      )),
    )),

    any_character: _ => '.',

    start_assertion: _ => '^',
    end_assertion: _ => '$',
    boundary_assertion: _ => '\\b',
    non_boundary_assertion: _ => '\\B',
    lookaround_assertion: $ => choice(
      $._lookahead_assertion,
      $._lookbehind_assertion,
    ),
    _lookahead_assertion: $ => seq(
      '(?',
      choice('=', '!'),
      $.pattern,
      ')',
    ),
    _lookbehind_assertion: $ => seq(
      '(?<',
      choice('=', '!'),
      $.pattern,
      ')',
    ),

    pattern_character: _ => new RegExp(`[^${SYNTAX_CHARS_ESCAPED}\\r?\\n]`),

    character_class: $ => seq(
      '[',
      optional('^'),
      repeat(choice(
        $.class_range,
        $._class_atom,
      )),
      ']',
    ),

    class_range: $ => prec.right(
      seq($._class_atom, '-', $._class_atom),
    ),

    _class_atom: $ => choice(
      alias('-', $.class_character),
      $.class_character,
      alias('\\-', $.identity_escape),
      $.character_class_escape,
      $._character_escape,
    ),

    class_character: _ => // NOT: \ ] or -
      /[^\\\]\-]/,

    anonymous_capturing_group: $ => seq('(', $.pattern, ')'),

    named_capturing_group: $ => seq('(?<', $.group_name, '>', $.pattern, ')'),

    non_capturing_group: $ => seq('(?:', $.pattern, ')'),

    zero_or_more: quantifierRule(_ => '*'),
    one_or_more: quantifierRule(_ => '+'),
    optional: quantifierRule(_ => '?'),
    count_quantifier: quantifierRule($ => seq(
      '{',
      seq($.decimal_digits, optional(seq(',', $.decimal_digits))),
      '}',
    )),

    backreference_escape: $ => seq('\\k', $.group_name),

    decimal_escape: _ => /\\[1-9][0-9]*/,

    character_class_escape: $ => choice(
      /\\[dDsSwW]/,
      seq(/\\[pP]/, '{', $.unicode_property_value_expression, '}'),
    ),

    unicode_property_value_expression: $ => seq(
      optional(seq(alias($.unicode_property, $.unicode_property_name), '=')),
      alias($.unicode_property, $.unicode_property_value),
    ),

    unicode_property: _ => /[a-zA-Z_0-9]+/,

    _character_escape: $ => choice(
      $.control_escape,
      $.control_letter_escape,
      $.identity_escape,
    ),

    // TODO: We should technically not accept \0 unless the
    // lookahead is not also a digit.
    // I think this has little bearing on the highlighting of
    // correct regexes.
    control_escape: _ => /\\[bfnrtv0]/,

    control_letter_escape: _ => /\\c[a-zA-Z]/,

    identity_escape: _ => token(seq('\\', /[^kdDsSpPwWbfnrtv0-9]/)),

    // TODO: This is an approximation of RegExpIdentifierName in the
    // formal grammar, which allows for Unicode names through
    // the following mechanism:
    //
    // RegExpIdentifierName[U]::
    //   RegExpIdentifierStart[?U]
    //   RegExpIdentifierName[?U]RegExpIdentifierPart[?U]
    //
    // RegExpIdentifierStart[U]::
    //   UnicodeIDStart
    //   $
    //   _
    //   \RegExpUnicodeEscapeSequence[?U]
    //
    // RegExpIdentifierPart[U]::
    //   UnicodeIDContinue
    //   $
    //   \RegExpUnicodeEscapeSequence[?U]
    //   <ZWNJ> <ZWJ>
    // RegExpUnicodeEscapeSequence[U]::
    //   [+U]uLeadSurrogate\uTrailSurrogate
    //   [+U]uLeadSurrogate
    //   [+U]uTrailSurrogate
    //   [+U]uNonSurrogate
    //   [~U]uHex4Digits
    //   [+U]u{CodePoint}
    group_name: _ => /[A-Za-z0-9]+/,

    decimal_digits: _ => /\d+/,
  },
});