Skip to content
1 change: 1 addition & 0 deletions internal/cbm/cbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ typedef enum {
CBM_LANG_QML, // Qt QML (Qt Modeling Language — declarative UI + embedded JS)
CBM_LANG_CFSCRIPT, // CFML script dialect (.cfc components — Lucee/ColdFusion)
CBM_LANG_CFML, // CFML tag dialect (.cfm templates — Lucee/ColdFusion)
CBM_LANG_ARKTS, // ArkTS (HarmonyOS/OpenHarmony declarative UI language)
CBM_LANG_COUNT
} CBMLanguage;

Expand Down
100 changes: 96 additions & 4 deletions internal/cbm/extract_defs.c
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,30 @@ TSNode cbm_resolve_func_name(TSNode node, CBMLanguage lang) {
}
}

/* ArkTS: function_declaration / decorated_function_declaration
* have no `name` field; the function name is a plain `identifier` child.
* build_method has no name at all — synthesize one from the keyword. */
if (lang == CBM_LANG_ARKTS && (strcmp(kind, "function_declaration") == 0 ||
strcmp(kind, "decorated_function_declaration") == 0)) {
TSNode id = cbm_find_child_by_kind(node, "identifier");
if (!ts_node_is_null(id)) {
return id;
}
}
if (lang == CBM_LANG_ARKTS && strcmp(kind, "build_method") == 0) {
/* build() is always named "build" in ArkTS components. Return a
* synthetic name by finding the "build" keyword child. If no named
* child matches, fall through — extract_func_def will skip it. */
TSNode kw = cbm_find_child_by_kind(node, "build");
if (!ts_node_is_null(kw)) {
return kw;
}
/* tree-sitter-arkts may use "build" as an anonymous token.
* Try the first named child which could be the body. If no
* identifier exists, we still want to emit a def — handled in
* extract_func_def via a special case. */
}

// PowerShell function_statement has no `name` field; the name is a
// `function_name` child node (#35).
if (lang == CBM_LANG_POWERSHELL && strcmp(kind, "function_statement") == 0) {
Expand Down Expand Up @@ -2607,7 +2631,8 @@ static const char *class_label_for_kind(const char *kind) {
return "Enum";
}
if (strcmp(kind, "type_alias_declaration") == 0 || strcmp(kind, "type_item") == 0 ||
strcmp(kind, "type_alias") == 0 || strcmp(kind, "type_definition") == 0) {
strcmp(kind, "type_alias") == 0 || strcmp(kind, "type_definition") == 0 ||
strcmp(kind, "type_declaration") == 0) {
return "Type";
}
return "Class";
Expand Down Expand Up @@ -3049,13 +3074,21 @@ static char *go_receiver_type_name(CBMArena *a, TSNode recv, const char *source)

static void extract_func_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec *spec) {
CBMArena *a = ctx->arena;
const char *kind = ts_node_type(node);

TSNode name_node = cbm_resolve_func_name(node, ctx->language);
if (ts_node_is_null(name_node)) {

/* ArkTS: build_method has no name child — synthesize "build". */
char *synth_name = NULL;
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_ARKTS &&
strcmp(kind, "build_method") == 0) {
synth_name = cbm_arena_strdup(a, "build");
}
if (ts_node_is_null(name_node) && !synth_name) {
return;
}

char *name = cbm_func_name_node_text(a, name_node, ctx->source);
char *name = synth_name ? synth_name : cbm_func_name_node_text(a, name_node, ctx->source);
if (!name || !name[0] || strcmp(name, "function") == 0) {
return;
}
Expand Down Expand Up @@ -3482,6 +3515,12 @@ static void extract_class_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
(ctx->language == CBM_LANG_SWIFT || ctx->language == CBM_LANG_KOTLIN)) {
name_node = cbm_find_child_by_kind(node, "type_identifier");
}
// ArkTS: class_declaration / component_declaration / interface_declaration /
// enum_declaration / type_declaration have no `name` field; the name is a
// plain `identifier` child.
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_ARKTS) {
name_node = cbm_find_child_by_kind(node, "identifier");
}
// Protobuf: service_name / message_name / enum_name children
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_PROTOBUF) {
name_node = cbm_find_child_by_kind(node, "service_name");
Expand Down Expand Up @@ -3744,6 +3783,16 @@ static void extract_class_def(CBMExtractCtx *ctx, TSNode node, const CBMLangSpec
label = "Interface";
}
}
// ArkTS: `@Component struct Foo` parses to `component_declaration` (and
// `@Component export struct Foo` to `decorated_export_declaration`). These
// are UI components, not OOP classes — emit the precise "Component" label.
// Scoped to ArkTS only so templ's `component_declaration` (a Go templating
// function) keeps its established "Class" labeling and the golden snapshot
// for templ is unaffected.
if (ctx->language == CBM_LANG_ARKTS && (strcmp(kind, "component_declaration") == 0 ||
strcmp(kind, "decorated_export_declaration") == 0)) {
label = "Component";
}
// Rust/Swift/D: a struct is a distinct kind from a class — emit the precise
// "Struct" label rather than collapsing it to "Class". Scoped to these three
// grammar/LSP languages. Rust's struct node is `struct_item`; D's is
Expand Down Expand Up @@ -3941,6 +3990,7 @@ static TSNode find_class_body(TSNode class_node, CBMLanguage lang) {
"block",
"closure",
"implementation_definition",
"component_body",
NULL};
uint32_t count = ts_node_child_count(class_node);
for (uint32_t i = 0; i < count; i++) {
Expand Down Expand Up @@ -4046,6 +4096,24 @@ static TSNode resolve_method_name(TSNode child, CBMLanguage lang) {
return cbm_find_child_by_kind(child, "identifier");
}

// ArkTS: method / build_method / decorated_function_declaration have no `name`
// field; the method name is a plain `identifier` child. For build_method,
// the name is always "build" (synthesized by the caller).
if (lang == CBM_LANG_ARKTS &&
(strcmp(ck, "method_declaration") == 0 || strcmp(ck, "function_declaration") == 0 ||
strcmp(ck, "decorated_function_declaration") == 0)) {
TSNode id = cbm_find_child_by_kind(child, "identifier");
if (!ts_node_is_null(id)) {
return id;
}
}
if (lang == CBM_LANG_ARKTS && strcmp(ck, "build_method") == 0) {
TSNode kw = cbm_find_child_by_kind(child, "build");
if (!ts_node_is_null(kw)) {
return kw;
}
}

if (strcmp(ck, "arrow_function") == 0) {
return resolve_arrow_func_name(child);
}
Expand Down Expand Up @@ -4204,6 +4272,26 @@ static void extract_class_methods(CBMExtractCtx *ctx, TSNode class_node, const c
}

TSNode name_node = resolve_method_name(method_node, ctx->language);
/* ArkTS: build_method has no name child — synthesize "build" as method name. */
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_ARKTS &&
strcmp(ts_node_type(method_node), "build_method") == 0) {
/* Use the method_node itself as name_node; push_method_def will
* get the text "build_method" — override below. */
char *build_name = cbm_arena_strdup(ctx->arena, "build");
const char *build_qn = cbm_arena_sprintf(ctx->arena, "%s.build", class_qn);
CBMDefinition mdef;
memset(&mdef, 0, sizeof(mdef));
mdef.name = build_name;
mdef.qualified_name = build_qn;
mdef.label = "Method";
mdef.file_path = ctx->rel_path;
mdef.start_line = ts_node_start_point(method_node).row + TS_LINE_OFFSET;
mdef.end_line = ts_node_end_point(method_node).row + TS_LINE_OFFSET;
mdef.lines = (int)(mdef.end_line - mdef.start_line + TS_LINE_OFFSET);
mdef.parent_class = class_qn;
cbm_defs_push(&ctx->result->defs, ctx->arena, mdef);
continue;
}
if (ts_node_is_null(name_node)) {
continue;
}
Expand Down Expand Up @@ -5727,6 +5815,9 @@ static const char *compute_class_qn(CBMExtractCtx *ctx, TSNode node, const char
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_SWIFT) {
name_node = cbm_find_child_by_kind(node, "type_identifier");
}
if (ts_node_is_null(name_node) && ctx->language == CBM_LANG_ARKTS) {
name_node = cbm_find_child_by_kind(node, "identifier");
}
if (!ts_node_is_null(name_node)) {
char *cname = cbm_node_text(ctx->arena, name_node, ctx->source);
if (cname && cname[0]) {
Expand All @@ -5752,6 +5843,7 @@ static void push_class_body_children(TSNode node, const CBMLangSpec *spec, wd_st
if (strcmp(ck, "field_declaration_list") == 0 || strcmp(ck, "class_body") == 0 ||
strcmp(ck, "declaration_list") == 0 || strcmp(ck, "body") == 0 ||
strcmp(ck, "block") == 0 || strcmp(ck, "suite") == 0 ||
strcmp(ck, "component_body") == 0 ||
// Groovy class bodies are a `closure` node; routing through the
// nested-class path keeps methods from being re-walked (and thus
// double-extracted) as top-level functions. Gated to Groovy so other
Expand Down Expand Up @@ -6253,7 +6345,7 @@ static void walk_defs(CBMExtractCtx *ctx, TSNode root, const CBMLangSpec *spec,
bool descend_into_func =
(ctx->language == CBM_LANG_WOLFRAM || ctx->language == CBM_LANG_TYPESCRIPT ||
ctx->language == CBM_LANG_JAVASCRIPT || ctx->language == CBM_LANG_TSX ||
ctx->language == CBM_LANG_ADA);
ctx->language == CBM_LANG_ADA || ctx->language == CBM_LANG_ARKTS);
if (!descend_into_func) {
continue;
}
Expand Down
3 changes: 3 additions & 0 deletions internal/cbm/grammar_arkts.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// Vendored tree-sitter grammar: arkts
// Each grammar compiled as separate unit (conflicting static symbols).
#include "vendored/grammars/arkts/parser.c"
46 changes: 46 additions & 0 deletions internal/cbm/lang_specs.c
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,7 @@ extern const TSLanguage *tree_sitter_apex(void);
extern const TSLanguage *tree_sitter_soql(void);
extern const TSLanguage *tree_sitter_sosl(void);
extern const TSLanguage *tree_sitter_pine(void);
extern const TSLanguage *tree_sitter_arkts(void);

// -- Empty sentinel --
static const char *empty_types[] = {NULL};
Expand Down Expand Up @@ -1596,6 +1597,43 @@ static const char *pine_var_types[] = {"variable_definition_statement",
static const char *pine_branch_types[] = {"if_statement", "switch_statement", "for_statement",
"for_in_statement", "while_statement", NULL};
static const char *pine_assign_types[] = {"reassignment_statement", NULL};
// ==================== ARKTS (HarmonyOS/OpenHarmony) ====================
// ArkTS is a TypeScript superset with declarative UI extensions.
// Grammar: tree-sitter-arkts (https://github.com/Million-mo/tree-sitter-arkts)
// Reuses TS/JS node types where applicable, adds ArkTS-specific UI nodes.
static const char *arkts_func_types[] = {"function_declaration",
"function_expression",
"arrow_function",
"method_declaration",
"constructor_declaration",
"build_method",
"decorated_function_declaration",
"ui_builder_arrow_function",
NULL};
static const char *arkts_class_types[] = {"class_declaration",
"enum_declaration",
"interface_declaration",
"type_declaration",
"component_declaration",
"decorated_export_declaration",
NULL};
static const char *arkts_field_types[] = {"property_declaration", "public_field_definition", NULL};
static const char *arkts_module_types[] = {"source_file", NULL};
static const char *arkts_call_types[] = {"call_expression", "new_expression", NULL};
static const char *arkts_import_types[] = {"import_declaration", "import", NULL};
static const char *arkts_branch_types[] = {"if_statement", "for_statement",
"for_in_statement", "while_statement",
"do_statement", "switch_statement",
"switch_case", "switch_default",
"try_statement", "catch_clause",
"for_each_statement", "lazy_for_each_statement",
"ui_if_statement", NULL};
static const char *arkts_var_types[] = {"lexical_declaration", "variable_declaration", NULL};
static const char *arkts_assign_types[] = {"assignment_expression",
"augmented_assignment_expression", NULL};
static const char *arkts_throw_types[] = {"throw_statement", NULL};
static const char *arkts_decorator_types[] = {"decorator", NULL};

// ==================== SPEC TABLE ====================

static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = {
Expand Down Expand Up @@ -2571,6 +2609,14 @@ static const CBMLangSpec lang_specs[CBM_LANG_COUNT] = {
pine_branch_types, pine_var_types, pine_assign_types, empty_types, NULL,
empty_types, NULL, NULL, tree_sitter_pine, NULL},

// CBM_LANG_ARKTS — ArkTS (HarmonyOS/OpenHarmony declarative UI language).
// TypeScript superset with UI extensions. Uses tree-sitter-arkts grammar.
[CBM_LANG_ARKTS] = {CBM_LANG_ARKTS, arkts_func_types, arkts_class_types, arkts_field_types,
arkts_module_types, arkts_call_types, arkts_import_types,
arkts_import_types, arkts_branch_types, arkts_var_types, arkts_assign_types,
arkts_throw_types, NULL, arkts_decorator_types, NULL, NULL,
tree_sitter_arkts, NULL},

};

_Static_assert(sizeof(lang_specs) / sizeof(lang_specs[0]) == CBM_LANG_COUNT,
Expand Down
5 changes: 4 additions & 1 deletion internal/cbm/vendored/grammars/MANIFEST.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,13 @@ The grammars were originally vendored as bare `parser.c`+`scanner.c` with **no r

## Summary

- Grammars: **159** — vendored-from-upstream: **142**, first-party/self-maintained: **12**, registry-disagreement: **5** (nim removed 2026-06-12; objectscript_udl + objectscript_routine added 2026-06-24; mojo added 2026-07-01 — see notes below)
- Grammars: **160** — vendored-from-upstream: **143**, first-party/self-maintained: **12**, registry-disagreement: **5** (nim removed 2026-06-12; objectscript_udl + objectscript_routine added 2026-06-24; mojo added 2026-07-01; arkts added 2026-07-03 — see notes below)
- ABI distribution: **7×** ABI-13 **85×** ABI-14 **64×** ABI-15 (runtime ceiling is ABI 15; never vendor ABI 16 without a runtime upgrade)
- Vendored copies missing LICENSE: **0** — all upstream LICENSE files restored 2026-06-11 (first-party grammars carry the project MIT license; `move` uses the Helix-listed upstream tzakian/tree-sitter-move MIT text, `zsh` uses georgeharker/tree-sitter-zsh MIT)
- `verdict`: VERIFIED-BOTH = our source matches *both* registries; VERIFIED-NVIM/HELIX = matches one; registry-disagreement = registries name a different repo (listed separately); `vendor-maintained` = the language vendor's own grammar, not in nvim/Helix.
- **objectscript_udl / objectscript_routine** (added 2026-06-24): vendored from [intersystems/tree-sitter-objectscript](https://github.com/intersystems/tree-sitter-objectscript) @ `a7ffcdf` — MIT, the InterSystems-official grammars (a niche vendor language, hence `vendor-maintained`, not in nvim-treesitter/Helix). **Re-vendor note:** each `scanner.c`'s upstream `#include "../../common/scanner.h"` is repointed to a per-directory `objectscript_common.h` (a verbatim copy of upstream `common/scanner.h`), because this repo's shared `vendored/common/scanner.h` belongs to the cfml/fsharp grammars and differs. The generated `parser.c`/`scanner.c` are otherwise byte-for-byte upstream — on re-vendor, re-apply only that single include rename.
- **mojo** (added 2026-07-01): vendored from [lsh/tree-sitter-mojo](https://github.com/lsh/tree-sitter-mojo) @ `33193a99afe6` — MIT, ABI 15. Helix tracks `lsh/tree-sitter-mojo` as its Mojo grammar source, but the Helix-pinned commit (`3d7c53b8038f`) no longer resolves in the upstream repository after a force-push, so this vendor uses current upstream `main` rather than the stale registry SHA. Security review covered only the vendored C surface (`parser.c`, `scanner.c`, `tree_sitter/*.h`) plus upstream license/provenance metadata; no package manager hooks, workflow files, prompt/agent instruction files, or generated lockfiles were vendored.
- **arkts** (added 2026-07-03): vendored from [Million-mo/tree-sitter-arkts](https://github.com/Million-mo/tree-sitter-arkts) @ `2fd0ad75e2d8` — MIT (c) 2024 million, ABI 15. Community grammar for HarmonyOS ArkTS (`.ets` files), not tracked by nvim-treesitter or Helix. Upstream declares MIT in `grammar.js` header (`@license MIT`, `@author million`) and `package.json` (`"license": "MIT"`, `"author": {"name": "million"}`), but ships no standalone LICENSE file — the vendored `LICENSE` is reconstructed from this declaration. The grammar lacks `field('name', ...)` mappings on declaration nodes, so definition extraction uses `cbm_find_child_by_kind` fallbacks and `build_method` name synthesis (see Custom extraction handling table). Security review covered only the vendored C surface (`parser.c`, `tree_sitter/parser.h`) plus upstream license/provenance metadata; the grammar has no external scanner (`EXTERNAL_TOKEN_COUNT 0` in `parser.c`), so no `scanner.c` was vendored; no package manager hooks, workflow files, prompt/agent instruction files, or generated lockfiles were vendored.

> ⚠️ **Pinned commit = the revision nvim-treesitter/Helix vendor** (battle-tested, canonical source), not bleeding-edge HEAD. When re-vendoring, update the pinned commit here.

Expand All @@ -35,6 +36,7 @@ Guarded by the `contract_all_grammars_in_graph` graph-breadth test in
| grammar | custom handling |
|---|---|
| ada | `resolve_func_name`: `subprogram_body`/`subprogram_declaration` → `procedure_specification`/`function_specification` child's `name` field |
| arkts | `resolve_func_name`: `function_declaration`/`decorated_function_declaration`/`method_declaration` → `identifier` child (no `name` field in grammar); `build_method` → synthesize `"build"` name; `extract_class_def`/`compute_class_qn`: `identifier` child fallback; `component_body` added to class body traversal; `Component` label for `component_declaration`/`decorated_export_declaration` |
| cairo | `resolve_func_name`: `function_definition`/`function_signature` → `identifier` child |
| clojure | `extract_lisp_def`: `(defn …)` / `(def …)` head-symbol forms in `list_lit` |
| d | `resolve_func_name`: `function_declaration` → `identifier` child |
Expand Down Expand Up @@ -70,6 +72,7 @@ Re-vendoring from upstream must re-apply these.
| ada | 14 | briot/tree-sitter-ada | `6b58259a08b1` | VERIFIED-BOTH | ✅ |
| agda | 14 | tree-sitter/tree-sitter-agda | `e8d47a6987ef` | VERIFIED-BOTH | ✅ |
| apex | 14 | aheber/tree-sitter-sfapex | `3597575a4297` | VERIFIED-NVIM | ✅ |
| arkts | 15 | Million-mo/tree-sitter-arkts | `2fd0ad75e2d8` | COMMUNITY | ✅ |
| astro | 14 | virchau13/tree-sitter-astro | `213f6e6973d9` | VERIFIED-BOTH | ✅ |
| awk | 14 | Beaglefoot/tree-sitter-awk | `34bbdc7cce8e` | VERIFIED-BOTH | ✅ |
| bash | 15 | tree-sitter/tree-sitter-bash | `a06c2e4415e9` | VERIFIED-BOTH | ✅ |
Expand Down
Loading
Loading