diff --git a/bun.lock b/bun.lock index 1f47721..88b6540 100644 --- a/bun.lock +++ b/bun.lock @@ -11,12 +11,15 @@ }, "packages/code-chunk": { "name": "code-chunk", - "version": "0.1.12", + "version": "0.1.13", "dependencies": { + "@tree-sitter-grammars/tree-sitter-toml": "^0.7.0", + "@tree-sitter-grammars/tree-sitter-yaml": "^0.7.1", "effect": "^3.19.12", "tree-sitter-go": "^0.25.0", "tree-sitter-java": "^0.23.5", "tree-sitter-javascript": "^0.25.0", + "tree-sitter-json": "^0.24.0", "tree-sitter-python": "^0.25.0", "tree-sitter-rust": "^0.24.0", "tree-sitter-typescript": "^0.23.2", @@ -229,6 +232,10 @@ "@supermemory/eval": ["@supermemory/eval@workspace:packages/eval"], + "@tree-sitter-grammars/tree-sitter-toml": ["@tree-sitter-grammars/tree-sitter-toml@0.7.0", "", { "dependencies": { "node-addon-api": "^8.3.0", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.22.1" }, "optionalPeers": ["tree-sitter"] }, "sha512-873Kl518Qm5ghbWadISY41rwFjee0v+2bU4twwvAw2VAJcWwj2vBo3F3hOCXfbqHaFiqc4qh6eLhkMl2YZJS0g=="], + + "@tree-sitter-grammars/tree-sitter-yaml": ["@tree-sitter-grammars/tree-sitter-yaml@0.7.1", "", { "dependencies": { "node-addon-api": "^8.3.1", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.22.4" }, "optionalPeers": ["tree-sitter"] }, "sha512-AynBwkIoQCTgjDR33bDUp9Mqq+YTco0is3n5hRApMqG9of/6A4eQsfC1/uSEeHSUyMQSYawcAWamsexnVpIP4Q=="], + "@tybys/wasm-util": ["@tybys/wasm-util@0.10.1", "", { "dependencies": { "tslib": "^2.4.0" } }, "sha512-9tTaPJLSiejZKx+Bmog4uSubteqTvFrVrURwkmHixBo0G4seD0zUxp98E1DzUBJxLQ3NPwXrGKDiVjwx/DpPsg=="], "@types/bun": ["@types/bun@1.3.5", "", { "dependencies": { "bun-types": "1.3.5" } }, "sha512-RnygCqNrd3srIPEWBd5LFeUYG7plCoH2Yw9WaZGyNmdTEei+gWaHqydbaIRkIkcbXwhBT94q78QljxN0Sk838w=="], @@ -363,6 +370,8 @@ "tree-sitter-javascript": ["tree-sitter-javascript@0.25.0", "", { "dependencies": { "node-addon-api": "^8.3.1", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.25.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-1fCbmzAskZkxcZzN41sFZ2br2iqTYP3tKls1b/HKGNPQUVOpsUxpmGxdN/wMqAk3jYZnYBR1dd/y/0avMeU7dw=="], + "tree-sitter-json": ["tree-sitter-json@0.24.8", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.2" }, "peerDependencies": { "tree-sitter": "^0.21.1" }, "optionalPeers": ["tree-sitter"] }, "sha512-Tc9ZZYwHyWZ3Tt1VEw7Pa2scu1YO7/d2BCBbKTx5hXwig3UfdQjsOPkPyLpDJOn/m1UBEWYAtSdGAwCSyagBqQ=="], + "tree-sitter-python": ["tree-sitter-python@0.25.0", "", { "dependencies": { "node-addon-api": "^8.5.0", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.25.0" }, "optionalPeers": ["tree-sitter"] }, "sha512-eCmJx6zQa35GxaCtQD+wXHOhYqBxEL+bp71W/s3fcDMu06MrtzkVXR437dRrCrbrDbyLuUDJpAgycs7ncngLXw=="], "tree-sitter-rust": ["tree-sitter-rust@0.24.0", "", { "dependencies": { "node-addon-api": "^8.2.2", "node-gyp-build": "^4.8.4" }, "peerDependencies": { "tree-sitter": "^0.22.1" }, "optionalPeers": ["tree-sitter"] }, "sha512-NWemUDf629Tfc90Y0Z55zuwPCAHkLxWnMf2RznYu4iBkkrQl2o/CHGB7Cr52TyN5F1DAx8FmUnDtCy9iUkXZEQ=="], diff --git a/packages/code-chunk/package.json b/packages/code-chunk/package.json index a072fdd..8f197bd 100644 --- a/packages/code-chunk/package.json +++ b/packages/code-chunk/package.json @@ -64,9 +64,12 @@ "tree-sitter-go": "^0.25.0", "tree-sitter-java": "^0.23.5", "tree-sitter-javascript": "^0.25.0", + "tree-sitter-json": "^0.24.0", "tree-sitter-python": "^0.25.0", "tree-sitter-rust": "^0.24.0", "tree-sitter-typescript": "^0.23.2", - "web-tree-sitter": "^0.26.3" + "web-tree-sitter": "^0.26.3", + "@tree-sitter-grammars/tree-sitter-toml": "^0.7.0", + "@tree-sitter-grammars/tree-sitter-yaml": "^0.7.1" } } diff --git a/packages/code-chunk/src/batch.ts b/packages/code-chunk/src/batch.ts index cc0c5e9..6087b81 100644 --- a/packages/code-chunk/src/batch.ts +++ b/packages/code-chunk/src/batch.ts @@ -9,6 +9,7 @@ import { } from 'effect' import { ChunkingError, UnsupportedLanguageError } from './chunk' import { chunk as chunkInternal } from './chunking' +import { chunkJsonl } from './chunking/jsonl' import { extractEntities } from './extract' import { parseCode } from './parser' import { detectLanguage } from './parser/languages' @@ -168,6 +169,14 @@ const nativeChunkFile: ChunkFileFunction = (filepath, code, options) => { return yield* Effect.fail(new UnsupportedLanguageError(filepath)) } + if (language === 'jsonl') { + return yield* Effect.tryPromise({ + try: () => chunkJsonl(code, options, filepath), + catch: (error: unknown) => + new ChunkingError('Failed to chunk JSONL', error), + }) + } + const parseResult = yield* Effect.tryPromise({ try: () => parseCode(code, language), catch: (error: unknown) => diff --git a/packages/code-chunk/src/chunk.ts b/packages/code-chunk/src/chunk.ts index 719b909..130e5e0 100644 --- a/packages/code-chunk/src/chunk.ts +++ b/packages/code-chunk/src/chunk.ts @@ -3,6 +3,7 @@ import { chunk as chunkInternal, streamChunks as streamChunksInternal, } from './chunking' +import { chunkJsonl } from './chunking/jsonl' import { extractEntities } from './extract' import { parseCode } from './parser' import { detectLanguage } from './parser/languages' @@ -62,6 +63,16 @@ const chunkEffect = ( return yield* Effect.fail(new UnsupportedLanguageError(filepath)) } + // JSONL: line-based chunking without full-file AST + if (language === 'jsonl') { + const jsonlChunks = yield* Effect.tryPromise({ + try: () => chunkJsonl(code, options, filepath), + catch: (error: unknown) => + new ChunkingError('Failed to chunk JSONL', error), + }) + return jsonlChunks + } + // Step 2: Parse the code const parseResult = yield* Effect.tryPromise({ try: () => parseCode(code, language), @@ -220,10 +231,22 @@ export const chunkStreamEffect = ( options?: ChunkOptions, ): Stream.Stream => { return Stream.unwrap( - Effect.map(prepareChunking(filepath, code, options), (prepared) => { - const { parseResult, scopeTree, language } = prepared - - // Create stream from the internal generator + Effect.gen(function* () { + const language: Language | null = + options?.language ?? detectLanguage(filepath) + if (!language) { + yield* Effect.fail(new UnsupportedLanguageError(filepath)) + } + if (language === 'jsonl') { + const chunks = yield* Effect.tryPromise({ + try: () => chunkJsonl(code, options ?? {}, filepath), + catch: (error: unknown) => + new ChunkingError('Failed to chunk JSONL', error), + }) + return Stream.fromIterable(chunks) + } + const prepared = yield* prepareChunking(filepath, code, options) + const { parseResult, scopeTree } = prepared return Stream.fromAsyncIterable( streamChunksInternal( parseResult.tree.rootNode, @@ -235,7 +258,6 @@ export const chunkStreamEffect = ( ), (error) => new ChunkingError('Stream iteration failed', error), ).pipe( - // Attach parse error to chunks if present Stream.map((chunk) => parseResult.error ? { @@ -280,12 +302,23 @@ export async function* chunkStream( code: string, options?: ChunkOptions, ): AsyncGenerator { + const language: Language | null = + options?.language ?? detectLanguage(filepath) + if (!language) { + throw new UnsupportedLanguageError(filepath) + } + if (language === 'jsonl') { + const chunks = await chunkJsonl(code, options ?? {}, filepath) + yield* chunks + return + } + // Prepare the chunking pipeline const prepared = await Effect.runPromise( prepareChunking(filepath, code, options), ) - const { parseResult, scopeTree, language } = prepared + const { parseResult, scopeTree } = prepared // Stream chunks from the internal generator const chunkGenerator = streamChunksInternal( diff --git a/packages/code-chunk/src/chunking/jsonl.ts b/packages/code-chunk/src/chunking/jsonl.ts new file mode 100644 index 0000000..677df3c --- /dev/null +++ b/packages/code-chunk/src/chunking/jsonl.ts @@ -0,0 +1,192 @@ +import { + getEntitiesInRange, + getRelevantImports, + getScopeForRange, +} from '../context' +import { formatChunkWithContext } from '../context/format' +import { getSiblings } from '../context/siblings' +import { parseCode } from '../parser' +import { buildScopeTreeFromEntities } from '../scope/tree' +import type { + Chunk, + ChunkContext, + ChunkOptions, + ExtractedEntity, + ScopeTree, +} from '../types' +import { DEFAULT_CHUNK_OPTIONS } from './index' +import type { RebuiltText } from './rebuild' + +function getFirstKeyFromJsonLine(line: string): string | null { + try { + const parsed = JSON.parse(line) as Record + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + const firstKey = Object.keys(parsed)[0] + return firstKey ?? null + } + } catch { + // ignore + } + return null +} + +/** + * Build entities for JSONL: one section per line (when line parses as JSON object). + * Uses parseCode(line, 'json') to get a node for each line. + */ +async function extractJsonlEntities( + code: string, + lines: string[], + lineStarts: number[], +): Promise { + const entities: ExtractedEntity[] = [] + for (let i = 0; i < lines.length; i++) { + const line = lines[i] + if (line === undefined) continue + const start = lineStarts[i] ?? 0 + const end = start + line.length + let name: string + let node: ExtractedEntity['node'] + try { + const parseResult = await parseCode(line, 'json') + node = parseResult.tree.rootNode + const firstKey = getFirstKeyFromJsonLine(line) + name = firstKey ?? `line ${i + 1}` + } catch { + // Skip entity for unparseable lines (chunk will still include the line text) + continue + } + entities.push({ + type: 'section', + name, + signature: line.includes('\n') ? line.slice(0, line.indexOf('\n')) : line, + docstring: null, + byteRange: { start, end }, + lineRange: { start: i, end: i }, + parent: null, + node, + }) + } + return entities +} + +/** + * Chunk JSONL by grouping consecutive lines until maxChunkSize. + */ +export async function chunkJsonl( + code: string, + options: ChunkOptions, + filepath?: string, +): Promise { + const opts: Required> & { language: 'jsonl' } = { + ...DEFAULT_CHUNK_OPTIONS, + ...options, + language: 'jsonl', + } + const maxSize = opts.maxChunkSize + const lines = code.split('\n') + const lineStarts: number[] = [] + let offset = 0 + for (let i = 0; i < lines.length; i++) { + lineStarts[i] = offset + offset += (lines[i]?.length ?? 0) + (i < lines.length - 1 ? 1 : 0) // +1 for \n + } + + const entities = await extractJsonlEntities(code, lines, lineStarts) + const scopeTree: ScopeTree = buildScopeTreeSync(entities) + + // Group lines into chunks by size (NWS or bytes; use bytes for simplicity) + const chunks: Chunk[] = [] + let chunkStartLine = 0 + let chunkSize = 0 + for (let i = 0; i <= lines.length; i++) { + const line = lines[i] + const lineLen = (line?.length ?? 0) + (i < lines.length - 1 ? 1 : 0) + const wouldExceed = i < lines.length && chunkSize + lineLen > maxSize + if (wouldExceed && chunkStartLine < i) { + // Emit chunk [chunkStartLine, i) + const chunkLines = lines.slice(chunkStartLine, i) + const text = chunkLines.join('\n') + const byteStart = lineStarts[chunkStartLine] ?? 0 + const byteRange = { start: byteStart, end: byteStart + text.length } + const lineRange = { start: chunkStartLine, end: i - 1 } + const context = buildContextForJsonl( + { text, byteRange, lineRange }, + scopeTree, + opts, + filepath, + ) + const contextualizedText = formatChunkWithContext(text, context) + chunks.push({ + text, + contextualizedText, + byteRange, + lineRange, + context, + index: chunks.length, + totalChunks: -1, // set below + }) + chunkStartLine = i + chunkSize = 0 + } + if (i < lines.length) { + chunkSize += lineLen + } + } + if (chunkStartLine < lines.length) { + const chunkLines = lines.slice(chunkStartLine) + const text = chunkLines.join('\n') + const byteStart = lineStarts[chunkStartLine] ?? 0 + const byteRange = { start: byteStart, end: byteStart + text.length } + const lineRange = { start: chunkStartLine, end: lines.length - 1 } + const context = buildContextForJsonl( + { text, byteRange, lineRange }, + scopeTree, + opts, + filepath, + ) + const contextualizedText = formatChunkWithContext(text, context) + chunks.push({ + text, + contextualizedText, + byteRange, + lineRange, + context, + index: chunks.length, + totalChunks: chunks.length, + }) + } + // Set totalChunks on all + for (let j = 0; j < chunks.length; j++) { + chunks[j] = { ...chunks[j], totalChunks: chunks.length } + } + return chunks +} + +function buildScopeTreeSync(entities: ExtractedEntity[]): ScopeTree { + return buildScopeTreeFromEntities(entities) +} + +function buildContextForJsonl( + text: RebuiltText, + scopeTree: ScopeTree, + options: Required, + filepath?: string, +): ChunkContext { + const byteRange = text.byteRange + return { + filepath, + language: 'jsonl', + scope: getScopeForRange(byteRange, scopeTree), + entities: getEntitiesInRange(byteRange, scopeTree), + siblings: getSiblings(byteRange, scopeTree, { + detail: options.siblingDetail, + maxSiblings: 3, + }), + imports: getRelevantImports( + getEntitiesInRange(byteRange, scopeTree), + scopeTree, + options.filterImports, + ), + } +} diff --git a/packages/code-chunk/src/extract/docstring.ts b/packages/code-chunk/src/extract/docstring.ts index 1df1f59..436e299 100644 --- a/packages/code-chunk/src/extract/docstring.ts +++ b/packages/code-chunk/src/extract/docstring.ts @@ -11,6 +11,10 @@ export const COMMENT_NODE_TYPES: Record = { rust: ['line_comment', 'block_comment'], go: ['comment'], java: ['line_comment', 'block_comment'], + yaml: ['comment'], + toml: ['comment'], + json: [], + jsonl: [], } /** diff --git a/packages/code-chunk/src/extract/fallback.ts b/packages/code-chunk/src/extract/fallback.ts index 2d6e61f..56c5e02 100644 --- a/packages/code-chunk/src/extract/fallback.ts +++ b/packages/code-chunk/src/extract/fallback.ts @@ -58,6 +58,10 @@ export const ENTITY_NODE_TYPES: Record = { 'enum_declaration', 'import_declaration', ], + yaml: ['block_mapping_pair'], + toml: ['table', 'table_array_element', 'pair'], + json: ['pair'], + jsonl: [], // Entities built from line-by-line parse, not from AST walk } /** @@ -105,6 +109,12 @@ export const NODE_TYPE_TO_ENTITY_TYPE: Record = { // Impl blocks (Rust - treat as class-like) impl_item: 'class', + + // Data formats: top-level sections + block_mapping_pair: 'section', + table: 'section', + table_array_element: 'section', + pair: 'section', } /** @@ -155,6 +165,33 @@ function walkAndExtract( // Check if this node is an entity type if (isEntityNodeType(node.type, language)) { + // YAML: only top-level block_mapping_pair (direct child of document's block_mapping) + if ( + language === 'yaml' && + node.type === 'block_mapping_pair' && + !(node.parent?.type === 'block_mapping' && node.parent?.parent?.type === 'document') + ) { + // Not top-level, continue to children + const children = node.namedChildren + for (let i = children.length - 1; i >= 0; i--) { + const child = children[i] + if (child) stack.push({ node: child, parentName }) + } + continue + } + // JSON: only top-level pair (direct child of document's object) + if ( + language === 'json' && + node.type === 'pair' && + !(node.parent?.type === 'object' && node.parent?.parent?.type === 'document') + ) { + const children = node.namedChildren + for (let i = children.length - 1; i >= 0; i--) { + const child = children[i] + if (child) stack.push({ node: child, parentName }) + } + continue + } // Skip if we've already processed this node if (entityNodes.has(node.id)) { continue @@ -207,7 +244,8 @@ function walkAndExtract( entityType === 'class' || entityType === 'interface' || entityType === 'function' || - entityType === 'method' + entityType === 'method' || + entityType === 'section' ? name : parentName diff --git a/packages/code-chunk/src/extract/imports.ts b/packages/code-chunk/src/extract/imports.ts index 06cd41b..3b7e69e 100644 --- a/packages/code-chunk/src/extract/imports.ts +++ b/packages/code-chunk/src/extract/imports.ts @@ -183,6 +183,12 @@ export function extractImportSymbols( } break } + + case 'yaml': + case 'toml': + case 'json': + case 'jsonl': + return entities } // If no symbols were extracted, fall back to using source as name diff --git a/packages/code-chunk/src/extract/index.ts b/packages/code-chunk/src/extract/index.ts index 94bc6a1..b97c858 100644 --- a/packages/code-chunk/src/extract/index.ts +++ b/packages/code-chunk/src/extract/index.ts @@ -19,7 +19,7 @@ import { loadQuerySync, type QueryMatch, } from './queries' -import { extractName, extractSignature } from './signature' +import { extractName, extractSignature, stripQuotes } from './signature' /** * Error when entity extraction fails @@ -118,9 +118,14 @@ function matchesToEntities( } // Extract name - prefer name node from query, fallback to extraction - const name = nameNode + // For JSON, key capture is string node with quotes; strip them + const rawName = nameNode ? nameNode.text : (extractName(itemNode, language) ?? '') + const name = + language === 'json' && nameNode + ? stripQuotes(rawName) + : rawName // Extract signature const signature = yield* extractSignature( diff --git a/packages/code-chunk/src/extract/queries.ts b/packages/code-chunk/src/extract/queries.ts index c1dcf4c..9b2a8cd 100644 --- a/packages/code-chunk/src/extract/queries.ts +++ b/packages/code-chunk/src/extract/queries.ts @@ -481,6 +481,26 @@ const JAVA_QUERY = `; Java Entity Extraction Queries name: (identifier) @name) @item) ` +const YAML_QUERY = `; YAML top-level keys (block_mapping_pair with key; filter to top-level in fallback if needed) +(block_mapping (block_mapping_pair key: (_) @name) @item) +` + +const TOML_QUERY = `; TOML top-level: root pairs, tables, table_array_elements +(document (pair (dotted_key) @name) @item) +(document (pair (bare_key) @name) @item) +(document (pair (quoted_key) @name) @item) +(table (dotted_key) @name) @item +(table (bare_key) @name) @item +(table (quoted_key) @name) @item +(table_array_element (dotted_key) @name) @item +(table_array_element (bare_key) @name) @item +(table_array_element (quoted_key) @name) @item) +` + +const JSON_QUERY = `; JSON top-level object keys +(document (object (pair key: (string) @name) @item)) +` + /** * Query patterns by language - embedded as strings for portability */ @@ -491,6 +511,10 @@ export const QUERY_PATTERNS: Record = { rust: RUST_QUERY, go: GO_QUERY, java: JAVA_QUERY, + yaml: YAML_QUERY, + toml: TOML_QUERY, + json: JSON_QUERY, + jsonl: '', // JSONL: no query, entities built from line-by-line parse } // ============================================================================= @@ -545,9 +569,9 @@ export const loadQuery = ( return cached } - // Get the query pattern for this language + // Get the query pattern for this language (jsonl has no query) const queryPattern = QUERY_PATTERNS[language] - if (!queryPattern) { + if (!queryPattern || language === 'jsonl') { return null } diff --git a/packages/code-chunk/src/extract/signature.ts b/packages/code-chunk/src/extract/signature.ts index 10ee156..550318d 100644 --- a/packages/code-chunk/src/extract/signature.ts +++ b/packages/code-chunk/src/extract/signature.ts @@ -11,6 +11,10 @@ export const BODY_DELIMITERS: Record = { rust: '{', go: '{', java: '{', + yaml: ':', + toml: '=', + json: ':', + jsonl: ':', } /** @@ -33,8 +37,38 @@ const NAME_NODE_TYPES: readonly string[] = [ */ export const extractName = ( node: SyntaxNode, - _language: Language, + language: Language, ): string | null => { + // YAML: block_mapping_pair has field "key" (block_node or flow_node, often plain_scalar) + if (language === 'yaml') { + const keyNode = node.childForFieldName('key') + if (keyNode) return keyNode.text.trim() + } + + // TOML: table/table_array_element have key in brackets; pair has key before "=" + if (language === 'toml') { + const keyNode = + node.childForFieldName('key') ?? + node.namedChildren.find( + (c) => + c.type === 'dotted_key' || + c.type === 'bare_key' || + c.type === 'quoted_key', + ) + if (keyNode) { + const raw = keyNode.text + return raw.startsWith('"') || raw.startsWith("'") + ? stripQuotes(raw) + : raw + } + } + + // JSON: pair has field "key" (string node with quotes) + if (language === 'json') { + const keyNode = node.childForFieldName('key') + if (keyNode?.type === 'string') return stripQuotes(keyNode.text) + } + // Try to find a named child that is an identifier for (const nameType of NAME_NODE_TYPES) { const nameNode = node.childForFieldName(nameType) @@ -334,6 +368,15 @@ export const extractSignature = ( case 'export': return extractImportExportSignature(node, code) + case 'section': { + const nodeText = code.slice(node.startIndex, node.endIndex) + const firstNewline = nodeText.indexOf('\n') + if (firstNewline !== -1) { + return cleanSignature(nodeText.slice(0, firstNewline)) + } + return cleanSignature(nodeText) + } + default: { // Fallback: extract first line const nodeText = code.slice(node.startIndex, node.endIndex) @@ -489,6 +532,12 @@ export const extractImportSource = ( } break } + + case 'yaml': + case 'toml': + case 'json': + case 'jsonl': + return null } // Fallback: look for any string-like child @@ -530,7 +579,7 @@ const extractRustUsePath = (node: SyntaxNode): string => { /** * Strip surrounding quotes from a string */ -const stripQuotes = (str: string): string => { +export const stripQuotes = (str: string): string => { if ( (str.startsWith('"') && str.endsWith('"')) || (str.startsWith("'") && str.endsWith("'")) || diff --git a/packages/code-chunk/src/parser/languages.ts b/packages/code-chunk/src/parser/languages.ts index 2d67d7b..c5346ed 100644 --- a/packages/code-chunk/src/parser/languages.ts +++ b/packages/code-chunk/src/parser/languages.ts @@ -35,6 +35,11 @@ export const LANGUAGE_EXTENSIONS: Record = { '.rs': 'rust', '.go': 'go', '.java': 'java', + '.yaml': 'yaml', + '.yml': 'yaml', + '.toml': 'toml', + '.json': 'json', + '.jsonl': 'jsonl', } /** @@ -69,6 +74,19 @@ function getGrammarPath(language: Language): string { return require.resolve('tree-sitter-go/tree-sitter-go.wasm') case 'java': return require.resolve('tree-sitter-java/tree-sitter-java.wasm') + case 'yaml': + return require.resolve( + '@tree-sitter-grammars/tree-sitter-yaml/tree-sitter-yaml.wasm', + ) + case 'toml': + return require.resolve( + '@tree-sitter-grammars/tree-sitter-toml/tree-sitter-toml.wasm', + ) + case 'json': + return require.resolve('tree-sitter-json/tree-sitter-json.wasm') + case 'jsonl': + // JSONL is parsed line-by-line with JSON grammar + return require.resolve('tree-sitter-json/tree-sitter-json.wasm') } } diff --git a/packages/code-chunk/src/types.ts b/packages/code-chunk/src/types.ts index b96c63c..588615a 100644 --- a/packages/code-chunk/src/types.ts +++ b/packages/code-chunk/src/types.ts @@ -10,6 +10,10 @@ export type Language = | 'rust' | 'go' | 'java' + | 'yaml' + | 'toml' + | 'json' + | 'jsonl' /** * Types of entities that can be extracted from source code @@ -23,6 +27,7 @@ export type EntityType = | 'enum' | 'import' | 'export' + | 'section' /** * A range of lines in the source code (0-indexed, inclusive) diff --git a/packages/code-chunk/test/extract.test.ts b/packages/code-chunk/test/extract.test.ts index 41e0fef..a7255a3 100644 --- a/packages/code-chunk/test/extract.test.ts +++ b/packages/code-chunk/test/extract.test.ts @@ -494,11 +494,18 @@ class Bar { 'rust', 'go', 'java', + 'yaml', + 'toml', + 'json', + 'jsonl', ] for (const lang of languages) { expect(ENTITY_NODE_TYPES[lang]).toBeDefined() - expect(ENTITY_NODE_TYPES[lang].length).toBeGreaterThanOrEqual(3) + // jsonl has no AST entity types (line-based); others have at least 1 + if (lang !== 'jsonl') { + expect(ENTITY_NODE_TYPES[lang].length).toBeGreaterThanOrEqual(1) + } } }) }) @@ -1673,3 +1680,70 @@ function third() {}` expect(methodNames).toEqual(['methodA', 'methodB']) }) }) + +// ============================================================================ +// YAML, TOML, JSON section extraction +// ============================================================================ + +describe('YAML, TOML, JSON section extraction', () => { + test('extracts YAML top-level keys as section entities', async () => { + const code = `run: + timeout: 10m +output: + format: colored +` + const result = await parseCode(code, 'yaml') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'yaml', + code, + ) + expect(entities.length).toBeGreaterThanOrEqual(2) + const names = entities.map((e) => e.name) + expect(names).toContain('run') + expect(names).toContain('output') + for (const e of entities) { + expect(e.type).toBe('section') + } + }) + + test('extracts TOML tables and root pairs as section entities', async () => { + const code = `name = "app" + +[server] +port = 8080 + +[[items]] +id = 1 +` + const result = await parseCode(code, 'toml') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'toml', + code, + ) + expect(entities.length).toBeGreaterThanOrEqual(2) + const names = entities.map((e) => e.name) + expect(names).toContain('name') + expect(names.some((n) => n === 'server' || n === 'items')).toBe(true) + for (const e of entities) { + expect(e.type).toBe('section') + } + }) + + test('extracts JSON top-level object keys as section entities', async () => { + const code = `{"name": "test", "version": "1.0", "count": 42}` + const result = await parseCode(code, 'json') + const entities = await extractEntitiesAsync( + result.tree.rootNode, + 'json', + code, + ) + expect(entities.length).toBe(3) + const names = entities.map((e) => e.name).sort() + expect(names).toEqual(['count', 'name', 'version']) + for (const e of entities) { + expect(e.type).toBe('section') + } + }) +}) diff --git a/packages/code-chunk/test/integration.test.ts b/packages/code-chunk/test/integration.test.ts index ea77401..4a683af 100644 --- a/packages/code-chunk/test/integration.test.ts +++ b/packages/code-chunk/test/integration.test.ts @@ -1013,6 +1013,10 @@ function finalize(item: TransformedItem): Result { path: 'Test.java', code: 'public class Test { int corge() { return 6; } }', }, + { path: 'config.yaml', code: 'run:\n timeout: 5m' }, + { path: 'Cargo.toml', code: '[package]\nname = "x"' }, + { path: 'pkg.json', code: '{"name": "x", "version": "1.0"}' }, + { path: 'data.jsonl', code: '{"id": 1}\n{"id": 2}' }, ] for (const sample of samples) { @@ -1082,3 +1086,71 @@ describe('integration: error handling', () => { } }) }) + +describe('integration: YAML, TOML, JSON, JSONL', () => { + test('chunks YAML with section context', async () => { + const code = `run: + concurrency: 8 + timeout: 10m +output: + format: colored +linters: + enable: [errcheck] +` + const chunks = await chunk('config.yaml', code) + expect(chunks.length).toBeGreaterThan(0) + const allEntities = chunks.flatMap((c) => c.context.entities) + const sectionNames = allEntities.map((e) => e.name) + expect(sectionNames).toContain('run') + expect(sectionNames).toContain('output') + expect(sectionNames).toContain('linters') + for (const c of chunks) { + expect(c.context.language).toBe('yaml') + } + }) + + test('chunks TOML with section context', async () => { + const code = `[package] +name = "myapp" +version = "0.1.0" + +[dependencies] +serde = "1.0" +` + const chunks = await chunk('Cargo.toml', code) + expect(chunks.length).toBeGreaterThan(0) + const allEntities = chunks.flatMap((c) => c.context.entities) + expect(allEntities.some((e) => e.type === 'section')).toBe(true) + for (const c of chunks) { + expect(c.context.language).toBe('toml') + } + }) + + test('chunks JSON with section context', async () => { + const code = `{"name": "app", "version": "1.0", "scripts": {"start": "node index.js"}}` + const chunks = await chunk('package.json', code) + expect(chunks.length).toBeGreaterThan(0) + const allEntities = chunks.flatMap((c) => c.context.entities) + const names = allEntities.map((e) => e.name) + expect(names).toContain('name') + expect(names).toContain('version') + expect(names).toContain('scripts') + for (const c of chunks) { + expect(c.context.language).toBe('json') + } + }) + + test('chunks JSONL by lines', async () => { + const code = `{"id": 1, "name": "first"} +{"id": 2, "name": "second"} +{"id": 3, "name": "third"} +` + const chunks = await chunk('data.jsonl', code) + expect(chunks.length).toBeGreaterThan(0) + const fullText = chunks.map((c) => c.text).join('') + expect(fullText.trimEnd()).toBe(code.trimEnd()) + for (const c of chunks) { + expect(c.context.language).toBe('jsonl') + } + }) +}) diff --git a/packages/code-chunk/test/parser.test.ts b/packages/code-chunk/test/parser.test.ts index d72d8ca..08e7cd0 100644 --- a/packages/code-chunk/test/parser.test.ts +++ b/packages/code-chunk/test/parser.test.ts @@ -64,12 +64,29 @@ describe('detectLanguage', () => { expect(detectLanguage('src/Main.java')).toBe('java') }) + test('detects yaml from .yaml and .yml extensions', () => { + expect(detectLanguage('config.yaml')).toBe('yaml') + expect(detectLanguage('config.yml')).toBe('yaml') + }) + + test('detects toml from .toml extension', () => { + expect(detectLanguage('Cargo.toml')).toBe('toml') + }) + + test('detects json from .json extension', () => { + expect(detectLanguage('package.json')).toBe('json') + }) + + test('detects jsonl from .jsonl extension', () => { + expect(detectLanguage('data.jsonl')).toBe('jsonl') + }) + test('returns null for unsupported extension', () => { expect(detectLanguage('README.md')).toBeNull() - expect(detectLanguage('config.yaml')).toBeNull() expect(detectLanguage('Makefile')).toBeNull() - expect(detectLanguage('data.json')).toBeNull() expect(detectLanguage('.env')).toBeNull() + expect(detectLanguage('file.xml')).toBeNull() + expect(detectLanguage('file.txt')).toBeNull() }) test('handles deeply nested paths correctly', () => { @@ -859,6 +876,36 @@ function c() {}` expect(node?.type).toBe('identifier') }) }) + + describe('YAML, TOML, JSON parsing', () => { + test('parses YAML with top-level keys', async () => { + const code = `run: + timeout: 10m +output: + format: colored +` + const result = await parseCode(code, 'yaml') + expect(result.error).toBeNull() + expect(result.tree.rootNode).not.toBeNull() + }) + + test('parses TOML with table', async () => { + const code = `[package] +name = "foo" +version = "0.1.0" +` + const result = await parseCode(code, 'toml') + expect(result.error).toBeNull() + expect(result.tree.rootNode).not.toBeNull() + }) + + test('parses JSON object', async () => { + const code = `{"name": "test", "count": 42}` + const result = await parseCode(code, 'json') + expect(result.error).toBeNull() + expect(result.tree.rootNode.type).toBe('document') + }) + }) }) // ============================================================================