Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 4 additions & 1 deletion packages/code-chunk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,12 @@
"tree-sitter-go": "^0.25.0",
"tree-sitter-java": "^0.23.5",
"tree-sitter-javascript": "^0.25.0",
"tree-sitter-json": "^0.24.0",
"tree-sitter-python": "^0.25.0",
"tree-sitter-rust": "^0.24.0",
"tree-sitter-typescript": "^0.23.2",
"web-tree-sitter": "^0.26.3"
"web-tree-sitter": "^0.26.3",
"@tree-sitter-grammars/tree-sitter-toml": "^0.7.0",
"@tree-sitter-grammars/tree-sitter-yaml": "^0.7.1"
}
}
9 changes: 9 additions & 0 deletions packages/code-chunk/src/batch.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import {
} from 'effect'
import { ChunkingError, UnsupportedLanguageError } from './chunk'
import { chunk as chunkInternal } from './chunking'
import { chunkJsonl } from './chunking/jsonl'
import { extractEntities } from './extract'
import { parseCode } from './parser'
import { detectLanguage } from './parser/languages'
Expand Down Expand Up @@ -168,6 +169,14 @@ const nativeChunkFile: ChunkFileFunction = (filepath, code, options) => {
return yield* Effect.fail(new UnsupportedLanguageError(filepath))
}

if (language === 'jsonl') {
return yield* Effect.tryPromise({
try: () => chunkJsonl(code, options, filepath),
catch: (error: unknown) =>
new ChunkingError('Failed to chunk JSONL', error),
})
}

const parseResult = yield* Effect.tryPromise({
try: () => parseCode(code, language),
catch: (error: unknown) =>
Expand Down
45 changes: 39 additions & 6 deletions packages/code-chunk/src/chunk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {
chunk as chunkInternal,
streamChunks as streamChunksInternal,
} from './chunking'
import { chunkJsonl } from './chunking/jsonl'
import { extractEntities } from './extract'
import { parseCode } from './parser'
import { detectLanguage } from './parser/languages'
Expand Down Expand Up @@ -62,6 +63,16 @@ const chunkEffect = (
return yield* Effect.fail(new UnsupportedLanguageError(filepath))
}

// JSONL: line-based chunking without full-file AST
if (language === 'jsonl') {
const jsonlChunks = yield* Effect.tryPromise({
try: () => chunkJsonl(code, options, filepath),
catch: (error: unknown) =>
new ChunkingError('Failed to chunk JSONL', error),
})
return jsonlChunks
}

// Step 2: Parse the code
const parseResult = yield* Effect.tryPromise({
try: () => parseCode(code, language),
Expand Down Expand Up @@ -220,10 +231,22 @@ export const chunkStreamEffect = (
options?: ChunkOptions,
): Stream.Stream<Chunk, ChunkingError | UnsupportedLanguageError> => {
return Stream.unwrap(
Effect.map(prepareChunking(filepath, code, options), (prepared) => {
const { parseResult, scopeTree, language } = prepared

// Create stream from the internal generator
Effect.gen(function* () {
const language: Language | null =
options?.language ?? detectLanguage(filepath)
if (!language) {
yield* Effect.fail(new UnsupportedLanguageError(filepath))
}
if (language === 'jsonl') {
const chunks = yield* Effect.tryPromise({
try: () => chunkJsonl(code, options ?? {}, filepath),
catch: (error: unknown) =>
new ChunkingError('Failed to chunk JSONL', error),
})
return Stream.fromIterable(chunks)
}
const prepared = yield* prepareChunking(filepath, code, options)
const { parseResult, scopeTree } = prepared
return Stream.fromAsyncIterable(
streamChunksInternal(
parseResult.tree.rootNode,
Expand All @@ -235,7 +258,6 @@ export const chunkStreamEffect = (
),
(error) => new ChunkingError('Stream iteration failed', error),
).pipe(
// Attach parse error to chunks if present
Stream.map((chunk) =>
parseResult.error
? {
Expand Down Expand Up @@ -280,12 +302,23 @@ export async function* chunkStream(
code: string,
options?: ChunkOptions,
): AsyncGenerator<Chunk> {
const language: Language | null =
options?.language ?? detectLanguage(filepath)
if (!language) {
throw new UnsupportedLanguageError(filepath)
}
if (language === 'jsonl') {
const chunks = await chunkJsonl(code, options ?? {}, filepath)
yield* chunks
return
}

// Prepare the chunking pipeline
const prepared = await Effect.runPromise(
prepareChunking(filepath, code, options),
)

const { parseResult, scopeTree, language } = prepared
const { parseResult, scopeTree } = prepared

// Stream chunks from the internal generator
const chunkGenerator = streamChunksInternal(
Expand Down
192 changes: 192 additions & 0 deletions packages/code-chunk/src/chunking/jsonl.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
import {
getEntitiesInRange,
getRelevantImports,
getScopeForRange,
} from '../context'
import { formatChunkWithContext } from '../context/format'
import { getSiblings } from '../context/siblings'
import { parseCode } from '../parser'
import { buildScopeTreeFromEntities } from '../scope/tree'
import type {
Chunk,
ChunkContext,
ChunkOptions,
ExtractedEntity,
ScopeTree,
} from '../types'
import { DEFAULT_CHUNK_OPTIONS } from './index'
import type { RebuiltText } from './rebuild'

function getFirstKeyFromJsonLine(line: string): string | null {
try {
const parsed = JSON.parse(line) as Record<string, unknown>
if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) {
const firstKey = Object.keys(parsed)[0]
return firstKey ?? null
}
} catch {
// ignore
}
return null
}

/**
* Build entities for JSONL: one section per line (when line parses as JSON object).
* Uses parseCode(line, 'json') to get a node for each line.
*/
async function extractJsonlEntities(
code: string,
lines: string[],
lineStarts: number[],
): Promise<ExtractedEntity[]> {
const entities: ExtractedEntity[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
if (line === undefined) continue
const start = lineStarts[i] ?? 0
const end = start + line.length
let name: string
let node: ExtractedEntity['node']
try {
const parseResult = await parseCode(line, 'json')
node = parseResult.tree.rootNode
const firstKey = getFirstKeyFromJsonLine(line)
name = firstKey ?? `line ${i + 1}`
} catch {
// Skip entity for unparseable lines (chunk will still include the line text)
continue
}
entities.push({
type: 'section',
name,
signature: line.includes('\n') ? line.slice(0, line.indexOf('\n')) : line,
docstring: null,
byteRange: { start, end },
lineRange: { start: i, end: i },
parent: null,
node,
})
}
return entities
}

/**
* Chunk JSONL by grouping consecutive lines until maxChunkSize.
*/
export async function chunkJsonl(
code: string,
options: ChunkOptions,
filepath?: string,
): Promise<Chunk[]> {
const opts: Required<Omit<ChunkOptions, 'language'>> & { language: 'jsonl' } = {
...DEFAULT_CHUNK_OPTIONS,
...options,
language: 'jsonl',
}
const maxSize = opts.maxChunkSize
const lines = code.split('\n')
const lineStarts: number[] = []
let offset = 0
for (let i = 0; i < lines.length; i++) {
lineStarts[i] = offset
offset += (lines[i]?.length ?? 0) + (i < lines.length - 1 ? 1 : 0) // +1 for \n
}

const entities = await extractJsonlEntities(code, lines, lineStarts)
const scopeTree: ScopeTree = buildScopeTreeSync(entities)

// Group lines into chunks by size (NWS or bytes; use bytes for simplicity)
const chunks: Chunk[] = []
let chunkStartLine = 0
let chunkSize = 0
for (let i = 0; i <= lines.length; i++) {
const line = lines[i]
const lineLen = (line?.length ?? 0) + (i < lines.length - 1 ? 1 : 0)
const wouldExceed = i < lines.length && chunkSize + lineLen > maxSize
if (wouldExceed && chunkStartLine < i) {
// Emit chunk [chunkStartLine, i)
const chunkLines = lines.slice(chunkStartLine, i)
const text = chunkLines.join('\n')
const byteStart = lineStarts[chunkStartLine] ?? 0
const byteRange = { start: byteStart, end: byteStart + text.length }
const lineRange = { start: chunkStartLine, end: i - 1 }
const context = buildContextForJsonl(
{ text, byteRange, lineRange },
scopeTree,
opts,
filepath,
)
const contextualizedText = formatChunkWithContext(text, context)
chunks.push({
text,
contextualizedText,
byteRange,
lineRange,
context,
index: chunks.length,
totalChunks: -1, // set below
})
chunkStartLine = i
chunkSize = 0
}
if (i < lines.length) {
chunkSize += lineLen
}
}
if (chunkStartLine < lines.length) {
const chunkLines = lines.slice(chunkStartLine)
const text = chunkLines.join('\n')
const byteStart = lineStarts[chunkStartLine] ?? 0
const byteRange = { start: byteStart, end: byteStart + text.length }
const lineRange = { start: chunkStartLine, end: lines.length - 1 }
const context = buildContextForJsonl(
{ text, byteRange, lineRange },
scopeTree,
opts,
filepath,
)
const contextualizedText = formatChunkWithContext(text, context)
chunks.push({
text,
contextualizedText,
byteRange,
lineRange,
context,
index: chunks.length,
totalChunks: chunks.length,
})
}
// Set totalChunks on all
for (let j = 0; j < chunks.length; j++) {
chunks[j] = { ...chunks[j], totalChunks: chunks.length }
}
return chunks
}

function buildScopeTreeSync(entities: ExtractedEntity[]): ScopeTree {
return buildScopeTreeFromEntities(entities)
}

function buildContextForJsonl(
text: RebuiltText,
scopeTree: ScopeTree,
options: Required<ChunkOptions>,
filepath?: string,
): ChunkContext {
const byteRange = text.byteRange
return {
filepath,
language: 'jsonl',
scope: getScopeForRange(byteRange, scopeTree),
entities: getEntitiesInRange(byteRange, scopeTree),
siblings: getSiblings(byteRange, scopeTree, {
detail: options.siblingDetail,
maxSiblings: 3,
}),
imports: getRelevantImports(
getEntitiesInRange(byteRange, scopeTree),
scopeTree,
options.filterImports,
),
}
}
4 changes: 4 additions & 0 deletions packages/code-chunk/src/extract/docstring.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ export const COMMENT_NODE_TYPES: Record<Language, readonly string[]> = {
rust: ['line_comment', 'block_comment'],
go: ['comment'],
java: ['line_comment', 'block_comment'],
yaml: ['comment'],
toml: ['comment'],
json: [],
jsonl: [],
}

/**
Expand Down
Loading