Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,15 @@ Make an efficient learning agent that can do anything.

## Docs

- [`docs/architecture.md`](docs/architecture.md) — Package dependency graph, per-package details, architectural patterns
- [`docs/request-flow.md`](docs/request-flow.md) — Full request lifecycle from CLI through server and back
- [`docs/error-schema.md`](docs/error-schema.md) — Server error response formats and client-side handling
- [`docs/development.md`](docs/development.md) — Dev setup, worktrees, logs, package management, DB migrations
- [`docs/testing.md`](docs/testing.md) — DI over mocking, tmux CLI testing
- [`docs/environment-variables.md`](docs/environment-variables.md) — Env var rules, DI helpers, loading order
- [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions
- [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls
- [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md)
- [docs/patterns/discover-before-implement.md](docs/patterns/discover-before-implement.md)
IMPORTANT: Prefer retrieval-led reasoning over pre-training-led reasoning. Always read the relevant docs below before implementing changes.

- `docs/architecture.md` — Package dependency graph, per-package details, architectural patterns
- `docs/request-flow.md` — Full request lifecycle from CLI through server and back
- `docs/error-schema.md` — Server error response formats and client-side handling
- `docs/development.md` — Dev setup, worktrees, logs, package management, DB migrations
- `docs/testing.md` — DI over mocking, tmux CLI testing
- `docs/environment-variables.md` — Env var rules, DI helpers, loading order
- `docs/agents-and-tools.md` — Agent system, shell shims, tool definitions
- `docs/patterns/handle-steps-generators.md` — handleSteps generator patterns and spawn_agents tool calls
- `docs/evalbuff/interpreting-task-prompts.md`
- `docs/patterns/discover-before-implement.md`
65 changes: 65 additions & 0 deletions evalbuff/src/run-carve-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,40 @@ import type { CarvedFeature, CarveResult, FileOperation } from './carve-features
import type { JudgingResult, ReviewerAgentType } from './judge'
import type { RunnerResult } from './runners/runner'

// --- Doc read stats ---

/** Extract doc file reads from an agent trace (JSONL of PrintModeEvents). */
function extractDocReads(agentTrace: string): Record<string, number> {
const counts: Record<string, number> = {}
for (const line of agentTrace.split('\n')) {
if (!line.trim()) continue
try {
const event = JSON.parse(line)
if (event.type !== 'tool_call' || event.toolName !== 'Read') continue
const filePath: string = event.input?.file_path ?? ''
// Normalize to repo-relative path
const match = filePath.match(/(?:^|\/)(?:docs\/.*|AGENTS\.md|CLAUDE\.md)$/)
if (!match) continue
const relPath = match[0].startsWith('/') ? match[0].slice(1) : match[0]
counts[relPath] = (counts[relPath] || 0) + 1
} catch {
// not JSON
}
}
return counts
}

/** Merge multiple doc-read count maps into one (summing counts). */
function mergeDocReads(maps: Record<string, number>[]): Record<string, number> {
const merged: Record<string, number> = {}
for (const m of maps) {
for (const [k, v] of Object.entries(m)) {
merged[k] = (merged[k] || 0) + v
}
}
return merged
}

// --- Apply carve operations to a repo directory ---

function applyCarveOperations(repoDir: string, operations: FileOperation[]): void {
Expand Down Expand Up @@ -274,6 +308,8 @@ interface CarveEvalResult {
docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
totalCost: number
/** Which doc files agents read and how many times (summed across all parallel runs). */
docsRead: Record<string, number>
}

async function runCarveEval(options: CarveEvalOptions): Promise<void> {
Expand Down Expand Up @@ -357,6 +393,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
docsKept: [],
docsRejected: [],
totalCost,
docsRead: {},
})
continue
}
Expand All @@ -368,6 +405,15 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`,
)

// Track which docs agents read across all runs for this feature
const baselineDocReads = mergeDocReads(validBaseline.map((r) => extractDocReads(r.agentTrace)))
const docReadEntries = Object.entries(baselineDocReads).sort((a, b) => b[1] - a[1])
if (docReadEntries.length > 0) {
console.log(` Docs read (baseline): ${docReadEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`)
} else {
console.log(` Docs read (baseline): none`)
}

const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = []
const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = []

Expand Down Expand Up @@ -510,6 +556,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
docsKept,
docsRejected,
totalCost,
docsRead: baselineDocReads,
})
}

Expand All @@ -525,6 +572,12 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`)
console.log(` Final: ${r.finalScore.toFixed(1)}/10`)
console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`)
const readEntries = Object.entries(r.docsRead).sort((a, b) => b[1] - a[1])
if (readEntries.length > 0) {
console.log(` Docs read: ${readEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`)
} else {
console.log(` Docs read: none`)
}
console.log(` Cost: $${r.totalCost.toFixed(2)}`)
totalCostAll += r.totalCost
}
Expand All @@ -538,6 +591,18 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
console.log(` Average final: ${avgFinal.toFixed(1)}/10`)
console.log(` Total cost: $${totalCostAll.toFixed(2)}`)

// Aggregate doc read stats across all features
const allDocReads = mergeDocReads(results.map((r) => r.docsRead))
const allReadEntries = Object.entries(allDocReads).sort((a, b) => b[1] - a[1])
if (allReadEntries.length > 0) {
console.log(`\n Doc read stats (all features):`)
for (const [docPath, count] of allReadEntries) {
console.log(` ${docPath}: ${count} reads`)
}
} else {
console.log(`\n No docs were read by any agent.`)
}

// Save results
const outputPath = path.join(
repoPath,
Expand Down
Loading