Skip to content

Commit 05dec5f

Browse files
jahoomaclaude
andcommitted
evalbuff: Track which docs agents read during carve eval
Parses agent traces for Read tool calls targeting docs/, AGENTS.md, and CLAUDE.md. Reports per-feature and aggregate doc read counts in both console output and the JSON results file. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 72f5f68 commit 05dec5f

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

evalbuff/src/run-carve-eval.ts

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,40 @@ import type { CarvedFeature, CarveResult, FileOperation } from './carve-features
2424
import type { JudgingResult, ReviewerAgentType } from './judge'
2525
import type { RunnerResult } from './runners/runner'
2626

27+
// --- Doc read stats ---
28+
29+
/** Extract doc file reads from an agent trace (JSONL of PrintModeEvents). */
30+
function extractDocReads(agentTrace: string): Record<string, number> {
31+
const counts: Record<string, number> = {}
32+
for (const line of agentTrace.split('\n')) {
33+
if (!line.trim()) continue
34+
try {
35+
const event = JSON.parse(line)
36+
if (event.type !== 'tool_call' || event.toolName !== 'Read') continue
37+
const filePath: string = event.input?.file_path ?? ''
38+
// Normalize to repo-relative path
39+
const match = filePath.match(/(?:^|\/)(?:docs\/.*|AGENTS\.md|CLAUDE\.md)$/)
40+
if (!match) continue
41+
const relPath = match[0].startsWith('/') ? match[0].slice(1) : match[0]
42+
counts[relPath] = (counts[relPath] || 0) + 1
43+
} catch {
44+
// not JSON
45+
}
46+
}
47+
return counts
48+
}
49+
50+
/** Merge multiple doc-read count maps into one (summing counts). */
51+
function mergeDocReads(maps: Record<string, number>[]): Record<string, number> {
52+
const merged: Record<string, number> = {}
53+
for (const m of maps) {
54+
for (const [k, v] of Object.entries(m)) {
55+
merged[k] = (merged[k] || 0) + v
56+
}
57+
}
58+
return merged
59+
}
60+
2761
// --- Apply carve operations to a repo directory ---
2862

2963
function applyCarveOperations(repoDir: string, operations: FileOperation[]): void {
@@ -274,6 +308,8 @@ interface CarveEvalResult {
274308
docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
275309
docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }>
276310
totalCost: number
311+
/** Which doc files agents read and how many times (summed across all parallel runs). */
312+
docsRead: Record<string, number>
277313
}
278314

279315
async function runCarveEval(options: CarveEvalOptions): Promise<void> {
@@ -357,6 +393,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
357393
docsKept: [],
358394
docsRejected: [],
359395
totalCost,
396+
docsRead: {},
360397
})
361398
continue
362399
}
@@ -368,6 +405,15 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
368405
` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`,
369406
)
370407

408+
// Track which docs agents read across all runs for this feature
409+
const baselineDocReads = mergeDocReads(validBaseline.map((r) => extractDocReads(r.agentTrace)))
410+
const docReadEntries = Object.entries(baselineDocReads).sort((a, b) => b[1] - a[1])
411+
if (docReadEntries.length > 0) {
412+
console.log(` Docs read (baseline): ${docReadEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`)
413+
} else {
414+
console.log(` Docs read (baseline): none`)
415+
}
416+
371417
const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = []
372418
const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = []
373419

@@ -510,6 +556,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
510556
docsKept,
511557
docsRejected,
512558
totalCost,
559+
docsRead: baselineDocReads,
513560
})
514561
}
515562

@@ -525,6 +572,12 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
525572
console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`)
526573
console.log(` Final: ${r.finalScore.toFixed(1)}/10`)
527574
console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`)
575+
const readEntries = Object.entries(r.docsRead).sort((a, b) => b[1] - a[1])
576+
if (readEntries.length > 0) {
577+
console.log(` Docs read: ${readEntries.map(([p, n]) => `${p} (${n}x)`).join(', ')}`)
578+
} else {
579+
console.log(` Docs read: none`)
580+
}
528581
console.log(` Cost: $${r.totalCost.toFixed(2)}`)
529582
totalCostAll += r.totalCost
530583
}
@@ -538,6 +591,18 @@ async function runCarveEval(options: CarveEvalOptions): Promise<void> {
538591
console.log(` Average final: ${avgFinal.toFixed(1)}/10`)
539592
console.log(` Total cost: $${totalCostAll.toFixed(2)}`)
540593

594+
// Aggregate doc read stats across all features
595+
const allDocReads = mergeDocReads(results.map((r) => r.docsRead))
596+
const allReadEntries = Object.entries(allDocReads).sort((a, b) => b[1] - a[1])
597+
if (allReadEntries.length > 0) {
598+
console.log(`\n Doc read stats (all features):`)
599+
for (const [docPath, count] of allReadEntries) {
600+
console.log(` ${docPath}: ${count} reads`)
601+
}
602+
} else {
603+
console.log(`\n No docs were read by any agent.`)
604+
}
605+
541606
// Save results
542607
const outputPath = path.join(
543608
repoPath,

0 commit comments

Comments
 (0)