diff --git a/AGENTS.md b/AGENTS.md index ca06ab44c3..56320dd6bd 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -43,3 +43,4 @@ Make an efficient learning agent that can do anything. - [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls - [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md) +- [docs/patterns/discover-before-implement.md](docs/patterns/discover-before-implement.md) diff --git a/bun.lock b/bun.lock index 3df586afb9..5c9ce08a53 100644 --- a/bun.lock +++ b/bun.lock @@ -115,6 +115,7 @@ "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", "ai": "^5.0.0", + "openai": "^6.33.0", "zod": "^4.2.1", }, }, @@ -2914,6 +2915,8 @@ "open": ["open@10.2.0", "", { "dependencies": { "default-browser": "^5.2.1", "define-lazy-prop": "^3.0.0", "is-inside-container": "^1.0.0", "wsl-utils": "^0.1.0" } }, "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA=="], + "openai": ["openai@6.33.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-xAYN1W3YsDXJWA5F277135YfkEk6H7D3D6vWwRhJ3OEkzRgcyK8z/P5P9Gyi/wB4N8kK9kM5ZjprfvyHagKmpw=="], + "openid-client": ["openid-client@5.7.1", "", { "dependencies": { "jose": "^4.15.9", "lru-cache": "^6.0.0", "object-hash": "^2.2.0", "oidc-token-hash": "^5.0.3" } }, "sha512-jDBPgSVfTnkIh71Hg9pRvtJc6wTwqjRkN88+gCFtYWrlP4Yx2Dsrow8uPi3qLr/aeymPF3o2+dS+wOpglK04ew=="], "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], diff --git a/evalbuff/package.json b/evalbuff/package.json index ac8a55395f..e97a2a3a8e 100644 --- a/evalbuff/package.json +++ b/evalbuff/package.json @@ -18,6 +18,7 @@ "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", "ai": "^5.0.0", + "openai": "^6.33.0", "zod": "^4.2.1" } } diff --git a/evalbuff/src/carve-features.ts b/evalbuff/src/carve-features.ts new file mode 100644 index 0000000000..080f1080ef --- /dev/null +++ b/evalbuff/src/carve-features.ts @@ -0,0 +1,533 @@ +/** + * Feature Carver for evalbuff v2. + * + * Instead of using git commits as evals, this: + * 1. Analyzes a codebase to identify discrete, self-contained features + * 2. Plans how to cleanly delete each feature + * 3. Produces diffs that remove the feature (code, docs, references) + * + * The output can then be used as eval tasks: give agents a simple prompt + * to rebuild the deleted feature, judge against the original code. + */ +import { execSync } from 'child_process' +import fs from 'fs' +import path from 'path' + +import OpenAI from 'openai' + +// --- Types --- + +export interface CarveCandidate { + id: string + name: string + prompt: string // Short, natural prompt to rebuild this feature + description: string // What this feature does + files: string[] // Files involved (to delete or modify) + complexity: 'small' | 'medium' | 'large' +} + +export interface CarvePlan { + candidates: CarveCandidate[] + reasoning: string +} + +export interface FileOperation { + path: string + action: 'delete' | 'modify' + /** For 'modify': the new file content with the feature removed */ + newContent?: string +} + +export interface CarvedFeature { + id: string + prompt: string + description: string + complexity: 'small' | 'medium' | 'large' + /** Files as they exist before carving (the "ground truth" to rebuild) */ + originalFiles: Record + /** Operations to perform to carve the feature out */ + operations: FileOperation[] + /** Unified diff of the carving (deletions) */ + diff: string +} + +export interface CarveResult { + repoPath: string + generationDate: string + features: CarvedFeature[] +} + +// --- OpenAI client --- + +function getClient(): OpenAI { + return new OpenAI() // Uses OPENAI_API_KEY from env +} + +const PLANNING_MODEL = 'gpt-5.4' +const CARVING_MODEL = 'gpt-5.4' + +// --- Repo analysis helpers --- + +function getFileTree(repoPath: string, maxDepth: number = 4): string { + try { + // Use git ls-files to only get tracked files + const files = execSync('git ls-files', { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }) + .trim() + .split('\n') + .filter(Boolean) + + // Filter out noise + const filtered = files.filter((f) => { + const parts = f.split('/') + if (parts.length > maxDepth) return false + if (f.endsWith('.lock') || f.endsWith('.lockb')) return false + if (f.includes('node_modules/')) return false + if (f.endsWith('.json') && f.includes('package-lock')) return false + return true + }) + + return filtered.join('\n') + } catch { + return '' + } +} + +function readFile(repoPath: string, filePath: string): string | null { + try { + const fullPath = path.join(repoPath, filePath) + return fs.readFileSync(fullPath, 'utf-8') + } catch { + return null + } +} + +function getRepoStats(repoPath: string): string { + const fileTree = getFileTree(repoPath) + const files = fileTree.split('\n').filter(Boolean) + + const byExtension: Record = {} + for (const f of files) { + const ext = path.extname(f) || '(no ext)' + byExtension[ext] = (byExtension[ext] || 0) + 1 + } + + const sorted = Object.entries(byExtension) + .sort((a, b) => b[1] - a[1]) + .slice(0, 15) + .map(([ext, count]) => ` ${ext}: ${count}`) + .join('\n') + + return `Total tracked files: ${files.length}\nBy extension:\n${sorted}` +} + +// --- Phase 1: Plan features to carve --- + +const PLANNING_SYSTEM = `You are an expert software architect analyzing a codebase to identify discrete, self-contained features that can be cleanly "carved out" (deleted) and used as coding evaluation tasks. + +## Your Goal + +Identify 15-25 features in this codebase that could be cleanly removed and then rebuilt by a coding agent. Each feature should: + +1. **Be self-contained** — removing it leaves the rest of the codebase functional (maybe some missing imports/references, but structurally intact) +2. **Be describable in 1-2 sentences** — a developer could ask for it naturally +3. **Be non-trivial but bounded** — not a one-liner, but not "rewrite the whole app" +4. **Cover different aspects** — mix of UI components, API endpoints, utilities, config, tests, etc. +5. **Not overlap** — deleting feature A shouldn't also delete most of feature B + +## What makes a good carve candidate + +- A React component + its usage sites +- An API endpoint (route + handler + types) +- A CLI subcommand or flag +- A utility module used in a few places +- A feature behind a config/flag +- A test suite for a specific module +- A middleware or plugin +- An integration with an external service + +## What makes a BAD candidate + +- Core infrastructure that everything depends on (routing, auth framework, database connection) +- A single function that's called in 50 places +- Trivially small changes (rename, config tweak) +- Auto-generated or boilerplate code + +## Output Format + +Respond with valid JSON matching this schema: +{ + "reasoning": "Your analysis of the codebase and approach to selecting features", + "candidates": [ + { + "id": "short-kebab-id", + "name": "Human readable name", + "prompt": "Natural prompt a developer would use to ask for this feature, 1-2 sentences", + "description": "What this feature does and why it exists", + "files": ["path/to/file1.ts", "path/to/file2.tsx"], + "complexity": "small|medium|large" + } + ] +} + +Be thorough in listing ALL files involved in each feature — missing a file means the carve won't be clean.` + +export async function planFeatures(repoPath: string): Promise { + const client = getClient() + + const fileTree = getFileTree(repoPath) + const stats = getRepoStats(repoPath) + + // Read key files for context + const keyFiles = [ + 'package.json', + 'README.md', + 'CLAUDE.md', + 'tsconfig.json', + 'src/index.ts', + 'src/index.tsx', + 'src/app.ts', + 'src/app.tsx', + 'src/main.ts', + 'src/main.tsx', + ] + + let keyFileContents = '' + for (const kf of keyFiles) { + const content = readFile(repoPath, kf) + if (content) { + keyFileContents += `\n### ${kf}\n\`\`\`\n${content.slice(0, 5000)}\n\`\`\`\n` + } + } + + const userPrompt = `## Repository Stats +${stats} + +## File Tree +\`\`\` +${fileTree} +\`\`\` + +## Key Files +${keyFileContents || '(none found)'} + +Please analyze this codebase and identify 15-25 features that can be cleanly carved out for evaluation.` + + console.log('Planning features to carve...') + const response = await client.chat.completions.create({ + model: PLANNING_MODEL, + messages: [ + { role: 'system', content: PLANNING_SYSTEM }, + { role: 'user', content: userPrompt }, + ], + response_format: { type: 'json_object' }, + }) + + const text = response.choices[0]?.message?.content + if (!text) throw new Error('No response from planning model') + + const parsed = JSON.parse(text) as CarvePlan + console.log(`Identified ${parsed.candidates.length} carve candidates`) + return parsed +} + +// --- Phase 2: Execute carving for each feature --- + +const CARVING_SYSTEM = `You are a precise code surgeon. Your job is to cleanly remove a specific feature from a codebase. + +## Rules + +1. **Delete completely** — remove ALL code related to the feature: components, handlers, types, tests, docs, imports, route registrations, etc. +2. **Don't break the rest** — the remaining code should still be structurally valid. Fix imports, remove dead references, etc. +3. **Minimal collateral** — only remove what's necessary. Don't "improve" or refactor surrounding code. +4. **Be thorough** — check for references in other files. If file A imports something from the feature, update file A's imports. + +## Output Format + +Respond with valid JSON matching this schema: +{ + "operations": [ + { + "path": "path/to/file.ts", + "action": "delete" + }, + { + "path": "path/to/other-file.ts", + "action": "modify", + "newContent": "...full file content with feature removed..." + } + ] +} + +For "modify" operations, provide the COMPLETE new file content (not a diff). This must be the entire file with only the feature-related code removed. +For "delete" operations, the entire file will be removed. + +Only include files that actually need to change. Don't include files that are unaffected.` + +export async function carveFeature( + repoPath: string, + candidate: CarveCandidate, +): Promise { + const client = getClient() + + // Read all files involved + const fileContents: Record = {} + for (const filePath of candidate.files) { + const content = readFile(repoPath, filePath) + if (content) { + fileContents[filePath] = content + } + } + + if (Object.keys(fileContents).length === 0) { + console.warn(` No readable files for feature ${candidate.id}, skipping`) + return null + } + + // Also read files that might reference the feature's files (importers) + const referenceFiles = findReferencingFiles(repoPath, candidate.files) + for (const refFile of referenceFiles) { + if (!fileContents[refFile]) { + const content = readFile(repoPath, refFile) + if (content) { + fileContents[refFile] = content + } + } + } + + let filesSection = '' + for (const [filePath, content] of Object.entries(fileContents)) { + const isFeatureFile = candidate.files.includes(filePath) + const label = isFeatureFile ? '(FEATURE FILE)' : '(REFERENCING FILE)' + filesSection += `\n### ${filePath} ${label}\n\`\`\`\n${content}\n\`\`\`\n` + } + + const userPrompt = `## Feature to Remove +**Name:** ${candidate.name} +**Description:** ${candidate.description} +**Feature files:** ${candidate.files.join(', ')} + +## Current File Contents +${filesSection} + +Remove this feature completely. For files that are entirely part of the feature, use "delete". For files that contain the feature mixed with other code, use "modify" and provide the full updated content.` + + console.log(` Carving feature: ${candidate.id}...`) + const response = await client.chat.completions.create({ + model: CARVING_MODEL, + messages: [ + { role: 'system', content: CARVING_SYSTEM }, + { role: 'user', content: userPrompt }, + ], + response_format: { type: 'json_object' }, + }) + + const text = response.choices[0]?.message?.content + if (!text) { + console.warn(` No response for feature ${candidate.id}`) + return null + } + + const parsed = JSON.parse(text) as { operations: FileOperation[] } + + // Compute diff + const diff = computeDiff(repoPath, parsed.operations) + + // Save original files (only the feature files, for judging) + const originalFiles: Record = {} + for (const filePath of candidate.files) { + if (fileContents[filePath]) { + originalFiles[filePath] = fileContents[filePath] + } + } + + return { + id: candidate.id, + prompt: candidate.prompt, + description: candidate.description, + complexity: candidate.complexity, + originalFiles, + operations: parsed.operations, + diff, + } +} + +// --- Helpers --- + +/** + * Find files that import/reference any of the given files. + * Uses git grep to find import statements. + */ +function findReferencingFiles( + repoPath: string, + featureFiles: string[], +): string[] { + const referencingFiles = new Set() + + for (const featureFile of featureFiles) { + // Extract the module name (without extension) for import matching + const basename = path.basename(featureFile).replace(/\.[^.]+$/, '') + const dirname = path.dirname(featureFile) + + // Search for imports of this file + try { + const results = execSync( + `git grep -l "${basename}" -- '*.ts' '*.tsx' '*.js' '*.jsx'`, + { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }, + ) + .trim() + .split('\n') + .filter(Boolean) + + for (const result of results) { + // Don't include the feature's own files + if (!featureFiles.includes(result)) { + referencingFiles.add(result) + } + } + } catch { + // git grep returns exit code 1 when no matches + } + } + + // Limit to reasonable number + const sorted = [...referencingFiles].slice(0, 20) + return sorted +} + +/** + * Compute a unified diff from file operations. + * Creates a temp worktree, applies operations, and diffs. + */ +function computeDiff( + repoPath: string, + operations: FileOperation[], +): string { + const diffs: string[] = [] + + for (const op of operations) { + const fullPath = path.join(repoPath, op.path) + const originalContent = fs.existsSync(fullPath) + ? fs.readFileSync(fullPath, 'utf-8') + : '' + + if (op.action === 'delete') { + // Show the full file as deleted + const lines = originalContent.split('\n') + const header = `--- a/${op.path}\n+++ /dev/null` + const hunk = `@@ -1,${lines.length} +0,0 @@\n` + + lines.map((l) => `-${l}`).join('\n') + diffs.push(`${header}\n${hunk}`) + } else if (op.action === 'modify' && op.newContent !== undefined) { + // Compute line-level diff + const oldLines = originalContent.split('\n') + const newLines = op.newContent.split('\n') + // Use a simple diff representation — the full before/after + const header = `--- a/${op.path}\n+++ b/${op.path}` + // For now, show full replacement (not optimal but correct) + const hunk = `@@ -1,${oldLines.length} +1,${newLines.length} @@\n` + + oldLines.map((l) => `-${l}`).join('\n') + '\n' + + newLines.map((l) => `+${l}`).join('\n') + diffs.push(`${header}\n${hunk}`) + } + } + + return diffs.join('\n\n') +} + +// --- Main orchestrator --- + +export async function carveFeatures( + repoPath: string, + options: { + count?: number // Number of features to carve (default: 10) + outputPath?: string + } = {}, +): Promise { + const { count = 10, outputPath } = options + + console.log(`\nCarving features from: ${repoPath}`) + console.log(`Target: ${count} features\n`) + + // Phase 1: Plan + const plan = await planFeatures(repoPath) + + console.log(`\nPlanning complete. Reasoning:\n${plan.reasoning}\n`) + console.log('Candidates:') + for (const c of plan.candidates) { + console.log(` ${c.id} (${c.complexity}): ${c.name}`) + console.log(` Prompt: ${c.prompt}`) + console.log(` Files: ${c.files.join(', ')}`) + } + + // Select top N candidates (prefer medium complexity) + const ranked = [...plan.candidates].sort((a, b) => { + const complexityOrder = { medium: 0, small: 1, large: 2 } + return complexityOrder[a.complexity] - complexityOrder[b.complexity] + }) + const selected = ranked.slice(0, count) + + console.log(`\nSelected ${selected.length} features for carving:\n`) + + // Phase 2: Carve each feature + const features: CarvedFeature[] = [] + for (const candidate of selected) { + try { + const carved = await carveFeature(repoPath, candidate) + if (carved) { + features.push(carved) + console.log(` ✓ ${carved.id} — ${carved.operations.length} file operations`) + } + } catch (error) { + console.error(` ✗ ${candidate.id} failed:`, error) + } + } + + const result: CarveResult = { + repoPath, + generationDate: new Date().toISOString(), + features, + } + + // Save output + const outPath = + outputPath || + path.join(repoPath, `carve-${new Date().toISOString().slice(0, 10)}.json`) + fs.writeFileSync(outPath, JSON.stringify(result, null, 2)) + console.log(`\nSaved ${features.length} carved features to: ${outPath}`) + + return result +} + +// --- CLI --- + +if (import.meta.main) { + const args = process.argv.slice(2) + + const getArg = (name: string, defaultValue?: string): string => { + const idx = args.indexOf(`--${name}`) + if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] + if (defaultValue !== undefined) return defaultValue + throw new Error(`Missing required argument: --${name}`) + } + + const repoPath = getArg('repo') + const count = parseInt(getArg('count', '10')) + const outputPath = args.indexOf('--output') >= 0 ? getArg('output') : undefined + + carveFeatures(repoPath, { count, outputPath }) + .then((result) => { + console.log(`\nDone! Carved ${result.features.length} features.`) + }) + .catch((error) => { + console.error('Carving failed:', error) + process.exit(1) + }) +} diff --git a/evalbuff/src/run-carve-eval.ts b/evalbuff/src/run-carve-eval.ts new file mode 100644 index 0000000000..1d627d87bf --- /dev/null +++ b/evalbuff/src/run-carve-eval.ts @@ -0,0 +1,590 @@ +/** + * Run carve-based evals: apply a carve (delete a feature), run agents to rebuild it, + * judge against the original code, then iterate on docs. + * + * Usage: + * bun run evalbuff/src/run-carve-eval.ts --repo /path/to/repo --carve-file carve-2026-03-30.json [--feature cli-init-command] [--parallelism 5] + */ +import { execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { + analyzeFailure, + applyDocEdit, + compareScores, + readCurrentDocs, + revertDocEdit, +} from './docs-optimizer' +import { judgeTaskResult } from './judge' +import { ClaudeRunner } from './runners/claude' + +import type { CarvedFeature, CarveResult, FileOperation } from './carve-features' +import type { JudgingResult, ReviewerAgentType } from './judge' +import type { RunnerResult } from './runners/runner' + +// --- Apply carve operations to a repo directory --- + +function applyCarveOperations(repoDir: string, operations: FileOperation[]): void { + for (const op of operations) { + const fullPath = path.join(repoDir, op.path) + if (op.action === 'delete') { + if (fs.existsSync(fullPath)) { + fs.rmSync(fullPath) + } + } else if (op.action === 'modify' && op.newContent !== undefined) { + fs.mkdirSync(path.dirname(fullPath), { recursive: true }) + fs.writeFileSync(fullPath, op.newContent) + } + } +} + +/** + * Compute a reverse diff (what needs to be added back) from a carve. + * This is the "ground truth" — the original code that was removed. + */ +function computeGroundTruthDiff(feature: CarvedFeature): string { + const diffs: string[] = [] + + for (const op of feature.operations) { + if (op.action === 'delete' && feature.originalFiles[op.path]) { + // File was deleted — ground truth is to recreate it + const lines = feature.originalFiles[op.path].split('\n') + diffs.push( + `--- /dev/null\n+++ b/${op.path}\n@@ -0,0 +1,${lines.length} @@\n` + + lines.map((l) => `+${l}`).join('\n'), + ) + } else if (op.action === 'modify' && feature.originalFiles[op.path]) { + // File was modified — ground truth is the original version + const origLines = feature.originalFiles[op.path].split('\n') + const carvedLines = (op.newContent || '').split('\n') + diffs.push( + `--- a/${op.path}\n+++ b/${op.path}\n@@ -1,${carvedLines.length} +1,${origLines.length} @@\n` + + carvedLines.map((l) => `-${l}`).join('\n') + + '\n' + + origLines.map((l) => `+${l}`).join('\n'), + ) + } + } + + return diffs.join('\n\n') +} + +// --- Clone repo and apply carve --- + +interface TestRepoResult { + result: T + cleanup: () => void +} + +async function withCarvedRepo( + repoPath: string, + feature: CarvedFeature, + initCommand: string | undefined, + fn: (repoDir: string, carveSha: string) => Promise, +): Promise { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'carve-eval-')) + const repoDir = path.join(tempDir, 'repo') + + try { + // Local clone (fast, uses hardlinks) + execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { + stdio: 'ignore', + }) + const headSha = execSync('git rev-parse HEAD', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' }) + + // Apply the carve operations (delete the feature) + applyCarveOperations(repoDir, feature.operations) + + // Commit the carved state so agents start from a clean working tree + execSync('git add -A', { cwd: repoDir, stdio: 'ignore' }) + execSync( + `git commit -m "carve: remove ${feature.id}" --allow-empty`, + { cwd: repoDir, stdio: 'ignore' }, + ) + const carveSha = execSync('git rev-parse HEAD', { + cwd: repoDir, + encoding: 'utf-8', + }).trim() + + // Run init command if provided + if (initCommand) { + try { + execSync(initCommand, { cwd: repoDir, stdio: 'ignore' }) + } catch (e) { + console.warn(`Init command failed: ${e}`) + } + } + + return await fn(repoDir, carveSha) + } finally { + try { + fs.rmSync(tempDir, { recursive: true, force: true }) + } catch { + // ignore + } + } +} + +// --- Run a single agent on a carved repo --- + +async function runAgentOnCarve(opts: { + idx: number + total: number + repoPath: string + feature: CarvedFeature + initCommand?: string + model: string + agentTimeoutMs: number + groundTruthDiff: string + reviewerAgents: ReviewerAgentType[] + docsSourcePath: string +}): Promise<{ + score: number + diff: string + agentTrace: string + judging: JudgingResult + costEstimate: number +}> { + const { + idx, + total, + repoPath, + feature, + initCommand, + model, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath, + } = opts + + return withCarvedRepo(repoPath, feature, initCommand, async (repoDir, carveSha) => { + // Copy docs into the carved repo + copyDocsIntoRepo(docsSourcePath, repoDir) + + console.log(` [Run ${idx + 1}/${total}] Running claude (${model}) on carved repo...`) + const runner = new ClaudeRunner(repoDir, {}, model) + + let result: RunnerResult + try { + result = await runner.run(feature.prompt) + } catch (runError) { + const errMsg = + runError instanceof Error ? runError.message : String(runError) + console.warn(` [Run ${idx + 1}/${total}] Agent failed: ${errMsg.slice(0, 200)}`) + return { + score: -1, + diff: '', + agentTrace: `Agent error: ${errMsg}`, + judging: { + analysis: `Agent failed: ${errMsg.slice(0, 500)}`, + strengths: [], + weaknesses: ['Agent failed due to infrastructure error'], + e2eTestsPerformed: [], + completionScore: -1, + codeQualityScore: -1, + e2eScore: -1, + overallScore: -1, + }, + costEstimate: 0, + } + } + + const agentTrace = result.steps + .map((step) => JSON.stringify(step)) + .join('\n') + + console.log(` [Run ${idx + 1}/${total}] Judging...`) + const judging = await judgeTaskResult({ + taskPrompt: feature.prompt, + agentDiff: result.diff, + groundTruthDiff, + repoDir, + error: result.diff === '' ? 'Agent made no changes' : undefined, + reviewerAgents, + }) + + return { + score: judging.overallScore, + diff: result.diff, + agentTrace, + judging, + costEstimate: result.totalCostUsd, + } + }) +} + +function copyDocsIntoRepo(sourceRepoPath: string, targetRepoPath: string): void { + const sourceDocsDir = path.join(sourceRepoPath, 'docs') + const sourceAgentsMd = path.join(sourceRepoPath, 'AGENTS.md') + const targetDocsDir = path.join(targetRepoPath, 'docs') + const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md') + + let copied = false + if (fs.existsSync(sourceDocsDir)) { + fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true }) + copied = true + } + if (fs.existsSync(sourceAgentsMd)) { + fs.cpSync(sourceAgentsMd, targetAgentsMd) + copied = true + } + + if (copied) { + try { + execSync( + 'git add docs/ AGENTS.md 2>/dev/null; git add -u docs/ AGENTS.md 2>/dev/null', + { cwd: targetRepoPath, stdio: 'ignore' }, + ) + execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', { + cwd: targetRepoPath, + stdio: 'ignore', + }) + } catch { + // fine + } + } +} + +// --- Main carve eval loop --- + +interface CarveEvalOptions { + repoPath: string + carveFile: string + featureId?: string // run only this feature (default: all) + model: string + parallelism: number + agentTimeoutMs: number + reviewerAgents: ReviewerAgentType[] + initCommand?: string + maxImprovementIterations: number +} + +interface CarveEvalResult { + featureId: string + prompt: string + baselineScore: number + finalScore: number + docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> + docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> + totalCost: number +} + +async function runCarveEval(options: CarveEvalOptions): Promise { + const { + repoPath, + carveFile, + featureId, + model, + parallelism, + agentTimeoutMs, + reviewerAgents, + initCommand, + maxImprovementIterations, + } = options + + // Load carve data + const carveData: CarveResult = JSON.parse( + fs.readFileSync(carveFile, 'utf-8'), + ) + + // Select features + let features = carveData.features + if (featureId) { + features = features.filter((f) => f.id === featureId) + if (features.length === 0) { + console.error( + `Feature "${featureId}" not found. Available: ${carveData.features.map((f) => f.id).join(', ')}`, + ) + process.exit(1) + } + } + + console.log(`\nCarve Eval:`) + console.log(` Repo: ${repoPath}`) + console.log(` Model: ${model}`) + console.log(` Parallelism: ${parallelism}`) + console.log(` Reviewers: ${reviewerAgents.join(', ')}`) + console.log(` Features: ${features.length}`) + console.log(` Max doc improvement iterations: ${maxImprovementIterations}`) + + const results: CarveEvalResult[] = [] + + for (const feature of features) { + console.log(`\n${'='.repeat(60)}`) + console.log(`Feature: ${feature.id}`) + console.log(`Prompt: ${feature.prompt}`) + console.log(`Operations: ${feature.operations.length} (${feature.operations.filter((o) => o.action === 'delete').length} deletes, ${feature.operations.filter((o) => o.action === 'modify').length} modifies)`) + console.log(`${'='.repeat(60)}`) + + const groundTruthDiff = computeGroundTruthDiff(feature) + + // --- Baseline: run agents in parallel --- + console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) + const baselineResults = await Promise.all( + Array.from({ length: parallelism }, (_, i) => + runAgentOnCarve({ + idx: i, + total: parallelism, + repoPath, + feature, + initCommand, + model, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath: repoPath, + }), + ), + ) + + const validBaseline = baselineResults.filter((r) => r.score >= 0) + let totalCost = baselineResults.reduce((a, r) => a + r.costEstimate, 0) + + if (validBaseline.length === 0) { + console.log(` All agents failed. Skipping feature.`) + results.push({ + featureId: feature.id, + prompt: feature.prompt, + baselineScore: 0, + finalScore: 0, + docsKept: [], + docsRejected: [], + totalCost, + }) + continue + } + + const baselineScores = validBaseline.map((r) => r.score) + let currentScore = + baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length + console.log( + ` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`, + ) + + const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] + const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] + + // --- Doc improvement loop --- + if (currentScore < 9.0) { + let latestJudgings = validBaseline.map((r) => r.judging) + let latestDiffs = validBaseline.map((r) => r.diff) + let latestTraces = validBaseline.map((r) => r.agentTrace) + + for (let iter = 0; iter < maxImprovementIterations; iter++) { + // Pick worst run for analysis + const worstIdx = latestJudgings.reduce( + (minIdx, j, idx, arr) => + j.overallScore < arr[minIdx].overallScore ? idx : minIdx, + 0, + ) + + const currentDocs = readCurrentDocs(repoPath) + const editHistory = [ + ...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })), + ...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })), + ] + + console.log(` Analyzing for doc improvements (iteration ${iter + 1})...`) + const docSuggestion = await analyzeFailure({ + judgeResult: latestJudgings[worstIdx], + taskPrompt: feature.prompt, + agentDiff: latestDiffs[worstIdx], + agentTrace: latestTraces[worstIdx], + groundTruthDiff, + currentDocs, + editHistory, + }) + + if (!docSuggestion) { + console.log(` No doc suggestion — stopping.`) + break + } + + console.log(` Doc suggestion: ${docSuggestion.suggestedDocPath}`) + console.log(` Reasoning: ${docSuggestion.reasoning}`) + + // Save previous content for revert + const docFullPath = path.join(repoPath, 'docs', docSuggestion.suggestedDocPath) + const previousContent = fs.existsSync(docFullPath) + ? fs.readFileSync(docFullPath, 'utf-8') + : null + + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, docSuggestion.suggestedContent) + + // Re-run with new docs + console.log(` Re-running ${parallelism} agents with new docs...`) + const rerunResults = await Promise.all( + Array.from({ length: parallelism }, (_, i) => + runAgentOnCarve({ + idx: i, + total: parallelism, + repoPath, + feature, + initCommand, + model, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath: repoPath, + }), + ), + ) + + const validRerun = rerunResults.filter((r) => r.score >= 0) + totalCost += rerunResults.reduce((a, r) => a + r.costEstimate, 0) + + if (validRerun.length === 0) { + console.log(` Re-run failed. Reverting doc.`) + if (previousContent !== null) { + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + break + } + + const rerunScores = validRerun.map((r) => r.score) + const rerunAvg = + rerunScores.reduce((a, b) => a + b, 0) / rerunScores.length + const comparison = compareScores(currentScore, rerunAvg) + console.log( + ` New score: ${rerunAvg.toFixed(1)}/10 (${comparison}) (${rerunScores.map((s) => s.toFixed(1)).join(', ')})`, + ) + + if (comparison === 'improved' || comparison === 'same') { + const reason = comparison === 'improved' ? 'improved' : 'within noise, keeping' + console.log(` Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`) + docsKept.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + scoreBefore: currentScore, + scoreAfter: rerunAvg, + }) + + // Commit the doc + try { + execSync('git add docs/ AGENTS.md', { cwd: repoPath, stdio: 'ignore' }) + execSync( + `git commit -m "evalbuff: add ${docSuggestion.suggestedDocPath} (carve: ${feature.id})"`, + { cwd: repoPath, stdio: 'ignore' }, + ) + } catch { + console.warn('Failed to commit doc change') + } + + currentScore = rerunAvg + latestJudgings = validRerun.map((r) => r.judging) + latestDiffs = validRerun.map((r) => r.diff) + latestTraces = validRerun.map((r) => r.agentTrace) + } else { + console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath}`) + docsRejected.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + scoreBefore: currentScore, + scoreAfter: rerunAvg, + }) + + if (previousContent !== null) { + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + break + } + } + } + + results.push({ + featureId: feature.id, + prompt: feature.prompt, + baselineScore: baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length, + finalScore: currentScore, + docsKept, + docsRejected, + totalCost, + }) + } + + // --- Summary --- + console.log(`\n${'='.repeat(60)}`) + console.log('CARVE EVAL RESULTS') + console.log(`${'='.repeat(60)}`) + + let totalCostAll = 0 + for (const r of results) { + console.log(`\n ${r.featureId}:`) + console.log(` Prompt: ${r.prompt.slice(0, 80)}...`) + console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`) + console.log(` Final: ${r.finalScore.toFixed(1)}/10`) + console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`) + console.log(` Cost: $${r.totalCost.toFixed(2)}`) + totalCostAll += r.totalCost + } + + const avgBaseline = + results.reduce((a, r) => a + r.baselineScore, 0) / results.length + const avgFinal = + results.reduce((a, r) => a + r.finalScore, 0) / results.length + + console.log(`\n Average baseline: ${avgBaseline.toFixed(1)}/10`) + console.log(` Average final: ${avgFinal.toFixed(1)}/10`) + console.log(` Total cost: $${totalCostAll.toFixed(2)}`) + + // Save results + const outputPath = path.join( + repoPath, + `carve-eval-results-${new Date().toISOString().slice(0, 10)}.json`, + ) + fs.writeFileSync(outputPath, JSON.stringify(results, null, 2)) + console.log(`\nResults saved to: ${outputPath}`) +} + +// --- CLI --- + +if (import.meta.main) { + const args = process.argv.slice(2) + + const getArg = (name: string, defaultValue?: string): string => { + const idx = args.indexOf(`--${name}`) + if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] + if (defaultValue !== undefined) return defaultValue + throw new Error(`Missing required argument: --${name}`) + } + const hasArg = (name: string): boolean => args.includes(`--${name}`) + + const repoPath = getArg('repo') + const carveFile = getArg('carve-file') + const featureId = hasArg('feature') ? getArg('feature') : undefined + const model = getArg('model', 'sonnet') + const parallelism = parseInt(getArg('parallelism', '3')) + const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) + const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined + const reviewerAgents: ReviewerAgentType[] = reviewerAgentsArg + ? (reviewerAgentsArg.split(',') as ReviewerAgentType[]) + : ['claude', 'codex'] + const initCommand = hasArg('init-command') ? getArg('init-command') : undefined + const maxImprovementIterations = parseInt(getArg('max-iterations', '3')) + + runCarveEval({ + repoPath, + carveFile, + featureId, + model, + parallelism, + agentTimeoutMs, + reviewerAgents, + initCommand, + maxImprovementIterations, + }).catch((error) => { + console.error('Carve eval failed:', error) + process.exit(1) + }) +} diff --git a/evalbuff/src/runners/claude.ts b/evalbuff/src/runners/claude.ts index 1ecd200567..2c1f228f51 100644 --- a/evalbuff/src/runners/claude.ts +++ b/evalbuff/src/runners/claude.ts @@ -9,10 +9,16 @@ import type { export class ClaudeRunner implements Runner { private cwd: string private env: Record + private model: string - constructor(cwd: string, env: Record = {}) { + constructor( + cwd: string, + env: Record = {}, + model: string = 'claude-opus-4-5-20251101', + ) { this.cwd = cwd this.env = env + this.model = model } async run(prompt: string): Promise { @@ -28,7 +34,7 @@ export class ClaudeRunner implements Runner { '--verbose', '--dangerously-skip-permissions', '--model', - 'claude-opus-4-5-20251101', + this.model, ] console.log(`[ClaudeRunner] Running: claude ${args.join(' ')}`)