From d1bd4532b8a4a864d24d6514098b79e7ab2f2fd2 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 30 Mar 2026 16:36:09 -0700 Subject: [PATCH 1/4] evalbuff: add patterns/discover-before-implement.md (carve: cli-init-command) --- AGENTS.md | 1 + docs/patterns/discover-before-implement.md | 100 +++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 docs/patterns/discover-before-implement.md diff --git a/AGENTS.md b/AGENTS.md index ca06ab44c..56320dd6b 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -43,3 +43,4 @@ Make an efficient learning agent that can do anything. - [`docs/agents-and-tools.md`](docs/agents-and-tools.md) — Agent system, shell shims, tool definitions - [`docs/patterns/handle-steps-generators.md`](docs/patterns/handle-steps-generators.md) — handleSteps generator patterns and spawn_agents tool calls - [docs/evalbuff/interpreting-task-prompts.md](docs/evalbuff/interpreting-task-prompts.md) +- [docs/patterns/discover-before-implement.md](docs/patterns/discover-before-implement.md) diff --git a/docs/patterns/discover-before-implement.md b/docs/patterns/discover-before-implement.md new file mode 100644 index 000000000..59e661b9e --- /dev/null +++ b/docs/patterns/discover-before-implement.md @@ -0,0 +1,100 @@ +# Discover Before Implement: Find Existing Patterns First + +Before implementing ANY new feature, search for existing utilities, constants, and templates in the codebase. Duplicating content that already exists is the most common source of architectural drift. + +## Key Locations to Check in This Codebase + +### Project Root Utilities +- **`cli/src/project-files.ts`** — Use `getProjectRoot()` (NOT `process.cwd()`) to get the user's project directory. `setProjectRoot()` sets it at startup. +- **`cli/src/utils/analytics.ts`** — Use `trackEvent(AnalyticsEvent.X, {...})` to track user actions. Constants are in `common/src/constants/analytics-events.ts`. + +### Template Files (DO NOT duplicate as strings) +Template files that users receive when scaffolding live in: +``` +common/src/templates/initial-agents-dir/ + types/agent-definition.ts ← import with Bun text import + types/tools.ts ← import with Bun text import + types/util-types.ts ← import with Bun text import + my-custom-agent.ts + package.json +``` + +Import them as text (Bun-specific, requires `@ts-expect-error`): +```typescript +// @ts-expect-error - Bun text import attribute not supported by TypeScript +import agentDefinitionSource from '../../../common/src/templates/initial-agents-dir/types/agent-definition' with { type: 'text' } +``` + +### Named Constants +- **Knowledge file name**: `PRIMARY_KNOWLEDGE_FILE_NAME` from `@codebuff/common/constants/knowledge` — use this, don't hardcode `'knowledge.md'` +- **Brand name**: `IS_FREEBUFF` from `cli/src/utils/constants` → use `const brandName = IS_FREEBUFF ? 'Freebuff' : 'Codebuff'` + +## CLI Command Pattern + +When a command produces system messages (not sending to the AI), the handler returns `{ postUserMessage }` and the command-registry calls `params.sendMessage({ content, agentMode, postUserMessage })`: + +```typescript +// In command-registry.ts: +defineCommand({ + name: 'init', + handler: async (params) => { + const { postUserMessage } = handleInitializationFlowLocally() + // Handle streaming/queue state check... + params.sendMessage({ + content: trimmed, + agentMode: params.agentMode, + postUserMessage, // ← injected into message, NOT setMessages directly + }) + }, +}) +``` + +For commands that only show system messages (no AI response), use `params.setMessages`: +```typescript +params.setMessages((prev) => postUserMessage(prev)) +``` + +## The postUserMessage Contract + +Handlers that produce UI messages return this shape (from `cli/src/types/contracts/send-message.ts`): +```typescript +type PostUserMessageFn = (prev: ChatMessage[]) => ChatMessage[] +// Return: { postUserMessage: PostUserMessageFn } +``` + +Use `getSystemMessage(text)` from `cli/src/utils/message-history.ts` to create each message. + +## Checklist Before Writing New Code + +1. **Is there a constant for this?** Search `common/src/constants/` first +2. **Is there a utility for this path operation?** Check `cli/src/project-files.ts` +3. **Does a template file already exist?** Check `common/src/templates/` +4. **Should I track analytics?** Most user-facing actions should call `trackEvent()` +5. **What is the naming convention?** Look at 2-3 existing similar handlers (e.g., `handleHelpCommand`, `handleUsageCommand`) before naming your function + +## Anti-patterns + +**DON'T** hardcode template content as string literals: +```typescript +// BAD - duplicates content that exists in common/src/templates/ +const TYPES_AGENT_DEFINITION = `export interface AgentDefinition { ... }` +``` + +**DO** import from the canonical template location: +```typescript +// GOOD +import agentDefinitionSource from '../../../common/src/templates/initial-agents-dir/types/agent-definition' with { type: 'text' } +``` + +**DON'T** use `process.cwd()` in CLI commands: +```typescript +// BAD +const projectRoot = process.cwd() +``` + +**DO** use the project-files utility: +```typescript +// GOOD +import { getProjectRoot } from '../project-files' +const projectRoot = getProjectRoot() +``` From 9e532f3dea197244a9b7a5e4595846ab5ca828d7 Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 30 Mar 2026 16:56:05 -0700 Subject: [PATCH 2/4] evalbuff: add carve-based eval pipeline (delete & rebuild) New approach to evals that carves features out of the current codebase and has agents rebuild them, instead of replaying git commits. Uses OpenAI SDK (gpt-5.4) to identify and surgically remove features, then runs agents in parallel to rebuild from a natural prompt, judges against original code, and iterates on docs. Co-Authored-By: Claude Opus 4.6 --- bun.lock | 3 + evalbuff/package.json | 1 + evalbuff/src/carve-features.ts | 533 ++++++++++++++++++++++++++++ evalbuff/src/run-carve-eval.ts | 614 +++++++++++++++++++++++++++++++++ 4 files changed, 1151 insertions(+) create mode 100644 evalbuff/src/carve-features.ts create mode 100644 evalbuff/src/run-carve-eval.ts diff --git a/bun.lock b/bun.lock index 3df586afb..5c9ce08a5 100644 --- a/bun.lock +++ b/bun.lock @@ -115,6 +115,7 @@ "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", "ai": "^5.0.0", + "openai": "^6.33.0", "zod": "^4.2.1", }, }, @@ -2914,6 +2915,8 @@ "open": ["open@10.2.0", "", { "dependencies": { "default-browser": "^5.2.1", "define-lazy-prop": "^3.0.0", "is-inside-container": "^1.0.0", "wsl-utils": "^0.1.0" } }, "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA=="], + "openai": ["openai@6.33.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-xAYN1W3YsDXJWA5F277135YfkEk6H7D3D6vWwRhJ3OEkzRgcyK8z/P5P9Gyi/wB4N8kK9kM5ZjprfvyHagKmpw=="], + "openid-client": ["openid-client@5.7.1", "", { "dependencies": { "jose": "^4.15.9", "lru-cache": "^6.0.0", "object-hash": "^2.2.0", "oidc-token-hash": "^5.0.3" } }, "sha512-jDBPgSVfTnkIh71Hg9pRvtJc6wTwqjRkN88+gCFtYWrlP4Yx2Dsrow8uPi3qLr/aeymPF3o2+dS+wOpglK04ew=="], "optionator": ["optionator@0.9.4", "", { "dependencies": { "deep-is": "^0.1.3", "fast-levenshtein": "^2.0.6", "levn": "^0.4.1", "prelude-ls": "^1.2.1", "type-check": "^0.4.0", "word-wrap": "^1.2.5" } }, "sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g=="], diff --git a/evalbuff/package.json b/evalbuff/package.json index ac8a55395..e97a2a3a8 100644 --- a/evalbuff/package.json +++ b/evalbuff/package.json @@ -18,6 +18,7 @@ "@codebuff/common": "workspace:*", "@codebuff/sdk": "workspace:*", "ai": "^5.0.0", + "openai": "^6.33.0", "zod": "^4.2.1" } } diff --git a/evalbuff/src/carve-features.ts b/evalbuff/src/carve-features.ts new file mode 100644 index 000000000..080f1080e --- /dev/null +++ b/evalbuff/src/carve-features.ts @@ -0,0 +1,533 @@ +/** + * Feature Carver for evalbuff v2. + * + * Instead of using git commits as evals, this: + * 1. Analyzes a codebase to identify discrete, self-contained features + * 2. Plans how to cleanly delete each feature + * 3. Produces diffs that remove the feature (code, docs, references) + * + * The output can then be used as eval tasks: give agents a simple prompt + * to rebuild the deleted feature, judge against the original code. + */ +import { execSync } from 'child_process' +import fs from 'fs' +import path from 'path' + +import OpenAI from 'openai' + +// --- Types --- + +export interface CarveCandidate { + id: string + name: string + prompt: string // Short, natural prompt to rebuild this feature + description: string // What this feature does + files: string[] // Files involved (to delete or modify) + complexity: 'small' | 'medium' | 'large' +} + +export interface CarvePlan { + candidates: CarveCandidate[] + reasoning: string +} + +export interface FileOperation { + path: string + action: 'delete' | 'modify' + /** For 'modify': the new file content with the feature removed */ + newContent?: string +} + +export interface CarvedFeature { + id: string + prompt: string + description: string + complexity: 'small' | 'medium' | 'large' + /** Files as they exist before carving (the "ground truth" to rebuild) */ + originalFiles: Record + /** Operations to perform to carve the feature out */ + operations: FileOperation[] + /** Unified diff of the carving (deletions) */ + diff: string +} + +export interface CarveResult { + repoPath: string + generationDate: string + features: CarvedFeature[] +} + +// --- OpenAI client --- + +function getClient(): OpenAI { + return new OpenAI() // Uses OPENAI_API_KEY from env +} + +const PLANNING_MODEL = 'gpt-5.4' +const CARVING_MODEL = 'gpt-5.4' + +// --- Repo analysis helpers --- + +function getFileTree(repoPath: string, maxDepth: number = 4): string { + try { + // Use git ls-files to only get tracked files + const files = execSync('git ls-files', { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }) + .trim() + .split('\n') + .filter(Boolean) + + // Filter out noise + const filtered = files.filter((f) => { + const parts = f.split('/') + if (parts.length > maxDepth) return false + if (f.endsWith('.lock') || f.endsWith('.lockb')) return false + if (f.includes('node_modules/')) return false + if (f.endsWith('.json') && f.includes('package-lock')) return false + return true + }) + + return filtered.join('\n') + } catch { + return '' + } +} + +function readFile(repoPath: string, filePath: string): string | null { + try { + const fullPath = path.join(repoPath, filePath) + return fs.readFileSync(fullPath, 'utf-8') + } catch { + return null + } +} + +function getRepoStats(repoPath: string): string { + const fileTree = getFileTree(repoPath) + const files = fileTree.split('\n').filter(Boolean) + + const byExtension: Record = {} + for (const f of files) { + const ext = path.extname(f) || '(no ext)' + byExtension[ext] = (byExtension[ext] || 0) + 1 + } + + const sorted = Object.entries(byExtension) + .sort((a, b) => b[1] - a[1]) + .slice(0, 15) + .map(([ext, count]) => ` ${ext}: ${count}`) + .join('\n') + + return `Total tracked files: ${files.length}\nBy extension:\n${sorted}` +} + +// --- Phase 1: Plan features to carve --- + +const PLANNING_SYSTEM = `You are an expert software architect analyzing a codebase to identify discrete, self-contained features that can be cleanly "carved out" (deleted) and used as coding evaluation tasks. + +## Your Goal + +Identify 15-25 features in this codebase that could be cleanly removed and then rebuilt by a coding agent. Each feature should: + +1. **Be self-contained** — removing it leaves the rest of the codebase functional (maybe some missing imports/references, but structurally intact) +2. **Be describable in 1-2 sentences** — a developer could ask for it naturally +3. **Be non-trivial but bounded** — not a one-liner, but not "rewrite the whole app" +4. **Cover different aspects** — mix of UI components, API endpoints, utilities, config, tests, etc. +5. **Not overlap** — deleting feature A shouldn't also delete most of feature B + +## What makes a good carve candidate + +- A React component + its usage sites +- An API endpoint (route + handler + types) +- A CLI subcommand or flag +- A utility module used in a few places +- A feature behind a config/flag +- A test suite for a specific module +- A middleware or plugin +- An integration with an external service + +## What makes a BAD candidate + +- Core infrastructure that everything depends on (routing, auth framework, database connection) +- A single function that's called in 50 places +- Trivially small changes (rename, config tweak) +- Auto-generated or boilerplate code + +## Output Format + +Respond with valid JSON matching this schema: +{ + "reasoning": "Your analysis of the codebase and approach to selecting features", + "candidates": [ + { + "id": "short-kebab-id", + "name": "Human readable name", + "prompt": "Natural prompt a developer would use to ask for this feature, 1-2 sentences", + "description": "What this feature does and why it exists", + "files": ["path/to/file1.ts", "path/to/file2.tsx"], + "complexity": "small|medium|large" + } + ] +} + +Be thorough in listing ALL files involved in each feature — missing a file means the carve won't be clean.` + +export async function planFeatures(repoPath: string): Promise { + const client = getClient() + + const fileTree = getFileTree(repoPath) + const stats = getRepoStats(repoPath) + + // Read key files for context + const keyFiles = [ + 'package.json', + 'README.md', + 'CLAUDE.md', + 'tsconfig.json', + 'src/index.ts', + 'src/index.tsx', + 'src/app.ts', + 'src/app.tsx', + 'src/main.ts', + 'src/main.tsx', + ] + + let keyFileContents = '' + for (const kf of keyFiles) { + const content = readFile(repoPath, kf) + if (content) { + keyFileContents += `\n### ${kf}\n\`\`\`\n${content.slice(0, 5000)}\n\`\`\`\n` + } + } + + const userPrompt = `## Repository Stats +${stats} + +## File Tree +\`\`\` +${fileTree} +\`\`\` + +## Key Files +${keyFileContents || '(none found)'} + +Please analyze this codebase and identify 15-25 features that can be cleanly carved out for evaluation.` + + console.log('Planning features to carve...') + const response = await client.chat.completions.create({ + model: PLANNING_MODEL, + messages: [ + { role: 'system', content: PLANNING_SYSTEM }, + { role: 'user', content: userPrompt }, + ], + response_format: { type: 'json_object' }, + }) + + const text = response.choices[0]?.message?.content + if (!text) throw new Error('No response from planning model') + + const parsed = JSON.parse(text) as CarvePlan + console.log(`Identified ${parsed.candidates.length} carve candidates`) + return parsed +} + +// --- Phase 2: Execute carving for each feature --- + +const CARVING_SYSTEM = `You are a precise code surgeon. Your job is to cleanly remove a specific feature from a codebase. + +## Rules + +1. **Delete completely** — remove ALL code related to the feature: components, handlers, types, tests, docs, imports, route registrations, etc. +2. **Don't break the rest** — the remaining code should still be structurally valid. Fix imports, remove dead references, etc. +3. **Minimal collateral** — only remove what's necessary. Don't "improve" or refactor surrounding code. +4. **Be thorough** — check for references in other files. If file A imports something from the feature, update file A's imports. + +## Output Format + +Respond with valid JSON matching this schema: +{ + "operations": [ + { + "path": "path/to/file.ts", + "action": "delete" + }, + { + "path": "path/to/other-file.ts", + "action": "modify", + "newContent": "...full file content with feature removed..." + } + ] +} + +For "modify" operations, provide the COMPLETE new file content (not a diff). This must be the entire file with only the feature-related code removed. +For "delete" operations, the entire file will be removed. + +Only include files that actually need to change. Don't include files that are unaffected.` + +export async function carveFeature( + repoPath: string, + candidate: CarveCandidate, +): Promise { + const client = getClient() + + // Read all files involved + const fileContents: Record = {} + for (const filePath of candidate.files) { + const content = readFile(repoPath, filePath) + if (content) { + fileContents[filePath] = content + } + } + + if (Object.keys(fileContents).length === 0) { + console.warn(` No readable files for feature ${candidate.id}, skipping`) + return null + } + + // Also read files that might reference the feature's files (importers) + const referenceFiles = findReferencingFiles(repoPath, candidate.files) + for (const refFile of referenceFiles) { + if (!fileContents[refFile]) { + const content = readFile(repoPath, refFile) + if (content) { + fileContents[refFile] = content + } + } + } + + let filesSection = '' + for (const [filePath, content] of Object.entries(fileContents)) { + const isFeatureFile = candidate.files.includes(filePath) + const label = isFeatureFile ? '(FEATURE FILE)' : '(REFERENCING FILE)' + filesSection += `\n### ${filePath} ${label}\n\`\`\`\n${content}\n\`\`\`\n` + } + + const userPrompt = `## Feature to Remove +**Name:** ${candidate.name} +**Description:** ${candidate.description} +**Feature files:** ${candidate.files.join(', ')} + +## Current File Contents +${filesSection} + +Remove this feature completely. For files that are entirely part of the feature, use "delete". For files that contain the feature mixed with other code, use "modify" and provide the full updated content.` + + console.log(` Carving feature: ${candidate.id}...`) + const response = await client.chat.completions.create({ + model: CARVING_MODEL, + messages: [ + { role: 'system', content: CARVING_SYSTEM }, + { role: 'user', content: userPrompt }, + ], + response_format: { type: 'json_object' }, + }) + + const text = response.choices[0]?.message?.content + if (!text) { + console.warn(` No response for feature ${candidate.id}`) + return null + } + + const parsed = JSON.parse(text) as { operations: FileOperation[] } + + // Compute diff + const diff = computeDiff(repoPath, parsed.operations) + + // Save original files (only the feature files, for judging) + const originalFiles: Record = {} + for (const filePath of candidate.files) { + if (fileContents[filePath]) { + originalFiles[filePath] = fileContents[filePath] + } + } + + return { + id: candidate.id, + prompt: candidate.prompt, + description: candidate.description, + complexity: candidate.complexity, + originalFiles, + operations: parsed.operations, + diff, + } +} + +// --- Helpers --- + +/** + * Find files that import/reference any of the given files. + * Uses git grep to find import statements. + */ +function findReferencingFiles( + repoPath: string, + featureFiles: string[], +): string[] { + const referencingFiles = new Set() + + for (const featureFile of featureFiles) { + // Extract the module name (without extension) for import matching + const basename = path.basename(featureFile).replace(/\.[^.]+$/, '') + const dirname = path.dirname(featureFile) + + // Search for imports of this file + try { + const results = execSync( + `git grep -l "${basename}" -- '*.ts' '*.tsx' '*.js' '*.jsx'`, + { + cwd: repoPath, + encoding: 'utf-8', + maxBuffer: 10 * 1024 * 1024, + }, + ) + .trim() + .split('\n') + .filter(Boolean) + + for (const result of results) { + // Don't include the feature's own files + if (!featureFiles.includes(result)) { + referencingFiles.add(result) + } + } + } catch { + // git grep returns exit code 1 when no matches + } + } + + // Limit to reasonable number + const sorted = [...referencingFiles].slice(0, 20) + return sorted +} + +/** + * Compute a unified diff from file operations. + * Creates a temp worktree, applies operations, and diffs. + */ +function computeDiff( + repoPath: string, + operations: FileOperation[], +): string { + const diffs: string[] = [] + + for (const op of operations) { + const fullPath = path.join(repoPath, op.path) + const originalContent = fs.existsSync(fullPath) + ? fs.readFileSync(fullPath, 'utf-8') + : '' + + if (op.action === 'delete') { + // Show the full file as deleted + const lines = originalContent.split('\n') + const header = `--- a/${op.path}\n+++ /dev/null` + const hunk = `@@ -1,${lines.length} +0,0 @@\n` + + lines.map((l) => `-${l}`).join('\n') + diffs.push(`${header}\n${hunk}`) + } else if (op.action === 'modify' && op.newContent !== undefined) { + // Compute line-level diff + const oldLines = originalContent.split('\n') + const newLines = op.newContent.split('\n') + // Use a simple diff representation — the full before/after + const header = `--- a/${op.path}\n+++ b/${op.path}` + // For now, show full replacement (not optimal but correct) + const hunk = `@@ -1,${oldLines.length} +1,${newLines.length} @@\n` + + oldLines.map((l) => `-${l}`).join('\n') + '\n' + + newLines.map((l) => `+${l}`).join('\n') + diffs.push(`${header}\n${hunk}`) + } + } + + return diffs.join('\n\n') +} + +// --- Main orchestrator --- + +export async function carveFeatures( + repoPath: string, + options: { + count?: number // Number of features to carve (default: 10) + outputPath?: string + } = {}, +): Promise { + const { count = 10, outputPath } = options + + console.log(`\nCarving features from: ${repoPath}`) + console.log(`Target: ${count} features\n`) + + // Phase 1: Plan + const plan = await planFeatures(repoPath) + + console.log(`\nPlanning complete. Reasoning:\n${plan.reasoning}\n`) + console.log('Candidates:') + for (const c of plan.candidates) { + console.log(` ${c.id} (${c.complexity}): ${c.name}`) + console.log(` Prompt: ${c.prompt}`) + console.log(` Files: ${c.files.join(', ')}`) + } + + // Select top N candidates (prefer medium complexity) + const ranked = [...plan.candidates].sort((a, b) => { + const complexityOrder = { medium: 0, small: 1, large: 2 } + return complexityOrder[a.complexity] - complexityOrder[b.complexity] + }) + const selected = ranked.slice(0, count) + + console.log(`\nSelected ${selected.length} features for carving:\n`) + + // Phase 2: Carve each feature + const features: CarvedFeature[] = [] + for (const candidate of selected) { + try { + const carved = await carveFeature(repoPath, candidate) + if (carved) { + features.push(carved) + console.log(` ✓ ${carved.id} — ${carved.operations.length} file operations`) + } + } catch (error) { + console.error(` ✗ ${candidate.id} failed:`, error) + } + } + + const result: CarveResult = { + repoPath, + generationDate: new Date().toISOString(), + features, + } + + // Save output + const outPath = + outputPath || + path.join(repoPath, `carve-${new Date().toISOString().slice(0, 10)}.json`) + fs.writeFileSync(outPath, JSON.stringify(result, null, 2)) + console.log(`\nSaved ${features.length} carved features to: ${outPath}`) + + return result +} + +// --- CLI --- + +if (import.meta.main) { + const args = process.argv.slice(2) + + const getArg = (name: string, defaultValue?: string): string => { + const idx = args.indexOf(`--${name}`) + if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] + if (defaultValue !== undefined) return defaultValue + throw new Error(`Missing required argument: --${name}`) + } + + const repoPath = getArg('repo') + const count = parseInt(getArg('count', '10')) + const outputPath = args.indexOf('--output') >= 0 ? getArg('output') : undefined + + carveFeatures(repoPath, { count, outputPath }) + .then((result) => { + console.log(`\nDone! Carved ${result.features.length} features.`) + }) + .catch((error) => { + console.error('Carving failed:', error) + process.exit(1) + }) +} diff --git a/evalbuff/src/run-carve-eval.ts b/evalbuff/src/run-carve-eval.ts new file mode 100644 index 000000000..8d515ffe7 --- /dev/null +++ b/evalbuff/src/run-carve-eval.ts @@ -0,0 +1,614 @@ +/** + * Run carve-based evals: apply a carve (delete a feature), run agents to rebuild it, + * judge against the original code, then iterate on docs. + * + * Usage: + * bun run evalbuff/src/run-carve-eval.ts --repo /path/to/repo --carve-file carve-2026-03-30.json [--feature cli-init-command] [--parallelism 5] + */ +import { execSync } from 'child_process' +import fs from 'fs' +import os from 'os' +import path from 'path' + +import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk' + +import { + analyzeFailure, + applyDocEdit, + compareScores, + readCurrentDocs, + revertDocEdit, +} from './docs-optimizer' +import { judgeTaskResult } from './judge' +import { CodebuffRunner } from './runners/codebuff' + +import type { CarvedFeature, CarveResult, FileOperation } from './carve-features' +import type { JudgingResult, ReviewerAgentType } from './judge' + +// --- Apply carve operations to a repo directory --- + +function applyCarveOperations(repoDir: string, operations: FileOperation[]): void { + for (const op of operations) { + const fullPath = path.join(repoDir, op.path) + if (op.action === 'delete') { + if (fs.existsSync(fullPath)) { + fs.rmSync(fullPath) + } + } else if (op.action === 'modify' && op.newContent !== undefined) { + fs.mkdirSync(path.dirname(fullPath), { recursive: true }) + fs.writeFileSync(fullPath, op.newContent) + } + } +} + +/** + * Compute a reverse diff (what needs to be added back) from a carve. + * This is the "ground truth" — the original code that was removed. + */ +function computeGroundTruthDiff(feature: CarvedFeature): string { + const diffs: string[] = [] + + for (const op of feature.operations) { + if (op.action === 'delete' && feature.originalFiles[op.path]) { + // File was deleted — ground truth is to recreate it + const lines = feature.originalFiles[op.path].split('\n') + diffs.push( + `--- /dev/null\n+++ b/${op.path}\n@@ -0,0 +1,${lines.length} @@\n` + + lines.map((l) => `+${l}`).join('\n'), + ) + } else if (op.action === 'modify' && feature.originalFiles[op.path]) { + // File was modified — ground truth is the original version + const origLines = feature.originalFiles[op.path].split('\n') + const carvedLines = (op.newContent || '').split('\n') + diffs.push( + `--- a/${op.path}\n+++ b/${op.path}\n@@ -1,${carvedLines.length} +1,${origLines.length} @@\n` + + carvedLines.map((l) => `-${l}`).join('\n') + + '\n' + + origLines.map((l) => `+${l}`).join('\n'), + ) + } + } + + return diffs.join('\n\n') +} + +// --- Clone repo and apply carve --- + +interface TestRepoResult { + result: T + cleanup: () => void +} + +async function withCarvedRepo( + repoPath: string, + feature: CarvedFeature, + initCommand: string | undefined, + fn: (repoDir: string, carveSha: string) => Promise, +): Promise { + const tempDir = fs.mkdtempSync(path.join(os.tmpdir(), 'carve-eval-')) + const repoDir = path.join(tempDir, 'repo') + + try { + // Local clone (fast, uses hardlinks) + execSync(`git clone --no-checkout "${repoPath}" "${repoDir}"`, { + stdio: 'ignore', + }) + const headSha = execSync('git rev-parse HEAD', { + cwd: repoPath, + encoding: 'utf-8', + }).trim() + execSync(`git checkout ${headSha}`, { cwd: repoDir, stdio: 'ignore' }) + + // Apply the carve operations (delete the feature) + applyCarveOperations(repoDir, feature.operations) + + // Commit the carved state so agents start from a clean working tree + execSync('git add -A', { cwd: repoDir, stdio: 'ignore' }) + execSync( + `git commit -m "carve: remove ${feature.id}" --allow-empty`, + { cwd: repoDir, stdio: 'ignore' }, + ) + const carveSha = execSync('git rev-parse HEAD', { + cwd: repoDir, + encoding: 'utf-8', + }).trim() + + // Run init command if provided + if (initCommand) { + try { + execSync(initCommand, { cwd: repoDir, stdio: 'ignore' }) + } catch (e) { + console.warn(`Init command failed: ${e}`) + } + } + + return await fn(repoDir, carveSha) + } finally { + try { + fs.rmSync(tempDir, { recursive: true, force: true }) + } catch { + // ignore + } + } +} + +// --- Run a single agent on a carved repo --- + +async function runAgentOnCarve(opts: { + idx: number + total: number + repoPath: string + feature: CarvedFeature + initCommand?: string + client: CodebuffClient + agentId: string + agentDefinitions: any[] + agentTimeoutMs: number + groundTruthDiff: string + reviewerAgents: ReviewerAgentType[] + docsSourcePath: string +}): Promise<{ + score: number + diff: string + agentTrace: string + judging: JudgingResult + costEstimate: number +}> { + const { + idx, + total, + repoPath, + feature, + initCommand, + client, + agentId, + agentDefinitions, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath, + } = opts + + return withCarvedRepo(repoPath, feature, initCommand, async (repoDir, carveSha) => { + // Copy docs into the carved repo + copyDocsIntoRepo(docsSourcePath, repoDir) + + console.log(` [Run ${idx + 1}/${total}] Running agent on carved repo...`) + const runner = new CodebuffRunner({ + cwd: repoDir, + client, + agentId, + localAgentDefinitions: agentDefinitions, + printEvents: false, + commitId: feature.id.slice(0, 8), + parentSha: carveSha, + }) + + let result: Awaited> + try { + result = await runner.run(feature.prompt) + } catch (runError) { + const errMsg = + runError instanceof Error ? runError.message : String(runError) + console.warn(` [Run ${idx + 1}/${total}] Agent failed: ${errMsg.slice(0, 200)}`) + return { + score: -1, + diff: '', + agentTrace: `Agent error: ${errMsg}`, + judging: { + analysis: `Agent failed: ${errMsg.slice(0, 500)}`, + strengths: [], + weaknesses: ['Agent failed due to infrastructure error'], + e2eTestsPerformed: [], + completionScore: -1, + codeQualityScore: -1, + e2eScore: -1, + overallScore: -1, + }, + costEstimate: 0, + } + } + + const agentTrace = result.steps + .map((step) => JSON.stringify(step)) + .join('\n') + + console.log(` [Run ${idx + 1}/${total}] Judging...`) + const judging = await judgeTaskResult({ + taskPrompt: feature.prompt, + agentDiff: result.diff, + groundTruthDiff, + repoDir, + error: result.diff === '' ? 'Agent made no changes' : undefined, + reviewerAgents, + }) + + return { + score: judging.overallScore, + diff: result.diff, + agentTrace, + judging, + costEstimate: result.totalCostUsd, + } + }) +} + +function copyDocsIntoRepo(sourceRepoPath: string, targetRepoPath: string): void { + const sourceDocsDir = path.join(sourceRepoPath, 'docs') + const sourceAgentsMd = path.join(sourceRepoPath, 'AGENTS.md') + const targetDocsDir = path.join(targetRepoPath, 'docs') + const targetAgentsMd = path.join(targetRepoPath, 'AGENTS.md') + + let copied = false + if (fs.existsSync(sourceDocsDir)) { + fs.cpSync(sourceDocsDir, targetDocsDir, { recursive: true }) + copied = true + } + if (fs.existsSync(sourceAgentsMd)) { + fs.cpSync(sourceAgentsMd, targetAgentsMd) + copied = true + } + + if (copied) { + try { + execSync( + 'git add docs/ AGENTS.md 2>/dev/null; git add -u docs/ AGENTS.md 2>/dev/null', + { cwd: targetRepoPath, stdio: 'ignore' }, + ) + execSync('git commit -m "evalbuff: pre-load docs" --allow-empty', { + cwd: targetRepoPath, + stdio: 'ignore', + }) + } catch { + // fine + } + } +} + +// --- Main carve eval loop --- + +interface CarveEvalOptions { + repoPath: string + carveFile: string + featureId?: string // run only this feature (default: all) + agentId: string + parallelism: number + agentTimeoutMs: number + reviewerAgents: ReviewerAgentType[] + initCommand?: string + maxImprovementIterations: number +} + +interface CarveEvalResult { + featureId: string + prompt: string + baselineScore: number + finalScore: number + docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> + docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> + totalCost: number +} + +async function runCarveEval(options: CarveEvalOptions): Promise { + const { + repoPath, + carveFile, + featureId, + agentId, + parallelism, + agentTimeoutMs, + reviewerAgents, + initCommand, + maxImprovementIterations, + } = options + + // Load carve data + const carveData: CarveResult = JSON.parse( + fs.readFileSync(carveFile, 'utf-8'), + ) + + // Select features + let features = carveData.features + if (featureId) { + features = features.filter((f) => f.id === featureId) + if (features.length === 0) { + console.error( + `Feature "${featureId}" not found. Available: ${carveData.features.map((f) => f.id).join(', ')}`, + ) + process.exit(1) + } + } + + // Init SDK client + const client = new CodebuffClient({ cwd: repoPath }) + const agentsDir = path.resolve(__dirname, '../../agents') + const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) + const agentDefinitions = Object.values(loadedAgents) + console.log(`Loaded ${agentDefinitions.length} agent definitions`) + + console.log(`\nCarve Eval:`) + console.log(` Repo: ${repoPath}`) + console.log(` Agent: ${agentId}`) + console.log(` Parallelism: ${parallelism}`) + console.log(` Reviewers: ${reviewerAgents.join(', ')}`) + console.log(` Features: ${features.length}`) + console.log(` Max doc improvement iterations: ${maxImprovementIterations}`) + + const results: CarveEvalResult[] = [] + + for (const feature of features) { + console.log(`\n${'='.repeat(60)}`) + console.log(`Feature: ${feature.id}`) + console.log(`Prompt: ${feature.prompt}`) + console.log(`Operations: ${feature.operations.length} (${feature.operations.filter((o) => o.action === 'delete').length} deletes, ${feature.operations.filter((o) => o.action === 'modify').length} modifies)`) + console.log(`${'='.repeat(60)}`) + + const groundTruthDiff = computeGroundTruthDiff(feature) + + // --- Baseline: run agents in parallel --- + console.log(`\n Running ${parallelism} agents in parallel (baseline)...`) + const baselineResults = await Promise.all( + Array.from({ length: parallelism }, (_, i) => + runAgentOnCarve({ + idx: i, + total: parallelism, + repoPath, + feature, + initCommand, + client, + agentId, + agentDefinitions, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath: repoPath, + }), + ), + ) + + const validBaseline = baselineResults.filter((r) => r.score >= 0) + let totalCost = baselineResults.reduce((a, r) => a + r.costEstimate, 0) + + if (validBaseline.length === 0) { + console.log(` All agents failed. Skipping feature.`) + results.push({ + featureId: feature.id, + prompt: feature.prompt, + baselineScore: 0, + finalScore: 0, + docsKept: [], + docsRejected: [], + totalCost, + }) + continue + } + + const baselineScores = validBaseline.map((r) => r.score) + let currentScore = + baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length + console.log( + ` Baseline: ${currentScore.toFixed(1)}/10 (${baselineScores.map((s) => s.toFixed(1)).join(', ')})`, + ) + + const docsKept: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] + const docsRejected: Array<{ path: string; reasoning: string; scoreBefore: number; scoreAfter: number }> = [] + + // --- Doc improvement loop --- + if (currentScore < 9.0) { + let latestJudgings = validBaseline.map((r) => r.judging) + let latestDiffs = validBaseline.map((r) => r.diff) + let latestTraces = validBaseline.map((r) => r.agentTrace) + + for (let iter = 0; iter < maxImprovementIterations; iter++) { + // Pick worst run for analysis + const worstIdx = latestJudgings.reduce( + (minIdx, j, idx, arr) => + j.overallScore < arr[minIdx].overallScore ? idx : minIdx, + 0, + ) + + const currentDocs = readCurrentDocs(repoPath) + const editHistory = [ + ...docsKept.map((d) => ({ ...d, outcome: 'accepted' as const })), + ...docsRejected.map((d) => ({ ...d, outcome: 'rejected' as const })), + ] + + console.log(` Analyzing for doc improvements (iteration ${iter + 1})...`) + const docSuggestion = await analyzeFailure({ + judgeResult: latestJudgings[worstIdx], + taskPrompt: feature.prompt, + agentDiff: latestDiffs[worstIdx], + agentTrace: latestTraces[worstIdx], + groundTruthDiff, + currentDocs, + editHistory, + }) + + if (!docSuggestion) { + console.log(` No doc suggestion — stopping.`) + break + } + + console.log(` Doc suggestion: ${docSuggestion.suggestedDocPath}`) + console.log(` Reasoning: ${docSuggestion.reasoning}`) + + // Save previous content for revert + const docFullPath = path.join(repoPath, 'docs', docSuggestion.suggestedDocPath) + const previousContent = fs.existsSync(docFullPath) + ? fs.readFileSync(docFullPath, 'utf-8') + : null + + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, docSuggestion.suggestedContent) + + // Re-run with new docs + console.log(` Re-running ${parallelism} agents with new docs...`) + const rerunResults = await Promise.all( + Array.from({ length: parallelism }, (_, i) => + runAgentOnCarve({ + idx: i, + total: parallelism, + repoPath, + feature, + initCommand, + client, + agentId, + agentDefinitions, + agentTimeoutMs, + groundTruthDiff, + reviewerAgents, + docsSourcePath: repoPath, + }), + ), + ) + + const validRerun = rerunResults.filter((r) => r.score >= 0) + totalCost += rerunResults.reduce((a, r) => a + r.costEstimate, 0) + + if (validRerun.length === 0) { + console.log(` Re-run failed. Reverting doc.`) + if (previousContent !== null) { + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + break + } + + const rerunScores = validRerun.map((r) => r.score) + const rerunAvg = + rerunScores.reduce((a, b) => a + b, 0) / rerunScores.length + const comparison = compareScores(currentScore, rerunAvg) + console.log( + ` New score: ${rerunAvg.toFixed(1)}/10 (${comparison}) (${rerunScores.map((s) => s.toFixed(1)).join(', ')})`, + ) + + if (comparison === 'improved' || comparison === 'same') { + const reason = comparison === 'improved' ? 'improved' : 'within noise, keeping' + console.log(` Keeping doc: ${docSuggestion.suggestedDocPath} (${reason})`) + docsKept.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + scoreBefore: currentScore, + scoreAfter: rerunAvg, + }) + + // Commit the doc + try { + execSync('git add docs/ AGENTS.md', { cwd: repoPath, stdio: 'ignore' }) + execSync( + `git commit -m "evalbuff: add ${docSuggestion.suggestedDocPath} (carve: ${feature.id})"`, + { cwd: repoPath, stdio: 'ignore' }, + ) + } catch { + console.warn('Failed to commit doc change') + } + + currentScore = rerunAvg + latestJudgings = validRerun.map((r) => r.judging) + latestDiffs = validRerun.map((r) => r.diff) + latestTraces = validRerun.map((r) => r.agentTrace) + } else { + console.log(` Rejecting doc: ${docSuggestion.suggestedDocPath}`) + docsRejected.push({ + path: docSuggestion.suggestedDocPath, + reasoning: docSuggestion.reasoning, + scoreBefore: currentScore, + scoreAfter: rerunAvg, + }) + + if (previousContent !== null) { + applyDocEdit(repoPath, docSuggestion.suggestedDocPath, previousContent) + } else { + revertDocEdit(repoPath, docSuggestion.suggestedDocPath) + } + break + } + } + } + + results.push({ + featureId: feature.id, + prompt: feature.prompt, + baselineScore: baselineScores.reduce((a, b) => a + b, 0) / baselineScores.length, + finalScore: currentScore, + docsKept, + docsRejected, + totalCost, + }) + } + + // --- Summary --- + console.log(`\n${'='.repeat(60)}`) + console.log('CARVE EVAL RESULTS') + console.log(`${'='.repeat(60)}`) + + let totalCostAll = 0 + for (const r of results) { + console.log(`\n ${r.featureId}:`) + console.log(` Prompt: ${r.prompt.slice(0, 80)}...`) + console.log(` Baseline: ${r.baselineScore.toFixed(1)}/10`) + console.log(` Final: ${r.finalScore.toFixed(1)}/10`) + console.log(` Docs kept: ${r.docsKept.length}, rejected: ${r.docsRejected.length}`) + console.log(` Cost: $${r.totalCost.toFixed(2)}`) + totalCostAll += r.totalCost + } + + const avgBaseline = + results.reduce((a, r) => a + r.baselineScore, 0) / results.length + const avgFinal = + results.reduce((a, r) => a + r.finalScore, 0) / results.length + + console.log(`\n Average baseline: ${avgBaseline.toFixed(1)}/10`) + console.log(` Average final: ${avgFinal.toFixed(1)}/10`) + console.log(` Total cost: $${totalCostAll.toFixed(2)}`) + + // Save results + const outputPath = path.join( + repoPath, + `carve-eval-results-${new Date().toISOString().slice(0, 10)}.json`, + ) + fs.writeFileSync(outputPath, JSON.stringify(results, null, 2)) + console.log(`\nResults saved to: ${outputPath}`) +} + +// --- CLI --- + +if (import.meta.main) { + const args = process.argv.slice(2) + + const getArg = (name: string, defaultValue?: string): string => { + const idx = args.indexOf(`--${name}`) + if (idx >= 0 && idx + 1 < args.length) return args[idx + 1] + if (defaultValue !== undefined) return defaultValue + throw new Error(`Missing required argument: --${name}`) + } + const hasArg = (name: string): boolean => args.includes(`--${name}`) + + const repoPath = getArg('repo') + const carveFile = getArg('carve-file') + const featureId = hasArg('feature') ? getArg('feature') : undefined + const agentId = getArg('agent', 'base2-free-evals') + const parallelism = parseInt(getArg('parallelism', '5')) + const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) + const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined + const reviewerAgents: ReviewerAgentType[] = reviewerAgentsArg + ? (reviewerAgentsArg.split(',') as ReviewerAgentType[]) + : ['claude', 'codex'] + const initCommand = hasArg('init-command') ? getArg('init-command') : undefined + const maxImprovementIterations = parseInt(getArg('max-iterations', '3')) + + runCarveEval({ + repoPath, + carveFile, + featureId, + agentId, + parallelism, + agentTimeoutMs, + reviewerAgents, + initCommand, + maxImprovementIterations, + }).catch((error) => { + console.error('Carve eval failed:', error) + process.exit(1) + }) +} From e4376f9becdc7a035b98bf67f03a57bacc84ea0e Mon Sep 17 00:00:00 2001 From: James Grugett Date: Mon, 30 Mar 2026 17:43:01 -0700 Subject: [PATCH 3/4] evalbuff: add patterns/discover-before-implement.md (carve: cli-init-command) --- docs/patterns/discover-before-implement.md | 79 +++++++++++++++++++--- 1 file changed, 69 insertions(+), 10 deletions(-) diff --git a/docs/patterns/discover-before-implement.md b/docs/patterns/discover-before-implement.md index 59e661b9e..524df880d 100644 --- a/docs/patterns/discover-before-implement.md +++ b/docs/patterns/discover-before-implement.md @@ -2,6 +2,21 @@ Before implementing ANY new feature, search for existing utilities, constants, and templates in the codebase. Duplicating content that already exists is the most common source of architectural drift. +## CRITICAL: Check Test Files First for Expected API Contracts + +If a test file already exists for the module you're creating (check `__tests__/` directories), **read it before writing a single line of implementation**. Tests reveal: +- The **exact function name** the codebase expects to export +- The **exact function signature** (arguments and return type) +- The **exact behavior** expected (message format, file creation patterns, etc.) + +Example: `cli/src/commands/__tests__/init.test.ts` imports `{ handleInitializationFlowLocally }` from `'../init'`. This means the function MUST be named `handleInitializationFlowLocally`, not `handleInitCommand`. + +**Decision tree:** +1. Does `__tests__/[filename].test.ts` exist? → Read it FIRST +2. What does it import? → That's your required export name +3. How does it call the function? → That's your required signature +4. What does it assert? → That's your required behavior + ## Key Locations to Check in This Codebase ### Project Root Utilities @@ -31,25 +46,41 @@ import agentDefinitionSource from '../../../common/src/templates/initial-agents- ## CLI Command Pattern -When a command produces system messages (not sending to the AI), the handler returns `{ postUserMessage }` and the command-registry calls `params.sendMessage({ content, agentMode, postUserMessage })`: +Commands that scaffold files AND show messages use the `postUserMessage` + `sendMessage` pattern: ```typescript -// In command-registry.ts: +// In init.ts - the handler returns { postUserMessage }, doesn't call setMessages directly +export function handleInitializationFlowLocally(): { postUserMessage: PostUserMessageFn } { + const messages: string[] = [] + + // ... do file operations, push to messages array ... + + const postUserMessage: PostUserMessageFn = (prev) => [ + ...prev, + ...messages.map((message) => getSystemMessage(message)), + ] + return { postUserMessage } +} + +// In command-registry.ts - the command calls sendMessage with postUserMessage defineCommand({ name: 'init', handler: async (params) => { const { postUserMessage } = handleInitializationFlowLocally() + const trimmed = params.inputValue.trim() + params.saveToHistory(trimmed) + clearInput(params) // Handle streaming/queue state check... params.sendMessage({ content: trimmed, agentMode: params.agentMode, - postUserMessage, // ← injected into message, NOT setMessages directly + postUserMessage, // ← injected, NOT calling setMessages directly }) }, }) ``` -For commands that only show system messages (no AI response), use `params.setMessages`: +For commands that ONLY show system messages (no AI call), use `params.setMessages` directly: ```typescript params.setMessages((prev) => postUserMessage(prev)) ``` @@ -64,20 +95,46 @@ type PostUserMessageFn = (prev: ChatMessage[]) => ChatMessage[] Use `getSystemMessage(text)` from `cli/src/utils/message-history.ts` to create each message. +## Idempotent Initialization Pattern + +When creating files/directories, check each item individually (not the whole directory at once): + +```typescript +// DO - check each file/dir independently, allowing partial init +if (existsSync(knowledgePath)) { + messages.push(`📋 \`knowledge.md\` already exists.`) +} else { + writeFileSync(knowledgePath, INITIAL_KNOWLEDGE_FILE) + messages.push(`✅ Created \`knowledge.md\``) +} + +// DON'T - bail out early if parent dir exists +if (existsSync(agentsDir)) { + return // WRONG: user may be missing sub-items +} +``` + ## Checklist Before Writing New Code -1. **Is there a constant for this?** Search `common/src/constants/` first -2. **Is there a utility for this path operation?** Check `cli/src/project-files.ts` -3. **Does a template file already exist?** Check `common/src/templates/` -4. **Should I track analytics?** Most user-facing actions should call `trackEvent()` -5. **What is the naming convention?** Look at 2-3 existing similar handlers (e.g., `handleHelpCommand`, `handleUsageCommand`) before naming your function +1. **Does a test file already exist?** Check `__tests__/[filename].test.ts` — read it FIRST for expected function name and signature +2. **Is there a constant for this?** Search `common/src/constants/` first +3. **Is there a utility for this path operation?** Check `cli/src/project-files.ts` +4. **Does a template file already exist?** Check `common/src/templates/` +5. **Should I track analytics?** Most user-facing actions should call `trackEvent()` +6. **What is the naming convention?** Look at 2-3 existing similar handlers before naming your function ## Anti-patterns +**DON'T** ignore existing test files that reveal expected exports: +```typescript +// BAD - test expects handleInitializationFlowLocally but you created: +export async function handleInitCommand(params: RouterParams): Promise +``` + **DON'T** hardcode template content as string literals: ```typescript // BAD - duplicates content that exists in common/src/templates/ -const TYPES_AGENT_DEFINITION = `export interface AgentDefinition { ... }` +const AGENT_DEFINITION_TEMPLATE = `export interface AgentDefinition { ... }` ``` **DO** import from the canonical template location: @@ -98,3 +155,5 @@ const projectRoot = process.cwd() import { getProjectRoot } from '../project-files' const projectRoot = getProjectRoot() ``` + +**DON'T** add scope-creep files not in the requirements — implement exactly what tests and ground truth specify. From 0deec9a5a9953f560a7bab51d83f30b662fc0c7d Mon Sep 17 00:00:00 2001 From: James Grugett Date: Tue, 31 Mar 2026 10:37:14 -0700 Subject: [PATCH 4/4] evalbuff: use Claude SDK runner for carve evals, delete generated doc - Switch carve eval inner agents to Claude SDK (sonnet) with 3 parallel runs - Update carve-features to use gpt-5.4 model - Remove auto-generated discover-before-implement.md (test artifact) Co-Authored-By: Claude Opus 4.6 --- docs/patterns/discover-before-implement.md | 159 --------------------- evalbuff/src/run-carve-eval.ts | 54 ++----- evalbuff/src/runners/claude.ts | 10 +- 3 files changed, 23 insertions(+), 200 deletions(-) delete mode 100644 docs/patterns/discover-before-implement.md diff --git a/docs/patterns/discover-before-implement.md b/docs/patterns/discover-before-implement.md deleted file mode 100644 index 524df880d..000000000 --- a/docs/patterns/discover-before-implement.md +++ /dev/null @@ -1,159 +0,0 @@ -# Discover Before Implement: Find Existing Patterns First - -Before implementing ANY new feature, search for existing utilities, constants, and templates in the codebase. Duplicating content that already exists is the most common source of architectural drift. - -## CRITICAL: Check Test Files First for Expected API Contracts - -If a test file already exists for the module you're creating (check `__tests__/` directories), **read it before writing a single line of implementation**. Tests reveal: -- The **exact function name** the codebase expects to export -- The **exact function signature** (arguments and return type) -- The **exact behavior** expected (message format, file creation patterns, etc.) - -Example: `cli/src/commands/__tests__/init.test.ts` imports `{ handleInitializationFlowLocally }` from `'../init'`. This means the function MUST be named `handleInitializationFlowLocally`, not `handleInitCommand`. - -**Decision tree:** -1. Does `__tests__/[filename].test.ts` exist? → Read it FIRST -2. What does it import? → That's your required export name -3. How does it call the function? → That's your required signature -4. What does it assert? → That's your required behavior - -## Key Locations to Check in This Codebase - -### Project Root Utilities -- **`cli/src/project-files.ts`** — Use `getProjectRoot()` (NOT `process.cwd()`) to get the user's project directory. `setProjectRoot()` sets it at startup. -- **`cli/src/utils/analytics.ts`** — Use `trackEvent(AnalyticsEvent.X, {...})` to track user actions. Constants are in `common/src/constants/analytics-events.ts`. - -### Template Files (DO NOT duplicate as strings) -Template files that users receive when scaffolding live in: -``` -common/src/templates/initial-agents-dir/ - types/agent-definition.ts ← import with Bun text import - types/tools.ts ← import with Bun text import - types/util-types.ts ← import with Bun text import - my-custom-agent.ts - package.json -``` - -Import them as text (Bun-specific, requires `@ts-expect-error`): -```typescript -// @ts-expect-error - Bun text import attribute not supported by TypeScript -import agentDefinitionSource from '../../../common/src/templates/initial-agents-dir/types/agent-definition' with { type: 'text' } -``` - -### Named Constants -- **Knowledge file name**: `PRIMARY_KNOWLEDGE_FILE_NAME` from `@codebuff/common/constants/knowledge` — use this, don't hardcode `'knowledge.md'` -- **Brand name**: `IS_FREEBUFF` from `cli/src/utils/constants` → use `const brandName = IS_FREEBUFF ? 'Freebuff' : 'Codebuff'` - -## CLI Command Pattern - -Commands that scaffold files AND show messages use the `postUserMessage` + `sendMessage` pattern: - -```typescript -// In init.ts - the handler returns { postUserMessage }, doesn't call setMessages directly -export function handleInitializationFlowLocally(): { postUserMessage: PostUserMessageFn } { - const messages: string[] = [] - - // ... do file operations, push to messages array ... - - const postUserMessage: PostUserMessageFn = (prev) => [ - ...prev, - ...messages.map((message) => getSystemMessage(message)), - ] - return { postUserMessage } -} - -// In command-registry.ts - the command calls sendMessage with postUserMessage -defineCommand({ - name: 'init', - handler: async (params) => { - const { postUserMessage } = handleInitializationFlowLocally() - const trimmed = params.inputValue.trim() - params.saveToHistory(trimmed) - clearInput(params) - // Handle streaming/queue state check... - params.sendMessage({ - content: trimmed, - agentMode: params.agentMode, - postUserMessage, // ← injected, NOT calling setMessages directly - }) - }, -}) -``` - -For commands that ONLY show system messages (no AI call), use `params.setMessages` directly: -```typescript -params.setMessages((prev) => postUserMessage(prev)) -``` - -## The postUserMessage Contract - -Handlers that produce UI messages return this shape (from `cli/src/types/contracts/send-message.ts`): -```typescript -type PostUserMessageFn = (prev: ChatMessage[]) => ChatMessage[] -// Return: { postUserMessage: PostUserMessageFn } -``` - -Use `getSystemMessage(text)` from `cli/src/utils/message-history.ts` to create each message. - -## Idempotent Initialization Pattern - -When creating files/directories, check each item individually (not the whole directory at once): - -```typescript -// DO - check each file/dir independently, allowing partial init -if (existsSync(knowledgePath)) { - messages.push(`📋 \`knowledge.md\` already exists.`) -} else { - writeFileSync(knowledgePath, INITIAL_KNOWLEDGE_FILE) - messages.push(`✅ Created \`knowledge.md\``) -} - -// DON'T - bail out early if parent dir exists -if (existsSync(agentsDir)) { - return // WRONG: user may be missing sub-items -} -``` - -## Checklist Before Writing New Code - -1. **Does a test file already exist?** Check `__tests__/[filename].test.ts` — read it FIRST for expected function name and signature -2. **Is there a constant for this?** Search `common/src/constants/` first -3. **Is there a utility for this path operation?** Check `cli/src/project-files.ts` -4. **Does a template file already exist?** Check `common/src/templates/` -5. **Should I track analytics?** Most user-facing actions should call `trackEvent()` -6. **What is the naming convention?** Look at 2-3 existing similar handlers before naming your function - -## Anti-patterns - -**DON'T** ignore existing test files that reveal expected exports: -```typescript -// BAD - test expects handleInitializationFlowLocally but you created: -export async function handleInitCommand(params: RouterParams): Promise -``` - -**DON'T** hardcode template content as string literals: -```typescript -// BAD - duplicates content that exists in common/src/templates/ -const AGENT_DEFINITION_TEMPLATE = `export interface AgentDefinition { ... }` -``` - -**DO** import from the canonical template location: -```typescript -// GOOD -import agentDefinitionSource from '../../../common/src/templates/initial-agents-dir/types/agent-definition' with { type: 'text' } -``` - -**DON'T** use `process.cwd()` in CLI commands: -```typescript -// BAD -const projectRoot = process.cwd() -``` - -**DO** use the project-files utility: -```typescript -// GOOD -import { getProjectRoot } from '../project-files' -const projectRoot = getProjectRoot() -``` - -**DON'T** add scope-creep files not in the requirements — implement exactly what tests and ground truth specify. diff --git a/evalbuff/src/run-carve-eval.ts b/evalbuff/src/run-carve-eval.ts index 8d515ffe7..1d627d87b 100644 --- a/evalbuff/src/run-carve-eval.ts +++ b/evalbuff/src/run-carve-eval.ts @@ -10,8 +10,6 @@ import fs from 'fs' import os from 'os' import path from 'path' -import { CodebuffClient, loadLocalAgents } from '@codebuff/sdk' - import { analyzeFailure, applyDocEdit, @@ -20,10 +18,11 @@ import { revertDocEdit, } from './docs-optimizer' import { judgeTaskResult } from './judge' -import { CodebuffRunner } from './runners/codebuff' +import { ClaudeRunner } from './runners/claude' import type { CarvedFeature, CarveResult, FileOperation } from './carve-features' import type { JudgingResult, ReviewerAgentType } from './judge' +import type { RunnerResult } from './runners/runner' // --- Apply carve operations to a repo directory --- @@ -140,9 +139,7 @@ async function runAgentOnCarve(opts: { repoPath: string feature: CarvedFeature initCommand?: string - client: CodebuffClient - agentId: string - agentDefinitions: any[] + model: string agentTimeoutMs: number groundTruthDiff: string reviewerAgents: ReviewerAgentType[] @@ -160,9 +157,7 @@ async function runAgentOnCarve(opts: { repoPath, feature, initCommand, - client, - agentId, - agentDefinitions, + model, agentTimeoutMs, groundTruthDiff, reviewerAgents, @@ -173,18 +168,10 @@ async function runAgentOnCarve(opts: { // Copy docs into the carved repo copyDocsIntoRepo(docsSourcePath, repoDir) - console.log(` [Run ${idx + 1}/${total}] Running agent on carved repo...`) - const runner = new CodebuffRunner({ - cwd: repoDir, - client, - agentId, - localAgentDefinitions: agentDefinitions, - printEvents: false, - commitId: feature.id.slice(0, 8), - parentSha: carveSha, - }) + console.log(` [Run ${idx + 1}/${total}] Running claude (${model}) on carved repo...`) + const runner = new ClaudeRunner(repoDir, {}, model) - let result: Awaited> + let result: RunnerResult try { result = await runner.run(feature.prompt) } catch (runError) { @@ -271,7 +258,7 @@ interface CarveEvalOptions { repoPath: string carveFile: string featureId?: string // run only this feature (default: all) - agentId: string + model: string parallelism: number agentTimeoutMs: number reviewerAgents: ReviewerAgentType[] @@ -294,7 +281,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise { repoPath, carveFile, featureId, - agentId, + model, parallelism, agentTimeoutMs, reviewerAgents, @@ -319,16 +306,9 @@ async function runCarveEval(options: CarveEvalOptions): Promise { } } - // Init SDK client - const client = new CodebuffClient({ cwd: repoPath }) - const agentsDir = path.resolve(__dirname, '../../agents') - const loadedAgents = await loadLocalAgents({ agentsPath: agentsDir }) - const agentDefinitions = Object.values(loadedAgents) - console.log(`Loaded ${agentDefinitions.length} agent definitions`) - console.log(`\nCarve Eval:`) console.log(` Repo: ${repoPath}`) - console.log(` Agent: ${agentId}`) + console.log(` Model: ${model}`) console.log(` Parallelism: ${parallelism}`) console.log(` Reviewers: ${reviewerAgents.join(', ')}`) console.log(` Features: ${features.length}`) @@ -355,9 +335,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise { repoPath, feature, initCommand, - client, - agentId, - agentDefinitions, + model, agentTimeoutMs, groundTruthDiff, reviewerAgents, @@ -450,9 +428,7 @@ async function runCarveEval(options: CarveEvalOptions): Promise { repoPath, feature, initCommand, - client, - agentId, - agentDefinitions, + model, agentTimeoutMs, groundTruthDiff, reviewerAgents, @@ -587,8 +563,8 @@ if (import.meta.main) { const repoPath = getArg('repo') const carveFile = getArg('carve-file') const featureId = hasArg('feature') ? getArg('feature') : undefined - const agentId = getArg('agent', 'base2-free-evals') - const parallelism = parseInt(getArg('parallelism', '5')) + const model = getArg('model', 'sonnet') + const parallelism = parseInt(getArg('parallelism', '3')) const agentTimeoutMs = parseInt(getArg('agent-timeout', '300000')) const reviewerAgentsArg = hasArg('reviewers') ? getArg('reviewers') : undefined const reviewerAgents: ReviewerAgentType[] = reviewerAgentsArg @@ -601,7 +577,7 @@ if (import.meta.main) { repoPath, carveFile, featureId, - agentId, + model, parallelism, agentTimeoutMs, reviewerAgents, diff --git a/evalbuff/src/runners/claude.ts b/evalbuff/src/runners/claude.ts index 1ecd20056..2c1f228f5 100644 --- a/evalbuff/src/runners/claude.ts +++ b/evalbuff/src/runners/claude.ts @@ -9,10 +9,16 @@ import type { export class ClaudeRunner implements Runner { private cwd: string private env: Record + private model: string - constructor(cwd: string, env: Record = {}) { + constructor( + cwd: string, + env: Record = {}, + model: string = 'claude-opus-4-5-20251101', + ) { this.cwd = cwd this.env = env + this.model = model } async run(prompt: string): Promise { @@ -28,7 +34,7 @@ export class ClaudeRunner implements Runner { '--verbose', '--dangerously-skip-permissions', '--model', - 'claude-opus-4-5-20251101', + this.model, ] console.log(`[ClaudeRunner] Running: claude ${args.join(' ')}`)