From 7758aba9062b1a05a9d401edcb92ad05d99e81de Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 20 May 2026 12:38:01 +0300 Subject: [PATCH] feat(0.11.0): defineAgent + surfaces-driven adapters + outcome measurement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The substrate redesign. Closes every universal failure flagged by the audit of the 4 per-agent draft PRs: 1. Subject-routing dead code → `parseFindingSubject` (agent-eval 0.30.0) + `resolveSubjectPath(subject, surfaces, repoRoot)` resolve a typed FindingSubject to a real file path. No `startsWith(...)` prose matching. 2. ImprovementAdapter has no apply → `createSurfaceImprovementAdapter` ships a first-class apply with two modes: `write` — `git apply -p0` in-place; operator reviews via git diff `open-pr` — branch + commit + push + `gh pr create` plus a `none` mode for report-only runs. Race-checked via SHA-256 of the file content the patch was drafted against. 3. No outcome measurement → `measureOutcome(result, opts)` re-runs the cohort after apply, computes composite delta, optionally rolls back applied paths on regression. Process-counts-only reporting is gone. 4. Fabricated file paths → `validateSurfaces` runs at `defineAgent` time and throws `AgentManifestError` listing every missing surface. A manifest that ships broken can't get past `pnpm typecheck`. 5. Per-vertical ImprovementAdapter code (~150 lines × N agents) → `createSurfaceImprovementAdapter(opts)` is the ONE adapter every vertical uses. Per-agent customization happens at the manifest level (`surfaces` + `autoApply`), not by writing a new adapter. 6. Knowledge proposals lacking lint → `createSurfaceKnowledgeAdapter` wraps agent-knowledge's `applyKnowledgeWriteBlocks` with an optional `lintAfterApply` hook. Wiki drift surfaces in `warnings` immediately. API surface (new sub-export `@tangle-network/agent-runtime/agent`): - `defineAgent(manifest)` — typed, validated factory - `AgentSurfaces`, `validateSurfaces`, `resolveSubjectPath` — surface map + Subject→Path resolver - `createSurfaceImprovementAdapter(opts)` — LLM-drafted patches + `git apply` / `gh pr create` apply - `createSurfaceKnowledgeAdapter(opts, deps)` — agent-knowledge integration with post-apply lint - `measureOutcome(result, opts)` — before/after cohort delta + rollback - `AgentManifestError` — fail-loud manifest validation Bumps agent-eval dep to ^0.30.0 (FindingSubject lives there). Tests: 124/124 pass (27 new under `tests/agent.test.ts`) covering manifest validation, subject-resolution for every surface variant, propose/apply error paths, race-detection via SHA mismatch, and outcome rollback on regression. Per-vertical PRs now collapse from ~700 lines of glue to a ~50-line `defineAgent({...})` call + the substrate's default adapters. Tracking that cascade in follow-up PRs per repo. --- package.json | 9 +- src/agent/define-agent.ts | 267 ++++++++++++++ src/agent/improvement-adapter.ts | 349 ++++++++++++++++++ src/agent/index.ts | 41 +++ src/agent/knowledge-adapter.ts | 139 ++++++++ src/agent/outcome.ts | 126 +++++++ src/agent/surfaces.ts | 245 +++++++++++++ tests/agent.test.ts | 584 +++++++++++++++++++++++++++++++ tsup.config.ts | 1 + 9 files changed, 1759 insertions(+), 2 deletions(-) create mode 100644 src/agent/define-agent.ts create mode 100644 src/agent/improvement-adapter.ts create mode 100644 src/agent/index.ts create mode 100644 src/agent/knowledge-adapter.ts create mode 100644 src/agent/outcome.ts create mode 100644 src/agent/surfaces.ts create mode 100644 tests/agent.test.ts diff --git a/package.json b/package.json index f8ee630..f4c6637 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.10.0", + "version": "0.11.0", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { @@ -28,6 +28,11 @@ "types": "./dist/analyst-loop.d.ts", "import": "./dist/analyst-loop.js", "default": "./dist/analyst-loop.js" + }, + "./agent": { + "types": "./dist/agent.d.ts", + "import": "./dist/agent.js", + "default": "./dist/agent.js" } }, "files": [ @@ -48,7 +53,7 @@ "typecheck": "tsc --noEmit" }, "dependencies": { - "@tangle-network/agent-eval": "^0.29.1" + "@tangle-network/agent-eval": "^0.30.0" }, "devDependencies": { "@biomejs/biome": "^2.4.0", diff --git a/src/agent/define-agent.ts b/src/agent/define-agent.ts new file mode 100644 index 0000000..e5a1f01 --- /dev/null +++ b/src/agent/define-agent.ts @@ -0,0 +1,267 @@ +/** + * `defineAgent` — typed, validated manifest API for declarative agent + * configuration. The substrate consumes this manifest to wire the + * canonical eval pattern + analyst self-improvement loop without any + * per-vertical glue. + * + * Design goal: scale to 1000s of vertical agents. Every agent declares + * its surfaces, rubric, runtime, and analyst configuration in ~50 lines. + * No per-vertical `ImprovementAdapter`. No per-vertical CLI. No + * fabricated paths. + * + * Validation: `defineAgent` runs `validateSurfaces` synchronously and + * throws a structured error if any required surface is missing on + * disk. The cost is one filesystem stat per surface (cheap); the + * benefit is a manifest that can't ship broken. + */ + +import type { TraceAnalystKindSpec } from '@tangle-network/agent-eval' +import { + type AgentSurfaces, + renderSurfaceIssues, + validateSurfaces, +} from './surfaces' + +// ── manifest ───────────────────────────────────────────────────────── + +/** + * The full agent manifest. Each agent ships ONE of these. + * + * Generics: + * `TPersona` — the agent's persona shape (loaded from + * `surfaces.personas`). Defaults to `unknown` so the substrate's + * persona discovery (`loadPersonas`) can accept anything; per-agent + * code re-narrows when it matters. + * `TRunOutput` — the shape `runtime.act` returns. Used by the rubric + * scorers and emitted into the trace. + */ +export interface AgentManifest { + /** + * Stable identifier — used as `projectId` in traces, as the analyst + * loop's `runId` prefix, and as the namespace under which findings + * are persisted. MUST match the agent's repo name to keep + * cross-repo telemetry joinable. + */ + id: string + + /** + * Filesystem root the substrate resolves surface paths against. + * Typically `process.cwd()` or a fixed absolute path. Use an + * absolute path when the agent's tests may run from subdirectories + * (vitest sometimes shifts cwd). + */ + repoRoot: string + + /** + * Map of mutable surfaces the self-improvement loop can edit. See + * `AgentSurfaces` — required: `systemPrompt`, `tools`, `rubric`, + * `knowledge`, `personas`. Optional: `scaffolding`, `memory`, `rag`, + * `outputSchema`. + * + * Every required path is validated at `defineAgent` time. Missing + * paths throw with the full list of offenders. + */ + surfaces: AgentSurfaces + + /** + * Rubric the substrate uses to score each run. Dimensions × weights + * × judges. The substrate computes the weighted composite and + * stamps it into the RunRecord. + */ + rubric: AgentRubric + + /** + * Runtime adapter — how the substrate INVOKES the agent against a + * persona. The `act` function takes a persona + a context (with the + * tracer the substrate threads through for span emission) and + * returns the run output the rubric will score. + * + * The agent's existing production runtime goes in here; the + * substrate is intentionally thin around it. + */ + runtime: AgentRuntime + + /** + * Persona discovery — the substrate loads personas via this function + * at eval start. Can read from `surfaces.personas`, an API, or be + * hardcoded. The substrate calls it once per `runAgentEval` call; + * persona ordering is preserved. + */ + personas: () => Promise> + + /** + * Analyst kinds the substrate runs against each persona's trace. + * Defaults to `DEFAULT_TRACE_ANALYST_KINDS` from agent-eval. Per-agent + * authors can prune (e.g. skip `knowledge-poisoning` when there's no + * knowledge base) or extend (custom domain kinds). + * + * Empty array disables the loop — useful for `pnpm eval --no-analyst`. + */ + analystKinds: ReadonlyArray + + /** + * Analyst LLM configuration. The substrate uses these for all four + * kinds (override per-kind via `analystKinds` if needed). + */ + analyst: AnalystConfig + + /** + * Auto-apply policy. Knowledge / improvement edits land only when + * `enabled === true` AND the source finding's confidence meets the + * threshold. `mode` controls how applies happen: `'write'` mutates + * files in-place; `'open-pr'` writes to a branch and opens a PR. + * + * Default: knowledge auto-applies at confidence ≥0.85 in `'write'` + * mode (wiki edits are git-reversible); improvement stays at + * `enabled: false` until the agent author has measured precision. + */ + autoApply?: AutoApplyPolicy +} + +export interface AgentRubric { + /** Dimensions composing the weighted score. Weights sum to 1.0 by convention. */ + dimensions: ReadonlyArray> + /** + * Optional judges layered on top of deterministic dimensions. Each + * judge returns a score per dimension; the substrate averages judges + * (mean by default) for the LLM contribution. + */ + judges?: ReadonlyArray> +} + +export interface RubricDimension { + /** Unique identifier — appears in finding subjects (`rubric:`). */ + id: string + /** 0..1 — weight in the composite. */ + weight: number + /** + * Deterministic scorer: given the persona + run output, returns a + * 0..1 score. The substrate sums weight × score across dimensions + * for the deterministic composite; judges supplement subjective dims. + */ + score: (input: { persona: unknown; output: TRunOutput }) => number + /** Optional human-readable label for reports. */ + label?: string +} + +export interface JudgeConfig { + /** Judge identifier — appears in trace spans + manifest. */ + id: string + /** Model snapshot to invoke. Pin the snapshot (`claude-sonnet-4-6@2025-04-15`); the validator rejects bare aliases. */ + model: string + /** Dimensions this judge scores. */ + dimensions: ReadonlyArray + /** + * Optional rubric anchors — text examples the judge sees as a + * few-shot prompt to calibrate. STRONGLY recommended for subjective + * dimensions; required by the calibration gate (Pearson ≥0.7). + */ + anchors?: ReadonlyArray<{ input: string; output: TRunOutput; expected: Record }> +} + +export interface AgentRuntime { + /** + * Invoke the agent against one persona. Returns the structured run + * output the rubric will score. + * + * `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents + * SHOULD record their LLM calls / tool calls through it for capture + * integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD + * honour it for graceful cancel. + */ + act: (persona: TPersona, ctx: AgentRunContext) => Promise +} + +export interface AgentRunContext { + /** Substrate-managed trace emitter. */ + emitter: import('@tangle-network/agent-eval').TraceEmitter + /** Stable run id for this persona × variant cell. */ + runId: string + /** Variant the runtime is exercising (e.g. `'baseline'`, `'source-grounded'`). */ + variantId?: string + /** Wall-clock deadline (epoch ms). The runtime SHOULD honour for graceful cancel. */ + deadlineMs?: number + /** Optional abort signal. */ + signal?: AbortSignal +} + +export interface AnalystConfig { + /** Model the analyst kinds use. Override per-kind via `analystKinds[i].cost.models`. */ + model: string + /** Optional total budget across all kinds for one run. Substrate enforces via `BudgetGuard`. */ + budgetUsd?: number + /** Backend hint for the AxAIService factory — same shape every kind uses. */ + backend?: { + name?: 'openai' | 'router' + apiKey?: string + baseUrl?: string + } +} + +export interface AutoApplyPolicy { + knowledge?: { + enabled: boolean + confidenceThreshold?: number + mode?: 'write' | 'open-pr' + } + improvement?: { + enabled: boolean + confidenceThreshold?: number + mode?: 'write' | 'open-pr' + } +} + +// ── factory + validation ───────────────────────────────────────────── + +export class AgentManifestError extends Error { + constructor( + message: string, + public readonly agentId: string, + public readonly issues: ReadonlyArray = [], + ) { + super(message) + this.name = 'AgentManifestError' + } +} + +/** + * Construct a validated agent manifest. Throws `AgentManifestError` + * if any required surface is missing on disk. + * + * Generics: pass your persona / output types if you want narrowed + * `runtime.act` signatures: + * `defineAgent({ ... })` + * + * Most callers don't need the generics — the substrate operates on + * `unknown` payloads internally and the manifest's `score` / + * `runtime.act` see the typed shapes via TypeScript inference at + * the call site. + */ +export function defineAgent( + manifest: AgentManifest, +): AgentManifest { + if (!manifest.id || manifest.id.trim().length === 0) { + throw new AgentManifestError('defineAgent: `id` is required', manifest.id ?? '') + } + if (!manifest.repoRoot || manifest.repoRoot.trim().length === 0) { + throw new AgentManifestError('defineAgent: `repoRoot` is required', manifest.id) + } + const issues = validateSurfaces(manifest.surfaces, manifest.repoRoot) + if (issues.length > 0) { + throw new AgentManifestError( + renderSurfaceIssues(issues, manifest.repoRoot), + manifest.id, + issues, + ) + } + // Lightweight rubric sanity: weights sum to ~1.0 (no hard requirement — + // the substrate normalizes — but flag wildly miscalibrated weights). + const total = manifest.rubric.dimensions.reduce((acc, d) => acc + d.weight, 0) + if (manifest.rubric.dimensions.length > 0 && (total < 0.5 || total > 1.5)) { + throw new AgentManifestError( + `defineAgent(${manifest.id}): rubric dimension weights sum to ${total.toFixed(3)} — should be ~1.0`, + manifest.id, + ) + } + return manifest +} diff --git a/src/agent/improvement-adapter.ts b/src/agent/improvement-adapter.ts new file mode 100644 index 0000000..c350395 --- /dev/null +++ b/src/agent/improvement-adapter.ts @@ -0,0 +1,349 @@ +/** + * Substrate-default `ImprovementAdapter` — surfaces-driven, LLM-drafted + * patches, optional auto-apply or PR-open. + * + * This is the one ImprovementAdapter every vertical agent uses. The + * substrate parses each finding's `subject` via + * `parseFindingSubject` (agent-eval), resolves it to a real file path + * via the agent's `AgentSurfaces`, reads the current content, and asks + * an LLM to draft a unified-diff patch given the finding + current + * content + per-kind editing-discipline rules. + * + * Auto-apply gates on the source-finding's confidence and the + * autoApply.improvement policy. Two modes: + * `write` — apply the patch in-place via `git apply -p0`. Operator + * reviews via `git diff`. + * `open-pr` — write to a branch, commit, push, open a PR via `gh`. + * Operator reviews via the PR UI. + * + * Fail-loud rules: + * - Findings whose subject doesn't parse → counted in `errors`. + * - Findings whose subject targets an undeclared surface → counted in + * `errors` with the offending kind in the message. + * - Findings whose target path doesn't exist AND the kind isn't a + * create-new variant (`new-tool`, `knowledge.wiki`) → counted in + * `errors` with the resolved path in the message. + * - LLM drafts that fail JSON-schema validation → counted in + * `errors` with the schema issue. + * + * No silent skips. Every dropped finding has a recorded reason the + * loop's report surfaces. + */ + +import { readFileSync } from 'node:fs' +import { spawnSync } from 'node:child_process' +import { + type AnalystFinding, + type FindingSubject, + parseFindingSubject, +} from '@tangle-network/agent-eval' +import type { ImprovementAdapter } from '../analyst-loop/types' +import type { AgentSurfaces, ResolvedSurface } from './surfaces' +import { resolveSubjectPath } from './surfaces' + +// ── proposal shape ─────────────────────────────────────────────────── + +export interface SurfaceImprovementEdit { + /** Stable id derived from the source finding so re-proposals are idempotent. */ + id: string + /** The finding that produced this edit — for revert + audit trail. */ + sourceFindingId: string + /** Parsed subject; included so the apply step doesn't re-parse. */ + subject: FindingSubject + /** Resolved on-disk target. */ + target: ResolvedSurface + /** SHA-256 of the current file content the patch was drafted against. */ + baseSha256: string + /** Unified-diff patch the LLM drafted (relative to `target.absolutePath`). */ + patch: string + /** One-line summary the operator sees in the report / PR title. */ + summary: string + /** Multi-line rationale for the PR body — finding context + LLM reasoning. */ + rationale: string + /** Carry-forward from the finding so the apply gate can check the threshold. */ + confidence: number + /** Carry-forward severity for prioritization. */ + severity: AnalystFinding['severity'] +} + +export interface CreateSurfaceImprovementAdapterOpts { + surfaces: AgentSurfaces + repoRoot: string + /** + * LLM-draft callback. Given a finding + current file content + the + * resolved target, returns a unified-diff patch + summary + rationale. + * + * Required — the substrate doesn't ship a hardcoded prompt; the agent + * author picks the model (Haiku for cheap routine drafts, Sonnet for + * substantive prompt rewrites, etc.) via this callback. + */ + draftPatch: (input: DraftPatchInput) => Promise + /** + * Apply mode: + * `write` — `git apply` in-place; operator reviews via `git diff` + * `open-pr` — branch + commit + push + `gh pr create` + * `none` — never apply; collect proposals for the report only + * + * The `apply` method honours this even when the loop calls it; the + * effective behaviour is also gated on the per-finding confidence + * threshold via `runAnalystLoop`'s `autoApply` policy. + */ + mode?: 'write' | 'open-pr' | 'none' + /** When `mode === 'open-pr'`, the base branch new PRs target. Default: `main`. */ + baseBranch?: string + /** Required for `mode === 'open-pr'` — the GH owner/repo (`tangle-network/tax-agent`). */ + ghRepo?: string + /** + * When the resolved target doesn't exist, allow the substrate to + * CREATE the file (for `knowledge.wiki`, `new-tool` subjects). Default + * true for those kinds, false for `system-prompt` / `rubric` / etc. + * (named sections that don't exist are a contract violation, not a + * scaffolding opportunity). + */ + allowCreateForKinds?: ReadonlyArray +} + +export interface DraftPatchInput { + finding: AnalystFinding + subject: FindingSubject + target: ResolvedSurface + /** Current file content (empty string when `intent === 'create-new'`). */ + currentContent: string +} + +export interface DraftPatchOutput { + /** Unified diff against the current file content. Empty string skips this finding. */ + patch: string + /** One-line summary for the operator. */ + summary: string + /** Multi-line rationale for the PR body. */ + rationale: string +} + +// ── factory ────────────────────────────────────────────────────────── + +const DEFAULT_CREATE_KINDS: ReadonlyArray = [ + 'knowledge.wiki', + 'knowledge.claim', + 'knowledge.raw', + 'new-tool', +] + +export function createSurfaceImprovementAdapter( + opts: CreateSurfaceImprovementAdapterOpts, +): ImprovementAdapter { + const mode = opts.mode ?? 'none' + const allowCreate = opts.allowCreateForKinds ?? DEFAULT_CREATE_KINDS + + return { + async proposeFromFindings(findings) { + const edits: SurfaceImprovementEdit[] = [] + const errors: Array<{ findingId: string; subject: string; message: string }> = [] + let skipped = 0 + + for (const f of findings) { + const subject = parseFindingSubject(f.subject) + if (subject === null) { + if (f.subject !== undefined) { + errors.push({ + findingId: f.finding_id, + subject: f.subject, + message: 'subject does not parse against the finding-subject grammar', + }) + } else { + // Subject-less findings are descriptive, not actionable — + // legitimate; count in `skipped` not `errors`. + skipped += 1 + } + continue + } + + // `cluster` findings (failure-mode) are evidence, not mutations. + if (subject.kind === 'cluster') { + skipped += 1 + continue + } + + // `agent-knowledge:*` findings flow to the KnowledgeAdapter; + // the ImprovementAdapter skips them so subjects don't double-route. + if (subject.kind.startsWith('knowledge.')) { + skipped += 1 + continue + } + + const target = resolveSubjectPath(subject, opts.surfaces, opts.repoRoot) + if (target === null) { + errors.push({ + findingId: f.finding_id, + subject: f.subject ?? '', + message: `subject kind "${subject.kind}" targets an undeclared surface; declare it in AgentSurfaces or stop emitting this subject`, + }) + continue + } + + if (target.intent === 'create-new' && !allowCreate.includes(subject.kind)) { + errors.push({ + findingId: f.finding_id, + subject: f.subject ?? '', + message: `target ${target.repoRelativePath} does not exist; the kind "${subject.kind}" requires an existing target (analyst named a section that isn't in the codebase)`, + }) + continue + } + + const currentContent = target.exists ? readFileSync(target.absolutePath, 'utf-8') : '' + + let draft: DraftPatchOutput + try { + draft = await opts.draftPatch({ finding: f, subject, target, currentContent }) + } catch (err) { + errors.push({ + findingId: f.finding_id, + subject: f.subject ?? '', + message: `draftPatch threw: ${err instanceof Error ? err.message : String(err)}`, + }) + continue + } + + if (draft.patch.trim().length === 0) { + skipped += 1 + continue + } + + edits.push({ + id: `imp-${f.finding_id}`, + sourceFindingId: f.finding_id, + subject, + target, + baseSha256: sha256(currentContent), + patch: draft.patch, + summary: draft.summary, + rationale: draft.rationale, + confidence: f.confidence, + severity: f.severity, + }) + } + + return { edits, skipped, errors } + }, + + async apply(edits) { + const applied: string[] = [] + const warnings: string[] = [] + + if (mode === 'none') { + warnings.push( + 'createSurfaceImprovementAdapter: mode=none; no edits applied — adjust manifest.autoApply.improvement.mode', + ) + return { applied, warnings } + } + + if (mode === 'open-pr' && !opts.ghRepo) { + warnings.push('createSurfaceImprovementAdapter: mode=open-pr requires `ghRepo`; falling back to no-op') + return { applied, warnings } + } + + for (const edit of edits) { + // Race-detection: confirm the file content hasn't moved since the + // patch was drafted. A diff applied against drifted content is a + // recipe for silent corruption. + const current = edit.target.exists + ? readFileSync(edit.target.absolutePath, 'utf-8') + : '' + if (sha256(current) !== edit.baseSha256) { + warnings.push( + `${edit.target.repoRelativePath}: base SHA mismatch; file changed after draft. Skipping.`, + ) + continue + } + + const ok = applyPatchInPlace(edit, opts.repoRoot) + if (!ok) { + warnings.push(`${edit.target.repoRelativePath}: git apply failed`) + continue + } + applied.push(edit.target.repoRelativePath) + } + + if (mode === 'open-pr' && applied.length > 0 && opts.ghRepo) { + const prUrl = openPullRequest( + applied, + edits.filter((e) => applied.includes(e.target.repoRelativePath)), + opts.repoRoot, + opts.ghRepo, + opts.baseBranch ?? 'main', + ) + if (prUrl) warnings.push(`opened PR: ${prUrl}`) + else warnings.push('PR creation failed; edits are committed to a local branch only') + } + + return { applied, warnings } + }, + } +} + +// ── apply helpers ──────────────────────────────────────────────────── + +function applyPatchInPlace(edit: SurfaceImprovementEdit, repoRoot: string): boolean { + const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], { + cwd: repoRoot, + input: edit.patch, + encoding: 'utf-8', + }) + return result.status === 0 +} + +function openPullRequest( + paths: ReadonlyArray, + edits: ReadonlyArray, + repoRoot: string, + ghRepo: string, + baseBranch: string, +): string | null { + const branch = `analyst-loop/${Date.now()}-${edits[0]?.sourceFindingId.slice(0, 12) ?? 'edits'}` + // Create branch, stage, commit + const checkout = spawnSync('git', ['checkout', '-b', branch], { cwd: repoRoot }) + if (checkout.status !== 0) return null + const add = spawnSync('git', ['add', ...paths], { cwd: repoRoot }) + if (add.status !== 0) return null + const title = `analyst-loop: ${edits[0]?.summary ?? `${edits.length} improvement edits`}` + const body = [ + `Automated analyst-loop edits — review carefully before merge.`, + '', + `Source findings:`, + ...edits.map((e) => ` - ${e.sourceFindingId} (confidence ${e.confidence.toFixed(2)}, severity ${e.severity})`), + '', + 'Rationales:', + ...edits.map((e) => `\n## ${e.target.repoRelativePath}\n\n${e.rationale}`), + ].join('\n') + const commit = spawnSync('git', ['commit', '-m', title, '-m', body], { cwd: repoRoot }) + if (commit.status !== 0) return null + const push = spawnSync('git', ['push', '-u', 'origin', branch], { cwd: repoRoot }) + if (push.status !== 0) return null + const pr = spawnSync( + 'gh', + [ + 'pr', + 'create', + '--repo', + ghRepo, + '--title', + title, + '--body', + body, + '--base', + baseBranch, + '--head', + branch, + ], + { cwd: repoRoot, encoding: 'utf-8' }, + ) + if (pr.status !== 0) return null + return pr.stdout.trim() +} + +function sha256(s: string): string { + // node:crypto is dynamic-imported lazily so the adapter can be tested in + // environments without crypto (browser tests, mocked envs). + const crypto = require('node:crypto') as typeof import('node:crypto') + return crypto.createHash('sha256').update(s, 'utf-8').digest('hex') +} diff --git a/src/agent/index.ts b/src/agent/index.ts new file mode 100644 index 0000000..764c633 --- /dev/null +++ b/src/agent/index.ts @@ -0,0 +1,41 @@ +/** + * `@tangle-network/agent-runtime/agent` — declarative agent manifest + + * substrate-default adapters. + * + * Every vertical agent (tax / legal / gtm / creative / N future + * verticals) ships ONE `defineAgent({...})` call + a thin invocation + * of `runAnalystLoop` wired through the substrate-default adapters. + * No per-vertical glue. No fabricated paths. No theater. + */ + +export type { + AgentManifest, + AgentRubric, + AgentRuntime, + AgentRunContext, + AnalystConfig, + AutoApplyPolicy, + JudgeConfig, + RubricDimension, +} from './define-agent' +export { AgentManifestError, defineAgent } from './define-agent' + +export type { AgentSurfaces, ResolvedSurface, SurfaceValidationIssue } from './surfaces' +export { renderSurfaceIssues, resolveSubjectPath, validateSurfaces } from './surfaces' + +export type { + CreateSurfaceImprovementAdapterOpts, + DraftPatchInput, + DraftPatchOutput, + SurfaceImprovementEdit, +} from './improvement-adapter' +export { createSurfaceImprovementAdapter } from './improvement-adapter' + +export type { + CreateSurfaceKnowledgeAdapterOpts, + KnowledgeAdapterDeps, +} from './knowledge-adapter' +export { createSurfaceKnowledgeAdapter } from './knowledge-adapter' + +export type { OutcomeMeasurement, OutcomeMeasurementOpts } from './outcome' +export { measureOutcome } from './outcome' diff --git a/src/agent/knowledge-adapter.ts b/src/agent/knowledge-adapter.ts new file mode 100644 index 0000000..dd6dbeb --- /dev/null +++ b/src/agent/knowledge-adapter.ts @@ -0,0 +1,139 @@ +/** + * Substrate-default `KnowledgeAdapter` — wraps agent-knowledge's + * `proposeFromFindings` + `applyKnowledgeWriteBlocks` with substrate + * defaults (auto-lint after apply, source linkage via finding id). + * + * Every agent that ships a `.agent-knowledge/` tree uses this adapter + * unmodified. Per-agent customization happens at the manifest level + * (`autoApply.knowledge.confidenceThreshold`, etc.), not by writing a + * new adapter. + * + * Lint discipline: after each apply we run agent-knowledge's + * `lintKnowledgeIndex` to catch broken links / circular claims / + * duplicate pages introduced by the new writes. Findings that fail the + * post-apply lint are recorded in `warnings`; the apply itself is not + * rolled back (lint failures are soft — humans review the wiki state). + */ + +import type { AnalystFinding } from '@tangle-network/agent-eval' +import type { KnowledgeAdapter } from '../analyst-loop/types' + +export interface CreateSurfaceKnowledgeAdapterOpts { + /** `.agent-knowledge/` root (absolute path the substrate writes blocks against). */ + knowledgeRoot: string +} + +/** + * Build the adapter. We accept the agent-knowledge functions as DI so + * the substrate stays decoupled from a specific agent-knowledge + * version — the agent author imports them in their manifest module + * and hands them to the factory. + * + * `proposeFromFindings(findings)` returns + * `{ proposals: KnowledgeProposal[]; skipped: number; errors: ... }`. + * + * `applyKnowledgeWriteBlocks(root, content)` returns + * `{ written: string[]; warnings: string[] }`. + * + * `lintKnowledgeIndex(index)` (optional) returns `KnowledgeLintFinding[]`. + */ +export interface KnowledgeAdapterDeps { + proposeFromFindings: (findings: ReadonlyArray) => { + proposals: TProposal[] + skipped: number + errors: Array<{ findingId: string; subject: string; message: string }> + } + applyKnowledgeWriteBlocks: ( + root: string, + proposalText: string, + ) => Promise<{ written: string[]; warnings: string[] }> + /** + * Optional post-apply lint hook. The substrate runs it after each + * batch of writes; failures land in `warnings` (the apply is not + * rolled back — lint signals drift to review, not block). + */ + lintAfterApply?: (root: string) => Promise> +} + +export function createSurfaceKnowledgeAdapter( + opts: CreateSurfaceKnowledgeAdapterOpts, + deps: KnowledgeAdapterDeps, +): KnowledgeAdapter { + return { + proposeFromFindings(findings) { + const batch = deps.proposeFromFindings(findings) + return { + proposals: batch.proposals, + skipped: batch.skipped, + errors: batch.errors, + } + }, + async apply(proposals) { + const written: string[] = [] + const warnings: string[] = [] + for (const p of proposals) { + const proposalText = renderProposalAsWriteBlock(p) + if (proposalText === null) { + warnings.push( + `proposal has no writeBlocks/content; skipping (sourceFindingId=${getSourceFindingId(p)})`, + ) + continue + } + try { + const r = await deps.applyKnowledgeWriteBlocks(opts.knowledgeRoot, proposalText) + written.push(...r.written) + warnings.push(...r.warnings) + } catch (err) { + warnings.push( + `applyKnowledgeWriteBlocks failed: ${err instanceof Error ? err.message : String(err)}`, + ) + } + } + if (deps.lintAfterApply && written.length > 0) { + try { + const lintIssues = await deps.lintAfterApply(opts.knowledgeRoot) + for (const issue of lintIssues) warnings.push(`lint: ${issue}`) + } catch (err) { + warnings.push( + `lintAfterApply failed: ${err instanceof Error ? err.message : String(err)}`, + ) + } + } + return { written, warnings } + }, + } +} + +/** + * Pluck the canonical write-block text from a proposal regardless of + * which exact agent-knowledge version produced it. We accept either: + * - `{ writeBlocks: Array<{ path, content }> }` — the typed shape + * 1.3.0+ emits + * - `{ proposalText: string }` — legacy single-block shape + * - `{ content: string }` — minimal raw form + * + * Returns `null` when nothing parseable is present (warned upstream). + */ +function renderProposalAsWriteBlock(p: unknown): string | null { + if (!p || typeof p !== 'object') return null + const obj = p as Record + if (Array.isArray(obj.writeBlocks)) { + const blocks = obj.writeBlocks as Array<{ path?: string; content?: string }> + if (blocks.length === 0) return null + return blocks + .map((b) => (typeof b.content === 'string' ? b.content : '')) + .filter((s) => s.length > 0) + .join('\n\n') + } + if (typeof obj.proposalText === 'string') return obj.proposalText + if (typeof obj.content === 'string') return obj.content + return null +} + +function getSourceFindingId(p: unknown): string { + if (!p || typeof p !== 'object') return '' + const obj = p as Record + if (typeof obj.sourceFindingId === 'string') return obj.sourceFindingId + if (typeof obj.id === 'string') return obj.id + return '' +} diff --git a/src/agent/outcome.ts b/src/agent/outcome.ts new file mode 100644 index 0000000..47558e3 --- /dev/null +++ b/src/agent/outcome.ts @@ -0,0 +1,126 @@ +/** + * `OutcomeMeasurement` — the missing metric that turns the analyst + * loop from "observability" into "self-improvement". + * + * Without this hook, the loop reports process counts (`findings: 42`, + * `applied: 7`) and never proves the applied edits actually improved + * anything. With this hook, the substrate re-runs the cohort against + * the same personas after each apply pass and reports a composite + * score delta. A negative delta is the substrate's strongest signal + * to either roll back or surface for review. + * + * Wiring is intentionally simple: pass the manifest + the `runAgentEval` + * function and a list of `personaIds` to re-run. The wrapper: + * 1. Captures the baseline composite from the just-finished run. + * 2. After `runAnalystLoop` returns, re-invokes `runAgentEval` against + * the same persona slice. + * 3. Computes the delta and appends to `loop-report.json`. + * 4. If `rollbackOnRegression` and delta < 0, reverts applied edits. + */ + +import type { RunAnalystLoopResult } from '../analyst-loop/types' + +export interface OutcomeMeasurement { + /** Baseline composite before applies — captured from the most-recent eval run. */ + baselineComposite: number + /** Composite after re-running the cohort with applied edits. */ + afterComposite: number + /** `afterComposite - baselineComposite`. Positive = the loop improved the agent. */ + delta: number + /** Per-persona deltas for finer-grained review. */ + perPersona: ReadonlyArray<{ personaId: string; before: number; after: number; delta: number }> + /** When the substrate rolled back applies due to regression, the paths reverted. */ + rolledBackPaths: ReadonlyArray +} + +export interface OutcomeMeasurementOpts { + /** Composite scores from the run that produced the findings. */ + baseline: ReadonlyArray<{ personaId: string; composite: number }> + /** + * Re-run callback — the substrate invokes this after applies. The + * agent author provides their `runAgentEval`-equivalent so the + * substrate can ask "score this persona slice now." + * + * The callback SHOULD reuse the same cohort + judges + variant as + * the baseline run; only the agent's mutable surfaces have changed. + */ + reRunCohort: (personaIds: ReadonlyArray) => Promise< + ReadonlyArray<{ personaId: string; composite: number }> + > + /** When `true`, applied edits are reverted on negative delta. Default `false`. */ + rollbackOnRegression?: boolean + /** Callback to revert a list of paths (typically `git checkout HEAD --`). */ + revert?: (paths: ReadonlyArray) => Promise +} + +/** + * Run `runAnalystLoop` and stamp an `OutcomeMeasurement` onto the + * result. The substrate calls this after each canonical eval; the + * delta lands in `loop-report.json` for cross-run trend analysis. + * + * The function returns the original `RunAnalystLoopResult` enriched + * with `outcome` so callers stay backwards-compatible (the field is + * optional on the type; missing means no measurement was wired). + */ +export async function measureOutcome( + result: RunAnalystLoopResult, + opts: OutcomeMeasurementOpts, +): Promise & { outcome: OutcomeMeasurement }> { + const applied = result.knowledge?.applied ?? [] + const improvementsApplied = result.improvement?.applied ?? [] + const allApplied = [...applied, ...improvementsApplied] + + // No applies → no outcome to measure. Return a zero-delta to keep the + // shape stable for consumers; baseline / after equal. + if (allApplied.length === 0) { + return { + ...result, + outcome: { + baselineComposite: meanComposite(opts.baseline), + afterComposite: meanComposite(opts.baseline), + delta: 0, + perPersona: opts.baseline.map((b) => ({ + personaId: b.personaId, + before: b.composite, + after: b.composite, + delta: 0, + })), + rolledBackPaths: [], + }, + } + } + + const personaIds = opts.baseline.map((b) => b.personaId) + const after = await opts.reRunCohort(personaIds) + const afterByPersona = new Map(after.map((r) => [r.personaId, r.composite])) + + const perPersona = opts.baseline.map((b) => { + const a = afterByPersona.get(b.personaId) ?? b.composite + return { personaId: b.personaId, before: b.composite, after: a, delta: a - b.composite } + }) + const baselineComposite = meanComposite(opts.baseline) + const afterComposite = meanComposite(after) + const delta = afterComposite - baselineComposite + + let rolledBackPaths: string[] = [] + if (delta < 0 && opts.rollbackOnRegression && opts.revert) { + await opts.revert(allApplied) + rolledBackPaths = [...allApplied] + } + + return { + ...result, + outcome: { + baselineComposite, + afterComposite, + delta, + perPersona, + rolledBackPaths, + }, + } +} + +function meanComposite(rows: ReadonlyArray<{ composite: number }>): number { + if (rows.length === 0) return 0 + return rows.reduce((acc, r) => acc + r.composite, 0) / rows.length +} diff --git a/src/agent/surfaces.ts b/src/agent/surfaces.ts new file mode 100644 index 0000000..ee670bc --- /dev/null +++ b/src/agent/surfaces.ts @@ -0,0 +1,245 @@ +/** + * `AgentSurfaces` — declarative map of the mutable file/directory paths + * the self-improvement loop can edit on behalf of an agent. + * + * The substrate uses this map to resolve every parsed `FindingSubject` + * (from agent-eval) to a real on-disk path. No per-vertical glue; + * no fabricated paths; no silent `existsSync(...)` skips that hide + * misconfiguration from the operator. + * + * Surfaces are validated at `defineAgent` time — missing paths fail + * loud with a list of every offender. A surface that's not needed + * (e.g. an agent with no RAG corpora) is simply omitted; the loop + * refuses to route those subjects rather than fabricating a target. + */ + +import { existsSync } from 'node:fs' +import { isAbsolute, join } from 'node:path' +import type { FindingSubject } from '@tangle-network/agent-eval' + +/** + * Surface declarations. Every path is repo-relative (or absolute) at + * `defineAgent` time. At resolution time, paths are joined against the + * agent's `repoRoot`. + * + * `systemPrompt`, `tools`, `personas` are DIRECTORIES; the loop appends + * `
.md`, `/README.md`, `.yaml` etc. + * `rubric`, `outputSchema` are SINGLE FILES; the loop edits them in + * place. + * + * `knowledge` is the agent-knowledge root (typically `.agent-knowledge`); + * `applyKnowledgeWriteBlocks` writes pages relative to it. + * + * Optional surfaces (`scaffolding`, `memory`, `rag`, `outputSchema`) + * can be omitted — the loop will reject findings targeting them with a + * clear log message instead of fabricating a path. + */ +export interface AgentSurfaces { + /** Directory containing one markdown file per system-prompt section. */ + systemPrompt: string + /** Directory containing one subdir per tool (`/README.md`). */ + tools: string + /** Single file (TypeScript module) defining the rubric weights + dimensions. */ + rubric: string + /** Knowledge-base root; typically `.agent-knowledge`. */ + knowledge: string + /** Directory containing one YAML/JSON file per persona. */ + personas: string + /** Optional: directory containing scaffolding rules (precondition checks, retry policies). */ + scaffolding?: string + /** Optional: memory store path (JSONL / SQLite / DB). */ + memory?: string + /** Optional: directory containing RAG corpora (`/.md`). */ + rag?: string + /** Optional: single file defining the output schema (Zod / JSON Schema). */ + outputSchema?: string +} + +export interface ResolvedSurface { + /** Absolute filesystem path the operator can `cat` / `vim`. */ + absolutePath: string + /** Repo-relative path for PR descriptions, diffs, audit logs. */ + repoRelativePath: string + /** Whether the path currently exists on disk. */ + exists: boolean + /** The substrate's intent: edit an existing file or create a new one. */ + intent: 'edit-existing' | 'create-new' +} + +/** + * Resolve a parsed `FindingSubject` to the file path the substrate + * should edit (or create) on disk. + * + * Returns `null` when: + * - the subject targets a surface the agent didn't declare + * (e.g. `rag:*` when `surfaces.rag` is undefined), OR + * - the subject is a `cluster` (failure-mode emits these as evidence, + * not actionable mutations — they don't route to a file). + * + * Returns a `ResolvedSurface` with `intent: 'create-new'` when the + * subject names a path that doesn't yet exist (e.g. a new wiki page). + * The caller chooses whether to honour the create — for tightly-managed + * surfaces like `systemPrompt` it's usually a contract violation + * (the analyst named a section that doesn't exist); for `knowledge` + * it's the whole point. + */ +export function resolveSubjectPath( + subject: FindingSubject, + surfaces: AgentSurfaces, + repoRoot: string, +): ResolvedSurface | null { + const rel = relativePathForSubject(subject, surfaces) + if (rel === null) return null + const abs = isAbsolute(rel) ? rel : join(repoRoot, rel) + const exists = existsSync(abs) + return { + absolutePath: abs, + repoRelativePath: rel, + exists, + intent: exists ? 'edit-existing' : 'create-new', + } +} + +function relativePathForSubject( + subject: FindingSubject, + surfaces: AgentSurfaces, +): string | null { + switch (subject.kind) { + case 'knowledge.wiki': + case 'knowledge.stale': + return join(surfaces.knowledge, `${subject.slug}.md`) + case 'knowledge.claim': + // Claims land in a per-topic claims directory under the knowledge root. + return join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`) + case 'knowledge.raw': + return join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`) + case 'system-prompt': + return join(surfaces.systemPrompt, `${slugify(subject.section)}.md`) + case 'tool-doc': + return subject.aspect + ? join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`) + : join(surfaces.tools, subject.tool, 'README.md') + case 'new-tool': + return join(surfaces.tools, subject.name, 'README.md') + case 'rag': + if (!surfaces.rag) return null + return join(surfaces.rag, subject.corpus, `${subject.docId}.md`) + case 'memory': + if (!surfaces.memory) return null + return join(surfaces.memory, `${slugify(subject.key)}.json`) + case 'scaffolding': + if (!surfaces.scaffolding) return null + return join(surfaces.scaffolding, `${slugify(subject.concern)}.md`) + case 'output-schema': + if (!surfaces.outputSchema) return null + // outputSchema is a single file — the field name is metadata for + // the LLM-drafted patch, not a separate path. + return surfaces.outputSchema + case 'websearch.outdated': + case 'prior-run-summary': + // Stale signals don't map to a single file — they're handled by + // the knowledge adapter as `agent-knowledge:stale:*` after the + // operator decides which wiki page to retract. The substrate + // doesn't auto-route them. + return null + case 'cluster': + // failure-mode cluster labels are evidence, not mutations. + return null + } +} + +function slugify(s: string): string { + return ( + s + .toLowerCase() + .replace(/[^a-z0-9-]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 200) || 'untitled' + ) +} + +/** + * Validate that every declared surface exists on disk under `repoRoot`. + * + * Returns an array of `SurfaceValidationIssue` — empty when all required + * surfaces resolve. `defineAgent` throws with the issues rendered, so + * a misconfigured manifest fails at startup (not at the first finding + * the loop produces 20 minutes later). + */ +export interface SurfaceValidationIssue { + surface: keyof AgentSurfaces + path: string + reason: 'missing' | 'not-directory' | 'not-file' +} + +export function validateSurfaces( + surfaces: AgentSurfaces, + repoRoot: string, +): ReadonlyArray { + const issues: SurfaceValidationIssue[] = [] + const dirSurfaces: ReadonlyArray = [ + 'systemPrompt', + 'tools', + 'personas', + 'knowledge', + ] + const fileSurfaces: ReadonlyArray = ['rubric'] + const optionalDirSurfaces: ReadonlyArray = [ + 'scaffolding', + 'memory', + 'rag', + ] + const optionalFileSurfaces: ReadonlyArray = ['outputSchema'] + + for (const key of dirSurfaces) { + const p = surfaces[key] as string | undefined + if (!p) { + issues.push({ surface: key, path: '', reason: 'missing' }) + continue + } + const abs = isAbsolute(p) ? p : join(repoRoot, p) + if (!existsSync(abs)) { + issues.push({ surface: key, path: p, reason: 'missing' }) + } + } + for (const key of fileSurfaces) { + const p = surfaces[key] as string | undefined + if (!p) { + issues.push({ surface: key, path: '', reason: 'missing' }) + continue + } + const abs = isAbsolute(p) ? p : join(repoRoot, p) + if (!existsSync(abs)) { + issues.push({ surface: key, path: p, reason: 'missing' }) + } + } + for (const key of [...optionalDirSurfaces, ...optionalFileSurfaces]) { + const p = surfaces[key] as string | undefined + if (p === undefined) continue + const abs = isAbsolute(p) ? p : join(repoRoot, p) + if (!existsSync(abs)) { + issues.push({ surface: key, path: p, reason: 'missing' }) + } + } + return issues +} + +export function renderSurfaceIssues( + issues: ReadonlyArray, + repoRoot: string, +): string { + if (issues.length === 0) return '' + const lines = issues.map( + (i) => ` - ${i.surface}: ${i.path ? `"${i.path}"` : ''} (${i.reason})`, + ) + return [ + `Agent surface validation failed against repoRoot=${repoRoot}:`, + ...lines, + '', + 'Fix the manifest: every required surface must point at an existing', + 'directory (systemPrompt / tools / personas / knowledge) or file', + '(rubric). Optional surfaces (scaffolding / memory / rag / outputSchema)', + 'may be omitted; the loop will reject findings targeting omitted', + 'surfaces rather than fabricating a path.', + ].join('\n') +} diff --git a/tests/agent.test.ts b/tests/agent.test.ts new file mode 100644 index 0000000..7a4fe79 --- /dev/null +++ b/tests/agent.test.ts @@ -0,0 +1,584 @@ +import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' +import { AgentManifestError, defineAgent } from '../src/agent/define-agent' +import { + createSurfaceImprovementAdapter, + type DraftPatchInput, + type DraftPatchOutput, +} from '../src/agent/improvement-adapter' +import { measureOutcome } from '../src/agent/outcome' +import { resolveSubjectPath, validateSurfaces } from '../src/agent/surfaces' + +// ── helpers ───────────────────────────────────────────────────────── + +function makeAgentTree(root: string): void { + mkdirSync(join(root, 'prompts'), { recursive: true }) + writeFileSync(join(root, 'prompts/intake.md'), '# intake\n\nOriginal intake section.\n') + mkdirSync(join(root, 'tools/list_invoices'), { recursive: true }) + writeFileSync(join(root, 'tools/list_invoices/README.md'), '# list_invoices\n') + mkdirSync(join(root, 'personas'), { recursive: true }) + writeFileSync(join(root, 'personas/w2-single.yaml'), 'id: w2-single\n') + mkdirSync(join(root, '.agent-knowledge'), { recursive: true }) + writeFileSync(join(root, 'rubric.ts'), 'export const rubric = {}\n') +} + +function f( + id: string, + subject: string | undefined, + partial: Partial = {}, +): import('@tangle-network/agent-eval').AnalystFinding { + return { + schema_version: '1.0.0', + finding_id: id, + analyst_id: 'improvement', + produced_at: '2026-05-20T00:00:00Z', + area: 'improvement', + severity: 'high', + claim: `${id} claim`, + confidence: 0.9, + evidence_refs: [], + subject, + ...partial, + } +} + +let tmpRoot: string + +beforeEach(() => { + tmpRoot = mkdtempSync(join(tmpdir(), 'agent-runtime-substrate-')) + makeAgentTree(tmpRoot) +}) + +afterEach(() => { + rmSync(tmpRoot, { recursive: true, force: true }) +}) + +// ── defineAgent ───────────────────────────────────────────────────── + +describe('defineAgent', () => { + it('returns the manifest when every required surface resolves', () => { + const m = defineAgent({ + id: 'test-agent', + repoRoot: tmpRoot, + surfaces: { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + }, + rubric: { + dimensions: [ + { id: 'd1', weight: 0.5, score: () => 1 }, + { id: 'd2', weight: 0.5, score: () => 1 }, + ], + }, + runtime: { act: async () => ({}) }, + personas: async () => [], + analystKinds: [], + analyst: { model: 'claude-haiku-4-5' }, + }) + expect(m.id).toBe('test-agent') + }) + + it('throws AgentManifestError on missing required surface', () => { + expect(() => + defineAgent({ + id: 'broken', + repoRoot: tmpRoot, + surfaces: { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'does-not-exist.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + }, + rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] }, + runtime: { act: async () => ({}) }, + personas: async () => [], + analystKinds: [], + analyst: { model: 'claude-haiku-4-5' }, + }), + ).toThrow(AgentManifestError) + }) + + it('throws when rubric weights sum to a clearly miscalibrated total', () => { + expect(() => + defineAgent({ + id: 'mis-weighted', + repoRoot: tmpRoot, + surfaces: { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + }, + rubric: { + dimensions: [ + { id: 'd1', weight: 5, score: () => 1 }, + { id: 'd2', weight: 5, score: () => 1 }, + ], + }, + runtime: { act: async () => ({}) }, + personas: async () => [], + analystKinds: [], + analyst: { model: 'claude-haiku-4-5' }, + }), + ).toThrow(/sum to 10\.000/) + }) + + it('does NOT validate optional surfaces that are omitted', () => { + const m = defineAgent({ + id: 'no-optionals', + repoRoot: tmpRoot, + surfaces: { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + // No scaffolding / memory / rag / outputSchema — should not throw. + }, + rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] }, + runtime: { act: async () => ({}) }, + personas: async () => [], + analystKinds: [], + analyst: { model: 'claude-haiku-4-5' }, + }) + expect(m.surfaces.scaffolding).toBeUndefined() + }) +}) + +// ── resolveSubjectPath ────────────────────────────────────────────── + +describe('resolveSubjectPath', () => { + const surfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + rag: 'rag', + } + + it('routes system-prompt subject to /
.md', () => { + const r = resolveSubjectPath( + { kind: 'system-prompt', section: 'intake' }, + surfaces, + tmpRoot, + ) + expect(r?.repoRelativePath).toBe('prompts/intake.md') + expect(r?.exists).toBe(true) + expect(r?.intent).toBe('edit-existing') + }) + + it('routes system-prompt to create-new when the file does not exist', () => { + const r = resolveSubjectPath( + { kind: 'system-prompt', section: 'new-section' }, + surfaces, + tmpRoot, + ) + expect(r?.intent).toBe('create-new') + expect(r?.exists).toBe(false) + }) + + it('routes tool-doc with aspect to //.md', () => { + const r = resolveSubjectPath( + { kind: 'tool-doc', tool: 'list_invoices', aspect: 'examples' }, + surfaces, + tmpRoot, + ) + expect(r?.repoRelativePath).toBe('tools/list_invoices/examples.md') + }) + + it('returns null when subject targets an undeclared optional surface', () => { + const noRag = { ...surfaces, rag: undefined } + const r = resolveSubjectPath( + { kind: 'rag', corpus: 'irs', docId: 'foo' }, + noRag, + tmpRoot, + ) + expect(r).toBeNull() + }) + + it('returns null for cluster subjects (failure-mode evidence, not mutations)', () => { + const r = resolveSubjectPath({ kind: 'cluster', label: 'tool-call-loop' }, surfaces, tmpRoot) + expect(r).toBeNull() + }) + + it('returns null for websearch.outdated / prior-run-summary (stale signals, no direct file)', () => { + expect( + resolveSubjectPath({ kind: 'websearch.outdated', topic: 't' }, surfaces, tmpRoot), + ).toBeNull() + expect( + resolveSubjectPath({ kind: 'prior-run-summary', topic: 't' }, surfaces, tmpRoot), + ).toBeNull() + }) +}) + +// ── createSurfaceImprovementAdapter — proposeFromFindings ─────────── + +describe('createSurfaceImprovementAdapter — proposeFromFindings', () => { + const baseSurfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + } + + function mkDraft(): { + fn: (i: DraftPatchInput) => Promise + calls: Array + } { + const calls: Array = [] + return { + calls, + fn: async (input) => { + calls.push(input) + return { + patch: `--- a/${input.target.repoRelativePath}\n+++ b/${input.target.repoRelativePath}\n@@ +1,1 @@\n+drafted\n`, + summary: `edit ${input.target.repoRelativePath}`, + rationale: 'because', + } + }, + } + } + + it('proposes an edit when subject + surface resolve cleanly', async () => { + const { fn, calls } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, errors, skipped } = await adapter.proposeFromFindings([ + f('f1', 'system-prompt:intake'), + ]) + expect(edits).toHaveLength(1) + expect(errors).toEqual([]) + expect(skipped).toBe(0) + expect(calls).toHaveLength(1) + expect(calls[0]!.currentContent).toMatch(/Original intake section/) + }) + + it('records an error when subject does not parse', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, errors } = await adapter.proposeFromFindings([f('bad', 'fix the prompt')]) + expect(edits).toEqual([]) + expect(errors).toHaveLength(1) + expect(errors[0]!.message).toMatch(/grammar/) + }) + + it('skips findings without a subject (descriptive findings)', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, errors, skipped } = await adapter.proposeFromFindings([f('none', undefined)]) + expect(edits).toEqual([]) + expect(errors).toEqual([]) + expect(skipped).toBe(1) + }) + + it('skips cluster findings (failure-mode evidence)', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, skipped } = await adapter.proposeFromFindings([f('c', 'tool-call-loop')]) + expect(edits).toEqual([]) + expect(skipped).toBe(1) + }) + + it('skips agent-knowledge:* subjects (they route to the KnowledgeAdapter)', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, skipped } = await adapter.proposeFromFindings([ + f('k', 'agent-knowledge:wiki:invoice-shape'), + ]) + expect(edits).toEqual([]) + expect(skipped).toBe(1) + }) + + it('records an error when subject targets an undeclared surface', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, // no `rag` declared + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits, errors } = await adapter.proposeFromFindings([ + f('r', 'rag:irs-rulings:rev-rul-2024-12'), + ]) + expect(edits).toEqual([]) + expect(errors).toHaveLength(1) + expect(errors[0]!.message).toMatch(/undeclared surface/) + }) + + it('records an error when target does not exist for a non-create kind', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + allowCreateForKinds: ['knowledge.wiki'], // explicitly disallow create for system-prompt + }) + const { edits, errors } = await adapter.proposeFromFindings([ + f('miss', 'system-prompt:nonexistent-section'), + ]) + expect(edits).toEqual([]) + expect(errors).toHaveLength(1) + expect(errors[0]!.message).toMatch(/does not exist/) + }) + + it('passes baseSha256 of current content so apply can race-check', async () => { + const { fn } = mkDraft() + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: fn, + }) + const { edits } = await adapter.proposeFromFindings([f('f1', 'system-prompt:intake')]) + expect(edits[0]!.baseSha256).toMatch(/^[0-9a-f]{64}$/) + }) + + it('records an error when draftPatch throws (no silent skip)', async () => { + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: async () => { + throw new Error('boom') + }, + }) + const { edits, errors } = await adapter.proposeFromFindings([f('e', 'system-prompt:intake')]) + expect(edits).toEqual([]) + expect(errors[0]!.message).toMatch(/draftPatch threw: boom/) + }) + + it('skips when draftPatch returns an empty patch', async () => { + const adapter = createSurfaceImprovementAdapter({ + surfaces: baseSurfaces, + repoRoot: tmpRoot, + draftPatch: async () => ({ patch: '', summary: 'no-op', rationale: '' }), + }) + const { edits, skipped } = await adapter.proposeFromFindings([ + f('np', 'system-prompt:intake'), + ]) + expect(edits).toEqual([]) + expect(skipped).toBe(1) + }) +}) + +// ── createSurfaceImprovementAdapter — apply (mode=none) ───────────── + +describe('createSurfaceImprovementAdapter — apply', () => { + const surfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + } + + it('mode=none returns a warning and applies nothing', async () => { + const adapter = createSurfaceImprovementAdapter({ + surfaces, + repoRoot: tmpRoot, + draftPatch: async () => ({ patch: 'x', summary: 'y', rationale: 'z' }), + mode: 'none', + }) + const r = await adapter.apply!([]) + expect(r.applied).toEqual([]) + expect(r.warnings.join(' ')).toMatch(/mode=none/) + }) + + it('mode=open-pr without ghRepo fails loud (no silent fallback)', async () => { + const adapter = createSurfaceImprovementAdapter({ + surfaces, + repoRoot: tmpRoot, + draftPatch: async () => ({ patch: 'x', summary: 'y', rationale: 'z' }), + mode: 'open-pr', + }) + const r = await adapter.apply!([]) + expect(r.applied).toEqual([]) + expect(r.warnings.join(' ')).toMatch(/requires `ghRepo`/) + }) +}) + +// ── outcome.measureOutcome ────────────────────────────────────────── + +describe('measureOutcome', () => { + it('returns a zero-delta outcome when nothing was applied', async () => { + const reRun = vi.fn(async () => [{ personaId: 'p1', composite: 0.9 }]) + const enriched = await measureOutcome( + { + runId: 'r', + baselineRunId: null, + analystResult: { + run_id: 'r', + correlation_id: 'c', + started_at: '', + ended_at: '', + findings: [], + per_analyst: [], + total_cost_usd: 0, + }, + diff: null, + knowledge: null, + improvement: null, + }, + { + baseline: [{ personaId: 'p1', composite: 0.7 }], + reRunCohort: reRun, + }, + ) + expect(enriched.outcome.delta).toBe(0) + expect(reRun).not.toHaveBeenCalled() + }) + + it('re-runs the cohort and computes the score delta when applies occurred', async () => { + const reRun = vi.fn(async () => [ + { personaId: 'p1', composite: 0.85 }, + { personaId: 'p2', composite: 0.95 }, + ]) + const enriched = await measureOutcome( + { + runId: 'r', + baselineRunId: null, + analystResult: { + run_id: 'r', + correlation_id: 'c', + started_at: '', + ended_at: '', + findings: [], + per_analyst: [], + total_cost_usd: 0, + }, + diff: null, + knowledge: { + proposals: [], + applied: ['knowledge/foo.md'], + skipped: 0, + errors: [], + withheld_for_review: 0, + }, + improvement: null, + }, + { + baseline: [ + { personaId: 'p1', composite: 0.7 }, + { personaId: 'p2', composite: 0.8 }, + ], + reRunCohort: reRun, + }, + ) + expect(reRun).toHaveBeenCalledOnce() + expect(enriched.outcome.baselineComposite).toBeCloseTo(0.75) + expect(enriched.outcome.afterComposite).toBeCloseTo(0.9) + expect(enriched.outcome.delta).toBeCloseTo(0.15) + expect(enriched.outcome.perPersona).toHaveLength(2) + expect(enriched.outcome.rolledBackPaths).toEqual([]) + }) + + it('rolls back applied paths on regression when rollbackOnRegression is set', async () => { + const reRun = async () => [{ personaId: 'p1', composite: 0.5 }] + const revert = vi.fn(async () => {}) + const enriched = await measureOutcome( + { + runId: 'r', + baselineRunId: null, + analystResult: { + run_id: 'r', + correlation_id: 'c', + started_at: '', + ended_at: '', + findings: [], + per_analyst: [], + total_cost_usd: 0, + }, + diff: null, + knowledge: { + proposals: [], + applied: ['knowledge/foo.md'], + skipped: 0, + errors: [], + withheld_for_review: 0, + }, + improvement: null, + }, + { + baseline: [{ personaId: 'p1', composite: 0.8 }], + reRunCohort: reRun, + rollbackOnRegression: true, + revert, + }, + ) + expect(enriched.outcome.delta).toBeLessThan(0) + expect(revert).toHaveBeenCalledWith(['knowledge/foo.md']) + expect(enriched.outcome.rolledBackPaths).toEqual(['knowledge/foo.md']) + }) +}) + +// ── validateSurfaces direct ───────────────────────────────────────── + +describe('validateSurfaces', () => { + it('flags every missing required surface (not first-fail)', () => { + const issues = validateSurfaces( + { + systemPrompt: 'nope', + tools: 'nada', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + }, + tmpRoot, + ) + expect(issues.map((i) => i.surface).sort()).toEqual(['systemPrompt', 'tools']) + }) + + it('flags an optional surface only when explicitly declared but missing', () => { + const ok = validateSurfaces( + { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + // rag undefined → not flagged + }, + tmpRoot, + ) + expect(ok).toEqual([]) + + const flagged = validateSurfaces( + { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + rag: 'rag', // explicitly declared but absent + }, + tmpRoot, + ) + expect(flagged).toHaveLength(1) + expect(flagged[0]!.surface).toBe('rag') + }) +}) diff --git a/tsup.config.ts b/tsup.config.ts index f6632bc..1453476 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -5,6 +5,7 @@ export default defineConfig({ index: 'src/index.ts', platform: 'src/platform/index.ts', 'analyst-loop': 'src/analyst-loop/index.ts', + agent: 'src/agent/index.ts', }, format: ['esm'], dts: true,