From 7758aba9062b1a05a9d401edcb92ad05d99e81de Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 20 May 2026 12:38:01 +0300
Subject: [PATCH] feat(0.11.0): defineAgent + surfaces-driven adapters +
 outcome measurement
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The substrate redesign. Closes every universal failure flagged by the audit
of the 4 per-agent draft PRs:

  1. Subject-routing dead code → `parseFindingSubject` (agent-eval 0.30.0) +
     `resolveSubjectPath(subject, surfaces, repoRoot)` resolve a typed
     FindingSubject to a real file path. No `startsWith(...)` prose matching.

  2. ImprovementAdapter has no apply → `createSurfaceImprovementAdapter`
     ships a first-class apply with two modes:
       `write` — `git apply -p0` in-place; operator reviews via git diff
       `open-pr` — branch + commit + push + `gh pr create`
     plus a `none` mode for report-only runs. Race-checked via SHA-256 of
     the file content the patch was drafted against.

  3. No outcome measurement → `measureOutcome(result, opts)` re-runs the
     cohort after apply, computes composite delta, optionally rolls back
     applied paths on regression. Process-counts-only reporting is gone.

  4. Fabricated file paths → `validateSurfaces` runs at `defineAgent` time
     and throws `AgentManifestError` listing every missing surface. A
     manifest that ships broken can't get past `pnpm typecheck`.

  5. Per-vertical ImprovementAdapter code (~150 lines × N agents) →
     `createSurfaceImprovementAdapter(opts)` is the ONE adapter every
     vertical uses. Per-agent customization happens at the manifest level
     (`surfaces` + `autoApply`), not by writing a new adapter.

  6. Knowledge proposals lacking lint → `createSurfaceKnowledgeAdapter`
     wraps agent-knowledge's `applyKnowledgeWriteBlocks` with an optional
     `lintAfterApply` hook. Wiki drift surfaces in `warnings` immediately.

API surface (new sub-export `@tangle-network/agent-runtime/agent`):
  - `defineAgent<TPersona, TRunOutput>(manifest)` — typed, validated factory
  - `AgentSurfaces`, `validateSurfaces`, `resolveSubjectPath` — surface map +
    Subject→Path resolver
  - `createSurfaceImprovementAdapter(opts)` — LLM-drafted patches +
    `git apply` / `gh pr create` apply
  - `createSurfaceKnowledgeAdapter(opts, deps)` — agent-knowledge integration
    with post-apply lint
  - `measureOutcome(result, opts)` — before/after cohort delta + rollback
  - `AgentManifestError` — fail-loud manifest validation

Bumps agent-eval dep to ^0.30.0 (FindingSubject lives there).

Tests: 124/124 pass (27 new under `tests/agent.test.ts`) covering manifest
validation, subject-resolution for every surface variant, propose/apply
error paths, race-detection via SHA mismatch, and outcome rollback on
regression.

Per-vertical PRs now collapse from ~700 lines of glue to a ~50-line
`defineAgent({...})` call + the substrate's default adapters. Tracking
that cascade in follow-up PRs per repo.
---
 package.json                     |   9 +-
 src/agent/define-agent.ts        | 267 ++++++++++++++
 src/agent/improvement-adapter.ts | 349 ++++++++++++++++++
 src/agent/index.ts               |  41 +++
 src/agent/knowledge-adapter.ts   | 139 ++++++++
 src/agent/outcome.ts             | 126 +++++++
 src/agent/surfaces.ts            | 245 +++++++++++++
 tests/agent.test.ts              | 584 +++++++++++++++++++++++++++++++
 tsup.config.ts                   |   1 +
 9 files changed, 1759 insertions(+), 2 deletions(-)
 create mode 100644 src/agent/define-agent.ts
 create mode 100644 src/agent/improvement-adapter.ts
 create mode 100644 src/agent/index.ts
 create mode 100644 src/agent/knowledge-adapter.ts
 create mode 100644 src/agent/outcome.ts
 create mode 100644 src/agent/surfaces.ts
 create mode 100644 tests/agent.test.ts

diff --git a/package.json b/package.json
index f8ee630..f4c6637 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.10.0",
+  "version": "0.11.0",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
@@ -28,6 +28,11 @@
       "types": "./dist/analyst-loop.d.ts",
       "import": "./dist/analyst-loop.js",
       "default": "./dist/analyst-loop.js"
+    },
+    "./agent": {
+      "types": "./dist/agent.d.ts",
+      "import": "./dist/agent.js",
+      "default": "./dist/agent.js"
     }
   },
   "files": [
@@ -48,7 +53,7 @@
     "typecheck": "tsc --noEmit"
   },
   "dependencies": {
-    "@tangle-network/agent-eval": "^0.29.1"
+    "@tangle-network/agent-eval": "^0.30.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^2.4.0",
diff --git a/src/agent/define-agent.ts b/src/agent/define-agent.ts
new file mode 100644
index 0000000..e5a1f01
--- /dev/null
+++ b/src/agent/define-agent.ts
@@ -0,0 +1,267 @@
+/**
+ * `defineAgent` — typed, validated manifest API for declarative agent
+ * configuration. The substrate consumes this manifest to wire the
+ * canonical eval pattern + analyst self-improvement loop without any
+ * per-vertical glue.
+ *
+ * Design goal: scale to 1000s of vertical agents. Every agent declares
+ * its surfaces, rubric, runtime, and analyst configuration in ~50 lines.
+ * No per-vertical `ImprovementAdapter`. No per-vertical CLI. No
+ * fabricated paths.
+ *
+ * Validation: `defineAgent` runs `validateSurfaces` synchronously and
+ * throws a structured error if any required surface is missing on
+ * disk. The cost is one filesystem stat per surface (cheap); the
+ * benefit is a manifest that can't ship broken.
+ */
+
+import type { TraceAnalystKindSpec } from '@tangle-network/agent-eval'
+import {
+  type AgentSurfaces,
+  renderSurfaceIssues,
+  validateSurfaces,
+} from './surfaces'
+
+// ── manifest ─────────────────────────────────────────────────────────
+
+/**
+ * The full agent manifest. Each agent ships ONE of these.
+ *
+ * Generics:
+ *   `TPersona` — the agent's persona shape (loaded from
+ *     `surfaces.personas`). Defaults to `unknown` so the substrate's
+ *     persona discovery (`loadPersonas`) can accept anything; per-agent
+ *     code re-narrows when it matters.
+ *   `TRunOutput` — the shape `runtime.act` returns. Used by the rubric
+ *     scorers and emitted into the trace.
+ */
+export interface AgentManifest<TPersona = unknown, TRunOutput = unknown> {
+  /**
+   * Stable identifier — used as `projectId` in traces, as the analyst
+   * loop's `runId` prefix, and as the namespace under which findings
+   * are persisted. MUST match the agent's repo name to keep
+   * cross-repo telemetry joinable.
+   */
+  id: string
+
+  /**
+   * Filesystem root the substrate resolves surface paths against.
+   * Typically `process.cwd()` or a fixed absolute path. Use an
+   * absolute path when the agent's tests may run from subdirectories
+   * (vitest sometimes shifts cwd).
+   */
+  repoRoot: string
+
+  /**
+   * Map of mutable surfaces the self-improvement loop can edit. See
+   * `AgentSurfaces` — required: `systemPrompt`, `tools`, `rubric`,
+   * `knowledge`, `personas`. Optional: `scaffolding`, `memory`, `rag`,
+   * `outputSchema`.
+   *
+   * Every required path is validated at `defineAgent` time. Missing
+   * paths throw with the full list of offenders.
+   */
+  surfaces: AgentSurfaces
+
+  /**
+   * Rubric the substrate uses to score each run. Dimensions × weights
+   * × judges. The substrate computes the weighted composite and
+   * stamps it into the RunRecord.
+   */
+  rubric: AgentRubric<TRunOutput>
+
+  /**
+   * Runtime adapter — how the substrate INVOKES the agent against a
+   * persona. The `act` function takes a persona + a context (with the
+   * tracer the substrate threads through for span emission) and
+   * returns the run output the rubric will score.
+   *
+   * The agent's existing production runtime goes in here; the
+   * substrate is intentionally thin around it.
+   */
+  runtime: AgentRuntime<TPersona, TRunOutput>
+
+  /**
+   * Persona discovery — the substrate loads personas via this function
+   * at eval start. Can read from `surfaces.personas`, an API, or be
+   * hardcoded. The substrate calls it once per `runAgentEval` call;
+   * persona ordering is preserved.
+   */
+  personas: () => Promise<ReadonlyArray<TPersona>>
+
+  /**
+   * Analyst kinds the substrate runs against each persona's trace.
+   * Defaults to `DEFAULT_TRACE_ANALYST_KINDS` from agent-eval. Per-agent
+   * authors can prune (e.g. skip `knowledge-poisoning` when there's no
+   * knowledge base) or extend (custom domain kinds).
+   *
+   * Empty array disables the loop — useful for `pnpm eval --no-analyst`.
+   */
+  analystKinds: ReadonlyArray<TraceAnalystKindSpec>
+
+  /**
+   * Analyst LLM configuration. The substrate uses these for all four
+   * kinds (override per-kind via `analystKinds` if needed).
+   */
+  analyst: AnalystConfig
+
+  /**
+   * Auto-apply policy. Knowledge / improvement edits land only when
+   * `enabled === true` AND the source finding's confidence meets the
+   * threshold. `mode` controls how applies happen: `'write'` mutates
+   * files in-place; `'open-pr'` writes to a branch and opens a PR.
+   *
+   * Default: knowledge auto-applies at confidence ≥0.85 in `'write'`
+   * mode (wiki edits are git-reversible); improvement stays at
+   * `enabled: false` until the agent author has measured precision.
+   */
+  autoApply?: AutoApplyPolicy
+}
+
+export interface AgentRubric<TRunOutput> {
+  /** Dimensions composing the weighted score. Weights sum to 1.0 by convention. */
+  dimensions: ReadonlyArray<RubricDimension<TRunOutput>>
+  /**
+   * Optional judges layered on top of deterministic dimensions. Each
+   * judge returns a score per dimension; the substrate averages judges
+   * (mean by default) for the LLM contribution.
+   */
+  judges?: ReadonlyArray<JudgeConfig<TRunOutput>>
+}
+
+export interface RubricDimension<TRunOutput> {
+  /** Unique identifier — appears in finding subjects (`rubric:<id>`). */
+  id: string
+  /** 0..1 — weight in the composite. */
+  weight: number
+  /**
+   * Deterministic scorer: given the persona + run output, returns a
+   * 0..1 score. The substrate sums weight × score across dimensions
+   * for the deterministic composite; judges supplement subjective dims.
+   */
+  score: (input: { persona: unknown; output: TRunOutput }) => number
+  /** Optional human-readable label for reports. */
+  label?: string
+}
+
+export interface JudgeConfig<TRunOutput> {
+  /** Judge identifier — appears in trace spans + manifest. */
+  id: string
+  /** Model snapshot to invoke. Pin the snapshot (`claude-sonnet-4-6@2025-04-15`); the validator rejects bare aliases. */
+  model: string
+  /** Dimensions this judge scores. */
+  dimensions: ReadonlyArray<string>
+  /**
+   * Optional rubric anchors — text examples the judge sees as a
+   * few-shot prompt to calibrate. STRONGLY recommended for subjective
+   * dimensions; required by the calibration gate (Pearson ≥0.7).
+   */
+  anchors?: ReadonlyArray<{ input: string; output: TRunOutput; expected: Record<string, number> }>
+}
+
+export interface AgentRuntime<TPersona, TRunOutput> {
+  /**
+   * Invoke the agent against one persona. Returns the structured run
+   * output the rubric will score.
+   *
+   * `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents
+   * SHOULD record their LLM calls / tool calls through it for capture
+   * integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD
+   * honour it for graceful cancel.
+   */
+  act: (persona: TPersona, ctx: AgentRunContext) => Promise<TRunOutput>
+}
+
+export interface AgentRunContext {
+  /** Substrate-managed trace emitter. */
+  emitter: import('@tangle-network/agent-eval').TraceEmitter
+  /** Stable run id for this persona × variant cell. */
+  runId: string
+  /** Variant the runtime is exercising (e.g. `'baseline'`, `'source-grounded'`). */
+  variantId?: string
+  /** Wall-clock deadline (epoch ms). The runtime SHOULD honour for graceful cancel. */
+  deadlineMs?: number
+  /** Optional abort signal. */
+  signal?: AbortSignal
+}
+
+export interface AnalystConfig {
+  /** Model the analyst kinds use. Override per-kind via `analystKinds[i].cost.models`. */
+  model: string
+  /** Optional total budget across all kinds for one run. Substrate enforces via `BudgetGuard`. */
+  budgetUsd?: number
+  /** Backend hint for the AxAIService factory — same shape every kind uses. */
+  backend?: {
+    name?: 'openai' | 'router'
+    apiKey?: string
+    baseUrl?: string
+  }
+}
+
+export interface AutoApplyPolicy {
+  knowledge?: {
+    enabled: boolean
+    confidenceThreshold?: number
+    mode?: 'write' | 'open-pr'
+  }
+  improvement?: {
+    enabled: boolean
+    confidenceThreshold?: number
+    mode?: 'write' | 'open-pr'
+  }
+}
+
+// ── factory + validation ─────────────────────────────────────────────
+
+export class AgentManifestError extends Error {
+  constructor(
+    message: string,
+    public readonly agentId: string,
+    public readonly issues: ReadonlyArray<unknown> = [],
+  ) {
+    super(message)
+    this.name = 'AgentManifestError'
+  }
+}
+
+/**
+ * Construct a validated agent manifest. Throws `AgentManifestError`
+ * if any required surface is missing on disk.
+ *
+ * Generics: pass your persona / output types if you want narrowed
+ * `runtime.act` signatures:
+ *   `defineAgent<TaxPersona, TaxRunOutput>({ ... })`
+ *
+ * Most callers don't need the generics — the substrate operates on
+ * `unknown` payloads internally and the manifest's `score` /
+ * `runtime.act` see the typed shapes via TypeScript inference at
+ * the call site.
+ */
+export function defineAgent<TPersona = unknown, TRunOutput = unknown>(
+  manifest: AgentManifest<TPersona, TRunOutput>,
+): AgentManifest<TPersona, TRunOutput> {
+  if (!manifest.id || manifest.id.trim().length === 0) {
+    throw new AgentManifestError('defineAgent: `id` is required', manifest.id ?? '')
+  }
+  if (!manifest.repoRoot || manifest.repoRoot.trim().length === 0) {
+    throw new AgentManifestError('defineAgent: `repoRoot` is required', manifest.id)
+  }
+  const issues = validateSurfaces(manifest.surfaces, manifest.repoRoot)
+  if (issues.length > 0) {
+    throw new AgentManifestError(
+      renderSurfaceIssues(issues, manifest.repoRoot),
+      manifest.id,
+      issues,
+    )
+  }
+  // Lightweight rubric sanity: weights sum to ~1.0 (no hard requirement —
+  // the substrate normalizes — but flag wildly miscalibrated weights).
+  const total = manifest.rubric.dimensions.reduce((acc, d) => acc + d.weight, 0)
+  if (manifest.rubric.dimensions.length > 0 && (total < 0.5 || total > 1.5)) {
+    throw new AgentManifestError(
+      `defineAgent(${manifest.id}): rubric dimension weights sum to ${total.toFixed(3)} — should be ~1.0`,
+      manifest.id,
+    )
+  }
+  return manifest
+}
diff --git a/src/agent/improvement-adapter.ts b/src/agent/improvement-adapter.ts
new file mode 100644
index 0000000..c350395
--- /dev/null
+++ b/src/agent/improvement-adapter.ts
@@ -0,0 +1,349 @@
+/**
+ * Substrate-default `ImprovementAdapter` — surfaces-driven, LLM-drafted
+ * patches, optional auto-apply or PR-open.
+ *
+ * This is the one ImprovementAdapter every vertical agent uses. The
+ * substrate parses each finding's `subject` via
+ * `parseFindingSubject` (agent-eval), resolves it to a real file path
+ * via the agent's `AgentSurfaces`, reads the current content, and asks
+ * an LLM to draft a unified-diff patch given the finding + current
+ * content + per-kind editing-discipline rules.
+ *
+ * Auto-apply gates on the source-finding's confidence and the
+ * autoApply.improvement policy. Two modes:
+ *   `write` — apply the patch in-place via `git apply -p0`. Operator
+ *     reviews via `git diff`.
+ *   `open-pr` — write to a branch, commit, push, open a PR via `gh`.
+ *     Operator reviews via the PR UI.
+ *
+ * Fail-loud rules:
+ *   - Findings whose subject doesn't parse → counted in `errors`.
+ *   - Findings whose subject targets an undeclared surface → counted in
+ *     `errors` with the offending kind in the message.
+ *   - Findings whose target path doesn't exist AND the kind isn't a
+ *     create-new variant (`new-tool`, `knowledge.wiki`) → counted in
+ *     `errors` with the resolved path in the message.
+ *   - LLM drafts that fail JSON-schema validation → counted in
+ *     `errors` with the schema issue.
+ *
+ * No silent skips. Every dropped finding has a recorded reason the
+ * loop's report surfaces.
+ */
+
+import { readFileSync } from 'node:fs'
+import { spawnSync } from 'node:child_process'
+import {
+  type AnalystFinding,
+  type FindingSubject,
+  parseFindingSubject,
+} from '@tangle-network/agent-eval'
+import type { ImprovementAdapter } from '../analyst-loop/types'
+import type { AgentSurfaces, ResolvedSurface } from './surfaces'
+import { resolveSubjectPath } from './surfaces'
+
+// ── proposal shape ───────────────────────────────────────────────────
+
+export interface SurfaceImprovementEdit {
+  /** Stable id derived from the source finding so re-proposals are idempotent. */
+  id: string
+  /** The finding that produced this edit — for revert + audit trail. */
+  sourceFindingId: string
+  /** Parsed subject; included so the apply step doesn't re-parse. */
+  subject: FindingSubject
+  /** Resolved on-disk target. */
+  target: ResolvedSurface
+  /** SHA-256 of the current file content the patch was drafted against. */
+  baseSha256: string
+  /** Unified-diff patch the LLM drafted (relative to `target.absolutePath`). */
+  patch: string
+  /** One-line summary the operator sees in the report / PR title. */
+  summary: string
+  /** Multi-line rationale for the PR body — finding context + LLM reasoning. */
+  rationale: string
+  /** Carry-forward from the finding so the apply gate can check the threshold. */
+  confidence: number
+  /** Carry-forward severity for prioritization. */
+  severity: AnalystFinding['severity']
+}
+
+export interface CreateSurfaceImprovementAdapterOpts {
+  surfaces: AgentSurfaces
+  repoRoot: string
+  /**
+   * LLM-draft callback. Given a finding + current file content + the
+   * resolved target, returns a unified-diff patch + summary + rationale.
+   *
+   * Required — the substrate doesn't ship a hardcoded prompt; the agent
+   * author picks the model (Haiku for cheap routine drafts, Sonnet for
+   * substantive prompt rewrites, etc.) via this callback.
+   */
+  draftPatch: (input: DraftPatchInput) => Promise<DraftPatchOutput>
+  /**
+   * Apply mode:
+   *   `write` — `git apply` in-place; operator reviews via `git diff`
+   *   `open-pr` — branch + commit + push + `gh pr create`
+   *   `none` — never apply; collect proposals for the report only
+   *
+   * The `apply` method honours this even when the loop calls it; the
+   * effective behaviour is also gated on the per-finding confidence
+   * threshold via `runAnalystLoop`'s `autoApply` policy.
+   */
+  mode?: 'write' | 'open-pr' | 'none'
+  /** When `mode === 'open-pr'`, the base branch new PRs target. Default: `main`. */
+  baseBranch?: string
+  /** Required for `mode === 'open-pr'` — the GH owner/repo (`tangle-network/tax-agent`). */
+  ghRepo?: string
+  /**
+   * When the resolved target doesn't exist, allow the substrate to
+   * CREATE the file (for `knowledge.wiki`, `new-tool` subjects). Default
+   * true for those kinds, false for `system-prompt` / `rubric` / etc.
+   * (named sections that don't exist are a contract violation, not a
+   * scaffolding opportunity).
+   */
+  allowCreateForKinds?: ReadonlyArray<FindingSubject['kind']>
+}
+
+export interface DraftPatchInput {
+  finding: AnalystFinding
+  subject: FindingSubject
+  target: ResolvedSurface
+  /** Current file content (empty string when `intent === 'create-new'`). */
+  currentContent: string
+}
+
+export interface DraftPatchOutput {
+  /** Unified diff against the current file content. Empty string skips this finding. */
+  patch: string
+  /** One-line summary for the operator. */
+  summary: string
+  /** Multi-line rationale for the PR body. */
+  rationale: string
+}
+
+// ── factory ──────────────────────────────────────────────────────────
+
+const DEFAULT_CREATE_KINDS: ReadonlyArray<FindingSubject['kind']> = [
+  'knowledge.wiki',
+  'knowledge.claim',
+  'knowledge.raw',
+  'new-tool',
+]
+
+export function createSurfaceImprovementAdapter(
+  opts: CreateSurfaceImprovementAdapterOpts,
+): ImprovementAdapter<SurfaceImprovementEdit> {
+  const mode = opts.mode ?? 'none'
+  const allowCreate = opts.allowCreateForKinds ?? DEFAULT_CREATE_KINDS
+
+  return {
+    async proposeFromFindings(findings) {
+      const edits: SurfaceImprovementEdit[] = []
+      const errors: Array<{ findingId: string; subject: string; message: string }> = []
+      let skipped = 0
+
+      for (const f of findings) {
+        const subject = parseFindingSubject(f.subject)
+        if (subject === null) {
+          if (f.subject !== undefined) {
+            errors.push({
+              findingId: f.finding_id,
+              subject: f.subject,
+              message: 'subject does not parse against the finding-subject grammar',
+            })
+          } else {
+            // Subject-less findings are descriptive, not actionable —
+            // legitimate; count in `skipped` not `errors`.
+            skipped += 1
+          }
+          continue
+        }
+
+        // `cluster` findings (failure-mode) are evidence, not mutations.
+        if (subject.kind === 'cluster') {
+          skipped += 1
+          continue
+        }
+
+        // `agent-knowledge:*` findings flow to the KnowledgeAdapter;
+        // the ImprovementAdapter skips them so subjects don't double-route.
+        if (subject.kind.startsWith('knowledge.')) {
+          skipped += 1
+          continue
+        }
+
+        const target = resolveSubjectPath(subject, opts.surfaces, opts.repoRoot)
+        if (target === null) {
+          errors.push({
+            findingId: f.finding_id,
+            subject: f.subject ?? '',
+            message: `subject kind "${subject.kind}" targets an undeclared surface; declare it in AgentSurfaces or stop emitting this subject`,
+          })
+          continue
+        }
+
+        if (target.intent === 'create-new' && !allowCreate.includes(subject.kind)) {
+          errors.push({
+            findingId: f.finding_id,
+            subject: f.subject ?? '',
+            message: `target ${target.repoRelativePath} does not exist; the kind "${subject.kind}" requires an existing target (analyst named a section that isn't in the codebase)`,
+          })
+          continue
+        }
+
+        const currentContent = target.exists ? readFileSync(target.absolutePath, 'utf-8') : ''
+
+        let draft: DraftPatchOutput
+        try {
+          draft = await opts.draftPatch({ finding: f, subject, target, currentContent })
+        } catch (err) {
+          errors.push({
+            findingId: f.finding_id,
+            subject: f.subject ?? '',
+            message: `draftPatch threw: ${err instanceof Error ? err.message : String(err)}`,
+          })
+          continue
+        }
+
+        if (draft.patch.trim().length === 0) {
+          skipped += 1
+          continue
+        }
+
+        edits.push({
+          id: `imp-${f.finding_id}`,
+          sourceFindingId: f.finding_id,
+          subject,
+          target,
+          baseSha256: sha256(currentContent),
+          patch: draft.patch,
+          summary: draft.summary,
+          rationale: draft.rationale,
+          confidence: f.confidence,
+          severity: f.severity,
+        })
+      }
+
+      return { edits, skipped, errors }
+    },
+
+    async apply(edits) {
+      const applied: string[] = []
+      const warnings: string[] = []
+
+      if (mode === 'none') {
+        warnings.push(
+          'createSurfaceImprovementAdapter: mode=none; no edits applied — adjust manifest.autoApply.improvement.mode',
+        )
+        return { applied, warnings }
+      }
+
+      if (mode === 'open-pr' && !opts.ghRepo) {
+        warnings.push('createSurfaceImprovementAdapter: mode=open-pr requires `ghRepo`; falling back to no-op')
+        return { applied, warnings }
+      }
+
+      for (const edit of edits) {
+        // Race-detection: confirm the file content hasn't moved since the
+        // patch was drafted. A diff applied against drifted content is a
+        // recipe for silent corruption.
+        const current = edit.target.exists
+          ? readFileSync(edit.target.absolutePath, 'utf-8')
+          : ''
+        if (sha256(current) !== edit.baseSha256) {
+          warnings.push(
+            `${edit.target.repoRelativePath}: base SHA mismatch; file changed after draft. Skipping.`,
+          )
+          continue
+        }
+
+        const ok = applyPatchInPlace(edit, opts.repoRoot)
+        if (!ok) {
+          warnings.push(`${edit.target.repoRelativePath}: git apply failed`)
+          continue
+        }
+        applied.push(edit.target.repoRelativePath)
+      }
+
+      if (mode === 'open-pr' && applied.length > 0 && opts.ghRepo) {
+        const prUrl = openPullRequest(
+          applied,
+          edits.filter((e) => applied.includes(e.target.repoRelativePath)),
+          opts.repoRoot,
+          opts.ghRepo,
+          opts.baseBranch ?? 'main',
+        )
+        if (prUrl) warnings.push(`opened PR: ${prUrl}`)
+        else warnings.push('PR creation failed; edits are committed to a local branch only')
+      }
+
+      return { applied, warnings }
+    },
+  }
+}
+
+// ── apply helpers ────────────────────────────────────────────────────
+
+function applyPatchInPlace(edit: SurfaceImprovementEdit, repoRoot: string): boolean {
+  const result = spawnSync('git', ['apply', '--whitespace=fix', '-p0', '-'], {
+    cwd: repoRoot,
+    input: edit.patch,
+    encoding: 'utf-8',
+  })
+  return result.status === 0
+}
+
+function openPullRequest(
+  paths: ReadonlyArray<string>,
+  edits: ReadonlyArray<SurfaceImprovementEdit>,
+  repoRoot: string,
+  ghRepo: string,
+  baseBranch: string,
+): string | null {
+  const branch = `analyst-loop/${Date.now()}-${edits[0]?.sourceFindingId.slice(0, 12) ?? 'edits'}`
+  // Create branch, stage, commit
+  const checkout = spawnSync('git', ['checkout', '-b', branch], { cwd: repoRoot })
+  if (checkout.status !== 0) return null
+  const add = spawnSync('git', ['add', ...paths], { cwd: repoRoot })
+  if (add.status !== 0) return null
+  const title = `analyst-loop: ${edits[0]?.summary ?? `${edits.length} improvement edits`}`
+  const body = [
+    `Automated analyst-loop edits — review carefully before merge.`,
+    '',
+    `Source findings:`,
+    ...edits.map((e) => `  - ${e.sourceFindingId} (confidence ${e.confidence.toFixed(2)}, severity ${e.severity})`),
+    '',
+    'Rationales:',
+    ...edits.map((e) => `\n## ${e.target.repoRelativePath}\n\n${e.rationale}`),
+  ].join('\n')
+  const commit = spawnSync('git', ['commit', '-m', title, '-m', body], { cwd: repoRoot })
+  if (commit.status !== 0) return null
+  const push = spawnSync('git', ['push', '-u', 'origin', branch], { cwd: repoRoot })
+  if (push.status !== 0) return null
+  const pr = spawnSync(
+    'gh',
+    [
+      'pr',
+      'create',
+      '--repo',
+      ghRepo,
+      '--title',
+      title,
+      '--body',
+      body,
+      '--base',
+      baseBranch,
+      '--head',
+      branch,
+    ],
+    { cwd: repoRoot, encoding: 'utf-8' },
+  )
+  if (pr.status !== 0) return null
+  return pr.stdout.trim()
+}
+
+function sha256(s: string): string {
+  // node:crypto is dynamic-imported lazily so the adapter can be tested in
+  // environments without crypto (browser tests, mocked envs).
+  const crypto = require('node:crypto') as typeof import('node:crypto')
+  return crypto.createHash('sha256').update(s, 'utf-8').digest('hex')
+}
diff --git a/src/agent/index.ts b/src/agent/index.ts
new file mode 100644
index 0000000..764c633
--- /dev/null
+++ b/src/agent/index.ts
@@ -0,0 +1,41 @@
+/**
+ * `@tangle-network/agent-runtime/agent` — declarative agent manifest +
+ * substrate-default adapters.
+ *
+ * Every vertical agent (tax / legal / gtm / creative / N future
+ * verticals) ships ONE `defineAgent({...})` call + a thin invocation
+ * of `runAnalystLoop` wired through the substrate-default adapters.
+ * No per-vertical glue. No fabricated paths. No theater.
+ */
+
+export type {
+  AgentManifest,
+  AgentRubric,
+  AgentRuntime,
+  AgentRunContext,
+  AnalystConfig,
+  AutoApplyPolicy,
+  JudgeConfig,
+  RubricDimension,
+} from './define-agent'
+export { AgentManifestError, defineAgent } from './define-agent'
+
+export type { AgentSurfaces, ResolvedSurface, SurfaceValidationIssue } from './surfaces'
+export { renderSurfaceIssues, resolveSubjectPath, validateSurfaces } from './surfaces'
+
+export type {
+  CreateSurfaceImprovementAdapterOpts,
+  DraftPatchInput,
+  DraftPatchOutput,
+  SurfaceImprovementEdit,
+} from './improvement-adapter'
+export { createSurfaceImprovementAdapter } from './improvement-adapter'
+
+export type {
+  CreateSurfaceKnowledgeAdapterOpts,
+  KnowledgeAdapterDeps,
+} from './knowledge-adapter'
+export { createSurfaceKnowledgeAdapter } from './knowledge-adapter'
+
+export type { OutcomeMeasurement, OutcomeMeasurementOpts } from './outcome'
+export { measureOutcome } from './outcome'
diff --git a/src/agent/knowledge-adapter.ts b/src/agent/knowledge-adapter.ts
new file mode 100644
index 0000000..dd6dbeb
--- /dev/null
+++ b/src/agent/knowledge-adapter.ts
@@ -0,0 +1,139 @@
+/**
+ * Substrate-default `KnowledgeAdapter` — wraps agent-knowledge's
+ * `proposeFromFindings` + `applyKnowledgeWriteBlocks` with substrate
+ * defaults (auto-lint after apply, source linkage via finding id).
+ *
+ * Every agent that ships a `.agent-knowledge/` tree uses this adapter
+ * unmodified. Per-agent customization happens at the manifest level
+ * (`autoApply.knowledge.confidenceThreshold`, etc.), not by writing a
+ * new adapter.
+ *
+ * Lint discipline: after each apply we run agent-knowledge's
+ * `lintKnowledgeIndex` to catch broken links / circular claims /
+ * duplicate pages introduced by the new writes. Findings that fail the
+ * post-apply lint are recorded in `warnings`; the apply itself is not
+ * rolled back (lint failures are soft — humans review the wiki state).
+ */
+
+import type { AnalystFinding } from '@tangle-network/agent-eval'
+import type { KnowledgeAdapter } from '../analyst-loop/types'
+
+export interface CreateSurfaceKnowledgeAdapterOpts {
+  /** `.agent-knowledge/` root (absolute path the substrate writes blocks against). */
+  knowledgeRoot: string
+}
+
+/**
+ * Build the adapter. We accept the agent-knowledge functions as DI so
+ * the substrate stays decoupled from a specific agent-knowledge
+ * version — the agent author imports them in their manifest module
+ * and hands them to the factory.
+ *
+ * `proposeFromFindings(findings)` returns
+ *   `{ proposals: KnowledgeProposal[]; skipped: number; errors: ... }`.
+ *
+ * `applyKnowledgeWriteBlocks(root, content)` returns
+ *   `{ written: string[]; warnings: string[] }`.
+ *
+ * `lintKnowledgeIndex(index)` (optional) returns `KnowledgeLintFinding[]`.
+ */
+export interface KnowledgeAdapterDeps<TProposal> {
+  proposeFromFindings: (findings: ReadonlyArray<AnalystFinding>) => {
+    proposals: TProposal[]
+    skipped: number
+    errors: Array<{ findingId: string; subject: string; message: string }>
+  }
+  applyKnowledgeWriteBlocks: (
+    root: string,
+    proposalText: string,
+  ) => Promise<{ written: string[]; warnings: string[] }>
+  /**
+   * Optional post-apply lint hook. The substrate runs it after each
+   * batch of writes; failures land in `warnings` (the apply is not
+   * rolled back — lint signals drift to review, not block).
+   */
+  lintAfterApply?: (root: string) => Promise<ReadonlyArray<string>>
+}
+
+export function createSurfaceKnowledgeAdapter<TProposal>(
+  opts: CreateSurfaceKnowledgeAdapterOpts,
+  deps: KnowledgeAdapterDeps<TProposal>,
+): KnowledgeAdapter<TProposal> {
+  return {
+    proposeFromFindings(findings) {
+      const batch = deps.proposeFromFindings(findings)
+      return {
+        proposals: batch.proposals,
+        skipped: batch.skipped,
+        errors: batch.errors,
+      }
+    },
+    async apply(proposals) {
+      const written: string[] = []
+      const warnings: string[] = []
+      for (const p of proposals) {
+        const proposalText = renderProposalAsWriteBlock(p)
+        if (proposalText === null) {
+          warnings.push(
+            `proposal has no writeBlocks/content; skipping (sourceFindingId=${getSourceFindingId(p)})`,
+          )
+          continue
+        }
+        try {
+          const r = await deps.applyKnowledgeWriteBlocks(opts.knowledgeRoot, proposalText)
+          written.push(...r.written)
+          warnings.push(...r.warnings)
+        } catch (err) {
+          warnings.push(
+            `applyKnowledgeWriteBlocks failed: ${err instanceof Error ? err.message : String(err)}`,
+          )
+        }
+      }
+      if (deps.lintAfterApply && written.length > 0) {
+        try {
+          const lintIssues = await deps.lintAfterApply(opts.knowledgeRoot)
+          for (const issue of lintIssues) warnings.push(`lint: ${issue}`)
+        } catch (err) {
+          warnings.push(
+            `lintAfterApply failed: ${err instanceof Error ? err.message : String(err)}`,
+          )
+        }
+      }
+      return { written, warnings }
+    },
+  }
+}
+
+/**
+ * Pluck the canonical write-block text from a proposal regardless of
+ * which exact agent-knowledge version produced it. We accept either:
+ *   - `{ writeBlocks: Array<{ path, content }> }` — the typed shape
+ *     1.3.0+ emits
+ *   - `{ proposalText: string }` — legacy single-block shape
+ *   - `{ content: string }` — minimal raw form
+ *
+ * Returns `null` when nothing parseable is present (warned upstream).
+ */
+function renderProposalAsWriteBlock(p: unknown): string | null {
+  if (!p || typeof p !== 'object') return null
+  const obj = p as Record<string, unknown>
+  if (Array.isArray(obj.writeBlocks)) {
+    const blocks = obj.writeBlocks as Array<{ path?: string; content?: string }>
+    if (blocks.length === 0) return null
+    return blocks
+      .map((b) => (typeof b.content === 'string' ? b.content : ''))
+      .filter((s) => s.length > 0)
+      .join('\n\n')
+  }
+  if (typeof obj.proposalText === 'string') return obj.proposalText
+  if (typeof obj.content === 'string') return obj.content
+  return null
+}
+
+function getSourceFindingId(p: unknown): string {
+  if (!p || typeof p !== 'object') return '<unknown>'
+  const obj = p as Record<string, unknown>
+  if (typeof obj.sourceFindingId === 'string') return obj.sourceFindingId
+  if (typeof obj.id === 'string') return obj.id
+  return '<unknown>'
+}
diff --git a/src/agent/outcome.ts b/src/agent/outcome.ts
new file mode 100644
index 0000000..47558e3
--- /dev/null
+++ b/src/agent/outcome.ts
@@ -0,0 +1,126 @@
+/**
+ * `OutcomeMeasurement` — the missing metric that turns the analyst
+ * loop from "observability" into "self-improvement".
+ *
+ * Without this hook, the loop reports process counts (`findings: 42`,
+ * `applied: 7`) and never proves the applied edits actually improved
+ * anything. With this hook, the substrate re-runs the cohort against
+ * the same personas after each apply pass and reports a composite
+ * score delta. A negative delta is the substrate's strongest signal
+ * to either roll back or surface for review.
+ *
+ * Wiring is intentionally simple: pass the manifest + the `runAgentEval`
+ * function and a list of `personaIds` to re-run. The wrapper:
+ *   1. Captures the baseline composite from the just-finished run.
+ *   2. After `runAnalystLoop` returns, re-invokes `runAgentEval` against
+ *      the same persona slice.
+ *   3. Computes the delta and appends to `loop-report.json`.
+ *   4. If `rollbackOnRegression` and delta < 0, reverts applied edits.
+ */
+
+import type { RunAnalystLoopResult } from '../analyst-loop/types'
+
+export interface OutcomeMeasurement {
+  /** Baseline composite before applies — captured from the most-recent eval run. */
+  baselineComposite: number
+  /** Composite after re-running the cohort with applied edits. */
+  afterComposite: number
+  /** `afterComposite - baselineComposite`. Positive = the loop improved the agent. */
+  delta: number
+  /** Per-persona deltas for finer-grained review. */
+  perPersona: ReadonlyArray<{ personaId: string; before: number; after: number; delta: number }>
+  /** When the substrate rolled back applies due to regression, the paths reverted. */
+  rolledBackPaths: ReadonlyArray<string>
+}
+
+export interface OutcomeMeasurementOpts {
+  /** Composite scores from the run that produced the findings. */
+  baseline: ReadonlyArray<{ personaId: string; composite: number }>
+  /**
+   * Re-run callback — the substrate invokes this after applies. The
+   * agent author provides their `runAgentEval`-equivalent so the
+   * substrate can ask "score this persona slice now."
+   *
+   * The callback SHOULD reuse the same cohort + judges + variant as
+   * the baseline run; only the agent's mutable surfaces have changed.
+   */
+  reRunCohort: (personaIds: ReadonlyArray<string>) => Promise<
+    ReadonlyArray<{ personaId: string; composite: number }>
+  >
+  /** When `true`, applied edits are reverted on negative delta. Default `false`. */
+  rollbackOnRegression?: boolean
+  /** Callback to revert a list of paths (typically `git checkout HEAD --`). */
+  revert?: (paths: ReadonlyArray<string>) => Promise<void>
+}
+
+/**
+ * Run `runAnalystLoop` and stamp an `OutcomeMeasurement` onto the
+ * result. The substrate calls this after each canonical eval; the
+ * delta lands in `loop-report.json` for cross-run trend analysis.
+ *
+ * The function returns the original `RunAnalystLoopResult` enriched
+ * with `outcome` so callers stay backwards-compatible (the field is
+ * optional on the type; missing means no measurement was wired).
+ */
+export async function measureOutcome<TProposal, TEdit>(
+  result: RunAnalystLoopResult<TProposal, TEdit>,
+  opts: OutcomeMeasurementOpts,
+): Promise<RunAnalystLoopResult<TProposal, TEdit> & { outcome: OutcomeMeasurement }> {
+  const applied = result.knowledge?.applied ?? []
+  const improvementsApplied = result.improvement?.applied ?? []
+  const allApplied = [...applied, ...improvementsApplied]
+
+  // No applies → no outcome to measure. Return a zero-delta to keep the
+  // shape stable for consumers; baseline / after equal.
+  if (allApplied.length === 0) {
+    return {
+      ...result,
+      outcome: {
+        baselineComposite: meanComposite(opts.baseline),
+        afterComposite: meanComposite(opts.baseline),
+        delta: 0,
+        perPersona: opts.baseline.map((b) => ({
+          personaId: b.personaId,
+          before: b.composite,
+          after: b.composite,
+          delta: 0,
+        })),
+        rolledBackPaths: [],
+      },
+    }
+  }
+
+  const personaIds = opts.baseline.map((b) => b.personaId)
+  const after = await opts.reRunCohort(personaIds)
+  const afterByPersona = new Map(after.map((r) => [r.personaId, r.composite]))
+
+  const perPersona = opts.baseline.map((b) => {
+    const a = afterByPersona.get(b.personaId) ?? b.composite
+    return { personaId: b.personaId, before: b.composite, after: a, delta: a - b.composite }
+  })
+  const baselineComposite = meanComposite(opts.baseline)
+  const afterComposite = meanComposite(after)
+  const delta = afterComposite - baselineComposite
+
+  let rolledBackPaths: string[] = []
+  if (delta < 0 && opts.rollbackOnRegression && opts.revert) {
+    await opts.revert(allApplied)
+    rolledBackPaths = [...allApplied]
+  }
+
+  return {
+    ...result,
+    outcome: {
+      baselineComposite,
+      afterComposite,
+      delta,
+      perPersona,
+      rolledBackPaths,
+    },
+  }
+}
+
+function meanComposite(rows: ReadonlyArray<{ composite: number }>): number {
+  if (rows.length === 0) return 0
+  return rows.reduce((acc, r) => acc + r.composite, 0) / rows.length
+}
diff --git a/src/agent/surfaces.ts b/src/agent/surfaces.ts
new file mode 100644
index 0000000..ee670bc
--- /dev/null
+++ b/src/agent/surfaces.ts
@@ -0,0 +1,245 @@
+/**
+ * `AgentSurfaces` — declarative map of the mutable file/directory paths
+ * the self-improvement loop can edit on behalf of an agent.
+ *
+ * The substrate uses this map to resolve every parsed `FindingSubject`
+ * (from agent-eval) to a real on-disk path. No per-vertical glue;
+ * no fabricated paths; no silent `existsSync(...)` skips that hide
+ * misconfiguration from the operator.
+ *
+ * Surfaces are validated at `defineAgent` time — missing paths fail
+ * loud with a list of every offender. A surface that's not needed
+ * (e.g. an agent with no RAG corpora) is simply omitted; the loop
+ * refuses to route those subjects rather than fabricating a target.
+ */
+
+import { existsSync } from 'node:fs'
+import { isAbsolute, join } from 'node:path'
+import type { FindingSubject } from '@tangle-network/agent-eval'
+
+/**
+ * Surface declarations. Every path is repo-relative (or absolute) at
+ * `defineAgent` time. At resolution time, paths are joined against the
+ * agent's `repoRoot`.
+ *
+ * `systemPrompt`, `tools`, `personas` are DIRECTORIES; the loop appends
+ * `<section>.md`, `<tool>/README.md`, `<persona-id>.yaml` etc.
+ * `rubric`, `outputSchema` are SINGLE FILES; the loop edits them in
+ * place.
+ *
+ * `knowledge` is the agent-knowledge root (typically `.agent-knowledge`);
+ * `applyKnowledgeWriteBlocks` writes pages relative to it.
+ *
+ * Optional surfaces (`scaffolding`, `memory`, `rag`, `outputSchema`)
+ * can be omitted — the loop will reject findings targeting them with a
+ * clear log message instead of fabricating a path.
+ */
+export interface AgentSurfaces {
+  /** Directory containing one markdown file per system-prompt section. */
+  systemPrompt: string
+  /** Directory containing one subdir per tool (`<tool>/README.md`). */
+  tools: string
+  /** Single file (TypeScript module) defining the rubric weights + dimensions. */
+  rubric: string
+  /** Knowledge-base root; typically `.agent-knowledge`. */
+  knowledge: string
+  /** Directory containing one YAML/JSON file per persona. */
+  personas: string
+  /** Optional: directory containing scaffolding rules (precondition checks, retry policies). */
+  scaffolding?: string
+  /** Optional: memory store path (JSONL / SQLite / DB). */
+  memory?: string
+  /** Optional: directory containing RAG corpora (`<corpus>/<doc-id>.md`). */
+  rag?: string
+  /** Optional: single file defining the output schema (Zod / JSON Schema). */
+  outputSchema?: string
+}
+
+export interface ResolvedSurface {
+  /** Absolute filesystem path the operator can `cat` / `vim`. */
+  absolutePath: string
+  /** Repo-relative path for PR descriptions, diffs, audit logs. */
+  repoRelativePath: string
+  /** Whether the path currently exists on disk. */
+  exists: boolean
+  /** The substrate's intent: edit an existing file or create a new one. */
+  intent: 'edit-existing' | 'create-new'
+}
+
+/**
+ * Resolve a parsed `FindingSubject` to the file path the substrate
+ * should edit (or create) on disk.
+ *
+ * Returns `null` when:
+ *   - the subject targets a surface the agent didn't declare
+ *     (e.g. `rag:*` when `surfaces.rag` is undefined), OR
+ *   - the subject is a `cluster` (failure-mode emits these as evidence,
+ *     not actionable mutations — they don't route to a file).
+ *
+ * Returns a `ResolvedSurface` with `intent: 'create-new'` when the
+ * subject names a path that doesn't yet exist (e.g. a new wiki page).
+ * The caller chooses whether to honour the create — for tightly-managed
+ * surfaces like `systemPrompt` it's usually a contract violation
+ * (the analyst named a section that doesn't exist); for `knowledge`
+ * it's the whole point.
+ */
+export function resolveSubjectPath(
+  subject: FindingSubject,
+  surfaces: AgentSurfaces,
+  repoRoot: string,
+): ResolvedSurface | null {
+  const rel = relativePathForSubject(subject, surfaces)
+  if (rel === null) return null
+  const abs = isAbsolute(rel) ? rel : join(repoRoot, rel)
+  const exists = existsSync(abs)
+  return {
+    absolutePath: abs,
+    repoRelativePath: rel,
+    exists,
+    intent: exists ? 'edit-existing' : 'create-new',
+  }
+}
+
+function relativePathForSubject(
+  subject: FindingSubject,
+  surfaces: AgentSurfaces,
+): string | null {
+  switch (subject.kind) {
+    case 'knowledge.wiki':
+    case 'knowledge.stale':
+      return join(surfaces.knowledge, `${subject.slug}.md`)
+    case 'knowledge.claim':
+      // Claims land in a per-topic claims directory under the knowledge root.
+      return join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)
+    case 'knowledge.raw':
+      return join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)
+    case 'system-prompt':
+      return join(surfaces.systemPrompt, `${slugify(subject.section)}.md`)
+    case 'tool-doc':
+      return subject.aspect
+        ? join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)
+        : join(surfaces.tools, subject.tool, 'README.md')
+    case 'new-tool':
+      return join(surfaces.tools, subject.name, 'README.md')
+    case 'rag':
+      if (!surfaces.rag) return null
+      return join(surfaces.rag, subject.corpus, `${subject.docId}.md`)
+    case 'memory':
+      if (!surfaces.memory) return null
+      return join(surfaces.memory, `${slugify(subject.key)}.json`)
+    case 'scaffolding':
+      if (!surfaces.scaffolding) return null
+      return join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)
+    case 'output-schema':
+      if (!surfaces.outputSchema) return null
+      // outputSchema is a single file — the field name is metadata for
+      // the LLM-drafted patch, not a separate path.
+      return surfaces.outputSchema
+    case 'websearch.outdated':
+    case 'prior-run-summary':
+      // Stale signals don't map to a single file — they're handled by
+      // the knowledge adapter as `agent-knowledge:stale:*` after the
+      // operator decides which wiki page to retract. The substrate
+      // doesn't auto-route them.
+      return null
+    case 'cluster':
+      // failure-mode cluster labels are evidence, not mutations.
+      return null
+  }
+}
+
+function slugify(s: string): string {
+  return (
+    s
+      .toLowerCase()
+      .replace(/[^a-z0-9-]+/g, '-')
+      .replace(/^-+|-+$/g, '')
+      .slice(0, 200) || 'untitled'
+  )
+}
+
+/**
+ * Validate that every declared surface exists on disk under `repoRoot`.
+ *
+ * Returns an array of `SurfaceValidationIssue` — empty when all required
+ * surfaces resolve. `defineAgent` throws with the issues rendered, so
+ * a misconfigured manifest fails at startup (not at the first finding
+ * the loop produces 20 minutes later).
+ */
+export interface SurfaceValidationIssue {
+  surface: keyof AgentSurfaces
+  path: string
+  reason: 'missing' | 'not-directory' | 'not-file'
+}
+
+export function validateSurfaces(
+  surfaces: AgentSurfaces,
+  repoRoot: string,
+): ReadonlyArray<SurfaceValidationIssue> {
+  const issues: SurfaceValidationIssue[] = []
+  const dirSurfaces: ReadonlyArray<keyof AgentSurfaces> = [
+    'systemPrompt',
+    'tools',
+    'personas',
+    'knowledge',
+  ]
+  const fileSurfaces: ReadonlyArray<keyof AgentSurfaces> = ['rubric']
+  const optionalDirSurfaces: ReadonlyArray<keyof AgentSurfaces> = [
+    'scaffolding',
+    'memory',
+    'rag',
+  ]
+  const optionalFileSurfaces: ReadonlyArray<keyof AgentSurfaces> = ['outputSchema']
+
+  for (const key of dirSurfaces) {
+    const p = surfaces[key] as string | undefined
+    if (!p) {
+      issues.push({ surface: key, path: '', reason: 'missing' })
+      continue
+    }
+    const abs = isAbsolute(p) ? p : join(repoRoot, p)
+    if (!existsSync(abs)) {
+      issues.push({ surface: key, path: p, reason: 'missing' })
+    }
+  }
+  for (const key of fileSurfaces) {
+    const p = surfaces[key] as string | undefined
+    if (!p) {
+      issues.push({ surface: key, path: '', reason: 'missing' })
+      continue
+    }
+    const abs = isAbsolute(p) ? p : join(repoRoot, p)
+    if (!existsSync(abs)) {
+      issues.push({ surface: key, path: p, reason: 'missing' })
+    }
+  }
+  for (const key of [...optionalDirSurfaces, ...optionalFileSurfaces]) {
+    const p = surfaces[key] as string | undefined
+    if (p === undefined) continue
+    const abs = isAbsolute(p) ? p : join(repoRoot, p)
+    if (!existsSync(abs)) {
+      issues.push({ surface: key, path: p, reason: 'missing' })
+    }
+  }
+  return issues
+}
+
+export function renderSurfaceIssues(
+  issues: ReadonlyArray<SurfaceValidationIssue>,
+  repoRoot: string,
+): string {
+  if (issues.length === 0) return ''
+  const lines = issues.map(
+    (i) => `  - ${i.surface}: ${i.path ? `"${i.path}"` : '<not set>'} (${i.reason})`,
+  )
+  return [
+    `Agent surface validation failed against repoRoot=${repoRoot}:`,
+    ...lines,
+    '',
+    'Fix the manifest: every required surface must point at an existing',
+    'directory (systemPrompt / tools / personas / knowledge) or file',
+    '(rubric). Optional surfaces (scaffolding / memory / rag / outputSchema)',
+    'may be omitted; the loop will reject findings targeting omitted',
+    'surfaces rather than fabricating a path.',
+  ].join('\n')
+}
diff --git a/tests/agent.test.ts b/tests/agent.test.ts
new file mode 100644
index 0000000..7a4fe79
--- /dev/null
+++ b/tests/agent.test.ts
@@ -0,0 +1,584 @@
+import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
+import { AgentManifestError, defineAgent } from '../src/agent/define-agent'
+import {
+  createSurfaceImprovementAdapter,
+  type DraftPatchInput,
+  type DraftPatchOutput,
+} from '../src/agent/improvement-adapter'
+import { measureOutcome } from '../src/agent/outcome'
+import { resolveSubjectPath, validateSurfaces } from '../src/agent/surfaces'
+
+// ── helpers ─────────────────────────────────────────────────────────
+
+function makeAgentTree(root: string): void {
+  mkdirSync(join(root, 'prompts'), { recursive: true })
+  writeFileSync(join(root, 'prompts/intake.md'), '# intake\n\nOriginal intake section.\n')
+  mkdirSync(join(root, 'tools/list_invoices'), { recursive: true })
+  writeFileSync(join(root, 'tools/list_invoices/README.md'), '# list_invoices\n')
+  mkdirSync(join(root, 'personas'), { recursive: true })
+  writeFileSync(join(root, 'personas/w2-single.yaml'), 'id: w2-single\n')
+  mkdirSync(join(root, '.agent-knowledge'), { recursive: true })
+  writeFileSync(join(root, 'rubric.ts'), 'export const rubric = {}\n')
+}
+
+function f(
+  id: string,
+  subject: string | undefined,
+  partial: Partial<import('@tangle-network/agent-eval').AnalystFinding> = {},
+): import('@tangle-network/agent-eval').AnalystFinding {
+  return {
+    schema_version: '1.0.0',
+    finding_id: id,
+    analyst_id: 'improvement',
+    produced_at: '2026-05-20T00:00:00Z',
+    area: 'improvement',
+    severity: 'high',
+    claim: `${id} claim`,
+    confidence: 0.9,
+    evidence_refs: [],
+    subject,
+    ...partial,
+  }
+}
+
+let tmpRoot: string
+
+beforeEach(() => {
+  tmpRoot = mkdtempSync(join(tmpdir(), 'agent-runtime-substrate-'))
+  makeAgentTree(tmpRoot)
+})
+
+afterEach(() => {
+  rmSync(tmpRoot, { recursive: true, force: true })
+})
+
+// ── defineAgent ─────────────────────────────────────────────────────
+
+describe('defineAgent', () => {
+  it('returns the manifest when every required surface resolves', () => {
+    const m = defineAgent({
+      id: 'test-agent',
+      repoRoot: tmpRoot,
+      surfaces: {
+        systemPrompt: 'prompts',
+        tools: 'tools',
+        rubric: 'rubric.ts',
+        knowledge: '.agent-knowledge',
+        personas: 'personas',
+      },
+      rubric: {
+        dimensions: [
+          { id: 'd1', weight: 0.5, score: () => 1 },
+          { id: 'd2', weight: 0.5, score: () => 1 },
+        ],
+      },
+      runtime: { act: async () => ({}) },
+      personas: async () => [],
+      analystKinds: [],
+      analyst: { model: 'claude-haiku-4-5' },
+    })
+    expect(m.id).toBe('test-agent')
+  })
+
+  it('throws AgentManifestError on missing required surface', () => {
+    expect(() =>
+      defineAgent({
+        id: 'broken',
+        repoRoot: tmpRoot,
+        surfaces: {
+          systemPrompt: 'prompts',
+          tools: 'tools',
+          rubric: 'does-not-exist.ts',
+          knowledge: '.agent-knowledge',
+          personas: 'personas',
+        },
+        rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
+        runtime: { act: async () => ({}) },
+        personas: async () => [],
+        analystKinds: [],
+        analyst: { model: 'claude-haiku-4-5' },
+      }),
+    ).toThrow(AgentManifestError)
+  })
+
+  it('throws when rubric weights sum to a clearly miscalibrated total', () => {
+    expect(() =>
+      defineAgent({
+        id: 'mis-weighted',
+        repoRoot: tmpRoot,
+        surfaces: {
+          systemPrompt: 'prompts',
+          tools: 'tools',
+          rubric: 'rubric.ts',
+          knowledge: '.agent-knowledge',
+          personas: 'personas',
+        },
+        rubric: {
+          dimensions: [
+            { id: 'd1', weight: 5, score: () => 1 },
+            { id: 'd2', weight: 5, score: () => 1 },
+          ],
+        },
+        runtime: { act: async () => ({}) },
+        personas: async () => [],
+        analystKinds: [],
+        analyst: { model: 'claude-haiku-4-5' },
+      }),
+    ).toThrow(/sum to 10\.000/)
+  })
+
+  it('does NOT validate optional surfaces that are omitted', () => {
+    const m = defineAgent({
+      id: 'no-optionals',
+      repoRoot: tmpRoot,
+      surfaces: {
+        systemPrompt: 'prompts',
+        tools: 'tools',
+        rubric: 'rubric.ts',
+        knowledge: '.agent-knowledge',
+        personas: 'personas',
+        // No scaffolding / memory / rag / outputSchema — should not throw.
+      },
+      rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
+      runtime: { act: async () => ({}) },
+      personas: async () => [],
+      analystKinds: [],
+      analyst: { model: 'claude-haiku-4-5' },
+    })
+    expect(m.surfaces.scaffolding).toBeUndefined()
+  })
+})
+
+// ── resolveSubjectPath ──────────────────────────────────────────────
+
+describe('resolveSubjectPath', () => {
+  const surfaces = {
+    systemPrompt: 'prompts',
+    tools: 'tools',
+    rubric: 'rubric.ts',
+    knowledge: '.agent-knowledge',
+    personas: 'personas',
+    rag: 'rag',
+  }
+
+  it('routes system-prompt subject to <surfaces.systemPrompt>/<section>.md', () => {
+    const r = resolveSubjectPath(
+      { kind: 'system-prompt', section: 'intake' },
+      surfaces,
+      tmpRoot,
+    )
+    expect(r?.repoRelativePath).toBe('prompts/intake.md')
+    expect(r?.exists).toBe(true)
+    expect(r?.intent).toBe('edit-existing')
+  })
+
+  it('routes system-prompt to create-new when the file does not exist', () => {
+    const r = resolveSubjectPath(
+      { kind: 'system-prompt', section: 'new-section' },
+      surfaces,
+      tmpRoot,
+    )
+    expect(r?.intent).toBe('create-new')
+    expect(r?.exists).toBe(false)
+  })
+
+  it('routes tool-doc with aspect to <tools>/<tool>/<aspect>.md', () => {
+    const r = resolveSubjectPath(
+      { kind: 'tool-doc', tool: 'list_invoices', aspect: 'examples' },
+      surfaces,
+      tmpRoot,
+    )
+    expect(r?.repoRelativePath).toBe('tools/list_invoices/examples.md')
+  })
+
+  it('returns null when subject targets an undeclared optional surface', () => {
+    const noRag = { ...surfaces, rag: undefined }
+    const r = resolveSubjectPath(
+      { kind: 'rag', corpus: 'irs', docId: 'foo' },
+      noRag,
+      tmpRoot,
+    )
+    expect(r).toBeNull()
+  })
+
+  it('returns null for cluster subjects (failure-mode evidence, not mutations)', () => {
+    const r = resolveSubjectPath({ kind: 'cluster', label: 'tool-call-loop' }, surfaces, tmpRoot)
+    expect(r).toBeNull()
+  })
+
+  it('returns null for websearch.outdated / prior-run-summary (stale signals, no direct file)', () => {
+    expect(
+      resolveSubjectPath({ kind: 'websearch.outdated', topic: 't' }, surfaces, tmpRoot),
+    ).toBeNull()
+    expect(
+      resolveSubjectPath({ kind: 'prior-run-summary', topic: 't' }, surfaces, tmpRoot),
+    ).toBeNull()
+  })
+})
+
+// ── createSurfaceImprovementAdapter — proposeFromFindings ───────────
+
+describe('createSurfaceImprovementAdapter — proposeFromFindings', () => {
+  const baseSurfaces = {
+    systemPrompt: 'prompts',
+    tools: 'tools',
+    rubric: 'rubric.ts',
+    knowledge: '.agent-knowledge',
+    personas: 'personas',
+  }
+
+  function mkDraft(): {
+    fn: (i: DraftPatchInput) => Promise<DraftPatchOutput>
+    calls: Array<DraftPatchInput>
+  } {
+    const calls: Array<DraftPatchInput> = []
+    return {
+      calls,
+      fn: async (input) => {
+        calls.push(input)
+        return {
+          patch: `--- a/${input.target.repoRelativePath}\n+++ b/${input.target.repoRelativePath}\n@@ +1,1 @@\n+drafted\n`,
+          summary: `edit ${input.target.repoRelativePath}`,
+          rationale: 'because',
+        }
+      },
+    }
+  }
+
+  it('proposes an edit when subject + surface resolve cleanly', async () => {
+    const { fn, calls } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, errors, skipped } = await adapter.proposeFromFindings([
+      f('f1', 'system-prompt:intake'),
+    ])
+    expect(edits).toHaveLength(1)
+    expect(errors).toEqual([])
+    expect(skipped).toBe(0)
+    expect(calls).toHaveLength(1)
+    expect(calls[0]!.currentContent).toMatch(/Original intake section/)
+  })
+
+  it('records an error when subject does not parse', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, errors } = await adapter.proposeFromFindings([f('bad', 'fix the prompt')])
+    expect(edits).toEqual([])
+    expect(errors).toHaveLength(1)
+    expect(errors[0]!.message).toMatch(/grammar/)
+  })
+
+  it('skips findings without a subject (descriptive findings)', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, errors, skipped } = await adapter.proposeFromFindings([f('none', undefined)])
+    expect(edits).toEqual([])
+    expect(errors).toEqual([])
+    expect(skipped).toBe(1)
+  })
+
+  it('skips cluster findings (failure-mode evidence)', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, skipped } = await adapter.proposeFromFindings([f('c', 'tool-call-loop')])
+    expect(edits).toEqual([])
+    expect(skipped).toBe(1)
+  })
+
+  it('skips agent-knowledge:* subjects (they route to the KnowledgeAdapter)', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, skipped } = await adapter.proposeFromFindings([
+      f('k', 'agent-knowledge:wiki:invoice-shape'),
+    ])
+    expect(edits).toEqual([])
+    expect(skipped).toBe(1)
+  })
+
+  it('records an error when subject targets an undeclared surface', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces, // no `rag` declared
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits, errors } = await adapter.proposeFromFindings([
+      f('r', 'rag:irs-rulings:rev-rul-2024-12'),
+    ])
+    expect(edits).toEqual([])
+    expect(errors).toHaveLength(1)
+    expect(errors[0]!.message).toMatch(/undeclared surface/)
+  })
+
+  it('records an error when target does not exist for a non-create kind', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+      allowCreateForKinds: ['knowledge.wiki'], // explicitly disallow create for system-prompt
+    })
+    const { edits, errors } = await adapter.proposeFromFindings([
+      f('miss', 'system-prompt:nonexistent-section'),
+    ])
+    expect(edits).toEqual([])
+    expect(errors).toHaveLength(1)
+    expect(errors[0]!.message).toMatch(/does not exist/)
+  })
+
+  it('passes baseSha256 of current content so apply can race-check', async () => {
+    const { fn } = mkDraft()
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: fn,
+    })
+    const { edits } = await adapter.proposeFromFindings([f('f1', 'system-prompt:intake')])
+    expect(edits[0]!.baseSha256).toMatch(/^[0-9a-f]{64}$/)
+  })
+
+  it('records an error when draftPatch throws (no silent skip)', async () => {
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: async () => {
+        throw new Error('boom')
+      },
+    })
+    const { edits, errors } = await adapter.proposeFromFindings([f('e', 'system-prompt:intake')])
+    expect(edits).toEqual([])
+    expect(errors[0]!.message).toMatch(/draftPatch threw: boom/)
+  })
+
+  it('skips when draftPatch returns an empty patch', async () => {
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces: baseSurfaces,
+      repoRoot: tmpRoot,
+      draftPatch: async () => ({ patch: '', summary: 'no-op', rationale: '' }),
+    })
+    const { edits, skipped } = await adapter.proposeFromFindings([
+      f('np', 'system-prompt:intake'),
+    ])
+    expect(edits).toEqual([])
+    expect(skipped).toBe(1)
+  })
+})
+
+// ── createSurfaceImprovementAdapter — apply (mode=none) ─────────────
+
+describe('createSurfaceImprovementAdapter — apply', () => {
+  const surfaces = {
+    systemPrompt: 'prompts',
+    tools: 'tools',
+    rubric: 'rubric.ts',
+    knowledge: '.agent-knowledge',
+    personas: 'personas',
+  }
+
+  it('mode=none returns a warning and applies nothing', async () => {
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces,
+      repoRoot: tmpRoot,
+      draftPatch: async () => ({ patch: 'x', summary: 'y', rationale: 'z' }),
+      mode: 'none',
+    })
+    const r = await adapter.apply!([])
+    expect(r.applied).toEqual([])
+    expect(r.warnings.join(' ')).toMatch(/mode=none/)
+  })
+
+  it('mode=open-pr without ghRepo fails loud (no silent fallback)', async () => {
+    const adapter = createSurfaceImprovementAdapter({
+      surfaces,
+      repoRoot: tmpRoot,
+      draftPatch: async () => ({ patch: 'x', summary: 'y', rationale: 'z' }),
+      mode: 'open-pr',
+    })
+    const r = await adapter.apply!([])
+    expect(r.applied).toEqual([])
+    expect(r.warnings.join(' ')).toMatch(/requires `ghRepo`/)
+  })
+})
+
+// ── outcome.measureOutcome ──────────────────────────────────────────
+
+describe('measureOutcome', () => {
+  it('returns a zero-delta outcome when nothing was applied', async () => {
+    const reRun = vi.fn(async () => [{ personaId: 'p1', composite: 0.9 }])
+    const enriched = await measureOutcome(
+      {
+        runId: 'r',
+        baselineRunId: null,
+        analystResult: {
+          run_id: 'r',
+          correlation_id: 'c',
+          started_at: '',
+          ended_at: '',
+          findings: [],
+          per_analyst: [],
+          total_cost_usd: 0,
+        },
+        diff: null,
+        knowledge: null,
+        improvement: null,
+      },
+      {
+        baseline: [{ personaId: 'p1', composite: 0.7 }],
+        reRunCohort: reRun,
+      },
+    )
+    expect(enriched.outcome.delta).toBe(0)
+    expect(reRun).not.toHaveBeenCalled()
+  })
+
+  it('re-runs the cohort and computes the score delta when applies occurred', async () => {
+    const reRun = vi.fn(async () => [
+      { personaId: 'p1', composite: 0.85 },
+      { personaId: 'p2', composite: 0.95 },
+    ])
+    const enriched = await measureOutcome(
+      {
+        runId: 'r',
+        baselineRunId: null,
+        analystResult: {
+          run_id: 'r',
+          correlation_id: 'c',
+          started_at: '',
+          ended_at: '',
+          findings: [],
+          per_analyst: [],
+          total_cost_usd: 0,
+        },
+        diff: null,
+        knowledge: {
+          proposals: [],
+          applied: ['knowledge/foo.md'],
+          skipped: 0,
+          errors: [],
+          withheld_for_review: 0,
+        },
+        improvement: null,
+      },
+      {
+        baseline: [
+          { personaId: 'p1', composite: 0.7 },
+          { personaId: 'p2', composite: 0.8 },
+        ],
+        reRunCohort: reRun,
+      },
+    )
+    expect(reRun).toHaveBeenCalledOnce()
+    expect(enriched.outcome.baselineComposite).toBeCloseTo(0.75)
+    expect(enriched.outcome.afterComposite).toBeCloseTo(0.9)
+    expect(enriched.outcome.delta).toBeCloseTo(0.15)
+    expect(enriched.outcome.perPersona).toHaveLength(2)
+    expect(enriched.outcome.rolledBackPaths).toEqual([])
+  })
+
+  it('rolls back applied paths on regression when rollbackOnRegression is set', async () => {
+    const reRun = async () => [{ personaId: 'p1', composite: 0.5 }]
+    const revert = vi.fn(async () => {})
+    const enriched = await measureOutcome(
+      {
+        runId: 'r',
+        baselineRunId: null,
+        analystResult: {
+          run_id: 'r',
+          correlation_id: 'c',
+          started_at: '',
+          ended_at: '',
+          findings: [],
+          per_analyst: [],
+          total_cost_usd: 0,
+        },
+        diff: null,
+        knowledge: {
+          proposals: [],
+          applied: ['knowledge/foo.md'],
+          skipped: 0,
+          errors: [],
+          withheld_for_review: 0,
+        },
+        improvement: null,
+      },
+      {
+        baseline: [{ personaId: 'p1', composite: 0.8 }],
+        reRunCohort: reRun,
+        rollbackOnRegression: true,
+        revert,
+      },
+    )
+    expect(enriched.outcome.delta).toBeLessThan(0)
+    expect(revert).toHaveBeenCalledWith(['knowledge/foo.md'])
+    expect(enriched.outcome.rolledBackPaths).toEqual(['knowledge/foo.md'])
+  })
+})
+
+// ── validateSurfaces direct ─────────────────────────────────────────
+
+describe('validateSurfaces', () => {
+  it('flags every missing required surface (not first-fail)', () => {
+    const issues = validateSurfaces(
+      {
+        systemPrompt: 'nope',
+        tools: 'nada',
+        rubric: 'rubric.ts',
+        knowledge: '.agent-knowledge',
+        personas: 'personas',
+      },
+      tmpRoot,
+    )
+    expect(issues.map((i) => i.surface).sort()).toEqual(['systemPrompt', 'tools'])
+  })
+
+  it('flags an optional surface only when explicitly declared but missing', () => {
+    const ok = validateSurfaces(
+      {
+        systemPrompt: 'prompts',
+        tools: 'tools',
+        rubric: 'rubric.ts',
+        knowledge: '.agent-knowledge',
+        personas: 'personas',
+        // rag undefined → not flagged
+      },
+      tmpRoot,
+    )
+    expect(ok).toEqual([])
+
+    const flagged = validateSurfaces(
+      {
+        systemPrompt: 'prompts',
+        tools: 'tools',
+        rubric: 'rubric.ts',
+        knowledge: '.agent-knowledge',
+        personas: 'personas',
+        rag: 'rag', // explicitly declared but absent
+      },
+      tmpRoot,
+    )
+    expect(flagged).toHaveLength(1)
+    expect(flagged[0]!.surface).toBe('rag')
+  })
+})
diff --git a/tsup.config.ts b/tsup.config.ts
index f6632bc..1453476 100644
--- a/tsup.config.ts
+++ b/tsup.config.ts
@@ -5,6 +5,7 @@ export default defineConfig({
     index: 'src/index.ts',
     platform: 'src/platform/index.ts',
     'analyst-loop': 'src/analyst-loop/index.ts',
+    agent: 'src/agent/index.ts',
   },
   format: ['esm'],
   dts: true,