From 9bedda4d3c2d7a92d05040e54b7bfb68d0398349 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 20 May 2026 11:49:05 +0300 Subject: [PATCH] docs(skill): canonical analyst-loop wiring pattern --- .claude/skills/agent-eval/SKILL.md | 79 ++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) diff --git a/.claude/skills/agent-eval/SKILL.md b/.claude/skills/agent-eval/SKILL.md index ca04b1f..6c74c81 100644 --- a/.claude/skills/agent-eval/SKILL.md +++ b/.claude/skills/agent-eval/SKILL.md @@ -599,6 +599,85 @@ If you're skipping any of the four for a reason that isn't "this is a unit test, --- +## Closing the loop — analyst-loop wiring pattern + +Capture is half the system. The other half is consumption: every run must produce *durable findings* (not prose), diff against the prior run, and propose mutations to either the knowledge base or the agent's mutable surfaces (system prompt, tool docs, rubric, personas). This is what separates "we have eval logs" from "the agent gets better". + +The wiring is identical across vertical agents (tax / legal / gtm / creative). Copy this shape into each repo at `tests/eval/lib/analyst-loop.ts`; only the `ImprovementAdapter` is per-vertical. + +```ts +import { + AnalystRegistry, + FindingsStore, + createTraceAnalystKind, + DEFAULT_TRACE_ANALYST_KINDS, +} from '@tangle-network/agent-eval' +import { OtlpFileTraceStore } from '@tangle-network/agent-eval/traces' +import { runAnalystLoop } from '@tangle-network/agent-runtime/analyst-loop' +import { + proposeFromFindings, + applyKnowledgeWriteBlocks, +} from '@tangle-network/agent-knowledge' + +const registry = new AnalystRegistry() +for (const spec of DEFAULT_TRACE_ANALYST_KINDS) { + registry.register(createTraceAnalystKind(spec, { ai, model })) +} + +const findingsStore = new FindingsStore(`${findingsDir}/findings.jsonl`) +const traceStore = new OtlpFileTraceStore({ path: otlpPath }) + +const result = await runAnalystLoop({ + runId, + registry, + inputs: { traceStore }, + findingsStore, + knowledgeAdapter: { + proposeFromFindings, + apply: async (proposals) => { + // write each proposal's blocks via applyKnowledgeWriteBlocks(knowledgeRoot, block.content) + }, + }, + improvementAdapter: createVerticalImprovementAdapter({ repoRoot }), + autoApply: { + knowledge: true, // wiki writes are git-reversible + knowledgeConfidenceThreshold: 0.85, + improvement: false, // prompt/tool/rubric edits stay in the report until reviewed + improvementConfidenceThreshold: 0.9, + }, +}) +``` + +The four analyst kinds (`failure-mode`, `knowledge-gap`, `knowledge-poisoning`, `improvement`) emit findings with a stable `subject` field that the adapters route on: + +| Subject prefix | Adapter | Action | +| ------------------------------------ | ------------------------ | -------------------------------------------------------------- | +| `agent-knowledge:wiki:[#h]` | `KnowledgeAdapter` | create/update `.agent-knowledge/.md` | +| `agent-knowledge:claim:` | `KnowledgeAdapter` | draft a claim row | +| `agent-knowledge:raw:` | `KnowledgeAdapter` | lift raw → curated | +| `agent-knowledge:stale:` | `KnowledgeAdapter` | mark superseded | +| `system-prompt:
` | per-vertical `ImprovementAdapter` | propose edit to the agent's system prompt | +| `tool-doc:` | per-vertical `ImprovementAdapter` | propose edit to a tool's README/description | +| `rubric:` | per-vertical `ImprovementAdapter` | propose edit to scoring weights/rules | +| `persona:` | per-vertical `ImprovementAdapter` | propose addition/correction to an eval persona | +| anything else | counted in `skipped` | not this loop's concern | + +**Directives**: + +1. **One ledger per agent, in the repo.** `.evolve/findings/findings.jsonl` is the canonical location. Cross-run diffs (`appeared` / `disappeared` / `persisted` / `changed`) compute against the previous `run_id` automatically. Don't park findings in markdown — they have to be machine-queryable for the diff to fire. + +2. **`subject` is load-bearing.** Analyst kind prompts MUST stamp the subject in the documented grammar. A finding without a recognised prefix falls into `skipped` and never produces a mutation. Audit the actor prompts when you bump a kind's version. + +3. **Auto-apply knowledge, withhold improvement.** Wiki/claim edits are content the operator can `git revert`. Prompt/tool/rubric edits change agent behaviour — operator review is the default. Flip via `autoApply.improvement = true` only for verticals where you've measured the edit producer's precision. + +4. **Forward the registry stream.** Pass `onEvent` to `runAnalystLoop` and forward `event.event` when `event.type === 'analyst'`. This is how UIs render per-analyst progress (skip/start/complete) without a second wire. + +5. **Fail loud on missing surfaces.** The `ImprovementAdapter` should throw or return an `Error` when an analyst names a file/section that doesn't exist. A silent skip lets the analyst prompt drift away from the prompt tree. + +**Reference implementation**: `tax-agent/tests/eval/lib/{analyst-loop,improvement-adapter}.ts` (PR `tangle-network/tax-agent#67`). Copy the orchestrator; only the per-vertical subject routing in `improvement-adapter.ts` changes. + +--- + ## Pitfalls 1. **Pin the model snapshot.** `validateRunRecord` rejects bare aliases like `claude-sonnet-4-6`. Record `claude-sonnet-4-6@2025-04-15`. Aliases re-map silently; a bare-alias row can't be re-evaluated.