From 599db2042fc0f4babb5f331021de2d46fa319386 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 20 May 2026 13:45:03 +0300 Subject: [PATCH] feat(0.11.1): streaming-first runtime + skill-dir layout probing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two follow-ups missed by the #16 merge window — both required by the per-agent manifest PRs (tax #69, legal #70, gtm #122, creative #98) to edit existing system-prompt skill directories rather than fabricating flat
.md siblings. Multi-candidate path probing (surfaces.ts): resolveSubjectPath now probes candidates in order rather than picking a single path. system-prompt:
resolves to: /
.md (canonical create-new) /
/SKILL.md (tax/legal/gtm/creative) /
/index.md (flat-md repos) tool-doc: probes /README.md then .md. First hit wins for edit-existing; first candidate is the canonical create-new target. Streaming-first AgentRuntime.act (define-agent.ts): Returns AgentRunInvocation { events, output } instead of Promise. Preserves the chat-centric product's streaming surface through the substrate: - events: AsyncIterable — chat UX consumes verbatim (SSE / WebSocket / inline render). runChatTurn plugs in directly. - output: Promise — resolves after stream drains; eval substrate awaits this; chat UX ignores (already rendered). Helpers: unimplementedAgentRun(reason?) — stub manifests use until their eval path is fully wired; yields no events, rejects output loudly. collectAgentRun(invocation) — eval-path drain helper; chat UX MUST NOT call this (defeats streaming). 133/133 tests pass (6 new: streaming contract + multi-candidate probing). Bumps to 0.11.1. --- package.json | 2 +- src/agent/define-agent.ts | 74 ++++++++++++++++++++++--- src/agent/index.ts | 8 ++- src/agent/surfaces.ts | 93 +++++++++++++++++++------------ tests/agent.test.ts | 112 ++++++++++++++++++++++++++++++++++++-- 5 files changed, 241 insertions(+), 48 deletions(-) diff --git a/package.json b/package.json index f4c6637..c728f1f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-runtime", - "version": "0.11.0", + "version": "0.11.1", "description": "Reusable runtime lifecycle for domain-specific agents.", "homepage": "https://github.com/tangle-network/agent-runtime#readme", "repository": { diff --git a/src/agent/define-agent.ts b/src/agent/define-agent.ts index 8f5f280..e99837a 100644 --- a/src/agent/define-agent.ts +++ b/src/agent/define-agent.ts @@ -16,6 +16,7 @@ */ import type { TraceAnalystKindSpec } from '@tangle-network/agent-eval' +import type { RuntimeStreamEvent } from '../types' import { type AgentSurfaces, renderSurfaceIssues, validateSurfaces } from './surfaces' // ── manifest ───────────────────────────────────────────────────────── @@ -157,15 +158,74 @@ export interface JudgeConfig { export interface AgentRuntime { /** - * Invoke the agent against one persona. Returns the structured run - * output the rubric will score. + * Invoke the agent against one persona. Returns BOTH: + * - `events`: an `AsyncIterable` the chat-centric + * product consumes verbatim (SSE / WebSocket / inline render). + * **Streaming is mandatory — never collapse this to a single Promise.** + * The agent's existing `runChatTurn` (or equivalent async generator) + * plugs in here directly. + * - `output`: a `Promise` resolved AFTER the event stream + * drains. The eval substrate awaits this for rubric scoring; chat + * products usually ignore it (they already rendered incrementally). * - * `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents - * SHOULD record their LLM calls / tool calls through it for capture - * integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD - * honour it for graceful cancel. + * Implementation contract: + * 1. `act` MUST return immediately (synchronous construction of the + * `events` iterator + the `output` promise). + * 2. Iterating `events` drives the underlying LLM/tool calls — the + * caller chooses when to consume. + * 3. `output` resolves only after the iterator yields its terminal + * event (typically `task_end`); see `collectAgentRun` helper. + * + * `ctx.emitter` is the substrate-threaded `TraceEmitter` — runtimes + * SHOULD record LLM/tool spans through it for capture integrity. + * `ctx.deadlineMs` is wall-clock; the runtime SHOULD honour for graceful + * cancel. `ctx.signal` is the standard abort signal. */ - act: (persona: TPersona, ctx: AgentRunContext) => Promise + act: (persona: TPersona, ctx: AgentRunContext) => AgentRunInvocation +} + +export interface AgentRunInvocation { + /** Live stream of typed runtime events. Consumed by chat UX directly. */ + events: AsyncIterable + /** Final structured output the rubric scores. Resolves after `events` drains. */ + output: Promise +} + +/** + * Stub for agents whose `runtime.act` is not yet wired to the substrate's + * eval path. Preserves the streaming contract (empty event stream + a + * rejected `output` promise that tells the caller exactly what to fix). + * + * Per-vertical manifests usually start with this stub and replace it with + * the agent's real streaming runtime (`runChatTurn` or equivalent) once + * the eval path consumes the manifest end-to-end. + */ +export function unimplementedAgentRun( + reason = 'AgentRuntime.act is not yet wired for this manifest', +): AgentRunInvocation { + return { + events: (async function* empty(): AsyncIterable {})(), + output: Promise.reject(new Error(reason)), + } +} + +/** + * Drain `act`'s `events` into an array AND await its `output`. Useful for + * eval / outcome-measurement code paths that don't care about live + * rendering. The events array is preserved so the substrate can inspect + * tool calls / readiness / questions retrospectively. + * + * IMPORTANT: chat-centric UX MUST NOT call this — it defeats streaming + * (no incremental render). Use `for await (const ev of invocation.events)` + * directly in the chat surface. + */ +export async function collectAgentRun( + invocation: AgentRunInvocation, +): Promise<{ events: ReadonlyArray; output: TRunOutput }> { + const events: RuntimeStreamEvent[] = [] + for await (const ev of invocation.events) events.push(ev) + const output = await invocation.output + return { events, output } } export interface AgentRunContext { diff --git a/src/agent/index.ts b/src/agent/index.ts index 3292dec..96fd519 100644 --- a/src/agent/index.ts +++ b/src/agent/index.ts @@ -12,13 +12,19 @@ export type { AgentManifest, AgentRubric, AgentRunContext, + AgentRunInvocation, AgentRuntime, AnalystConfig, AutoApplyPolicy, JudgeConfig, RubricDimension, } from './define-agent' -export { AgentManifestError, defineAgent } from './define-agent' +export { + AgentManifestError, + collectAgentRun, + defineAgent, + unimplementedAgentRun, +} from './define-agent' export type { CreateSurfaceImprovementAdapterOpts, DraftPatchInput, diff --git a/src/agent/surfaces.ts b/src/agent/surfaces.ts index 57cdb4f..38c10bc 100644 --- a/src/agent/surfaces.ts +++ b/src/agent/surfaces.ts @@ -88,60 +88,85 @@ export function resolveSubjectPath( surfaces: AgentSurfaces, repoRoot: string, ): ResolvedSurface | null { - const rel = relativePathForSubject(subject, surfaces) - if (rel === null) return null - const abs = isAbsolute(rel) ? rel : join(repoRoot, rel) - const exists = existsSync(abs) + const candidates = candidatePathsForSubject(subject, surfaces) + if (candidates.length === 0) return null + + // Probe candidates in order, preferring the first one that exists on disk. + // Lets the substrate accept both the flat `
.md` convention and + // the skill-dir `
/SKILL.md` convention without forcing one layout. + // When none exists, fall back to the first candidate (canonical create-new). + for (const rel of candidates) { + const abs = isAbsolute(rel) ? rel : join(repoRoot, rel) + if (existsSync(abs)) { + return { absolutePath: abs, repoRelativePath: rel, exists: true, intent: 'edit-existing' } + } + } + const fallback = candidates[0]! + const fallbackAbs = isAbsolute(fallback) ? fallback : join(repoRoot, fallback) return { - absolutePath: abs, - repoRelativePath: rel, - exists, - intent: exists ? 'edit-existing' : 'create-new', + absolutePath: fallbackAbs, + repoRelativePath: fallback, + exists: false, + intent: 'create-new', } } -function relativePathForSubject(subject: FindingSubject, surfaces: AgentSurfaces): string | null { +function candidatePathsForSubject( + subject: FindingSubject, + surfaces: AgentSurfaces, +): ReadonlyArray { switch (subject.kind) { case 'knowledge.wiki': case 'knowledge.stale': - return join(surfaces.knowledge, `${subject.slug}.md`) + return [join(surfaces.knowledge, `${subject.slug}.md`)] case 'knowledge.claim': // Claims land in a per-topic claims directory under the knowledge root. - return join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`) + return [join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)] case 'knowledge.raw': - return join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`) - case 'system-prompt': - return join(surfaces.systemPrompt, `${slugify(subject.section)}.md`) + return [join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)] + case 'system-prompt': { + const slug = slugify(subject.section) + // Prefer flat layout for create-new (canonical); probe skill-dir layout + // in case the existing repo (tax/legal/gtm/creative) uses + // `
/SKILL.md` already. + return [ + join(surfaces.systemPrompt, `${slug}.md`), + join(surfaces.systemPrompt, slug, 'SKILL.md'), + join(surfaces.systemPrompt, slug, 'index.md'), + ] + } case 'tool-doc': - return subject.aspect - ? join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`) - : join(surfaces.tools, subject.tool, 'README.md') + if (subject.aspect) { + return [join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)] + } + // tool-doc default: `/README.md`; also probe `.md` for flat + // tool-list repos. + return [ + join(surfaces.tools, subject.tool, 'README.md'), + join(surfaces.tools, `${subject.tool}.md`), + ] case 'new-tool': - return join(surfaces.tools, subject.name, 'README.md') + return [join(surfaces.tools, subject.name, 'README.md')] case 'rag': - if (!surfaces.rag) return null - return join(surfaces.rag, subject.corpus, `${subject.docId}.md`) + if (!surfaces.rag) return [] + return [join(surfaces.rag, subject.corpus, `${subject.docId}.md`)] case 'memory': - if (!surfaces.memory) return null - return join(surfaces.memory, `${slugify(subject.key)}.json`) + if (!surfaces.memory) return [] + return [join(surfaces.memory, `${slugify(subject.key)}.json`)] case 'scaffolding': - if (!surfaces.scaffolding) return null - return join(surfaces.scaffolding, `${slugify(subject.concern)}.md`) + if (!surfaces.scaffolding) return [] + return [join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)] case 'output-schema': - if (!surfaces.outputSchema) return null - // outputSchema is a single file — the field name is metadata for - // the LLM-drafted patch, not a separate path. - return surfaces.outputSchema + if (!surfaces.outputSchema) return [] + return [surfaces.outputSchema] case 'websearch.outdated': case 'prior-run-summary': - // Stale signals don't map to a single file — they're handled by - // the knowledge adapter as `agent-knowledge:stale:*` after the - // operator decides which wiki page to retract. The substrate - // doesn't auto-route them. - return null + // Stale signals don't map to a single file — handled by the knowledge + // adapter as `agent-knowledge:stale:*` after operator review. + return [] case 'cluster': // failure-mode cluster labels are evidence, not mutations. - return null + return [] } } diff --git a/tests/agent.test.ts b/tests/agent.test.ts index ec04383..e109942 100644 --- a/tests/agent.test.ts +++ b/tests/agent.test.ts @@ -2,7 +2,12 @@ import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'nod import { tmpdir } from 'node:os' import { join } from 'node:path' import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest' -import { AgentManifestError, defineAgent } from '../src/agent/define-agent' +import { + AgentManifestError, + collectAgentRun, + defineAgent, + unimplementedAgentRun, +} from '../src/agent/define-agent' import { createSurfaceImprovementAdapter, type DraftPatchInput, @@ -75,7 +80,7 @@ describe('defineAgent', () => { { id: 'd2', weight: 0.5, score: () => 1 }, ], }, - runtime: { act: async () => ({}) }, + runtime: { act: () => unimplementedAgentRun() }, personas: async () => [], analystKinds: [], analyst: { model: 'claude-haiku-4-5' }, @@ -96,7 +101,7 @@ describe('defineAgent', () => { personas: 'personas', }, rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] }, - runtime: { act: async () => ({}) }, + runtime: { act: () => unimplementedAgentRun() }, personas: async () => [], analystKinds: [], analyst: { model: 'claude-haiku-4-5' }, @@ -122,7 +127,7 @@ describe('defineAgent', () => { { id: 'd2', weight: 5, score: () => 1 }, ], }, - runtime: { act: async () => ({}) }, + runtime: { act: () => unimplementedAgentRun() }, personas: async () => [], analystKinds: [], analyst: { model: 'claude-haiku-4-5' }, @@ -143,7 +148,7 @@ describe('defineAgent', () => { // No scaffolding / memory / rag / outputSchema — should not throw. }, rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] }, - runtime: { act: async () => ({}) }, + runtime: { act: () => unimplementedAgentRun() }, personas: async () => [], analystKinds: [], analyst: { model: 'claude-haiku-4-5' }, @@ -572,3 +577,100 @@ describe('validateSurfaces', () => { expect(flagged[0]!.surface).toBe('rag') }) }) + +describe('AgentRunInvocation streaming contract', () => { + it('unimplementedAgentRun yields no events and rejects output with a clear message', async () => { + const invocation = unimplementedAgentRun<{ score: number }>() + const events: unknown[] = [] + for await (const ev of invocation.events) events.push(ev) + expect(events).toEqual([]) + await expect(invocation.output).rejects.toThrow(/not yet wired/) + }) + + it('collectAgentRun drains events AND awaits output', async () => { + const invocation = { + events: (async function* yielder() { + yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never + yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never + })(), + output: Promise.resolve({ score: 0.9 }), + } + const result = await collectAgentRun(invocation) + expect(result.events.length).toBe(2) + expect(result.output).toEqual({ score: 0.9 }) + }) + + it('preserves chat-UX streaming — events consumed incrementally', async () => { + const yielded: string[] = [] + const invocation = { + events: (async function* tokens() { + yielded.push('start') + yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never + yielded.push('mid') + yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never + yielded.push('end') + })(), + output: Promise.resolve({ score: 1 }), + } + for await (const _ev of invocation.events) { + /* incremental render */ + } + expect(yielded).toEqual(['start', 'mid', 'end']) + }) +}) + +describe('multi-candidate path probing', () => { + it('probes
/SKILL.md skill-dir layout when present', () => { + const surfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + } + mkdirSync(join(tmpRoot, 'prompts/expense-categorization'), { recursive: true }) + writeFileSync( + join(tmpRoot, 'prompts/expense-categorization/SKILL.md'), + '# expense-categorization\n', + ) + const r = resolveSubjectPath( + { kind: 'system-prompt', section: 'expense-categorization' }, + surfaces, + tmpRoot, + ) + expect(r?.repoRelativePath).toBe('prompts/expense-categorization/SKILL.md') + expect(r?.intent).toBe('edit-existing') + }) + + it('prefers flat
.md when both layouts exist', () => { + const surfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + } + mkdirSync(join(tmpRoot, 'prompts/both-layouts'), { recursive: true }) + writeFileSync(join(tmpRoot, 'prompts/both-layouts.md'), '# flat\n') + writeFileSync(join(tmpRoot, 'prompts/both-layouts/SKILL.md'), '# skill\n') + const r = resolveSubjectPath( + { kind: 'system-prompt', section: 'both-layouts' }, + surfaces, + tmpRoot, + ) + expect(r?.repoRelativePath).toBe('prompts/both-layouts.md') + }) + + it('falls back to flat tool-doc .md when /README.md is absent', () => { + const surfaces = { + systemPrompt: 'prompts', + tools: 'tools', + rubric: 'rubric.ts', + knowledge: '.agent-knowledge', + personas: 'personas', + } + writeFileSync(join(tmpRoot, 'tools/flat-tool.md'), '# flat-tool\n') + const r = resolveSubjectPath({ kind: 'tool-doc', tool: 'flat-tool' }, surfaces, tmpRoot) + expect(r?.repoRelativePath).toBe('tools/flat-tool.md') + }) +})