From 599db2042fc0f4babb5f331021de2d46fa319386 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 20 May 2026 13:45:03 +0300
Subject: [PATCH] feat(0.11.1): streaming-first runtime + skill-dir layout
 probing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups missed by the #16 merge window — both required by the
per-agent manifest PRs (tax #69, legal #70, gtm #122, creative #98) to
edit existing system-prompt skill directories rather than fabricating
flat <section>.md siblings.

Multi-candidate path probing (surfaces.ts):
  resolveSubjectPath now probes candidates in order rather than picking
  a single path. system-prompt:<section> resolves to:
    <surfaces.systemPrompt>/<section>.md          (canonical create-new)
    <surfaces.systemPrompt>/<section>/SKILL.md    (tax/legal/gtm/creative)
    <surfaces.systemPrompt>/<section>/index.md    (flat-md repos)
  tool-doc:<tool> probes <tool>/README.md then <tool>.md. First hit wins
  for edit-existing; first candidate is the canonical create-new target.

Streaming-first AgentRuntime.act (define-agent.ts):
  Returns AgentRunInvocation { events, output } instead of Promise<T>.
  Preserves the chat-centric product's streaming surface through the
  substrate:
    - events: AsyncIterable<RuntimeStreamEvent> — chat UX consumes
      verbatim (SSE / WebSocket / inline render). runChatTurn plugs
      in directly.
    - output: Promise<TRunOutput> — resolves after stream drains; eval
      substrate awaits this; chat UX ignores (already rendered).

Helpers:
  unimplementedAgentRun(reason?) — stub manifests use until their eval
    path is fully wired; yields no events, rejects output loudly.
  collectAgentRun(invocation) — eval-path drain helper; chat UX MUST
    NOT call this (defeats streaming).

133/133 tests pass (6 new: streaming contract + multi-candidate probing).

Bumps to 0.11.1.
---
 package.json              |   2 +-
 src/agent/define-agent.ts |  74 ++++++++++++++++++++++---
 src/agent/index.ts        |   8 ++-
 src/agent/surfaces.ts     |  93 +++++++++++++++++++------------
 tests/agent.test.ts       | 112 ++++++++++++++++++++++++++++++++++++--
 5 files changed, 241 insertions(+), 48 deletions(-)
diff --git a/package.json b/package.json
index f4c6637..c728f1f 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-runtime",
-  "version": "0.11.0",
+  "version": "0.11.1",
   "description": "Reusable runtime lifecycle for domain-specific agents.",
   "homepage": "https://github.com/tangle-network/agent-runtime#readme",
   "repository": {
diff --git a/src/agent/define-agent.ts b/src/agent/define-agent.ts
index 8f5f280..e99837a 100644
--- a/src/agent/define-agent.ts
+++ b/src/agent/define-agent.ts
@@ -16,6 +16,7 @@
  */
 
 import type { TraceAnalystKindSpec } from '@tangle-network/agent-eval'
+import type { RuntimeStreamEvent } from '../types'
 import { type AgentSurfaces, renderSurfaceIssues, validateSurfaces } from './surfaces'
 
 // ── manifest ─────────────────────────────────────────────────────────
@@ -157,15 +158,74 @@ export interface JudgeConfig<TRunOutput> {
 
 export interface AgentRuntime<TPersona, TRunOutput> {
   /**
-   * Invoke the agent against one persona. Returns the structured run
-   * output the rubric will score.
+   * Invoke the agent against one persona. Returns BOTH:
+   *   - `events`: an `AsyncIterable<RuntimeStreamEvent>` the chat-centric
+   *     product consumes verbatim (SSE / WebSocket / inline render).
+   *     **Streaming is mandatory — never collapse this to a single Promise.**
+   *     The agent's existing `runChatTurn` (or equivalent async generator)
+   *     plugs in here directly.
+   *   - `output`: a `Promise<TRunOutput>` resolved AFTER the event stream
+   *     drains. The eval substrate awaits this for rubric scoring; chat
+   *     products usually ignore it (they already rendered incrementally).
    *
-   * `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents
-   * SHOULD record their LLM calls / tool calls through it for capture
-   * integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD
-   * honour it for graceful cancel.
+   * Implementation contract:
+   *   1. `act` MUST return immediately (synchronous construction of the
+   *      `events` iterator + the `output` promise).
+   *   2. Iterating `events` drives the underlying LLM/tool calls — the
+   *      caller chooses when to consume.
+   *   3. `output` resolves only after the iterator yields its terminal
+   *      event (typically `task_end`); see `collectAgentRun` helper.
+   *
+   * `ctx.emitter` is the substrate-threaded `TraceEmitter` — runtimes
+   * SHOULD record LLM/tool spans through it for capture integrity.
+   * `ctx.deadlineMs` is wall-clock; the runtime SHOULD honour for graceful
+   * cancel. `ctx.signal` is the standard abort signal.
    */
-  act: (persona: TPersona, ctx: AgentRunContext) => Promise<TRunOutput>
+  act: (persona: TPersona, ctx: AgentRunContext) => AgentRunInvocation<TRunOutput>
+}
+
+export interface AgentRunInvocation<TRunOutput> {
+  /** Live stream of typed runtime events. Consumed by chat UX directly. */
+  events: AsyncIterable<RuntimeStreamEvent>
+  /** Final structured output the rubric scores. Resolves after `events` drains. */
+  output: Promise<TRunOutput>
+}
+
+/**
+ * Stub for agents whose `runtime.act` is not yet wired to the substrate's
+ * eval path. Preserves the streaming contract (empty event stream + a
+ * rejected `output` promise that tells the caller exactly what to fix).
+ *
+ * Per-vertical manifests usually start with this stub and replace it with
+ * the agent's real streaming runtime (`runChatTurn` or equivalent) once
+ * the eval path consumes the manifest end-to-end.
+ */
+export function unimplementedAgentRun<TRunOutput = unknown>(
+  reason = 'AgentRuntime.act is not yet wired for this manifest',
+): AgentRunInvocation<TRunOutput> {
+  return {
+    events: (async function* empty(): AsyncIterable<RuntimeStreamEvent> {})(),
+    output: Promise.reject(new Error(reason)),
+  }
+}
+
+/**
+ * Drain `act`'s `events` into an array AND await its `output`. Useful for
+ * eval / outcome-measurement code paths that don't care about live
+ * rendering. The events array is preserved so the substrate can inspect
+ * tool calls / readiness / questions retrospectively.
+ *
+ * IMPORTANT: chat-centric UX MUST NOT call this — it defeats streaming
+ * (no incremental render). Use `for await (const ev of invocation.events)`
+ * directly in the chat surface.
+ */
+export async function collectAgentRun<TRunOutput>(
+  invocation: AgentRunInvocation<TRunOutput>,
+): Promise<{ events: ReadonlyArray<RuntimeStreamEvent>; output: TRunOutput }> {
+  const events: RuntimeStreamEvent[] = []
+  for await (const ev of invocation.events) events.push(ev)
+  const output = await invocation.output
+  return { events, output }
 }
 
 export interface AgentRunContext {
diff --git a/src/agent/index.ts b/src/agent/index.ts
index 3292dec..96fd519 100644
--- a/src/agent/index.ts
+++ b/src/agent/index.ts
@@ -12,13 +12,19 @@ export type {
   AgentManifest,
   AgentRubric,
   AgentRunContext,
+  AgentRunInvocation,
   AgentRuntime,
   AnalystConfig,
   AutoApplyPolicy,
   JudgeConfig,
   RubricDimension,
 } from './define-agent'
-export { AgentManifestError, defineAgent } from './define-agent'
+export {
+  AgentManifestError,
+  collectAgentRun,
+  defineAgent,
+  unimplementedAgentRun,
+} from './define-agent'
 export type {
   CreateSurfaceImprovementAdapterOpts,
   DraftPatchInput,
diff --git a/src/agent/surfaces.ts b/src/agent/surfaces.ts
index 57cdb4f..38c10bc 100644
--- a/src/agent/surfaces.ts
+++ b/src/agent/surfaces.ts
@@ -88,60 +88,85 @@ export function resolveSubjectPath(
   surfaces: AgentSurfaces,
   repoRoot: string,
 ): ResolvedSurface | null {
-  const rel = relativePathForSubject(subject, surfaces)
-  if (rel === null) return null
-  const abs = isAbsolute(rel) ? rel : join(repoRoot, rel)
-  const exists = existsSync(abs)
+  const candidates = candidatePathsForSubject(subject, surfaces)
+  if (candidates.length === 0) return null
+
+  // Probe candidates in order, preferring the first one that exists on disk.
+  // Lets the substrate accept both the flat `<section>.md` convention and
+  // the skill-dir `<section>/SKILL.md` convention without forcing one layout.
+  // When none exists, fall back to the first candidate (canonical create-new).
+  for (const rel of candidates) {
+    const abs = isAbsolute(rel) ? rel : join(repoRoot, rel)
+    if (existsSync(abs)) {
+      return { absolutePath: abs, repoRelativePath: rel, exists: true, intent: 'edit-existing' }
+    }
+  }
+  const fallback = candidates[0]!
+  const fallbackAbs = isAbsolute(fallback) ? fallback : join(repoRoot, fallback)
   return {
-    absolutePath: abs,
-    repoRelativePath: rel,
-    exists,
-    intent: exists ? 'edit-existing' : 'create-new',
+    absolutePath: fallbackAbs,
+    repoRelativePath: fallback,
+    exists: false,
+    intent: 'create-new',
   }
 }
 
-function relativePathForSubject(subject: FindingSubject, surfaces: AgentSurfaces): string | null {
+function candidatePathsForSubject(
+  subject: FindingSubject,
+  surfaces: AgentSurfaces,
+): ReadonlyArray<string> {
   switch (subject.kind) {
     case 'knowledge.wiki':
     case 'knowledge.stale':
-      return join(surfaces.knowledge, `${subject.slug}.md`)
+      return [join(surfaces.knowledge, `${subject.slug}.md`)]
     case 'knowledge.claim':
       // Claims land in a per-topic claims directory under the knowledge root.
-      return join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)
+      return [join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)]
     case 'knowledge.raw':
-      return join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)
-    case 'system-prompt':
-      return join(surfaces.systemPrompt, `${slugify(subject.section)}.md`)
+      return [join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)]
+    case 'system-prompt': {
+      const slug = slugify(subject.section)
+      // Prefer flat layout for create-new (canonical); probe skill-dir layout
+      // in case the existing repo (tax/legal/gtm/creative) uses
+      // `<section>/SKILL.md` already.
+      return [
+        join(surfaces.systemPrompt, `${slug}.md`),
+        join(surfaces.systemPrompt, slug, 'SKILL.md'),
+        join(surfaces.systemPrompt, slug, 'index.md'),
+      ]
+    }
     case 'tool-doc':
-      return subject.aspect
-        ? join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)
-        : join(surfaces.tools, subject.tool, 'README.md')
+      if (subject.aspect) {
+        return [join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)]
+      }
+      // tool-doc default: `<tool>/README.md`; also probe `<tool>.md` for flat
+      // tool-list repos.
+      return [
+        join(surfaces.tools, subject.tool, 'README.md'),
+        join(surfaces.tools, `${subject.tool}.md`),
+      ]
     case 'new-tool':
-      return join(surfaces.tools, subject.name, 'README.md')
+      return [join(surfaces.tools, subject.name, 'README.md')]
     case 'rag':
-      if (!surfaces.rag) return null
-      return join(surfaces.rag, subject.corpus, `${subject.docId}.md`)
+      if (!surfaces.rag) return []
+      return [join(surfaces.rag, subject.corpus, `${subject.docId}.md`)]
     case 'memory':
-      if (!surfaces.memory) return null
-      return join(surfaces.memory, `${slugify(subject.key)}.json`)
+      if (!surfaces.memory) return []
+      return [join(surfaces.memory, `${slugify(subject.key)}.json`)]
     case 'scaffolding':
-      if (!surfaces.scaffolding) return null
-      return join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)
+      if (!surfaces.scaffolding) return []
+      return [join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)]
     case 'output-schema':
-      if (!surfaces.outputSchema) return null
-      // outputSchema is a single file — the field name is metadata for
-      // the LLM-drafted patch, not a separate path.
-      return surfaces.outputSchema
+      if (!surfaces.outputSchema) return []
+      return [surfaces.outputSchema]
     case 'websearch.outdated':
     case 'prior-run-summary':
-      // Stale signals don't map to a single file — they're handled by
-      // the knowledge adapter as `agent-knowledge:stale:*` after the
-      // operator decides which wiki page to retract. The substrate
-      // doesn't auto-route them.
-      return null
+      // Stale signals don't map to a single file — handled by the knowledge
+      // adapter as `agent-knowledge:stale:*` after operator review.
+      return []
     case 'cluster':
       // failure-mode cluster labels are evidence, not mutations.
-      return null
+      return []
   }
 }
 
diff --git a/tests/agent.test.ts b/tests/agent.test.ts
index ec04383..e109942 100644
--- a/tests/agent.test.ts
+++ b/tests/agent.test.ts
@@ -2,7 +2,12 @@ import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'nod
 import { tmpdir } from 'node:os'
 import { join } from 'node:path'
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
-import { AgentManifestError, defineAgent } from '../src/agent/define-agent'
+import {
+  AgentManifestError,
+  collectAgentRun,
+  defineAgent,
+  unimplementedAgentRun,
+} from '../src/agent/define-agent'
 import {
   createSurfaceImprovementAdapter,
   type DraftPatchInput,
@@ -75,7 +80,7 @@ describe('defineAgent', () => {
           { id: 'd2', weight: 0.5, score: () => 1 },
         ],
       },
-      runtime: { act: async () => ({}) },
+      runtime: { act: () => unimplementedAgentRun() },
       personas: async () => [],
       analystKinds: [],
       analyst: { model: 'claude-haiku-4-5' },
@@ -96,7 +101,7 @@ describe('defineAgent', () => {
           personas: 'personas',
         },
         rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
-        runtime: { act: async () => ({}) },
+        runtime: { act: () => unimplementedAgentRun() },
         personas: async () => [],
         analystKinds: [],
         analyst: { model: 'claude-haiku-4-5' },
@@ -122,7 +127,7 @@ describe('defineAgent', () => {
             { id: 'd2', weight: 5, score: () => 1 },
           ],
         },
-        runtime: { act: async () => ({}) },
+        runtime: { act: () => unimplementedAgentRun() },
         personas: async () => [],
         analystKinds: [],
         analyst: { model: 'claude-haiku-4-5' },
@@ -143,7 +148,7 @@ describe('defineAgent', () => {
         // No scaffolding / memory / rag / outputSchema — should not throw.
       },
       rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
-      runtime: { act: async () => ({}) },
+      runtime: { act: () => unimplementedAgentRun() },
       personas: async () => [],
       analystKinds: [],
       analyst: { model: 'claude-haiku-4-5' },
@@ -572,3 +577,100 @@ describe('validateSurfaces', () => {
     expect(flagged[0]!.surface).toBe('rag')
   })
 })
+
+describe('AgentRunInvocation streaming contract', () => {
+  it('unimplementedAgentRun yields no events and rejects output with a clear message', async () => {
+    const invocation = unimplementedAgentRun<{ score: number }>()
+    const events: unknown[] = []
+    for await (const ev of invocation.events) events.push(ev)
+    expect(events).toEqual([])
+    await expect(invocation.output).rejects.toThrow(/not yet wired/)
+  })
+
+  it('collectAgentRun drains events AND awaits output', async () => {
+    const invocation = {
+      events: (async function* yielder() {
+        yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never
+        yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never
+      })(),
+      output: Promise.resolve({ score: 0.9 }),
+    }
+    const result = await collectAgentRun(invocation)
+    expect(result.events.length).toBe(2)
+    expect(result.output).toEqual({ score: 0.9 })
+  })
+
+  it('preserves chat-UX streaming — events consumed incrementally', async () => {
+    const yielded: string[] = []
+    const invocation = {
+      events: (async function* tokens() {
+        yielded.push('start')
+        yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never
+        yielded.push('mid')
+        yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never
+        yielded.push('end')
+      })(),
+      output: Promise.resolve({ score: 1 }),
+    }
+    for await (const _ev of invocation.events) {
+      /* incremental render */
+    }
+    expect(yielded).toEqual(['start', 'mid', 'end'])
+  })
+})
+
+describe('multi-candidate path probing', () => {
+  it('probes <section>/SKILL.md skill-dir layout when present', () => {
+    const surfaces = {
+      systemPrompt: 'prompts',
+      tools: 'tools',
+      rubric: 'rubric.ts',
+      knowledge: '.agent-knowledge',
+      personas: 'personas',
+    }
+    mkdirSync(join(tmpRoot, 'prompts/expense-categorization'), { recursive: true })
+    writeFileSync(
+      join(tmpRoot, 'prompts/expense-categorization/SKILL.md'),
+      '# expense-categorization\n',
+    )
+    const r = resolveSubjectPath(
+      { kind: 'system-prompt', section: 'expense-categorization' },
+      surfaces,
+      tmpRoot,
+    )
+    expect(r?.repoRelativePath).toBe('prompts/expense-categorization/SKILL.md')
+    expect(r?.intent).toBe('edit-existing')
+  })
+
+  it('prefers flat <section>.md when both layouts exist', () => {
+    const surfaces = {
+      systemPrompt: 'prompts',
+      tools: 'tools',
+      rubric: 'rubric.ts',
+      knowledge: '.agent-knowledge',
+      personas: 'personas',
+    }
+    mkdirSync(join(tmpRoot, 'prompts/both-layouts'), { recursive: true })
+    writeFileSync(join(tmpRoot, 'prompts/both-layouts.md'), '# flat\n')
+    writeFileSync(join(tmpRoot, 'prompts/both-layouts/SKILL.md'), '# skill\n')
+    const r = resolveSubjectPath(
+      { kind: 'system-prompt', section: 'both-layouts' },
+      surfaces,
+      tmpRoot,
+    )
+    expect(r?.repoRelativePath).toBe('prompts/both-layouts.md')
+  })
+
+  it('falls back to flat tool-doc <tool>.md when <tool>/README.md is absent', () => {
+    const surfaces = {
+      systemPrompt: 'prompts',
+      tools: 'tools',
+      rubric: 'rubric.ts',
+      knowledge: '.agent-knowledge',
+      personas: 'personas',
+    }
+    writeFileSync(join(tmpRoot, 'tools/flat-tool.md'), '# flat-tool\n')
+    const r = resolveSubjectPath({ kind: 'tool-doc', tool: 'flat-tool' }, surfaces, tmpRoot)
+    expect(r?.repoRelativePath).toBe('tools/flat-tool.md')
+  })
+})