Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@tangle-network/agent-runtime",
"version": "0.11.0",
"version": "0.11.1",
"description": "Reusable runtime lifecycle for domain-specific agents.",
"homepage": "https://github.com/tangle-network/agent-runtime#readme",
"repository": {
Expand Down
74 changes: 67 additions & 7 deletions src/agent/define-agent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/

import type { TraceAnalystKindSpec } from '@tangle-network/agent-eval'
import type { RuntimeStreamEvent } from '../types'
import { type AgentSurfaces, renderSurfaceIssues, validateSurfaces } from './surfaces'

// ── manifest ─────────────────────────────────────────────────────────
Expand Down Expand Up @@ -157,15 +158,74 @@ export interface JudgeConfig<TRunOutput> {

export interface AgentRuntime<TPersona, TRunOutput> {
/**
* Invoke the agent against one persona. Returns the structured run
* output the rubric will score.
* Invoke the agent against one persona. Returns BOTH:
* - `events`: an `AsyncIterable<RuntimeStreamEvent>` the chat-centric
* product consumes verbatim (SSE / WebSocket / inline render).
* **Streaming is mandatory — never collapse this to a single Promise.**
* The agent's existing `runChatTurn` (or equivalent async generator)
* plugs in here directly.
* - `output`: a `Promise<TRunOutput>` resolved AFTER the event stream
* drains. The eval substrate awaits this for rubric scoring; chat
* products usually ignore it (they already rendered incrementally).
*
* `ctx.emitter` is the substrate-threaded `TraceEmitter` — agents
* SHOULD record their LLM calls / tool calls through it for capture
* integrity. `ctx.deadlineMs` is wall-clock; the runtime SHOULD
* honour it for graceful cancel.
* Implementation contract:
* 1. `act` MUST return immediately (synchronous construction of the
* `events` iterator + the `output` promise).
* 2. Iterating `events` drives the underlying LLM/tool calls — the
* caller chooses when to consume.
* 3. `output` resolves only after the iterator yields its terminal
* event (typically `task_end`); see `collectAgentRun` helper.
*
* `ctx.emitter` is the substrate-threaded `TraceEmitter` — runtimes
* SHOULD record LLM/tool spans through it for capture integrity.
* `ctx.deadlineMs` is wall-clock; the runtime SHOULD honour for graceful
* cancel. `ctx.signal` is the standard abort signal.
*/
act: (persona: TPersona, ctx: AgentRunContext) => Promise<TRunOutput>
act: (persona: TPersona, ctx: AgentRunContext) => AgentRunInvocation<TRunOutput>
}

export interface AgentRunInvocation<TRunOutput> {
/** Live stream of typed runtime events. Consumed by chat UX directly. */
events: AsyncIterable<RuntimeStreamEvent>
/** Final structured output the rubric scores. Resolves after `events` drains. */
output: Promise<TRunOutput>
}

/**
* Stub for agents whose `runtime.act` is not yet wired to the substrate's
* eval path. Preserves the streaming contract (empty event stream + a
* rejected `output` promise that tells the caller exactly what to fix).
*
* Per-vertical manifests usually start with this stub and replace it with
* the agent's real streaming runtime (`runChatTurn` or equivalent) once
* the eval path consumes the manifest end-to-end.
*/
export function unimplementedAgentRun<TRunOutput = unknown>(
reason = 'AgentRuntime.act is not yet wired for this manifest',
): AgentRunInvocation<TRunOutput> {
return {
events: (async function* empty(): AsyncIterable<RuntimeStreamEvent> {})(),
output: Promise.reject(new Error(reason)),
}
}

/**
* Drain `act`'s `events` into an array AND await its `output`. Useful for
* eval / outcome-measurement code paths that don't care about live
* rendering. The events array is preserved so the substrate can inspect
* tool calls / readiness / questions retrospectively.
*
* IMPORTANT: chat-centric UX MUST NOT call this — it defeats streaming
* (no incremental render). Use `for await (const ev of invocation.events)`
* directly in the chat surface.
*/
export async function collectAgentRun<TRunOutput>(
invocation: AgentRunInvocation<TRunOutput>,
): Promise<{ events: ReadonlyArray<RuntimeStreamEvent>; output: TRunOutput }> {
const events: RuntimeStreamEvent[] = []
for await (const ev of invocation.events) events.push(ev)
const output = await invocation.output
return { events, output }
}

export interface AgentRunContext {
Expand Down
8 changes: 7 additions & 1 deletion src/agent/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,19 @@ export type {
AgentManifest,
AgentRubric,
AgentRunContext,
AgentRunInvocation,
AgentRuntime,
AnalystConfig,
AutoApplyPolicy,
JudgeConfig,
RubricDimension,
} from './define-agent'
export { AgentManifestError, defineAgent } from './define-agent'
export {
AgentManifestError,
collectAgentRun,
defineAgent,
unimplementedAgentRun,
} from './define-agent'
export type {
CreateSurfaceImprovementAdapterOpts,
DraftPatchInput,
Expand Down
93 changes: 59 additions & 34 deletions src/agent/surfaces.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,60 +88,85 @@ export function resolveSubjectPath(
surfaces: AgentSurfaces,
repoRoot: string,
): ResolvedSurface | null {
const rel = relativePathForSubject(subject, surfaces)
if (rel === null) return null
const abs = isAbsolute(rel) ? rel : join(repoRoot, rel)
const exists = existsSync(abs)
const candidates = candidatePathsForSubject(subject, surfaces)
if (candidates.length === 0) return null

// Probe candidates in order, preferring the first one that exists on disk.
// Lets the substrate accept both the flat `<section>.md` convention and
// the skill-dir `<section>/SKILL.md` convention without forcing one layout.
// When none exists, fall back to the first candidate (canonical create-new).
for (const rel of candidates) {
const abs = isAbsolute(rel) ? rel : join(repoRoot, rel)
if (existsSync(abs)) {
return { absolutePath: abs, repoRelativePath: rel, exists: true, intent: 'edit-existing' }
}
}
const fallback = candidates[0]!
const fallbackAbs = isAbsolute(fallback) ? fallback : join(repoRoot, fallback)
return {
absolutePath: abs,
repoRelativePath: rel,
exists,
intent: exists ? 'edit-existing' : 'create-new',
absolutePath: fallbackAbs,
repoRelativePath: fallback,
exists: false,
intent: 'create-new',
}
}

function relativePathForSubject(subject: FindingSubject, surfaces: AgentSurfaces): string | null {
function candidatePathsForSubject(
subject: FindingSubject,
surfaces: AgentSurfaces,
): ReadonlyArray<string> {
switch (subject.kind) {
case 'knowledge.wiki':
case 'knowledge.stale':
return join(surfaces.knowledge, `${subject.slug}.md`)
return [join(surfaces.knowledge, `${subject.slug}.md`)]
case 'knowledge.claim':
// Claims land in a per-topic claims directory under the knowledge root.
return join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)
return [join(surfaces.knowledge, 'claims', `${slugify(subject.topic)}.md`)]
case 'knowledge.raw':
return join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)
case 'system-prompt':
return join(surfaces.systemPrompt, `${slugify(subject.section)}.md`)
return [join(surfaces.knowledge, 'raw', `${subject.sourceId}.md`)]
case 'system-prompt': {
const slug = slugify(subject.section)
// Prefer flat layout for create-new (canonical); probe skill-dir layout
// in case the existing repo (tax/legal/gtm/creative) uses
// `<section>/SKILL.md` already.
return [
join(surfaces.systemPrompt, `${slug}.md`),
join(surfaces.systemPrompt, slug, 'SKILL.md'),
join(surfaces.systemPrompt, slug, 'index.md'),
]
}
case 'tool-doc':
return subject.aspect
? join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)
: join(surfaces.tools, subject.tool, 'README.md')
if (subject.aspect) {
return [join(surfaces.tools, subject.tool, `${slugify(subject.aspect)}.md`)]
}
// tool-doc default: `<tool>/README.md`; also probe `<tool>.md` for flat
// tool-list repos.
return [
join(surfaces.tools, subject.tool, 'README.md'),
join(surfaces.tools, `${subject.tool}.md`),
]
case 'new-tool':
return join(surfaces.tools, subject.name, 'README.md')
return [join(surfaces.tools, subject.name, 'README.md')]
case 'rag':
if (!surfaces.rag) return null
return join(surfaces.rag, subject.corpus, `${subject.docId}.md`)
if (!surfaces.rag) return []
return [join(surfaces.rag, subject.corpus, `${subject.docId}.md`)]
case 'memory':
if (!surfaces.memory) return null
return join(surfaces.memory, `${slugify(subject.key)}.json`)
if (!surfaces.memory) return []
return [join(surfaces.memory, `${slugify(subject.key)}.json`)]
case 'scaffolding':
if (!surfaces.scaffolding) return null
return join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)
if (!surfaces.scaffolding) return []
return [join(surfaces.scaffolding, `${slugify(subject.concern)}.md`)]
case 'output-schema':
if (!surfaces.outputSchema) return null
// outputSchema is a single file — the field name is metadata for
// the LLM-drafted patch, not a separate path.
return surfaces.outputSchema
if (!surfaces.outputSchema) return []
return [surfaces.outputSchema]
case 'websearch.outdated':
case 'prior-run-summary':
// Stale signals don't map to a single file — they're handled by
// the knowledge adapter as `agent-knowledge:stale:*` after the
// operator decides which wiki page to retract. The substrate
// doesn't auto-route them.
return null
// Stale signals don't map to a single file — handled by the knowledge
// adapter as `agent-knowledge:stale:*` after operator review.
return []
case 'cluster':
// failure-mode cluster labels are evidence, not mutations.
return null
return []
}
}

Expand Down
112 changes: 107 additions & 5 deletions tests/agent.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@ import { mkdirSync, mkdtempSync, readFileSync, rmSync, writeFileSync } from 'nod
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'
import { AgentManifestError, defineAgent } from '../src/agent/define-agent'
import {
AgentManifestError,
collectAgentRun,
defineAgent,
unimplementedAgentRun,
} from '../src/agent/define-agent'
import {
createSurfaceImprovementAdapter,
type DraftPatchInput,
Expand Down Expand Up @@ -75,7 +80,7 @@ describe('defineAgent', () => {
{ id: 'd2', weight: 0.5, score: () => 1 },
],
},
runtime: { act: async () => ({}) },
runtime: { act: () => unimplementedAgentRun() },
personas: async () => [],
analystKinds: [],
analyst: { model: 'claude-haiku-4-5' },
Expand All @@ -96,7 +101,7 @@ describe('defineAgent', () => {
personas: 'personas',
},
rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
runtime: { act: async () => ({}) },
runtime: { act: () => unimplementedAgentRun() },
personas: async () => [],
analystKinds: [],
analyst: { model: 'claude-haiku-4-5' },
Expand All @@ -122,7 +127,7 @@ describe('defineAgent', () => {
{ id: 'd2', weight: 5, score: () => 1 },
],
},
runtime: { act: async () => ({}) },
runtime: { act: () => unimplementedAgentRun() },
personas: async () => [],
analystKinds: [],
analyst: { model: 'claude-haiku-4-5' },
Expand All @@ -143,7 +148,7 @@ describe('defineAgent', () => {
// No scaffolding / memory / rag / outputSchema — should not throw.
},
rubric: { dimensions: [{ id: 'd1', weight: 1, score: () => 0 }] },
runtime: { act: async () => ({}) },
runtime: { act: () => unimplementedAgentRun() },
personas: async () => [],
analystKinds: [],
analyst: { model: 'claude-haiku-4-5' },
Expand Down Expand Up @@ -572,3 +577,100 @@ describe('validateSurfaces', () => {
expect(flagged[0]!.surface).toBe('rag')
})
})

describe('AgentRunInvocation streaming contract', () => {
it('unimplementedAgentRun yields no events and rejects output with a clear message', async () => {
const invocation = unimplementedAgentRun<{ score: number }>()
const events: unknown[] = []
for await (const ev of invocation.events) events.push(ev)
expect(events).toEqual([])
await expect(invocation.output).rejects.toThrow(/not yet wired/)
})

it('collectAgentRun drains events AND awaits output', async () => {
const invocation = {
events: (async function* yielder() {
yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never
yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never
})(),
output: Promise.resolve({ score: 0.9 }),
}
const result = await collectAgentRun(invocation)
expect(result.events.length).toBe(2)
expect(result.output).toEqual({ score: 0.9 })
})

it('preserves chat-UX streaming — events consumed incrementally', async () => {
const yielded: string[] = []
const invocation = {
events: (async function* tokens() {
yielded.push('start')
yield { type: 'task_start', task: { id: 't' }, timestamp: 'now' } as never
yielded.push('mid')
yield { type: 'task_end', task: { id: 't' }, ok: true, timestamp: 'now' } as never
yielded.push('end')
})(),
output: Promise.resolve({ score: 1 }),
}
for await (const _ev of invocation.events) {
/* incremental render */
}
expect(yielded).toEqual(['start', 'mid', 'end'])
})
})

describe('multi-candidate path probing', () => {
it('probes <section>/SKILL.md skill-dir layout when present', () => {
const surfaces = {
systemPrompt: 'prompts',
tools: 'tools',
rubric: 'rubric.ts',
knowledge: '.agent-knowledge',
personas: 'personas',
}
mkdirSync(join(tmpRoot, 'prompts/expense-categorization'), { recursive: true })
writeFileSync(
join(tmpRoot, 'prompts/expense-categorization/SKILL.md'),
'# expense-categorization\n',
)
const r = resolveSubjectPath(
{ kind: 'system-prompt', section: 'expense-categorization' },
surfaces,
tmpRoot,
)
expect(r?.repoRelativePath).toBe('prompts/expense-categorization/SKILL.md')
expect(r?.intent).toBe('edit-existing')
})

it('prefers flat <section>.md when both layouts exist', () => {
const surfaces = {
systemPrompt: 'prompts',
tools: 'tools',
rubric: 'rubric.ts',
knowledge: '.agent-knowledge',
personas: 'personas',
}
mkdirSync(join(tmpRoot, 'prompts/both-layouts'), { recursive: true })
writeFileSync(join(tmpRoot, 'prompts/both-layouts.md'), '# flat\n')
writeFileSync(join(tmpRoot, 'prompts/both-layouts/SKILL.md'), '# skill\n')
const r = resolveSubjectPath(
{ kind: 'system-prompt', section: 'both-layouts' },
surfaces,
tmpRoot,
)
expect(r?.repoRelativePath).toBe('prompts/both-layouts.md')
})

it('falls back to flat tool-doc <tool>.md when <tool>/README.md is absent', () => {
const surfaces = {
systemPrompt: 'prompts',
tools: 'tools',
rubric: 'rubric.ts',
knowledge: '.agent-knowledge',
personas: 'personas',
}
writeFileSync(join(tmpRoot, 'tools/flat-tool.md'), '# flat-tool\n')
const r = resolveSubjectPath({ kind: 'tool-doc', tool: 'flat-tool' }, surfaces, tmpRoot)
expect(r?.repoRelativePath).toBe('tools/flat-tool.md')
})
})
Loading