diff --git a/package.json b/package.json index 67d971b..278cf3a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.30.1", + "version": "0.31.0", "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/errors.ts b/src/errors.ts index e6176d8..6a86651 100644 --- a/src/errors.ts +++ b/src/errors.ts @@ -21,6 +21,7 @@ export type AgentEvalErrorCode = | 'judge' | 'verification' | 'replay' + | 'backend_integrity' export class AgentEvalError extends Error { /** Stable string code. Survives minification; safe to switch on. */ diff --git a/src/index.ts b/src/index.ts index 2aef2da..6ed0402 100644 --- a/src/index.ts +++ b/src/index.ts @@ -208,6 +208,16 @@ export { integrationManifestResolvedPayload, integrationManifestValidatedPayload, } from './integration-gates' +// ── Backend-integrity guard ─────────────────────────────────────────── +// Distinguish "agent failed" from "eval ran blind against a stub or +// unconfigured backend." Required after every canonical eval so a 0/N +// pass-rate never silently masks a misconfigured runtime. +export type { BackendIntegrityReport } from './integrity/backend-integrity' +export { + assertRealBackend, + BackendIntegrityError, + summarizeBackendIntegrity, +} from './integrity/backend-integrity' export { adversarialJudge, codeExecutionJudge, diff --git a/src/integrity/backend-integrity.ts b/src/integrity/backend-integrity.ts new file mode 100644 index 0000000..e1e0408 --- /dev/null +++ b/src/integrity/backend-integrity.ts @@ -0,0 +1,183 @@ +/** + * Backend-integrity guard: distinguish "agent failed" from "eval ran against + * a stub / unconfigured backend." Without this guard a canonical eval can + * silently report `0/N passed` and look like an agent-quality problem when + * the LLM was never actually called — the failure mode we just hit running + * the 4-vertical parallel eval (legal-sandbox-stub returned hard-coded 33-104 + * char strings; gtm/creative defaulted to a cli-bridge that wasn't running). + * + * The shape: + * + * const report = summarizeBackendIntegrity(records) + * assertRealBackend(records) // throws BackendIntegrityError if 100% stub + * + * A record is "stub-mode" if its `tokenUsage.input === 0 && tokenUsage.output === 0`. + * (`costUsd` alone is unreliable — some backends successfully call LLMs but + * don't propagate pricing, producing real tokens with $0 cost.) + * + * Verdicts: + * - `real` — at least one record has nonzero token usage + * - `stub` — every record is stub-mode (eval ran blind) + * - `mixed` — some records real, some stub (partial backend failure; + * often the 429-cascade or auth-half-failed case) + */ + +import { AgentEvalError } from '../errors' +import type { RunRecord } from '../run-record' + +export interface BackendIntegrityReport { + /** Total records inspected. */ + totalRecords: number + /** Records with input=0 AND output=0 (a stub fingerprint). */ + stubRecords: number + /** Records with nonzero token usage (real LLM activity). */ + realRecords: number + /** Records where output>0 but costUsd=0 (real LLM, broken cost ledger). */ + uncostedRecords: number + /** Sum of input tokens across all records. */ + totalInputTokens: number + /** Sum of output tokens across all records. */ + totalOutputTokens: number + /** Sum of costUsd across all records. */ + totalCostUsd: number + /** Worst-case integrity verdict. */ + verdict: 'real' | 'mixed' | 'stub' + /** Human-readable diagnosis suitable for terminal output. */ + diagnosis: string +} + +/** + * Error thrown when an integrity assertion fails. Caller can pattern-match + * by `code === 'AGENT_EVAL_BACKEND_STUB'` to differentiate from other + * errors. + */ +export class BackendIntegrityError extends AgentEvalError { + constructor( + message: string, + public readonly report: BackendIntegrityReport, + ) { + super('backend_integrity', message) + } +} + +function isStubRecord(rec: RunRecord): boolean { + return rec.tokenUsage.input === 0 && rec.tokenUsage.output === 0 +} + +function isUncostedRecord(rec: RunRecord): boolean { + return rec.tokenUsage.output > 0 && rec.costUsd === 0 +} + +/** + * Inspect a batch of RunRecords and return an integrity report. Pure + * function — no I/O, no logging. The caller decides what to do with the + * verdict (print warning, throw, gate CI, etc.). + */ +export function summarizeBackendIntegrity( + records: ReadonlyArray, +): BackendIntegrityReport { + const totalRecords = records.length + let stubRecords = 0 + let realRecords = 0 + let uncostedRecords = 0 + let totalInputTokens = 0 + let totalOutputTokens = 0 + let totalCostUsd = 0 + for (const rec of records) { + totalInputTokens += rec.tokenUsage.input + totalOutputTokens += rec.tokenUsage.output + totalCostUsd += rec.costUsd + if (isStubRecord(rec)) stubRecords++ + else realRecords++ + if (isUncostedRecord(rec)) uncostedRecords++ + } + const verdict: BackendIntegrityReport['verdict'] = + totalRecords === 0 + ? 'stub' + : stubRecords === totalRecords + ? 'stub' + : stubRecords === 0 + ? 'real' + : 'mixed' + const diagnosis = buildDiagnosis({ + totalRecords, + stubRecords, + realRecords, + uncostedRecords, + totalInputTokens, + totalOutputTokens, + totalCostUsd, + verdict, + }) + return { + totalRecords, + stubRecords, + realRecords, + uncostedRecords, + totalInputTokens, + totalOutputTokens, + totalCostUsd, + verdict, + diagnosis, + } +} + +function buildDiagnosis(r: Omit): string { + if (r.totalRecords === 0) { + return 'no records — eval produced zero runs; backend likely failed before first turn' + } + if (r.verdict === 'stub') { + return [ + `all ${r.totalRecords} records have zero token usage — the LLM backend was never called.`, + 'common causes: --backend sandbox without a sandbox bridge running; stub model returning hard-coded strings;', + 'auth misconfigured so requests were silently dropped before the LLM. Re-run with --backend tcloud and TANGLE_API_KEY set,', + 'or boot the cli-bridge / sandbox before invoking the eval.', + ].join(' ') + } + if (r.verdict === 'mixed') { + const pct = ((r.stubRecords / r.totalRecords) * 100).toFixed(0) + return [ + `${r.stubRecords}/${r.totalRecords} records (${pct}%) have zero token usage — the backend partially failed.`, + 'common causes: rate-limit cascade (429s after the first N personas);', + 'transient auth expiry mid-run; provider outage. Treat the affected records as missing data, not agent failures.', + ].join(' ') + } + // verdict === 'real' + if (r.uncostedRecords > 0) { + const pct = ((r.uncostedRecords / r.totalRecords) * 100).toFixed(0) + return [ + `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens).`, + `${r.uncostedRecords} (${pct}%) have output tokens but costUsd=0 — cost ledger is mis-wired (no input-token`, + 'propagation from the runtime stream into RunRecord).', + ].join(' ') + } + return `${r.totalRecords} records with real LLM activity (in=${r.totalInputTokens}, out=${r.totalOutputTokens} tokens, $${r.totalCostUsd.toFixed(4)}).` +} + +/** + * Throw BackendIntegrityError if the verdict is 'stub' — i.e. every record + * shows zero LLM activity. Non-strict callers can pass `{ allowMixed: false }` + * to also reject mixed verdicts (recommended for CI gates). + * + * Real backends pass through silently. + */ +export function assertRealBackend( + records: ReadonlyArray, + opts: { allowMixed?: boolean } = {}, +): BackendIntegrityReport { + const report = summarizeBackendIntegrity(records) + const allowMixed = opts.allowMixed ?? true + if (report.verdict === 'stub') { + throw new BackendIntegrityError( + `backend-integrity: ran against a stub or unconfigured backend — ${report.diagnosis}`, + report, + ) + } + if (!allowMixed && report.verdict === 'mixed') { + throw new BackendIntegrityError( + `backend-integrity: partial backend failure rejected — ${report.diagnosis}`, + report, + ) + } + return report +} diff --git a/tests/backend-integrity.test.ts b/tests/backend-integrity.test.ts new file mode 100644 index 0000000..8a70c21 --- /dev/null +++ b/tests/backend-integrity.test.ts @@ -0,0 +1,136 @@ +import { describe, expect, it } from 'vitest' +import { + assertRealBackend, + BackendIntegrityError, + summarizeBackendIntegrity, +} from '../src/integrity/backend-integrity' +import type { RunRecord } from '../src/run-record' + +function makeRecord(input: number, output: number, costUsd: number): RunRecord { + return { + runId: `r-${Math.random()}`, + experimentId: 'exp', + candidateId: 'cand', + seed: 0, + model: 'test-model@2026-01', + promptHash: 'a'.repeat(64), + configHash: 'b'.repeat(64), + commitSha: 'c'.repeat(40), + wallMs: 100, + costUsd, + tokenUsage: { input, output }, + outcome: { holdoutScore: 0.5, raw: {} }, + splitTag: 'holdout', + scenarioId: 'scn', + } +} + +describe('backend-integrity', () => { + describe('summarizeBackendIntegrity', () => { + it('classifies all-zero token usage as stub', () => { + const r = summarizeBackendIntegrity([ + makeRecord(0, 0, 0), + makeRecord(0, 0, 0), + makeRecord(0, 0, 0), + ]) + expect(r.verdict).toBe('stub') + expect(r.stubRecords).toBe(3) + expect(r.realRecords).toBe(0) + expect(r.diagnosis).toContain('LLM backend was never called') + }) + + it('classifies any real activity as real', () => { + const r = summarizeBackendIntegrity([ + makeRecord(500, 1000, 0.01), + makeRecord(600, 1200, 0.012), + ]) + expect(r.verdict).toBe('real') + expect(r.realRecords).toBe(2) + expect(r.stubRecords).toBe(0) + expect(r.totalInputTokens).toBe(1100) + expect(r.totalOutputTokens).toBe(2200) + }) + + it('classifies partial stub-mode as mixed', () => { + const r = summarizeBackendIntegrity([ + makeRecord(500, 1000, 0.01), + makeRecord(0, 0, 0), + makeRecord(0, 0, 0), + ]) + expect(r.verdict).toBe('mixed') + expect(r.stubRecords).toBe(2) + expect(r.realRecords).toBe(1) + expect(r.diagnosis).toContain('2/3 records (67%) have zero') + }) + + it('flags output-tokens-without-cost as uncosted', () => { + const r = summarizeBackendIntegrity([ + makeRecord(500, 1000, 0), + makeRecord(500, 1000, 0), + ]) + expect(r.verdict).toBe('real') + expect(r.uncostedRecords).toBe(2) + expect(r.diagnosis).toContain('cost ledger is mis-wired') + }) + + it('handles empty input', () => { + const r = summarizeBackendIntegrity([]) + expect(r.verdict).toBe('stub') + expect(r.totalRecords).toBe(0) + expect(r.diagnosis).toContain('no records') + }) + + it('does not count input=0+output>0 as stub (partial usage propagation)', () => { + const r = summarizeBackendIntegrity([makeRecord(0, 1000, 0)]) + expect(r.verdict).toBe('real') + expect(r.stubRecords).toBe(0) + expect(r.uncostedRecords).toBe(1) + }) + }) + + describe('assertRealBackend', () => { + it('throws on pure-stub verdict', () => { + expect(() => assertRealBackend([makeRecord(0, 0, 0)])).toThrow(BackendIntegrityError) + }) + + it('throws on empty input', () => { + expect(() => assertRealBackend([])).toThrow(BackendIntegrityError) + }) + + it('passes through on real verdict', () => { + const r = assertRealBackend([makeRecord(500, 1000, 0.01)]) + expect(r.verdict).toBe('real') + }) + + it('allows mixed by default', () => { + const r = assertRealBackend([ + makeRecord(500, 1000, 0.01), + makeRecord(0, 0, 0), + ]) + expect(r.verdict).toBe('mixed') + }) + + it('rejects mixed when allowMixed=false', () => { + expect(() => + assertRealBackend( + [makeRecord(500, 1000, 0.01), makeRecord(0, 0, 0)], + { allowMixed: false }, + ), + ).toThrow(BackendIntegrityError) + }) + + it('thrown error carries the report and the right code', () => { + try { + assertRealBackend([makeRecord(0, 0, 0), makeRecord(0, 0, 0)]) + expect.fail('should have thrown') + } catch (e) { + expect(e).toBeInstanceOf(BackendIntegrityError) + if (e instanceof BackendIntegrityError) { + expect(e.code).toBe('backend_integrity') + expect(e.report.verdict).toBe('stub') + expect(e.report.totalRecords).toBe(2) + } + } + }) + }) +})