From 4ca0db5e35cadbcd5391c64e38d99e8e810e0899 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Wed, 20 May 2026 08:43:40 -0600 Subject: [PATCH] feat(0.31.0): JudgeScoresRecord on RunRecord.outcome Ensemble-judge consumers were dropping per-judge per-dim scores on the floor because RunOutcome only had a slot for the composite. Adds a typed `judgeScores?: JudgeScoresRecord` field, threaded through runEvalCampaign and pinned in the consumer-contract test. Validator rejects NaN scores and non-string failedJudges entries; fail-loud test covers a panel where one judge throws. Bumps TS + Python clients to 0.31.0 in lockstep. --- CHANGELOG.md | 57 +++++++ clients/python/pyproject.toml | 2 +- clients/python/src/agent_eval_rpc/__init__.py | 2 +- package.json | 2 +- src/eval-campaign.ts | 8 + src/index.ts | 1 + src/run-record.ts | 102 +++++++++++ tests/consumer-contract.test.ts | 20 +++ tests/eval-campaign.test.ts | 158 ++++++++++++++++++ tests/run-record.test.ts | 84 ++++++++++ 10 files changed, 433 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index aa552f3..0ac87aa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,62 @@ # Changelog +## 0.31.0 — 2026-05-20 + +### `JudgeScoresRecord` on `RunRecord.outcome` — substrate-blessed ensemble shape + +Multi-judge consumers (forge-chat in agent-builder, and four sibling +product agents on the same trajectory) compute per-judge per-dimension +scores per cell, then collapse to a single composite for the gate. The +substrate's `RunOutcome` only had a slot for the composite plus a free +`raw: Record` bag. Consumers were either dropping the +breakdown on the floor or smuggling it through stringly-typed `raw` +keys like `judge_kimi_helpfulness` — neither survives a corpus-IRR run +(0.27.2's `corpusInterRaterAgreement` expects structured per-judge +per-dim records, not parsed strings). + +This release ships the typed slot so every product agent speaks the +same shape, and the inter-rater primitives consume it without a +per-consumer adapter. + +### Added + +- **`JudgeScoresRecord`** (`src/run-record.ts`) — `perJudge[judgeId][dim]` + is the canonical store; `perDimMean` and `composite` are precomputed + projections so reporters and IRR primitives don't repeat the + aggregation; `failedJudges?: string[]` records dead-judge ids + explicitly (no inferring partial-failure from missing keys); + `notes?: string` carries panel prose. +- **`RunOutcome.judgeScores?: JudgeScoresRecord`** — optional. Single- + judge or scalar-only runs leave it unset; ensemble runs populate it. +- **`CampaignRunOutcome.judgeScores?: JudgeScoresRecord`** — runners + return it on the per-cell outcome; `runEvalCampaign` threads it onto + the resulting `RunRecord.outcome.judgeScores` without coercion. + +### Validator extended + +`validateRunRecord` validates `outcome.judgeScores` when present. +Every `perJudge[judge][dim]` and every `perDimMean[dim]` and the +`composite` must be finite numbers — the NaN-as-silent-zero bug class +banned by `CLAUDE.md` cannot pass the boundary. `failedJudges` must be +an array of non-empty strings; `notes` must be a string. Round-trip +tested in `tests/run-record.test.ts`. + +### Fail-loud contract + +A judge that throws lands in `failedJudges` by id, not a silent zero +in `perJudge`. The composite is computed over surviving judges only; +the partial-failure signal is preserved through to the gate. +`tests/eval-campaign.test.ts` covers the four shapes (full, partial, +missing, with notes) plus an explicit fail-loud case where one judge +throws and the run record carries `failedJudges: ['glm-5.1@...']`. + +### Consumer contract + +`tests/consumer-contract.test.ts` pins `JudgeScoresRecord` as a +type-level export at the root entry. The 0.30.0 surface is preserved — +the new field is additive on `RunOutcome` and the new type is a new +export, so existing consumers stay green. + ## 0.29.0 — 2026-05-19 ### Analyst kinds + cross-run findings context diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml index ef79c26..09df840 100644 --- a/clients/python/pyproject.toml +++ b/clients/python/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "agent-eval-rpc" -version = "0.30.0" +version = "0.31.0" description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client." readme = "README.md" requires-python = ">=3.10" diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py index 5498239..c50579b 100644 --- a/clients/python/src/agent_eval_rpc/__init__.py +++ b/clients/python/src/agent_eval_rpc/__init__.py @@ -48,7 +48,7 @@ try: __version__ = version("agent-eval-rpc") except PackageNotFoundError: - __version__ = "0.30.0" + __version__ = "0.31.0" __all__ = [ "Client", diff --git a/package.json b/package.json index fa0e15e..278cf3a 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.30.0", + "version": "0.31.0", "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.", "homepage": "https://github.com/tangle-network/agent-eval#readme", "repository": { diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts index 8c41a0c..8c6ac99 100644 --- a/src/eval-campaign.ts +++ b/src/eval-campaign.ts @@ -41,6 +41,7 @@ import { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client' import { canonicalize, hashJson } from './pre-registration' import type { + JudgeScoresRecord, RunJudgeMetadata, RunOutcome, RunRecord, @@ -120,6 +121,12 @@ export interface CampaignRunOutcome { failureMode?: string /** Optional judge metadata when a judge was used. */ judgeMetadata?: RunJudgeMetadata + /** + * Optional per-judge / per-dim breakdown for ensemble-judged runs. + * Propagated to `outcome.judgeScores` on the resulting `RunRecord`. + * Single-judge or scalar-only runs leave this unset. + */ + judgeScores?: JudgeScoresRecord } export type CampaignRunner = (ctx: CampaignRunContext) => Promise @@ -457,6 +464,7 @@ export async function runEvalCampaign( } if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score else recordOutcome.searchScore = outcome.score + if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores const record: RunRecord = { runId, diff --git a/src/index.ts b/src/index.ts index f9b4ad3..c3f519b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1087,6 +1087,7 @@ export { CallbackResearcher, NoopResearcher } from './researcher' // tournaments, adversarial, compute curves, auto-research — live on the // dedicated subpath: @tangle-network/agent-eval/rl export type { + JudgeScoresRecord, RunJudgeMetadata, RunOutcome, RunRecord, diff --git a/src/run-record.ts b/src/run-record.ts index 48fe4e2..52280da 100644 --- a/src/run-record.ts +++ b/src/run-record.ts @@ -44,6 +44,42 @@ export interface RunJudgeMetadata { fallback: boolean } +/** + * Per-judge / per-dimension breakdown for runs scored by an ensemble of + * judges over a multi-dimensional rubric. + * + * The collapsed `outcome.searchScore` / `holdoutScore` carries the + * composite the gate uses. The full breakdown belongs here so consumers + * can answer "which judge disagreed?", "which dimension dragged the + * composite down?", and "did half the panel fail?" without re-running. + * + * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and + * `composite` are convenience projections — derivable but precomputed so + * downstream IRR primitives (`interRaterReliability`, + * `corpusInterRaterAgreement`) and reporters don't pay the same + * aggregation twice. + * + * Fail-loud discipline: judges that errored out land in `failedJudges` + * by id. A missing key in `perJudge` is ambiguous (silent zero vs not + * run); the explicit list makes a partial-failure recorded as such. + */ +export interface JudgeScoresRecord { + /** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */ + perJudge: Record> + /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */ + perDimMean: Record + /** Composite mean across all dims and judges. Mirrors the score + * the gate sees on `outcome.searchScore` / `holdoutScore`. */ + composite: number + /** Judges that errored or returned an unparseable verdict. Recorded + * by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit, + * not inferred from missing keys in `perJudge`. */ + failedJudges?: string[] + /** Free-form notes the judges emitted (joined across judges or + * first-judge only — consumer's choice). */ + notes?: string +} + export interface RunOutcome { /** Score on the search/optimization split. Optional because a * holdout-only evaluation only fills `holdoutScore`. */ @@ -55,6 +91,12 @@ export interface RunOutcome { * pass/fail counters, latency stats, etc. Numeric only — keeps * reporters honest. */ raw: Record + /** Per-judge / per-dim breakdown. Consumers writing ensemble + * judgements populate this; substrate primitives like + * `interRaterReliability` and `corpusInterRaterAgreement` accept + * these records as input. Optional — single-judge or scalar-only + * runs leave it unset. */ + judgeScores?: JudgeScoresRecord } /** @@ -242,6 +284,11 @@ export function validateRunRecord(input: unknown): RunRecord { expectFiniteNumber(v, `outcome.raw.${k}`) } + // Per-judge / per-dim breakdown, optional. + if (outRec.judgeScores !== undefined) { + validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores') + } + // Failure mode optional. if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode') @@ -298,6 +345,61 @@ function expectFiniteNumber(value: unknown, path: string): void { } } +function validateJudgeScores(value: unknown, path: string): void { + if (value === null || typeof value !== 'object') { + throw new RunRecordValidationError('judgeScores must be an object', path) + } + const rec = value as Record + + const perJudge = rec.perJudge + if (perJudge === null || typeof perJudge !== 'object') { + throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`) + } + for (const [judgeId, dims] of Object.entries(perJudge as Record)) { + if (dims === null || typeof dims !== 'object') { + throw new RunRecordValidationError( + 'per-judge entry must be an object of dimension scores', + `${path}.perJudge.${judgeId}`, + ) + } + for (const [dim, score] of Object.entries(dims as Record)) { + expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`) + } + } + + const perDimMean = rec.perDimMean + if (perDimMean === null || typeof perDimMean !== 'object') { + throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`) + } + for (const [dim, mean] of Object.entries(perDimMean as Record)) { + expectFiniteNumber(mean, `${path}.perDimMean.${dim}`) + } + + expectFiniteNumber(rec.composite, `${path}.composite`) + + if (rec.failedJudges !== undefined) { + if (!Array.isArray(rec.failedJudges)) { + throw new RunRecordValidationError( + 'failedJudges must be an array of strings', + `${path}.failedJudges`, + ) + } + for (let i = 0; i < rec.failedJudges.length; i++) { + const id = rec.failedJudges[i] + if (typeof id !== 'string' || id.length === 0) { + throw new RunRecordValidationError( + 'failedJudges entry must be a non-empty string', + `${path}.failedJudges[${i}]`, + ) + } + } + } + + if (rec.notes !== undefined && typeof rec.notes !== 'string') { + throw new RunRecordValidationError('notes must be a string', `${path}.notes`) + } +} + /** * Heuristic snapshot check. Accepts: * - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`) diff --git a/tests/consumer-contract.test.ts b/tests/consumer-contract.test.ts index abed82a..3e3e7c6 100644 --- a/tests/consumer-contract.test.ts +++ b/tests/consumer-contract.test.ts @@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest' import * as builderEval from '../src/builder-eval/index' import * as agentEval from '../src/index' import * as rl from '../src/rl/index' +import type { JudgeScoresRecord, RunOutcome } from '../src/index' /** * Public-surface contract for `@tangle-network/agent-eval`. @@ -109,4 +110,23 @@ describe('public-surface contract for consumers', () => { expect(proto instanceof Error, `${name} must extend Error`).toBe(true) } }) + + it('exposes JudgeScoresRecord as the canonical ensemble shape on RunOutcome', () => { + // Type-level pin: a `JudgeScoresRecord` is assignable to + // `RunOutcome.judgeScores`. If the interface gets renamed or the + // field gets dropped from `RunOutcome`, this stops compiling — the + // contract that protects forge-chat / multi-judge consumers. + const judgeScores: JudgeScoresRecord = { + perJudge: { 'kimi-k2.6': { helpfulness: 0.8, clarity: 0.7 } }, + perDimMean: { helpfulness: 0.8, clarity: 0.7 }, + composite: 0.75, + } + const outcome: RunOutcome = { + holdoutScore: 0.75, + raw: {}, + judgeScores, + } + expect(outcome.judgeScores).toBe(judgeScores) + expect(outcome.judgeScores?.composite).toBe(0.75) + }) }) diff --git a/tests/eval-campaign.test.ts b/tests/eval-campaign.test.ts index e0ccd15..96b3a66 100644 --- a/tests/eval-campaign.test.ts +++ b/tests/eval-campaign.test.ts @@ -286,6 +286,164 @@ describe('runEvalCampaign — failure handling', () => { }) }) +describe('runEvalCampaign — judgeScores propagation', () => { + // Forge-chat / multi-judge consumers produce per-judge per-dim scores + // alongside the composite. The campaign must thread them onto + // `RunRecord.outcome.judgeScores` without coercion, and the record + // must survive a JSON round-trip (records.jsonl is what consumers + // ultimately persist). + + function judgeScoresRunner( + judgeScores: import('../src/run-record').JudgeScoresRecord | undefined, + ): CampaignRunner { + return async (ctx) => { + const base = await defaultRunner(ctx) + if (judgeScores === undefined) return base + return { ...base, judgeScores } + } + } + + it('full shape: lands all per-judge/per-dim/composite fields on the record + JSON round-trip', async () => { + const judgeScores = { + perJudge: { + 'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75, on_topic: 0.9 }, + 'glm-5.1@2026-04-02': { helpfulness: 0.85, clarity: 0.7, on_topic: 0.95 }, + }, + perDimMean: { helpfulness: 0.825, clarity: 0.725, on_topic: 0.925 }, + composite: 0.825, + } + const result = await runEvalCampaign( + baseOpts({ + variants: [{ id: 'v1', payload: { prompt: 'p' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + runner: judgeScoresRunner(judgeScores), + }), + ) + expect(result.runs).toHaveLength(1) + const rec = result.runs[0] + expect(rec?.outcome.judgeScores).toEqual(judgeScores) + // JSON round-trip — this is the shape that lands in records.jsonl. + const roundTripped = JSON.parse(JSON.stringify(rec)) + expect(roundTripped.outcome.judgeScores).toEqual(judgeScores) + }) + + it('partial shape (failedJudges populated): one judge errored, recorded explicitly', async () => { + // Fail-loud: a panel with one dead judge is recorded as such — not + // inferred from a missing key in perJudge. The composite + perDimMean + // are computed over the surviving judges only. + const judgeScores = { + perJudge: { + 'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75 }, + }, + perDimMean: { helpfulness: 0.8, clarity: 0.75 }, + composite: 0.775, + failedJudges: ['glm-5.1@2026-04-02'], + } + const result = await runEvalCampaign( + baseOpts({ + variants: [{ id: 'v1', payload: { prompt: 'p' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + runner: judgeScoresRunner(judgeScores), + }), + ) + const rec = result.runs[0] + expect(rec?.outcome.judgeScores?.failedJudges).toEqual(['glm-5.1@2026-04-02']) + expect(Object.keys(rec?.outcome.judgeScores?.perJudge ?? {})).toEqual(['kimi-k2.6@2026-04-01']) + }) + + it('missing shape (no ensemble): legacy / single-judge runs leave outcome.judgeScores undefined', async () => { + const result = await runEvalCampaign( + baseOpts({ + variants: [{ id: 'v1', payload: { prompt: 'p' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + runner: judgeScoresRunner(undefined), + }), + ) + const rec = result.runs[0] + expect(rec?.outcome.judgeScores).toBeUndefined() + }) + + it('with notes: judge prose survives the campaign-to-record conversion', async () => { + const judgeScores = { + perJudge: { + 'kimi-k2.6@2026-04-01': { helpfulness: 0.6, clarity: 0.55 }, + 'glm-5.1@2026-04-02': { helpfulness: 0.65, clarity: 0.5 }, + }, + perDimMean: { helpfulness: 0.625, clarity: 0.525 }, + composite: 0.575, + notes: 'panel flagged tone drift mid-response', + } + const result = await runEvalCampaign( + baseOpts({ + variants: [{ id: 'v1', payload: { prompt: 'p' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + runner: judgeScoresRunner(judgeScores), + }), + ) + const rec = result.runs[0] + expect(rec?.outcome.judgeScores?.notes).toBe('panel flagged tone drift mid-response') + }) + + it('fail-loud: a judge throwing during scoring lands in failedJudges, not swallowed', async () => { + // Consumer pattern: the runner runs the panel, catches per-judge + // throws, and records the dead judge in `failedJudges`. The + // composite is computed over survivors. The substrate's job is to + // preserve that signal — never to silently zero it. + const ensembleRunner: CampaignRunner = async (ctx) => { + const base = await defaultRunner(ctx) + const judges = ['kimi-k2.6@2026-04-01', 'glm-5.1@2026-04-02'] as const + const perJudge: Record> = {} + const failed: string[] = [] + for (const judgeId of judges) { + try { + if (judgeId === 'glm-5.1@2026-04-02') throw new Error('upstream 503') + perJudge[judgeId] = { helpfulness: 0.7, clarity: 0.65 } + } catch { + failed.push(judgeId) + } + } + // perDimMean over surviving judges only. No silent zero. + const dims = ['helpfulness', 'clarity'] as const + const perDimMean: Record = {} + for (const d of dims) { + const vals = Object.values(perJudge) + .map((d2) => d2[d]) + .filter((v): v is number => typeof v === 'number') + perDimMean[d] = vals.reduce((a, b) => a + b, 0) / vals.length + } + const composite = + Object.values(perDimMean).reduce((a, b) => a + b, 0) / Object.values(perDimMean).length + return { + ...base, + judgeScores: { + perJudge, + perDimMean, + composite, + failedJudges: failed, + }, + } + } + const result = await runEvalCampaign( + baseOpts({ + variants: [{ id: 'v1', payload: { prompt: 'p' } }], + scenarios: [{ scenarioId: 's1' }], + seeds: [0], + runner: ensembleRunner, + }), + ) + const rec = result.runs[0] + expect(rec?.outcome.judgeScores?.failedJudges).toEqual(['glm-5.1@2026-04-02']) + expect(rec?.outcome.judgeScores?.perJudge['glm-5.1@2026-04-02']).toBeUndefined() + expect(rec?.outcome.judgeScores?.perJudge['kimi-k2.6@2026-04-01']).toBeDefined() + // Composite is the mean over survivor dim-means — not silently zero. + expect(rec?.outcome.judgeScores?.composite).toBeGreaterThan(0) + }) +}) + describe('runEvalCampaign — concurrency', () => { it('runs cells in parallel up to the configured worker count', async () => { const inFlight = { count: 0, max: 0 } diff --git a/tests/run-record.test.ts b/tests/run-record.test.ts index c25d0d6..157fa9d 100644 --- a/tests/run-record.test.ts +++ b/tests/run-record.test.ts @@ -170,3 +170,87 @@ describe('validateRunRecord — mandatory field enforcement', () => { expect(isRunRecord({ runId: 'x' })).toBe(false) }) }) + +describe('validateRunRecord — judgeScores', () => { + const fullJudgeScores = { + perJudge: { + 'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75 }, + 'glm-5.1@2026-04-02': { helpfulness: 0.85, clarity: 0.7 }, + }, + perDimMean: { helpfulness: 0.825, clarity: 0.725 }, + composite: 0.775, + } + + it('accepts a fully-populated judgeScores block', () => { + const r = makeRecord({ + outcome: { holdoutScore: 0.775, raw: {}, judgeScores: fullJudgeScores }, + }) + expect(() => validateRunRecord(r)).not.toThrow() + }) + + it('round-trips judgeScores through JSON', () => { + const r = makeRecord({ + outcome: { holdoutScore: 0.775, raw: {}, judgeScores: fullJudgeScores }, + }) + const out = roundTripRunRecord(r) + expect(out.outcome.judgeScores).toEqual(fullJudgeScores) + }) + + it('accepts judgeScores with failedJudges and notes', () => { + const r = makeRecord({ + outcome: { + holdoutScore: 0.5, + raw: {}, + judgeScores: { + ...fullJudgeScores, + failedJudges: ['dead-judge@2026-01-01'], + notes: 'panel split on clarity', + }, + }, + }) + expect(() => validateRunRecord(r)).not.toThrow() + }) + + it('throws on non-finite per-judge score (NaN as silent zero is the bug class we ban)', () => { + const r = makeRecord({ + outcome: { + holdoutScore: 0.5, + raw: {}, + judgeScores: { + perJudge: { 'k@2026-01-01': { helpfulness: Number.NaN } }, + perDimMean: { helpfulness: 0.5 }, + composite: 0.5, + }, + }, + }) + expect(() => validateRunRecord(r)).toThrow(/finite/) + }) + + it('throws when composite is missing', () => { + const r = makeRecord({ + outcome: { + holdoutScore: 0.5, + raw: {}, + judgeScores: { + perJudge: { 'k@2026-01-01': { helpfulness: 0.5 } }, + perDimMean: { helpfulness: 0.5 }, + } as unknown as import('../src/run-record').JudgeScoresRecord, + }, + }) + expect(() => validateRunRecord(r)).toThrow(/composite/) + }) + + it('throws when failedJudges contains a non-string', () => { + const r = makeRecord({ + outcome: { + holdoutScore: 0.5, + raw: {}, + judgeScores: { + ...fullJudgeScores, + failedJudges: [42 as unknown as string], + }, + }, + }) + expect(() => validateRunRecord(r)).toThrow(/failedJudges/) + }) +})