From 4ca0db5e35cadbcd5391c64e38d99e8e810e0899 Mon Sep 17 00:00:00 2001
From: Drew Stone <drewstone329@gmail.com>
Date: Wed, 20 May 2026 08:43:40 -0600
Subject: [PATCH] feat(0.31.0): JudgeScoresRecord on RunRecord.outcome

Ensemble-judge consumers were dropping per-judge per-dim scores on the
floor because RunOutcome only had a slot for the composite. Adds a
typed `judgeScores?: JudgeScoresRecord` field, threaded through
runEvalCampaign and pinned in the consumer-contract test. Validator
rejects NaN scores and non-string failedJudges entries; fail-loud
test covers a panel where one judge throws.

Bumps TS + Python clients to 0.31.0 in lockstep.
---
 CHANGELOG.md                                  |  57 +++++++
 clients/python/pyproject.toml                 |   2 +-
 clients/python/src/agent_eval_rpc/__init__.py |   2 +-
 package.json                                  |   2 +-
 src/eval-campaign.ts                          |   8 +
 src/index.ts                                  |   1 +
 src/run-record.ts                             | 102 +++++++++++
 tests/consumer-contract.test.ts               |  20 +++
 tests/eval-campaign.test.ts                   | 158 ++++++++++++++++++
 tests/run-record.test.ts                      |  84 ++++++++++
 10 files changed, 433 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa552f3..0ac87aa 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,62 @@
 # Changelog
 
+## 0.31.0 — 2026-05-20
+
+### `JudgeScoresRecord` on `RunRecord.outcome` — substrate-blessed ensemble shape
+
+Multi-judge consumers (forge-chat in agent-builder, and four sibling
+product agents on the same trajectory) compute per-judge per-dimension
+scores per cell, then collapse to a single composite for the gate. The
+substrate's `RunOutcome` only had a slot for the composite plus a free
+`raw: Record<string, number>` bag. Consumers were either dropping the
+breakdown on the floor or smuggling it through stringly-typed `raw`
+keys like `judge_kimi_helpfulness` — neither survives a corpus-IRR run
+(0.27.2's `corpusInterRaterAgreement` expects structured per-judge
+per-dim records, not parsed strings).
+
+This release ships the typed slot so every product agent speaks the
+same shape, and the inter-rater primitives consume it without a
+per-consumer adapter.
+
+### Added
+
+- **`JudgeScoresRecord`** (`src/run-record.ts`) — `perJudge[judgeId][dim]`
+  is the canonical store; `perDimMean` and `composite` are precomputed
+  projections so reporters and IRR primitives don't repeat the
+  aggregation; `failedJudges?: string[]` records dead-judge ids
+  explicitly (no inferring partial-failure from missing keys);
+  `notes?: string` carries panel prose.
+- **`RunOutcome.judgeScores?: JudgeScoresRecord`** — optional. Single-
+  judge or scalar-only runs leave it unset; ensemble runs populate it.
+- **`CampaignRunOutcome.judgeScores?: JudgeScoresRecord`** — runners
+  return it on the per-cell outcome; `runEvalCampaign` threads it onto
+  the resulting `RunRecord.outcome.judgeScores` without coercion.
+
+### Validator extended
+
+`validateRunRecord` validates `outcome.judgeScores` when present.
+Every `perJudge[judge][dim]` and every `perDimMean[dim]` and the
+`composite` must be finite numbers — the NaN-as-silent-zero bug class
+banned by `CLAUDE.md` cannot pass the boundary. `failedJudges` must be
+an array of non-empty strings; `notes` must be a string. Round-trip
+tested in `tests/run-record.test.ts`.
+
+### Fail-loud contract
+
+A judge that throws lands in `failedJudges` by id, not a silent zero
+in `perJudge`. The composite is computed over surviving judges only;
+the partial-failure signal is preserved through to the gate.
+`tests/eval-campaign.test.ts` covers the four shapes (full, partial,
+missing, with notes) plus an explicit fail-loud case where one judge
+throws and the run record carries `failedJudges: ['glm-5.1@...']`.
+
+### Consumer contract
+
+`tests/consumer-contract.test.ts` pins `JudgeScoresRecord` as a
+type-level export at the root entry. The 0.30.0 surface is preserved —
+the new field is additive on `RunOutcome` and the new type is a new
+export, so existing consumers stay green.
+
 ## 0.29.0 — 2026-05-19
 
 ### Analyst kinds + cross-run findings context
diff --git a/clients/python/pyproject.toml b/clients/python/pyproject.toml
index ef79c26..09df840 100644
--- a/clients/python/pyproject.toml
+++ b/clients/python/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "agent-eval-rpc"
-version = "0.30.0"
+version = "0.31.0"
 description = "Python RPC client for @tangle-network/agent-eval — judge content against rubrics over HTTP or stdio RPC. Eval logic runs in the Node runtime; this package is a thin wire client."
 readme = "README.md"
 requires-python = ">=3.10"
diff --git a/clients/python/src/agent_eval_rpc/__init__.py b/clients/python/src/agent_eval_rpc/__init__.py
index 5498239..c50579b 100644
--- a/clients/python/src/agent_eval_rpc/__init__.py
+++ b/clients/python/src/agent_eval_rpc/__init__.py
@@ -48,7 +48,7 @@
 try:
     __version__ = version("agent-eval-rpc")
 except PackageNotFoundError:
-    __version__ = "0.30.0"
+    __version__ = "0.31.0"
 
 __all__ = [
     "Client",
diff --git a/package.json b/package.json
index fa0e15e..278cf3a 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "@tangle-network/agent-eval",
-  "version": "0.30.0",
+  "version": "0.31.0",
   "description": "Substrate for self-improving agents: traces, verifiable rewards, preferences, GEPA / reflective mutation, auto-research, replay, sequential anytime-valid stats, and release gates.",
   "homepage": "https://github.com/tangle-network/agent-eval#readme",
   "repository": {
diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts
index 8c41a0c..8c6ac99 100644
--- a/src/eval-campaign.ts
+++ b/src/eval-campaign.ts
@@ -41,6 +41,7 @@
 import { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client'
 import { canonicalize, hashJson } from './pre-registration'
 import type {
+  JudgeScoresRecord,
   RunJudgeMetadata,
   RunOutcome,
   RunRecord,
@@ -120,6 +121,12 @@ export interface CampaignRunOutcome {
   failureMode?: string
   /** Optional judge metadata when a judge was used. */
   judgeMetadata?: RunJudgeMetadata
+  /**
+   * Optional per-judge / per-dim breakdown for ensemble-judged runs.
+   * Propagated to `outcome.judgeScores` on the resulting `RunRecord`.
+   * Single-judge or scalar-only runs leave this unset.
+   */
+  judgeScores?: JudgeScoresRecord
 }
 
 export type CampaignRunner<V> = (ctx: CampaignRunContext<V>) => Promise<CampaignRunOutcome>
@@ -457,6 +464,7 @@ export async function runEvalCampaign<V>(
     }
     if (splitTag === 'holdout') recordOutcome.holdoutScore = outcome.score
     else recordOutcome.searchScore = outcome.score
+    if (outcome.judgeScores !== undefined) recordOutcome.judgeScores = outcome.judgeScores
 
     const record: RunRecord = {
       runId,
diff --git a/src/index.ts b/src/index.ts
index f9b4ad3..c3f519b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1087,6 +1087,7 @@ export { CallbackResearcher, NoopResearcher } from './researcher'
 // tournaments, adversarial, compute curves, auto-research — live on the
 // dedicated subpath: @tangle-network/agent-eval/rl
 export type {
+  JudgeScoresRecord,
   RunJudgeMetadata,
   RunOutcome,
   RunRecord,
diff --git a/src/run-record.ts b/src/run-record.ts
index 48fe4e2..52280da 100644
--- a/src/run-record.ts
+++ b/src/run-record.ts
@@ -44,6 +44,42 @@ export interface RunJudgeMetadata {
   fallback: boolean
 }
 
+/**
+ * Per-judge / per-dimension breakdown for runs scored by an ensemble of
+ * judges over a multi-dimensional rubric.
+ *
+ * The collapsed `outcome.searchScore` / `holdoutScore` carries the
+ * composite the gate uses. The full breakdown belongs here so consumers
+ * can answer "which judge disagreed?", "which dimension dragged the
+ * composite down?", and "did half the panel fail?" without re-running.
+ *
+ * `perJudge[judgeId][dim]` is the canonical source; `perDimMean` and
+ * `composite` are convenience projections — derivable but precomputed so
+ * downstream IRR primitives (`interRaterReliability`,
+ * `corpusInterRaterAgreement`) and reporters don't pay the same
+ * aggregation twice.
+ *
+ * Fail-loud discipline: judges that errored out land in `failedJudges`
+ * by id. A missing key in `perJudge` is ambiguous (silent zero vs not
+ * run); the explicit list makes a partial-failure recorded as such.
+ */
+export interface JudgeScoresRecord {
+  /** Per-judge per-dimension scores. `{ "kimi-k2.6": { helpfulness: 0.8, clarity: 0.7 }, ... }`. */
+  perJudge: Record<string, Record<string, number>>
+  /** Per-dim mean across judges. Convenience — derivable from `perJudge`. */
+  perDimMean: Record<string, number>
+  /** Composite mean across all dims and judges. Mirrors the score
+   *  the gate sees on `outcome.searchScore` / `holdoutScore`. */
+  composite: number
+  /** Judges that errored or returned an unparseable verdict. Recorded
+   *  by id (e.g. `['glm-5.1']`) so a partial-failure case is explicit,
+   *  not inferred from missing keys in `perJudge`. */
+  failedJudges?: string[]
+  /** Free-form notes the judges emitted (joined across judges or
+   *  first-judge only — consumer's choice). */
+  notes?: string
+}
+
 export interface RunOutcome {
   /** Score on the search/optimization split. Optional because a
    *  holdout-only evaluation only fills `holdoutScore`. */
@@ -55,6 +91,12 @@ export interface RunOutcome {
    *  pass/fail counters, latency stats, etc. Numeric only — keeps
    *  reporters honest. */
   raw: Record<string, number>
+  /** Per-judge / per-dim breakdown. Consumers writing ensemble
+   *  judgements populate this; substrate primitives like
+   *  `interRaterReliability` and `corpusInterRaterAgreement` accept
+   *  these records as input. Optional — single-judge or scalar-only
+   *  runs leave it unset. */
+  judgeScores?: JudgeScoresRecord
 }
 
 /**
@@ -242,6 +284,11 @@ export function validateRunRecord(input: unknown): RunRecord {
     expectFiniteNumber(v, `outcome.raw.${k}`)
   }
 
+  // Per-judge / per-dim breakdown, optional.
+  if (outRec.judgeScores !== undefined) {
+    validateJudgeScores(outRec.judgeScores, 'outcome.judgeScores')
+  }
+
   // Failure mode optional.
   if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode')
 
@@ -298,6 +345,61 @@ function expectFiniteNumber(value: unknown, path: string): void {
   }
 }
 
+function validateJudgeScores(value: unknown, path: string): void {
+  if (value === null || typeof value !== 'object') {
+    throw new RunRecordValidationError('judgeScores must be an object', path)
+  }
+  const rec = value as Record<string, unknown>
+
+  const perJudge = rec.perJudge
+  if (perJudge === null || typeof perJudge !== 'object') {
+    throw new RunRecordValidationError('perJudge must be an object', `${path}.perJudge`)
+  }
+  for (const [judgeId, dims] of Object.entries(perJudge as Record<string, unknown>)) {
+    if (dims === null || typeof dims !== 'object') {
+      throw new RunRecordValidationError(
+        'per-judge entry must be an object of dimension scores',
+        `${path}.perJudge.${judgeId}`,
+      )
+    }
+    for (const [dim, score] of Object.entries(dims as Record<string, unknown>)) {
+      expectFiniteNumber(score, `${path}.perJudge.${judgeId}.${dim}`)
+    }
+  }
+
+  const perDimMean = rec.perDimMean
+  if (perDimMean === null || typeof perDimMean !== 'object') {
+    throw new RunRecordValidationError('perDimMean must be an object', `${path}.perDimMean`)
+  }
+  for (const [dim, mean] of Object.entries(perDimMean as Record<string, unknown>)) {
+    expectFiniteNumber(mean, `${path}.perDimMean.${dim}`)
+  }
+
+  expectFiniteNumber(rec.composite, `${path}.composite`)
+
+  if (rec.failedJudges !== undefined) {
+    if (!Array.isArray(rec.failedJudges)) {
+      throw new RunRecordValidationError(
+        'failedJudges must be an array of strings',
+        `${path}.failedJudges`,
+      )
+    }
+    for (let i = 0; i < rec.failedJudges.length; i++) {
+      const id = rec.failedJudges[i]
+      if (typeof id !== 'string' || id.length === 0) {
+        throw new RunRecordValidationError(
+          'failedJudges entry must be a non-empty string',
+          `${path}.failedJudges[${i}]`,
+        )
+      }
+    }
+  }
+
+  if (rec.notes !== undefined && typeof rec.notes !== 'string') {
+    throw new RunRecordValidationError('notes must be a string', `${path}.notes`)
+  }
+}
+
 /**
  * Heuristic snapshot check. Accepts:
  *   - `name@YYYY-MM-DD` (Anthropic style: `claude-sonnet-4-6@2025-04-15`)
diff --git a/tests/consumer-contract.test.ts b/tests/consumer-contract.test.ts
index abed82a..3e3e7c6 100644
--- a/tests/consumer-contract.test.ts
+++ b/tests/consumer-contract.test.ts
@@ -2,6 +2,7 @@ import { describe, expect, it } from 'vitest'
 import * as builderEval from '../src/builder-eval/index'
 import * as agentEval from '../src/index'
 import * as rl from '../src/rl/index'
+import type { JudgeScoresRecord, RunOutcome } from '../src/index'
 
 /**
  * Public-surface contract for `@tangle-network/agent-eval`.
@@ -109,4 +110,23 @@ describe('public-surface contract for consumers', () => {
       expect(proto instanceof Error, `${name} must extend Error`).toBe(true)
     }
   })
+
+  it('exposes JudgeScoresRecord as the canonical ensemble shape on RunOutcome', () => {
+    // Type-level pin: a `JudgeScoresRecord` is assignable to
+    // `RunOutcome.judgeScores`. If the interface gets renamed or the
+    // field gets dropped from `RunOutcome`, this stops compiling — the
+    // contract that protects forge-chat / multi-judge consumers.
+    const judgeScores: JudgeScoresRecord = {
+      perJudge: { 'kimi-k2.6': { helpfulness: 0.8, clarity: 0.7 } },
+      perDimMean: { helpfulness: 0.8, clarity: 0.7 },
+      composite: 0.75,
+    }
+    const outcome: RunOutcome = {
+      holdoutScore: 0.75,
+      raw: {},
+      judgeScores,
+    }
+    expect(outcome.judgeScores).toBe(judgeScores)
+    expect(outcome.judgeScores?.composite).toBe(0.75)
+  })
 })
diff --git a/tests/eval-campaign.test.ts b/tests/eval-campaign.test.ts
index e0ccd15..96b3a66 100644
--- a/tests/eval-campaign.test.ts
+++ b/tests/eval-campaign.test.ts
@@ -286,6 +286,164 @@ describe('runEvalCampaign — failure handling', () => {
   })
 })
 
+describe('runEvalCampaign — judgeScores propagation', () => {
+  // Forge-chat / multi-judge consumers produce per-judge per-dim scores
+  // alongside the composite. The campaign must thread them onto
+  // `RunRecord.outcome.judgeScores` without coercion, and the record
+  // must survive a JSON round-trip (records.jsonl is what consumers
+  // ultimately persist).
+
+  function judgeScoresRunner(
+    judgeScores: import('../src/run-record').JudgeScoresRecord | undefined,
+  ): CampaignRunner<VariantPayload> {
+    return async (ctx) => {
+      const base = await defaultRunner(ctx)
+      if (judgeScores === undefined) return base
+      return { ...base, judgeScores }
+    }
+  }
+
+  it('full shape: lands all per-judge/per-dim/composite fields on the record + JSON round-trip', async () => {
+    const judgeScores = {
+      perJudge: {
+        'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75, on_topic: 0.9 },
+        'glm-5.1@2026-04-02': { helpfulness: 0.85, clarity: 0.7, on_topic: 0.95 },
+      },
+      perDimMean: { helpfulness: 0.825, clarity: 0.725, on_topic: 0.925 },
+      composite: 0.825,
+    }
+    const result = await runEvalCampaign(
+      baseOpts({
+        variants: [{ id: 'v1', payload: { prompt: 'p' } }],
+        scenarios: [{ scenarioId: 's1' }],
+        seeds: [0],
+        runner: judgeScoresRunner(judgeScores),
+      }),
+    )
+    expect(result.runs).toHaveLength(1)
+    const rec = result.runs[0]
+    expect(rec?.outcome.judgeScores).toEqual(judgeScores)
+    // JSON round-trip — this is the shape that lands in records.jsonl.
+    const roundTripped = JSON.parse(JSON.stringify(rec))
+    expect(roundTripped.outcome.judgeScores).toEqual(judgeScores)
+  })
+
+  it('partial shape (failedJudges populated): one judge errored, recorded explicitly', async () => {
+    // Fail-loud: a panel with one dead judge is recorded as such — not
+    // inferred from a missing key in perJudge. The composite + perDimMean
+    // are computed over the surviving judges only.
+    const judgeScores = {
+      perJudge: {
+        'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75 },
+      },
+      perDimMean: { helpfulness: 0.8, clarity: 0.75 },
+      composite: 0.775,
+      failedJudges: ['glm-5.1@2026-04-02'],
+    }
+    const result = await runEvalCampaign(
+      baseOpts({
+        variants: [{ id: 'v1', payload: { prompt: 'p' } }],
+        scenarios: [{ scenarioId: 's1' }],
+        seeds: [0],
+        runner: judgeScoresRunner(judgeScores),
+      }),
+    )
+    const rec = result.runs[0]
+    expect(rec?.outcome.judgeScores?.failedJudges).toEqual(['glm-5.1@2026-04-02'])
+    expect(Object.keys(rec?.outcome.judgeScores?.perJudge ?? {})).toEqual(['kimi-k2.6@2026-04-01'])
+  })
+
+  it('missing shape (no ensemble): legacy / single-judge runs leave outcome.judgeScores undefined', async () => {
+    const result = await runEvalCampaign(
+      baseOpts({
+        variants: [{ id: 'v1', payload: { prompt: 'p' } }],
+        scenarios: [{ scenarioId: 's1' }],
+        seeds: [0],
+        runner: judgeScoresRunner(undefined),
+      }),
+    )
+    const rec = result.runs[0]
+    expect(rec?.outcome.judgeScores).toBeUndefined()
+  })
+
+  it('with notes: judge prose survives the campaign-to-record conversion', async () => {
+    const judgeScores = {
+      perJudge: {
+        'kimi-k2.6@2026-04-01': { helpfulness: 0.6, clarity: 0.55 },
+        'glm-5.1@2026-04-02': { helpfulness: 0.65, clarity: 0.5 },
+      },
+      perDimMean: { helpfulness: 0.625, clarity: 0.525 },
+      composite: 0.575,
+      notes: 'panel flagged tone drift mid-response',
+    }
+    const result = await runEvalCampaign(
+      baseOpts({
+        variants: [{ id: 'v1', payload: { prompt: 'p' } }],
+        scenarios: [{ scenarioId: 's1' }],
+        seeds: [0],
+        runner: judgeScoresRunner(judgeScores),
+      }),
+    )
+    const rec = result.runs[0]
+    expect(rec?.outcome.judgeScores?.notes).toBe('panel flagged tone drift mid-response')
+  })
+
+  it('fail-loud: a judge throwing during scoring lands in failedJudges, not swallowed', async () => {
+    // Consumer pattern: the runner runs the panel, catches per-judge
+    // throws, and records the dead judge in `failedJudges`. The
+    // composite is computed over survivors. The substrate's job is to
+    // preserve that signal — never to silently zero it.
+    const ensembleRunner: CampaignRunner<VariantPayload> = async (ctx) => {
+      const base = await defaultRunner(ctx)
+      const judges = ['kimi-k2.6@2026-04-01', 'glm-5.1@2026-04-02'] as const
+      const perJudge: Record<string, Record<string, number>> = {}
+      const failed: string[] = []
+      for (const judgeId of judges) {
+        try {
+          if (judgeId === 'glm-5.1@2026-04-02') throw new Error('upstream 503')
+          perJudge[judgeId] = { helpfulness: 0.7, clarity: 0.65 }
+        } catch {
+          failed.push(judgeId)
+        }
+      }
+      // perDimMean over surviving judges only. No silent zero.
+      const dims = ['helpfulness', 'clarity'] as const
+      const perDimMean: Record<string, number> = {}
+      for (const d of dims) {
+        const vals = Object.values(perJudge)
+          .map((d2) => d2[d])
+          .filter((v): v is number => typeof v === 'number')
+        perDimMean[d] = vals.reduce((a, b) => a + b, 0) / vals.length
+      }
+      const composite =
+        Object.values(perDimMean).reduce((a, b) => a + b, 0) / Object.values(perDimMean).length
+      return {
+        ...base,
+        judgeScores: {
+          perJudge,
+          perDimMean,
+          composite,
+          failedJudges: failed,
+        },
+      }
+    }
+    const result = await runEvalCampaign(
+      baseOpts({
+        variants: [{ id: 'v1', payload: { prompt: 'p' } }],
+        scenarios: [{ scenarioId: 's1' }],
+        seeds: [0],
+        runner: ensembleRunner,
+      }),
+    )
+    const rec = result.runs[0]
+    expect(rec?.outcome.judgeScores?.failedJudges).toEqual(['glm-5.1@2026-04-02'])
+    expect(rec?.outcome.judgeScores?.perJudge['glm-5.1@2026-04-02']).toBeUndefined()
+    expect(rec?.outcome.judgeScores?.perJudge['kimi-k2.6@2026-04-01']).toBeDefined()
+    // Composite is the mean over survivor dim-means — not silently zero.
+    expect(rec?.outcome.judgeScores?.composite).toBeGreaterThan(0)
+  })
+})
+
 describe('runEvalCampaign — concurrency', () => {
   it('runs cells in parallel up to the configured worker count', async () => {
     const inFlight = { count: 0, max: 0 }
diff --git a/tests/run-record.test.ts b/tests/run-record.test.ts
index c25d0d6..157fa9d 100644
--- a/tests/run-record.test.ts
+++ b/tests/run-record.test.ts
@@ -170,3 +170,87 @@ describe('validateRunRecord — mandatory field enforcement', () => {
     expect(isRunRecord({ runId: 'x' })).toBe(false)
   })
 })
+
+describe('validateRunRecord — judgeScores', () => {
+  const fullJudgeScores = {
+    perJudge: {
+      'kimi-k2.6@2026-04-01': { helpfulness: 0.8, clarity: 0.75 },
+      'glm-5.1@2026-04-02': { helpfulness: 0.85, clarity: 0.7 },
+    },
+    perDimMean: { helpfulness: 0.825, clarity: 0.725 },
+    composite: 0.775,
+  }
+
+  it('accepts a fully-populated judgeScores block', () => {
+    const r = makeRecord({
+      outcome: { holdoutScore: 0.775, raw: {}, judgeScores: fullJudgeScores },
+    })
+    expect(() => validateRunRecord(r)).not.toThrow()
+  })
+
+  it('round-trips judgeScores through JSON', () => {
+    const r = makeRecord({
+      outcome: { holdoutScore: 0.775, raw: {}, judgeScores: fullJudgeScores },
+    })
+    const out = roundTripRunRecord(r)
+    expect(out.outcome.judgeScores).toEqual(fullJudgeScores)
+  })
+
+  it('accepts judgeScores with failedJudges and notes', () => {
+    const r = makeRecord({
+      outcome: {
+        holdoutScore: 0.5,
+        raw: {},
+        judgeScores: {
+          ...fullJudgeScores,
+          failedJudges: ['dead-judge@2026-01-01'],
+          notes: 'panel split on clarity',
+        },
+      },
+    })
+    expect(() => validateRunRecord(r)).not.toThrow()
+  })
+
+  it('throws on non-finite per-judge score (NaN as silent zero is the bug class we ban)', () => {
+    const r = makeRecord({
+      outcome: {
+        holdoutScore: 0.5,
+        raw: {},
+        judgeScores: {
+          perJudge: { 'k@2026-01-01': { helpfulness: Number.NaN } },
+          perDimMean: { helpfulness: 0.5 },
+          composite: 0.5,
+        },
+      },
+    })
+    expect(() => validateRunRecord(r)).toThrow(/finite/)
+  })
+
+  it('throws when composite is missing', () => {
+    const r = makeRecord({
+      outcome: {
+        holdoutScore: 0.5,
+        raw: {},
+        judgeScores: {
+          perJudge: { 'k@2026-01-01': { helpfulness: 0.5 } },
+          perDimMean: { helpfulness: 0.5 },
+        } as unknown as import('../src/run-record').JudgeScoresRecord,
+      },
+    })
+    expect(() => validateRunRecord(r)).toThrow(/composite/)
+  })
+
+  it('throws when failedJudges contains a non-string', () => {
+    const r = makeRecord({
+      outcome: {
+        holdoutScore: 0.5,
+        raw: {},
+        judgeScores: {
+          ...fullJudgeScores,
+          failedJudges: [42 as unknown as string],
+        },
+      },
+    })
+    expect(() => validateRunRecord(r)).toThrow(/failedJudges/)
+  })
+})