From c013899e0a90bdb18c2a20b9ee22bdd84d408431 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 10:42:59 -0400
Subject: [PATCH 01/58] {"schema":"decodex/commit/1","summary":"Port explicit
 qrel benchmark rescue slice","authority":"manual"}

---
 .../agent_memory_quantitative_benchmark_v1.md | 764 +++++++++++++-----
 makefiles/benchmark-core.toml                 |  97 +--
 makefiles/benchmark-memory-a.toml             |   7 +
 makefiles/benchmark-memory-b.toml             |   8 +
 scripts/materialize-explicit-qrels.py         | 290 +++++++
 scripts/real-world-docker.sh                  |  14 +
 scripts/real-world-explicit-qrels.sh          |  39 +
 scripts/real-world-live-explicit-qrels.sh     |  80 ++
 8 files changed, 1021 insertions(+), 278 deletions(-)
 create mode 100755 scripts/materialize-explicit-qrels.py
 create mode 100755 scripts/real-world-explicit-qrels.sh
 create mode 100755 scripts/real-world-live-explicit-qrels.sh

diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md
index 5974e4bf..265a71c1 100644
--- a/docs/spec/agent_memory_quantitative_benchmark_v1.md
+++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md
@@ -1,216 +1,608 @@
 ---
 type: Spec
 title: "Agent Memory Quantitative Benchmark v1"
-description: "Define the public quantitative competitor scoreboard row contract and claim boundaries."
+description: "Define quantitative same-corpus memory benchmark metrics, formulas, evidence classes, and claim boundaries."
 resource: docs/spec/agent_memory_quantitative_benchmark_v1.md
 status: active
 authority: normative
 owner: spec
-last_verified: 2026-06-27
+last_verified: 2026-06-23
 tags:
   - docs
   - spec
   - benchmarking
   - agent-memory
-source_refs:
-  - XY-1098
-  - XY-1120
+source_refs: []
 code_refs:
+  - Makefile.toml
+  - makefiles/benchmark-memory-a.toml
+  - makefiles/benchmark-memory-b.toml
+  - scripts/materialize-explicit-qrels.py
+  - scripts/real-world-explicit-qrels.sh
+  - scripts/real-world-docker.sh
+  - scripts/real-world-live-explicit-qrels.sh
+  - apps/elf-eval/src/app.rs
   - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
-  - apps/elf-eval/tests/real_world_job_benchmark.rs
+  - apps/elf-eval/fixtures/real_world_memory/p1_closeout/source_candidate_approval_recall.json
+  - apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json
 related:
+  - docs/spec/agent_memory_knowledge_system_v1.md
   - docs/spec/real_world_agent_memory_benchmark_v1.md
-  - docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md
+  - docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md
+  - docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md
 drift_watch:
   - docs/spec/agent_memory_quantitative_benchmark_v1.md
+  - Makefile.toml
+  - makefiles/benchmark-memory-a.toml
+  - makefiles/benchmark-memory-b.toml
+  - scripts/materialize-explicit-qrels.py
+  - scripts/real-world-explicit-qrels.sh
+  - scripts/real-world-docker.sh
+  - scripts/real-world-live-explicit-qrels.sh
+  - docs/spec/agent_memory_knowledge_system_v1.md
   - docs/spec/real_world_agent_memory_benchmark_v1.md
   - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
-  - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json
+  - apps/elf-eval/src/app.rs
+  - docs/evidence/benchmarking/index.md
 ---
 # Agent Memory Quantitative Benchmark v1
 
-Purpose: Define the public quantitative competitor scoreboard row contract and claim
-boundaries.
+Purpose: Define the quantitative scoreboard that must sit beside ELF's existing
+typed real-world memory benchmark reports.
 Status: normative
-Read this when: You are implementing, validating, or publishing the public
-competitor-quality scoreboard for agent memory systems.
-Not this document: Real-world job fixture schema, Work Journal behavior, operational
-runbooks, or external adapter setup procedures.
-Defines: `elf.quality_scoreboard/v1` quantitative rows, metrics, comparability gates,
-typed non-pass behavior, and optimization-direction metadata.
-
-## Scope
-
-The quantitative scoreboard turns `real_world_job` reports and external adapter
-manifest records into public product rows. It is a row-level evidence contract, not a
-universal leaderboard. It is allowed to say which metrics are proven for a row, which
-competitor strengths remain visible, and which evidence is missing before a row can be
-treated as comparable.
-
-This contract applies to reports with schema `elf.quality_scoreboard/v1`.
-
-## Scoreboard Report
-
-A report MUST include:
-
-- `schema`: exactly `elf.quality_scoreboard/v1`.
-- `result_states`: the public row-state enum.
-- `evidence_classes`: the public evidence-class enum.
-- `metric_basis`: the ranking basis used for retrieval metrics.
-- `retrieval_k`: the `k` used for recall, precision, MRR, and nDCG.
-- typed non-pass counts and visible typed non-pass states for encoded jobs, external
-  adapter rows, and the aggregate report.
-- evidence-class counts.
-- bounded encoded-job and aggregate summary claims.
-- `unqualified_win_claim_allowed`, which MUST be `false` when any typed non-pass row
-  or non-comparable row exists.
-- `claim_boundary`, a human-readable statement that prevents typed blockers or
-  fixture-only evidence from becoming broad superiority claims.
-- `rows`: one row for ELF plus one row for each tracked external product represented
-  by the loaded adapter manifest.
-- `optimization_roadmap`: concrete next optimization directions derived from missing
-  row evidence, not from hidden assumptions.
-
-## Public Row States
+Read this when: You are adding or reviewing recall, freshness, update, delete,
+expiry, latency, cost, or competitor-comparison metrics for agent memory systems.
+Not this document: A finished benchmark report, a claim that current results beat
+every competitor, or a replacement for typed non-pass outcome reporting.
+Defines: `elf.agent_memory_quantitative_benchmark/v1`, required metric families,
+formulas, denominators, evidence classes, comparability rules, and minimum report
+rows.
+
+## Core Rule
+
+Quantitative memory comparison must measure the exact behavior users care about:
+finding the right evidence, using current facts, suppressing stale or deleted facts,
+showing citations, and staying within latency/cost/resource bounds.
+
+A report must not use broad product labels such as "best memory" or "beats OpenKB"
+unless the specific metric row is same-corpus, same-task, same-evidence-class,
+same-candidate-source, same-denominator, and leaderboard eligible. Typed non-pass
+states remain first-class results.
+
+## Evidence Classes
+
+Every quantitative row must declare one evidence class:
+
+| Evidence class | Meaning | Comparable for leaderboard |
+| --- | --- | --- |
+| `fixture_backed` | Checked-in fixture scored by ELF's runner. | Only against other fixture rows with the same corpus and task. |
+| `live_baseline` | Docker-contained baseline or smoke run that may not execute real-world answer jobs. | No, unless the report states the exact same scored task. |
+| `live_real_world` | Runtime executed the same real-world job prompt and produced scored answer artifacts. | Yes, when same-corpus and same-task. |
+| `public_proxy` | Local proxy contract based on public docs or expected artifact shape, not a product runtime. | No product leaderboard claim. |
+| `private_corpus` | Operator-owned private corpus with publishable bounded metrics only. | Yes only for private-corpus rows with matching policy. |
+| `provider_backed` | Provider credentials/models were used and cost/latency are measured. | Yes only against rows with equivalent provider boundary. |
+| `research_gate` | Research-only, blocked, or reference-only evidence. | No. |
+| `mixed_evidence` | Aggregate row blends multiple evidence classes. | No; split rows before leaderboard use. |
+
+## Result States
+
+Every row must declare one result state:
 
 | State | Meaning |
 | --- | --- |
-| `pass` | The row has a scored pass under its evidence class. A pass is comparable only when every comparability gate is also true. |
-| `wrong_result` | The adapter or job reached the behavioral check but selected the wrong answer, evidence, lifecycle state, or action. |
-| `incomplete` | Setup, build, parse, adapter wiring, or runtime execution did not reach the behavioral check. |
-| `blocked` | The row cannot be completed safely without missing credentials, private input, durable runtime integration, Docker evidence, or manual product setup. |
-| `not_tested` | No benchmark execution or comparable adapter output exists for the row. |
-| `not_encoded` | The suite, scoring dimension, or adapter path is not implemented in the runner. |
-| `not_comparable` | The row has useful evidence but lacks one or more required comparability gates, so it must not be used as a product-runtime comparison pass. |
-| `unsupported_claim` | The row or source report made a substantive claim not supported by corpus evidence, source refs, or report metadata. |
-
-`not_comparable` is a public row state only. It is not a `real_world_job` status and
-must not be written back into job or suite outcome fields.
+| `pass` | The metric is measured and meets the row threshold. |
+| `wrong_result` | The task ran but selected the wrong answer, wrong evidence, or wrong lifecycle state. |
+| `incomplete` | Some required artifacts exist, but the metric denominator is not fully satisfied. |
+| `blocked` | Required setup, credentials, corpus, exported artifact, or product readback is missing. |
+| `not_encoded` | The adapter or benchmark does not implement this metric. |
+| `not_comparable` | A metric exists but evidence class, corpus, task, or denominator differs. |
+| `unsupported_claim` | The output makes a claim that the evidence cannot support. |
+
+Metric states are separate from row result states. A metric state of `measured`
+means the denominator is non-zero and the row has no typed non-pass state; it does
+not mean the value passed a leaderboard threshold. If the row result is
+`blocked`, `wrong_result`, `incomplete`, `not_encoded`, or `unsupported_claim`,
+metric states for measured values must inherit that non-pass state.
+
+Metric states may also use `partial_coverage` when a formula is computable for
+some queries but the row lacks full ranked-candidate coverage or the minimum query
+count required for leaderboard use. `partial_coverage` values are useful regression
+evidence, not product-ranking proof.
+
+## Retrieval Metrics
+
+Retrieval metrics apply when a job has relevance labels and an ordered candidate
+list. The report must name `k` for every `@k` metric. A row must also declare whether
+ranked candidates came from a product/runtime trace or a fixture trace; fixture traces
+are formula smoke tests unless the compared product emitted the same artifact shape.
+Explicit qrels live in `expected_answer.relevance_judgments` as
+`{ "evidence_id": "...", "grade": 0.0 }` records. If a legacy fixture omits qrels,
+the runner may derive binary relevance from required evidence for regression use,
+but that row must expose `qrel_source = expected_evidence_fallback` and must not
+become leaderboard eligible.
+
+`cargo make real-world-memory-explicit-qrels` is the deterministic qrel
+materialization command for fixture-mechanics evidence. It derives positive qrels
+from checked-in `expected_answer.evidence_links` and `required_evidence`, preserves
+existing explicit zero-grade judgments, and leaves unmentioned corpus evidence
+unjudged instead of converting it into synthetic negative labels. Its optional
+oracle ranked candidates are allowed only to prove metric mechanics; they are not
+product-runtime retrieval evidence and cannot satisfy leaderboard runtime, held-out,
+or leakage-audit gates.
+
+`cargo make real-world-memory-live-explicit-qrels` is the current product-runtime
+bridge from deterministic qrel materialization to ELF/qmd live adapter scoring. It
+must materialize explicit qrels with `--ranked-candidates-source none`, then let
+the live adapters emit their own runtime ranked candidates. This command can close
+the `qrel_source` gap for product-runtime rows, but it does not itself prove
+held-out status, leakage audit status, or clean leaderboard eligibility.
+
+| Metric | Formula | Required fields |
+| --- | --- | --- |
+| `recall_at_k` | `relevant_returned_in_top_k / expected_relevant_count` | relevance labels, explicit `ranked_candidate_evidence_ids`, `k` |
+| `precision_at_k` | `relevant_returned_in_top_k / k` | ordered candidates, relevance labels |
+| `mrr` | `1 / rank(first_relevant)` or `0` when no relevant item appears | ordered candidates, relevance labels |
+| `ndcg_at_k` | `dcg_at_k / ideal_dcg_at_k` using graded relevance when available, binary otherwise | ordered candidates, relevance grades |
+| `map` | Mean of per-query average precision values | ordered candidates, relevance labels |
+| `average_precision` | Per-query sum of precision at each relevant hit divided by expected relevant count | ordered candidates, relevance labels |
+| `success_at_k` | Query has at least one relevant candidate in the top `k` | ordered candidates, relevance labels, `k` |
+| `expected_evidence_recall` | `produced_required_evidence_count / required_evidence_count` | required evidence map, produced evidence ids |
+| `citation_coverage` | `claims_with_valid_citation / claims_requiring_citation` | claim list, citation validation result |
+| `source_ref_coverage` | `claims_with_valid_source_ref / claims_requiring_source_ref` | source-ref validation result |
+
+Retrieval metrics must not count redacted, excluded, deleted, expired, unreadable, or
+non-captured source spans as relevant current evidence. Such candidates may be
+reported separately as historical or diagnostic rows.
+
+## Memory Lifecycle Metrics
+
+Memory lifecycle metrics apply to jobs that encode state changes over time.
+
+| Metric | Formula | What it proves |
+| --- | --- | --- |
+| `update_correctness_rate` | `jobs_selecting_current_superseding_fact / update_jobs` | New facts replace old facts for current answers. |
+| `stale_suppression_rate` | `stale_facts_not_used_as_current / stale_fact_opportunities` | Stale facts do not pollute current answers. |
+| `delete_suppression_rate` | `deleted_or_tombstoned_facts_not_used / delete_opportunities` | Deleted or tombstoned facts do not reappear as current context. |
+| `expiry_suppression_rate` | `expired_facts_not_used / expiry_opportunities` | TTL or time-bounded facts are suppressed after expiry. |
+| `rollback_readback_rate` | `rollback_events_with_readback / rollback_events_expected` | Rollback and prior versions remain auditable. |
+| `history_readback_rate` | `history_events_readable / history_events_expected` | Add, update, ignore, reject, delete, restore, and derived transitions are visible. |
+| `contradiction_resolution_rate` | `contradictions_resolved_to_current_supported_answer / contradiction_opportunities` | Mutually inconsistent memories are resolved with current source support instead of arbitrary retrieval order. |
+
+The denominator must be explicit. A benchmark with no delete jobs must report
+`delete_suppression_rate = not_encoded`, not `1.000`.
+
+## Answer Safety Metrics
+
+| Metric | Formula |
+| --- | --- |
+| `unsupported_claim_rate` | `unsupported_claim_count / answer_claim_count` |
+| `stale_answer_rate` | `answers_using_stale_fact_as_current / answered_jobs` |
+| `hallucinated_evidence_rate` | `citations_not_in_candidate_or_source_set / citation_count` |
+| `redaction_leak_count` | Count of private, excluded, or redacted spans surfaced in public output. |
+| `irrelevant_context_ratio` | `irrelevant_context_items / returned_context_items` |
+| `scope_violation_count` | Count of unreadable cross-scope or grant-violating rows returned. |
 
-## Evidence Classes
+Zero values are meaningful only when the denominator is non-zero and the checked row
+actually exercises the failure mode.
+
+## Operational Metrics
 
-| Evidence class | Meaning |
+| Metric | Required unit |
 | --- | --- |
-| `fixture_backed` | Checked-in fixtures were scored. This is regression evidence, not live product-runtime evidence. |
-| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists, but the row is not a real-world product-runtime scoreboard pass. |
-| `live_real_world` | A live adapter executed real-world job paths and emitted typed outcomes. |
-| `research_gate` | Research, source mapping, setup, credential, or resource gates are recorded before fair scoring can run. |
-
-## Row Fields
-
-Each `rows[]` entry MUST include:
-
-- `product_id` and `product_name`.
-- `row_source`: stable source label, such as `elf_report` or
-  `external_adapter_manifest`.
-- `evidence_class`.
-- `result_state`.
-- `comparable`: true only when all comparability gates are satisfied and the row has a
-  pass state with quantitative metrics.
-- comparability gates:
-  - `same_corpus`
-  - `source_id_mapped`
-  - `held_out`
-  - `leakage_audited`
-  - `product_runtime`
-  - `container_digest_identified`
-- `metrics`.
-- `strengths`: product strengths supported by the row source.
-- `weaknesses`: typed weaknesses, blockers, or non-pass evidence from the row source.
-- `next_evidence`: row-level evidence needed before the row can become comparable.
-- `source_provenance`: bounded source pointers to the input report, adapter record, or
-  suite records.
-
-`same_corpus = true` requires positive row evidence that the product or checked-in
-adapter is mapped to the benchmark corpus. A blocker sentence that says same-corpus
-evidence is missing is not sufficient. A typed same-corpus setup-blocker adapter may
-set this gate to true only when its source provenance identifies the intended shared
-benchmark corpus and the remaining blocker is runtime/source-id output, not corpus
-selection.
-
-## Metrics
-
-The `metrics` object MUST include `retrieval`, `lifecycle`, `answer_safety`,
-`operations`, and `coverage` sub-objects.
-
-`retrieval` MUST include:
-
-- `k`.
-- `metric_basis`.
-- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg`, or `null` when the row lacks
-  ranked produced evidence.
-- `expected_evidence_recall`.
-- `citation_source_ref_coverage`.
-- matched, total, and produced evidence counts.
-
-For `metric_basis = "produced_evidence_order"`, ranked retrieval metrics use the
-ordered `produced_evidence` list in the scored job output as the retrieved list.
-Expected evidence ids are the relevance set. Relevance is binary. `recall_at_k` and
-`precision_at_k` use the first `k` produced evidence ids. MRR is reciprocal rank of
-the first relevant produced evidence id. nDCG uses binary gains with the ideal DCG
-bounded by `min(k, expected_evidence_total)`.
-
-`lifecycle` MUST include:
-
-- stale suppression rate and counts.
-- update correctness rate and counts.
-- delete correctness rate and counts.
-- rollback/history readback rate and counts.
-
-`answer_safety` MUST include:
-
-- unsupported-claim rate and count.
-- stale-answer rate and count.
-- hallucinated-evidence rate when measurable.
-- redaction leak count.
-- irrelevant-context ratio.
-
-`operations` MUST include:
-
-- mean latency in milliseconds when measured.
-- total cost when cost accounting exists.
-- resource-envelope status, encoded job count, and pass count.
-
-`coverage` MUST include:
-
-- job count.
-- encoded suite count.
-- pass count.
-- typed non-pass count.
-- source-ref coverage.
-- evidence coverage.
-- evidence class.
-
-## Comparability Rules
-
-A row is comparable only when all of the following are true:
-
-- `same_corpus = true`.
-- `source_id_mapped = true`.
-- `held_out = true`.
-- `leakage_audited = true`.
-- `product_runtime = true`.
-- `container_digest_identified = true`.
-- `result_state = "pass"`.
-- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg` are present.
-
-If any required gate is false, the report MUST set `comparable = false`, add a
-specific `next_evidence` entry for each missing gate, and avoid any win, parity, or
-rank claim for that row. If an otherwise passing row is missing a required gate, the
-public row state SHOULD be `not_comparable` so the report is explicit about the
-reason no product-runtime comparison claim is allowed.
-
-## Report Claim Rules
-
-- A row with `fixture_backed`, `live_baseline`, or `research_gate` evidence MUST NOT
-  be described as a comparable product-runtime pass.
-- A row with `blocked`, `incomplete`, `not_tested`, `not_encoded`, `not_comparable`,
-  or `unsupported_claim` MUST remain visible as a non-pass row.
-- External competitors MUST have either comparable product-runtime evidence or an
-  explicit typed non-pass/blocker row with source provenance.
-- Missing Docker image digest evidence is a blocker for comparability, even if a live
-  adapter executed.
-- Public-proxy, fixture-only, local-mock, diagnostic, blocked, and not-encoded rows
-  MUST NOT be promoted into universal product superiority claims.
-- Optimization direction MUST be tied to row-level `next_evidence`, metrics, or typed
-  non-pass states.
+| `ingestion_success_rate` | successful ingested records / records submitted |
+| `indexing_coverage` | indexed records or spans / ingestible records or spans |
+| `source_id_mapping_coverage` | returned candidates or generated claims mapped to benchmark source ids / candidates or claims requiring mapping |
+| `query_latency_p50_ms`, `query_latency_p95_ms`, `query_latency_p99_ms` | milliseconds |
+| `ingest_latency_ms` | milliseconds from submitted source to durable ingest acknowledgement |
+| `update_propagation_latency_ms` | milliseconds from write/apply/delete to searchable/readable effect |
+| `cold_start_recovery_seconds` | seconds |
+| `restore_seconds` | seconds |
+| `index_rebuild_seconds` | seconds |
+| `cost_usd` | USD with input/output token counts where applicable |
+| `available_context_token_count` | tokens available in the source corpus or memory store for the query |
+| `answer_context_token_count` | tokens supplied to the answering model or final answer context |
+| `context_token_efficiency` | `answer_context_token_count / available_context_token_count` |
+| `resource_envelope_status` | pass, blocked, incomplete, not_encoded |
+
+Provider-backed rows must include model/provider identifiers or must remain
+`not_comparable`. Fixture zero-cost rows must not imply hosted provider cost.
+
+## Quantitative Scoreboard Schema
+
+Reports that implement this spec must emit:
+
+```json
+{
+  "schema": "elf.agent_memory_quantitative_benchmark/v1",
+  "generated_at": "...",
+  "corpus_id": "...",
+  "k_values": [1, 3, 5, 10],
+  "rows": [
+    {
+      "product": "ELF",
+      "adapter_id": "elf_live_real_world",
+      "adapter_name": "ELF live real-world",
+      "suite": "memory_evolution",
+      "evidence_class": "live_real_world",
+      "result_state": "pass",
+      "comparable": true,
+      "metric_comparable": true,
+      "leaderboard_eligible": false,
+      "held_out": false,
+      "leakage_audited": false,
+      "audit_manifest_id": null,
+      "fixture_regression_only": false,
+      "sample_size": 40,
+      "ranking_query_count": 40,
+      "ranking_coverage_state": "measured",
+      "ranked_candidate_source": "runtime_trace",
+      "qrel_source": "explicit_qrels",
+      "explicit_qrel_query_count": 40,
+      "metrics": {
+        "recall_at_5": 1.0,
+        "precision_at_5": 0.6,
+        "mrr": 1.0,
+        "ndcg_at_5": 1.0,
+        "map": 1.0,
+        "average_precision": 1.0,
+        "success_at_5": 1.0,
+        "explicit_qrel_query_coverage": 1.0,
+        "relevance_judgment_count": 80,
+        "relevance_grade_sum": 160,
+        "update_correctness_rate": 1.0,
+        "stale_suppression_rate": 1.0,
+        "delete_suppression_rate": 1.0,
+        "expected_evidence_recall": 1.0,
+        "unsupported_claim_rate": 0.0,
+        "stale_answer_rate": 0.0
+      },
+      "metric_states": {
+        "recall_at_5": "measured",
+        "precision_at_5": "measured",
+        "mrr": "measured",
+        "ndcg_at_5": "measured",
+        "average_precision": "measured",
+        "map": "measured",
+        "success_at_5": "measured"
+      },
+      "denominators": {
+        "recall_at_5": 80,
+        "precision_at_5": 200,
+        "map": 40,
+        "success_at_5": 40,
+        "update_correctness_rate": 2,
+        "delete_suppression_rate": 1,
+        "stale_answer_rate": 40
+      },
+      "confidence_intervals": {
+        "recall_at_5": {
+          "method": "wilson_score",
+          "confidence": 0.95,
+          "lower": 0.954,
+          "upper": 1.0,
+          "numerator": 80,
+          "denominator": 80
+        }
+      },
+      "claim_boundary": "Comparable only against same-corpus live_real_world rows."
+    }
+  ],
+  "per_query_rows": [
+    {
+      "job_id": "memory-evolution-001",
+      "suite": "memory_evolution",
+      "evidence_class": "live_real_world",
+      "result_state": "pass",
+      "expected_relevant_count": 2,
+      "candidate_count": 8,
+      "qrel_source": "explicit_qrels",
+      "relevance_grade_sum": 4.0,
+      "product": "ELF",
+      "adapter_id": "elf_live_real_world",
+      "metrics": {
+        "recall_at_5": 1.0,
+        "precision_at_5": 0.4,
+        "mrr": 1.0,
+        "ndcg_at_5": 1.0,
+        "average_precision": 1.0,
+        "success_at_5": 1.0
+      },
+      "metric_states": {
+        "recall_at_5": "measured",
+        "precision_at_5": "measured",
+        "mrr": "measured",
+        "ndcg_at_5": "measured",
+        "average_precision": "measured",
+        "success_at_5": "measured"
+      },
+      "denominators": {
+        "recall_at_5": 2,
+        "precision_at_5": 5,
+        "mrr": 1,
+        "ndcg_at_5": 1,
+        "average_precision": 1,
+        "success_at_5": 1
+      }
+    }
+  ],
+  "ablation_rows": [
+    {
+      "product": "ELF",
+      "adapter_id": "elf_live_real_world",
+      "ablation_id": "raw_vector",
+      "job_id": "memory-evolution-001",
+      "suite": "memory_evolution",
+      "evidence_class": "live_real_world",
+      "result_state": "pass",
+      "candidate_source": "runtime_trace_ablation",
+      "qrel_source": "explicit_qrels",
+      "expected_relevant_count": 2,
+      "candidate_count": 8,
+      "metrics": {
+        "recall_at_5": 0.5,
+        "precision_at_5": 0.2,
+        "mrr": 0.5,
+        "ndcg_at_5": 0.62,
+        "average_precision": 0.5,
+        "success_at_5": 1.0
+      },
+      "metric_states": {
+        "recall_at_5": "measured",
+        "precision_at_5": "measured",
+        "mrr": "measured",
+        "ndcg_at_5": "measured",
+        "average_precision": "measured",
+        "success_at_5": "measured"
+      },
+      "denominators": {
+        "recall_at_5": 2,
+        "precision_at_5": 5,
+        "mrr": 1,
+        "ndcg_at_5": 1,
+        "average_precision": 1,
+        "success_at_5": 1
+      },
+      "claim_boundary": "Ablation rows score explicitly supplied candidate orderings for diagnosis; they are not separate product-runtime rows unless the evidence class and candidate source say so."
+    }
+  ],
+  "significance": {
+    "method": "exact_two_sided_sign_test_on_same_query_metric_deltas",
+    "state": "not_encoded_single_product_row",
+    "eligible": false,
+    "minimum_paired_query_count": 30,
+    "comparable_product_row_count": 1,
+    "paired_query_count": 0,
+    "comparisons": [],
+    "ablation_comparisons": [
+      {
+        "comparison_scope": "ablation",
+        "baseline_id": "raw_vector",
+        "candidate_id": "governed_memory",
+        "baseline_product": "raw_vector",
+        "candidate_product": "governed_memory",
+        "metric": "ndcg_at_5",
+        "paired_query_count": 1,
+        "state": "measured",
+        "effect_mean": 0.311,
+        "p_value": 1.0,
+        "win_count": 1,
+        "loss_count": 0,
+        "tie_count": 0
+      }
+    ],
+    "claim_boundary": "Pairwise wins require at least two leaderboard-eligible rows with same-query per-query metrics; otherwise p-values and win claims stay not encoded."
+  },
+  "leakage_audit": {
+    "state": "not_leaderboard_eligible",
+    "held_out": false,
+    "leakage_audited": false,
+    "corpus_profile": "synthetic",
+    "evidence_class": "fixture_backed",
+    "qrel_source": "explicit_qrels",
+    "fixture_regression_only": true,
+    "ranking_coverage_state": "partial_coverage",
+    "leaderboard_blocking_reasons": [
+      "fixture_regression_only",
+      "insufficient_query_count",
+      "no_held_out_manifest",
+      "no_leakage_audit_manifest",
+      "not_live_real_world",
+      "ranking_coverage_not_measured"
+    ],
+    "claim_boundary": "Held-out and leakage-audit fields are explicit gates; fixture or non-audited rows cannot become public leaderboard evidence by omission."
+  },
+  "non_comparable_rows": [
+    {
+      "product": "VectifyAI PageIndex",
+      "adapter_id": "pageindex_public_proxy_contract",
+      "result_state": "not_comparable",
+      "reason": "public_proxy evidence class; no PageIndex product runtime output"
+    }
+  ],
+  "controls": {
+    "same_corpus_required": true,
+    "same_task_required": true,
+    "same_evidence_class_required": true,
+    "same_budget_required": true,
+    "ranked_candidates_required_for_ranking_metrics": true,
+    "raw_ranked_candidate_artifacts_required": true,
+    "held_out_or_leakage_audited_required": true,
+    "explicit_relevance_judgments_required_for_leaderboard": true,
+    "per_query_rows_required_for_significance": true,
+    "minimum_query_count_for_leaderboard": 30,
+    "current_query_count": 40,
+    "current_ranking_query_count": 40,
+    "current_explicit_qrel_query_count": 40,
+    "comparable_product_row_count": 1,
+    "leaderboard_claim_allowed": false,
+    "statistical_significance": "not_encoded_until_at_least_two_same-corpus comparable product rows meet minimum query count, full ranking coverage, and explicit qrels",
+    "uncertainty_reporting": "single-row rates include Wilson 95% confidence intervals; competitor win claims require same-query paired significance over per-query rows.",
+    "leakage_control": "fixture rows are not public leaderboard proof; current product leaderboard rows require held-out and leakage-audited status plus an audit manifest id."
+  }
+}
+```
+
+## External Product Row Import
+
+`real_world_job_benchmark run` may accept an optional
+`--quantitative-product-manifest` file when a competitor adapter has already
+materialized same-corpus product-runtime rows outside the current ELF fixture run.
+The manifest schema is `elf.agent_memory_quantitative_product_manifest/v1`.
+Generated reports infer the quantitative row `product` from the external adapter
+manifest entry matching `--adapter-id`, with `--product` available only as an
+explicit override for old or ad hoc reports.
+
+Use `real_world_job_benchmark export-quantitative-product-manifest --report
+<report.json>` to derive this manifest from a generated `elf.real_world_job_report/v1`
+instead of hand-writing metric rows. The export command copies the report's primary
+aggregate row and matching per-query rows, rejects `ELF` self rows, and then runs
+the same manifest validation used by import. The live qmd adapter sweep writes
+`qmd-quantitative-product-manifest.json` and a combined
+`elf-qmd-quantitative-report.json` so the same-corpus qmd row is visible in
+`quantitative_scoreboard.rows` when fresh live artifacts exist.
+
+```json
+{
+  "schema": "elf.agent_memory_quantitative_product_manifest/v1",
+  "manifest_id": "qmd-live-real-world-2026-06-23",
+  "corpus_id": "...same value as quantitative_scoreboard.corpus_id...",
+  "rows": [
+    {
+      "product": "qmd",
+      "adapter_id": "qmd_live_real_world",
+      "held_out": false,
+      "leakage_audited": false,
+      "audit_manifest_id": null,
+      "metrics": {
+        "recall_at_5": 0.75,
+        "ndcg_at_5": 0.601,
+        "average_precision": 0.608
+      },
+      "metric_states": {
+        "recall_at_5": "measured",
+        "ndcg_at_5": "measured",
+        "average_precision": "measured"
+      }
+    }
+  ],
+  "per_query_rows": [
+    {
+      "product": "qmd",
+      "adapter_id": "qmd_live_real_world",
+      "job_id": "...",
+      "metrics": {
+        "recall_at_5": 0.75,
+        "ndcg_at_5": 0.601,
+        "average_precision": 0.608
+      },
+      "metric_states": {
+        "recall_at_5": "measured",
+        "ndcg_at_5": "measured",
+        "average_precision": "measured"
+      }
+    }
+  ]
+}
+```
+
+The runner must reject imported rows unless:
+
+- the manifest `corpus_id` exactly matches the current scoreboard `corpus_id`
+- each `(product, adapter_id)` matches an external adapter manifest record
+- the product is not `ELF`
+- aggregate rows and per-query rows carry the paired-comparison metrics
+  `recall_at_5`, `ndcg_at_5`, and `average_precision`
+- ranked aggregate rows have at least `ranking_query_count` matching per-query rows
+
+Imported rows replace the matching `non_comparable_rows` entry, but they do not
+automatically authorize leaderboard claims. A row marked `leaderboard_eligible`
+must also be product-runtime evidence with `result_state = pass`, minimum ranked
+query coverage, `ranked_candidate_source = runtime_trace`, `qrel_source =
+explicit_qrels`, enough explicit qrels for every ranked query, `held_out = true`,
+`leakage_audited = true`, and a non-empty `audit_manifest_id`. The current runner
+requires both held-out and leakage-audit fields, plus an audit manifest id, before
+an imported product row can remain marked leaderboard eligible. This keeps
+hand-written, public-proxy, or non-audited rows from becoming hidden wins.
+
+## Minimum Rows For P6
+
+The first implementation issue after this spec must produce a machine-readable
+`quantitative_scoreboard` from `real_world_job_benchmark`. The initial runner row may
+calculate ranking metrics only when the fixture or adapter emits explicit
+`ranked_candidate_evidence_ids`; otherwise it must mark those metrics
+`not_encoded`. If only a subset of queries emits ranked candidates, ranking metrics
+must use `partial_coverage` and must not make the row leaderboard eligible. It must
+publish metric states, denominators, sample size, ranked query count, per-query rows,
+explicit-qrel coverage, qrel source, Wilson 95% intervals for measured or partial
+rate metrics, ablation rows for explicitly supplied candidate orderings, diagnostic
+ablation pairwise comparisons with exact two-sided sign-test p-values,
+paired-significance gating state for product rows, held-out/leakage audit state, and
+controls so missing rows cannot become hidden wins. The runner may also import
+same-corpus external quantitative product rows through
+`elf.agent_memory_quantitative_product_manifest/v1`; this is an adapter artifact
+boundary, not a manual scoring exemption. It must also keep unimplemented but
+required production-memory measures visible as `not_encoded`, including source-id
+mapping coverage, ingestion/indexing coverage, contradiction resolution,
+propagation latency, and context-token efficiency.
+
+The full P6 scoreboard must produce rows for:
+
+- ELF fixture-backed memory authority and knowledge workspace jobs.
+- ELF live-real-world retrieval and memory-evolution jobs where artifacts exist.
+- qmd live-real-world retrieval/debug rows where artifacts exist.
+- mem0/OpenMemory local SDK history/export rows where artifacts exist.
+- Honcho rows as typed same-corpus blockers plus `research_gate`/`not_comparable`
+  external-adapter rows until peer/session outputs, background reasoning artifacts,
+  source-id mapped search/chat/context results, and token/context efficiency
+  measures exist for the same corpus.
+- PageIndex/OpenKB rows as `blocked` or `not_comparable` until actual product
+  artifacts exist.
+- Letta, OpenViking, Graphiti/Zep, RAGFlow, GraphRAG, and LightRAG rows as
+  `blocked`, `not_encoded`, or `not_comparable` unless same-corpus product artifacts
+  are checked in.
+
+## Research Alignment
+
+This benchmark contract is aligned with established retrieval and memory-evaluation
+practice, but it is not itself a public leaderboard until the controls permit one:
+
+- BEIR-style retrieval evaluation requires a shared corpus/query/qrels format and
+  rank-aware metrics such as nDCG@k, MAP, and success@k for comparable retrieval
+  claims.
+- RAGAS-style RAG evaluation separates retrieval context recall/precision from
+  answer faithfulness and response quality.
+- LoCoMo-style memory evaluation shows that long-term memory requires temporal,
+  multi-session, summarization, and event-grounded reasoning slices, not only
+  single-turn retrieval.
+- Production memory comparisons must report token/cost/latency budgets; Mem0's
+  public benchmark framing treats accuracy, token cost, and latency as coupled
+  production dimensions.
+- Honcho's public docs and benchmark materials position it as reasoning-first
+  memory with peer/session representations, background reasoning/dreaming, LongMem,
+  LoCoMo, BEAM, and token-efficiency framing. ELF must treat those as required
+  benchmark surfaces, not as same-corpus product results, until a Honcho adapter
+  emits source-id mapped artifacts on the benchmark corpus.
+- Scientific comparison requires held-out and leakage-audited corpora with audit
+  manifest ids, explicit qrels, raw per-query rows, repeated or paired comparable
+  runs, confidence intervals for single-row estimates, and paired product-row
+  significance tests before a leaderboard claim is allowed. Ablation pairwise tests
+  are diagnostic optimization evidence, not product leaderboard evidence.
+
+## Claim Boundaries
+
+Allowed:
+
+- "ELF has measured evidence recall, source-ref coverage, stale suppression, and
+  update/delete correctness for the rows shown."
+- "Product X is not comparable on metric Y because evidence class, corpus, or
+  product artifact coverage differs."
+- "Product X beats ELF on metric Y" only when both rows are same-corpus,
+  same-evidence-class, same-task, and comparable.
+
+Not allowed:
+
+- A fixture-backed pass cannot beat a provider-backed or product-runtime row.
+- A public-proxy pass cannot prove PageIndex, OpenKB, hosted memory, provider-backed,
+  or private-corpus product quality.
+- A missing denominator cannot be reported as `1.000`.
+- A `blocked`, `not_encoded`, or `not_comparable` row cannot become a win by omission.
diff --git a/makefiles/benchmark-core.toml b/makefiles/benchmark-core.toml
index 02c94349..55243485 100644
--- a/makefiles/benchmark-core.toml
+++ b/makefiles/benchmark-core.toml
@@ -1,95 +1,8 @@
-# Rust workspace tasks: Benchmark core, baseline, and operator tasks.
-
-# Rust workspace tasks: Benchmark.
-
-# Benchmark
-# | task                                       | type      | cwd |
-# | ------------------------------------------ | --------- | --- |
-# | baseline-backfill-100k-docker              | command   |     |
-# | baseline-backfill-10k-docker               | command   |     |
-# | baseline-backfill-docker                   | command   |     |
-# | baseline-live-docker                       | command   |     |
-# | baseline-live-report                       | command   |     |
-# | baseline-production-private                | command   |     |
-# | baseline-production-private-addendum       | command   |     |
-# | baseline-production-synthetic              | command   |     |
-# | baseline-soak-docker                       | command   |     |
-# | local-agent-loop                           | command   |     |
-# | openmemory-ui-export-readback              | command   |     |
-# | parity-docker                              | command   |     |
-# | real-world-first-generation-oss            | composite |     |
-# | real-world-first-generation-oss-json       | command   |     |
-# | real-world-first-generation-oss-report     | command   |     |
-# | real-world-job-operator-ux                 | composite |     |
-# | real-world-job-operator-ux-json            | command   |     |
-# | real-world-job-operator-ux-live-adapters   | command   |     |
-# | real-world-job-operator-ux-report          | command   |     |
-# | real-world-memory                          | composite |     |
-# | real-world-memory-adversarial-quality      | composite |     |
-# | real-world-memory-adversarial-quality-json | command   |     |
-# | real-world-memory-adversarial-quality-report | command |     |
-# | real-world-memory-consolidation            | composite |     |
-# | real-world-memory-consolidation-json       | command   |     |
-# | real-world-memory-consolidation-report     | command   |     |
-# | real-world-memory-p1-closeout              | composite |     |
-# | real-world-memory-p1-closeout-json         | command   |     |
-# | real-world-memory-p1-closeout-report       | command   |     |
-# | real-world-memory-p4-production-readiness  | composite |     |
-# | real-world-memory-p4-production-readiness-json | command |     |
-# | real-world-memory-p4-production-readiness-report | command |     |
-# | real-world-memory-p4-quality-hardening-closeout | composite |     |
-# | real-world-memory-p2-knowledge-closeout    | composite |     |
-# | real-world-memory-core-archival            | composite |     |
-# | real-world-memory-core-archival-json       | command   |     |
-# | real-world-memory-core-archival-report     | command   |     |
-# | real-world-memory-context-trajectory       | composite |     |
-# | real-world-memory-context-trajectory-json  | command   |     |
-# | real-world-memory-context-trajectory-report | command   |     |
-# | real-world-memory-evolution                | composite |     |
-# | real-world-memory-evolution-json           | command   |     |
-# | real-world-memory-evolution-report         | command   |     |
-# | real-world-memory-graph-rag                | composite |     |
-# | real-world-memory-graph-rag-json           | command   |     |
-# | real-world-memory-graph-rag-report         | command   |     |
-# | real-world-memory-json                     | command   |     |
-# | real-world-memory-knowledge                | composite |     |
-# | real-world-memory-knowledge-json           | command   |     |
-# | real-world-memory-knowledge-report         | command   |     |
-# | real-world-memory-live-adapters            | command   |     |
-# | real-world-memory-live-consolidation       | command   |     |
-# | real-world-memory-live-knowledge           | command   |     |
-# | real-world-memory-mem0-openmemory-letta    | composite |     |
-# | real-world-memory-mem0-openmemory-letta-json | command |     |
-# | real-world-memory-mem0-openmemory-letta-report | command |     |
-# | real-world-memory-pageindex-openkb         | composite |     |
-# | real-world-memory-pageindex-openkb-json    | command   |     |
-# | real-world-memory-pageindex-openkb-report  | command   |     |
-# | real-world-memory-proactive-brief          | composite |     |
-# | real-world-memory-proactive-brief-json     | command   |     |
-# | real-world-memory-proactive-brief-report   | command   |     |
-# | real-world-memory-production-ops           | composite |     |
-# | real-world-memory-production-ops-json      | command   |     |
-# | real-world-memory-production-ops-report    | command   |     |
-# | real-world-memory-project-decisions        | composite |     |
-# | real-world-memory-project-decisions-json   | command   |     |
-# | real-world-memory-project-decisions-report | command   |     |
-# | real-world-memory-quantitative-scoreboard  | composite |     |
-# | real-world-memory-quantitative-scoreboard-json | command |     |
-# | real-world-memory-quantitative-scoreboard-report | command |     |
-# | real-world-memory-report                   | command   |     |
-# | real-world-memory-retrieval                | composite |     |
-# | real-world-memory-retrieval-json           | command   |     |
-# | real-world-memory-retrieval-report         | command   |     |
-# | real-world-memory-scheduled                | composite |     |
-# | real-world-memory-scheduled-json           | command   |     |
-# | real-world-memory-scheduled-report         | command   |     |
-# | real-world-memory-service-native-dreaming  | command   |     |
-# | real-world-memory-summary                  | composite |     |
-# | real-world-memory-summary-json             | command   |     |
-# | real-world-memory-summary-report           | command   |     |
-# | real-world-memory-work-continuity          | composite |     |
-# | real-world-memory-work-continuity-json     | command   |     |
-# | real-world-memory-work-continuity-report   | command   |     |
+# Rust workspace tasks: benchmark core, baseline, and operator commands.
+#
+# Keep long task listings out of comments. `cargo make --list-all-steps` is the
+# source for the complete task index, while this file owns only non-sharded
+# benchmark commands.
 
 [tasks.baseline-backfill-100k-docker]
 workspace = false
diff --git a/makefiles/benchmark-memory-a.toml b/makefiles/benchmark-memory-a.toml
index a7063ca4..a7b5e6c6 100644
--- a/makefiles/benchmark-memory-a.toml
+++ b/makefiles/benchmark-memory-a.toml
@@ -364,6 +364,13 @@ args = [
 	"tmp/real-world-memory/evolution-report.md",
 ]
 
+[tasks.real-world-memory-explicit-qrels]
+workspace = false
+command = "bash"
+args = [
+	"scripts/real-world-explicit-qrels.sh",
+]
+
 [tasks.real-world-memory-graph-rag]
 workspace = false
 dependencies = [
diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml
index 8657bb36..95003f90 100644
--- a/makefiles/benchmark-memory-b.toml
+++ b/makefiles/benchmark-memory-b.toml
@@ -251,6 +251,14 @@ args = [
 	"memory-live-consolidation",
 ]
 
+[tasks.real-world-memory-live-explicit-qrels]
+workspace = false
+command = "bash"
+args = [
+	"scripts/real-world-docker.sh",
+	"memory-live-explicit-qrels",
+]
+
 [tasks.real-world-memory-live-knowledge]
 workspace = false
 command = "bash"
diff --git a/scripts/materialize-explicit-qrels.py b/scripts/materialize-explicit-qrels.py
new file mode 100755
index 00000000..779abd2f
--- /dev/null
+++ b/scripts/materialize-explicit-qrels.py
@@ -0,0 +1,290 @@
+#!/usr/bin/env python3
+"""Generate explicit relevance-judgment fixtures from real-world job fixtures."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import shutil
+from datetime import UTC, datetime
+from pathlib import Path
+from typing import Any
+
+
+SCHEMA = "elf.real_world_explicit_qrel_materialization/v1"
+JOB_SCHEMA = "elf.real_world_job/v1"
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description=(
+            "Copy real_world_job fixtures and derive expected_answer.relevance_judgments "
+            "from checked-in evidence_links/required_evidence."
+        )
+    )
+    parser.add_argument("--fixtures", required=True, type=Path, help="Input fixture directory.")
+    parser.add_argument("--out-fixtures", required=True, type=Path, help="Generated fixture directory.")
+    parser.add_argument(
+        "--summary-out",
+        required=True,
+        type=Path,
+        help="Write materialization summary JSON.",
+    )
+    parser.add_argument(
+        "--ranked-candidates-source",
+        choices=["none", "oracle"],
+        default="none",
+        help="Optionally add fixture-trace ranked candidates ordered by qrel grade.",
+    )
+    parser.add_argument(
+        "--profile",
+        choices=["preserve", "generated_public"],
+        default="preserve",
+        help="Preserve original corpus profile or mark generated jobs as generated_public.",
+    )
+    parser.add_argument(
+        "--exclude-without-positive-qrels",
+        action="store_true",
+        help="Do not copy job JSON files that have no positive derived qrels.",
+    )
+    parser.add_argument(
+        "--overwrite",
+        action="store_true",
+        help="Replace existing relevance_judgments instead of preserving explicit grades.",
+    )
+
+    return parser.parse_args()
+
+
+def read_json(path: Path) -> Any:
+    with path.open(encoding="utf-8") as fh:
+        return json.load(fh)
+
+
+def write_json(path: Path, value: Any) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with path.open("w", encoding="utf-8") as fh:
+        json.dump(value, fh, indent=2, sort_keys=False)
+        fh.write("\n")
+
+
+def stable_unique(values: list[str]) -> list[str]:
+    seen: set[str] = set()
+    result: list[str] = []
+    for value in values:
+        if value and value not in seen:
+            seen.add(value)
+            result.append(value)
+
+    return result
+
+
+def evidence_link_ids(value: Any) -> list[str]:
+    if isinstance(value, str):
+        return [value]
+    if isinstance(value, list):
+        return [item for item in value if isinstance(item, str)]
+
+    return []
+
+
+def corpus_evidence_ids(job: dict[str, Any]) -> list[str]:
+    return [
+        item["evidence_id"]
+        for item in job.get("corpus", {}).get("items", [])
+        if isinstance(item, dict) and isinstance(item.get("evidence_id"), str)
+    ]
+
+
+def derive_positive_grades(job: dict[str, Any]) -> dict[str, float]:
+    grades: dict[str, float] = {}
+    expected = job.get("expected_answer", {})
+
+    for link in expected.get("evidence_links", {}).values():
+        for evidence_id in evidence_link_ids(link):
+            grades[evidence_id] = max(grades.get(evidence_id, 0.0), 2.0)
+
+    for evidence in job.get("required_evidence", []):
+        if isinstance(evidence, dict) and isinstance(evidence.get("evidence_id"), str):
+            grades[evidence["evidence_id"]] = max(grades.get(evidence["evidence_id"], 0.0), 1.0)
+
+    return grades
+
+
+def existing_qrel_grades(job: dict[str, Any]) -> dict[str, float]:
+    grades: dict[str, float] = {}
+    expected = job.get("expected_answer", {})
+    for judgment in expected.get("relevance_judgments", []):
+        if not isinstance(judgment, dict) or not isinstance(judgment.get("evidence_id"), str):
+            continue
+        grade = judgment.get("grade", 1.0)
+        if isinstance(grade, (int, float)):
+            grades[judgment["evidence_id"]] = float(grade)
+
+    return grades
+
+
+def materialized_qrels(job: dict[str, Any], overwrite: bool) -> list[dict[str, Any]]:
+    evidence_ids = corpus_evidence_ids(job)
+    grades = derive_positive_grades(job)
+
+    if not overwrite:
+        grades.update(existing_qrel_grades(job))
+
+    if not any(grade > 0.0 for grade in grades.values()):
+        return []
+
+    return [
+        {"evidence_id": evidence_id, "grade": grades.get(evidence_id, 0.0)}
+        for evidence_id in evidence_ids
+        if evidence_id in grades
+    ]
+
+
+def ranked_candidates_from_qrels(qrels: list[dict[str, Any]]) -> list[str]:
+    return [
+        judgment["evidence_id"]
+        for judgment in sorted(
+            qrels,
+            key=lambda judgment: (
+                -float(judgment.get("grade", 0.0)),
+                str(judgment.get("evidence_id", "")),
+            ),
+        )
+        if judgment.get("evidence_id")
+    ]
+
+
+def add_oracle_ranked_candidates(job: dict[str, Any], qrels: list[dict[str, Any]]) -> bool:
+    answer = job.get("corpus", {}).get("adapter_response", {}).get("answer")
+    if not isinstance(answer, dict):
+        return False
+
+    trace = answer.setdefault("trace_explainability", {})
+    trace["ranked_candidate_evidence_ids"] = ranked_candidates_from_qrels(qrels)
+    trace.setdefault("trace_id", f"{job.get('job_id', 'unknown')}-explicit-qrel-oracle")
+
+    return True
+
+
+def materialize_job(
+    source: Path,
+    target: Path,
+    args: argparse.Namespace,
+) -> dict[str, Any]:
+    job = read_json(source)
+    if not isinstance(job, dict) or job.get("schema") != JOB_SCHEMA:
+        shutil.copy2(source, target)
+        return {"kind": "copied_non_job_json"}
+
+    qrels = materialized_qrels(job, overwrite=args.overwrite)
+    if not qrels and args.exclude_without_positive_qrels:
+        return {
+            "kind": "excluded_without_positive_qrels",
+            "job_id": job.get("job_id"),
+        }
+
+    ranked_candidate_added = False
+    if qrels:
+        expected = job.setdefault("expected_answer", {})
+        had_existing_qrels = bool(expected.get("relevance_judgments"))
+        expected["relevance_judgments"] = qrels
+        tags = stable_unique([*job.get("tags", []), "explicit_qrels_generated"])
+        job["tags"] = tags
+
+        if args.profile == "generated_public":
+            job.setdefault("corpus", {})["profile"] = "generated_public"
+
+        if args.ranked_candidates_source == "oracle":
+            ranked_candidate_added = add_oracle_ranked_candidates(job, qrels)
+
+        write_json(target, job)
+        return {
+            "kind": "materialized_job",
+            "job_id": job.get("job_id"),
+			"judgment_count": len(qrels),
+			"positive_judgment_count": sum(1 for judgment in qrels if judgment["grade"] > 0.0),
+			"zero_grade_judgment_count": sum(1 for judgment in qrels if judgment["grade"] == 0.0),
+			"unjudged_corpus_evidence_count": len(corpus_evidence_ids(job)) - len(qrels),
+			"had_existing_qrels": had_existing_qrels,
+			"ranked_candidate_added": ranked_candidate_added,
+		}
+
+    shutil.copy2(source, target)
+    return {
+        "kind": "copied_without_positive_qrels",
+        "job_id": job.get("job_id"),
+    }
+
+
+def materialize(args: argparse.Namespace) -> dict[str, Any]:
+    if not args.fixtures.is_dir():
+        raise SystemExit(f"{args.fixtures} is not a directory")
+
+    if args.out_fixtures.exists():
+        shutil.rmtree(args.out_fixtures)
+    args.out_fixtures.mkdir(parents=True)
+
+    records: list[dict[str, Any]] = []
+    for source in sorted(args.fixtures.rglob("*")):
+        rel = source.relative_to(args.fixtures)
+        target = args.out_fixtures / rel
+        if source.is_dir():
+            target.mkdir(parents=True, exist_ok=True)
+            continue
+        if source.suffix == ".json":
+            records.append(materialize_job(source, target, args))
+        else:
+            target.parent.mkdir(parents=True, exist_ok=True)
+            shutil.copy2(source, target)
+
+    materialized = [record for record in records if record["kind"] == "materialized_job"]
+    excluded = [record for record in records if record["kind"] == "excluded_without_positive_qrels"]
+
+    summary = {
+        "schema": SCHEMA,
+        "generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"),
+        "input_fixture_dir": str(args.fixtures),
+        "output_fixture_dir": str(args.out_fixtures),
+        "ranked_candidates_source": args.ranked_candidates_source,
+        "profile": args.profile,
+        "exclude_without_positive_qrels": args.exclude_without_positive_qrels,
+        "overwrite": args.overwrite,
+        "job_count": len(materialized),
+        "excluded_without_positive_qrels_count": len(excluded),
+		"judgment_count": sum(record["judgment_count"] for record in materialized),
+		"positive_judgment_count": sum(record["positive_judgment_count"] for record in materialized),
+		"zero_grade_judgment_count": sum(record["zero_grade_judgment_count"] for record in materialized),
+		"unjudged_corpus_evidence_count": sum(
+			record["unjudged_corpus_evidence_count"] for record in materialized
+		),
+		"existing_qrel_job_count": sum(1 for record in materialized if record["had_existing_qrels"]),
+        "ranked_candidate_job_count": sum(
+            1 for record in materialized if record["ranked_candidate_added"]
+        ),
+        "excluded_job_ids": [record.get("job_id") for record in excluded],
+        "claim_boundary": (
+			"Derived qrels are deterministic benchmark labels from checked-in evidence links and "
+			"required_evidence. Unmentioned corpus evidence remains unjudged instead of being "
+			"converted into synthetic negative labels. Oracle ranked candidates test metric "
+			"mechanics only; they are not product-runtime retrieval evidence or leaderboard proof."
+		),
+	}
+
+    write_json(args.summary_out, summary)
+    return summary
+
+
+def main() -> None:
+    args = parse_args()
+    summary = materialize(args)
+    print(
+        "materialized explicit qrels: "
+        f"{summary['job_count']} jobs, "
+        f"{summary['judgment_count']} judgments, "
+        f"{summary['ranked_candidate_job_count']} ranked-candidate traces"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh
index 163c4d1f..8afc80d5 100755
--- a/scripts/real-world-docker.sh
+++ b/scripts/real-world-docker.sh
@@ -45,6 +45,11 @@ memory-live-adapters)
 		docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb
 	fi
 	docker compose -f docker-compose.baseline.yml run --build --rm \
+		-e ELF_REAL_WORLD_LIVE_REPORT_DIR \
+		-e ELF_REAL_WORLD_LIVE_FIXTURES \
+		-e ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES \
+		-e ELF_REAL_WORLD_LIVE_WORK_DIR \
+		-e ELF_REAL_WORLD_QMD_DIR \
 		-e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW \
 		-e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG \
 		-e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG \
@@ -123,6 +128,15 @@ memory-live-adapters)
 	fi
 	exit "$status"
 	;;
+memory-live-explicit-qrels)
+	docker compose -f docker-compose.baseline.yml run --build --rm \
+		-e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR \
+		-e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES \
+		-e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES \
+		-e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR \
+		-e ELF_REAL_WORLD_QMD_DIR \
+		baseline-runner bash scripts/real-world-live-explicit-qrels.sh
+	;;
 *)
 	echo "unknown real-world Docker profile: $profile" >&2
 	exit 2
diff --git a/scripts/real-world-explicit-qrels.sh b/scripts/real-world-explicit-qrels.sh
new file mode 100755
index 00000000..ccd17cf1
--- /dev/null
+++ b/scripts/real-world-explicit-qrels.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+REPORT_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/explicit-qrels}"
+SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}"
+QREL_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_OUT_FIXTURES:-${REPORT_DIR}/fixtures}"
+
+cd "${ROOT_DIR}"
+
+python3 scripts/materialize-explicit-qrels.py \
+	--fixtures "${SOURCE_FIXTURE_DIR}" \
+	--out-fixtures "${QREL_FIXTURE_DIR}" \
+	--summary-out "${REPORT_DIR}/materialization-summary.json" \
+	--ranked-candidates-source oracle \
+	--profile generated_public \
+	--exclude-without-positive-qrels
+
+cargo run -p elf-eval --bin real_world_job_benchmark -- \
+	run \
+	--fixtures "${QREL_FIXTURE_DIR}" \
+	--out "${REPORT_DIR}/report.json" \
+	--run-id real-world-memory-explicit-qrels \
+	--adapter-id fixture_explicit_qrels \
+	--adapter-name "Explicit qrel oracle fixture pack" \
+	--adapter-behavior explicit_qrel_oracle_fixture \
+	--adapter-storage-status pass \
+	--adapter-runtime-status pass \
+	--adapter-notes "Generated by scripts/materialize-explicit-qrels.py from checked-in evidence_links and required_evidence; unmentioned corpus evidence remains unjudged; oracle ranked candidates test metric mechanics only."
+
+cargo run -p elf-eval --bin real_world_job_benchmark -- \
+	publish \
+	--report "${REPORT_DIR}/report.json" \
+	--out "${REPORT_DIR}/report.md"
+
+echo "Explicit qrel benchmark report:"
+echo "  ${REPORT_DIR}/materialization-summary.json"
+echo "  ${REPORT_DIR}/report.json"
+echo "  ${REPORT_DIR}/report.md"
diff --git a/scripts/real-world-live-explicit-qrels.sh b/scripts/real-world-live-explicit-qrels.sh
new file mode 100755
index 00000000..35212ac1
--- /dev/null
+++ b/scripts/real-world-live-explicit-qrels.sh
@@ -0,0 +1,80 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+REPORT_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-explicit-qrels}"
+SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}"
+OPERATOR_SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}"
+QREL_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-fixtures"
+QREL_OPERATOR_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-operator-debug-fixtures"
+LIVE_REPORT_DIR="${REPORT_DIR}/live-adapters"
+LIVE_WORK_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR:-/bench/real-world-live-explicit-qrels}"
+
+if [[ ! -f "/.dockerenv" && "${ELF_REAL_WORLD_LIVE_ALLOW_HOST:-0}" != "1" ]]; then
+  echo "Refusing to run live explicit-qrel adapters outside Docker. Use cargo make real-world-memory-live-explicit-qrels." >&2
+  exit 1
+fi
+
+for cmd in bash jq python3; do
+  if ! command -v "${cmd}" >/dev/null 2>&1; then
+    echo "Missing ${cmd} in live explicit-qrel runner." >&2
+    exit 1
+  fi
+done
+
+cd "${ROOT_DIR}"
+
+rm -rf "${REPORT_DIR}"
+mkdir -p "${REPORT_DIR}"
+
+python3 scripts/materialize-explicit-qrels.py \
+  --fixtures "${SOURCE_FIXTURE_DIR}" \
+  --out-fixtures "${QREL_FIXTURE_DIR}" \
+  --summary-out "${REPORT_DIR}/memory-materialization-summary.json" \
+  --ranked-candidates-source none \
+  --profile generated_public \
+  --exclude-without-positive-qrels
+
+python3 scripts/materialize-explicit-qrels.py \
+  --fixtures "${OPERATOR_SOURCE_FIXTURE_DIR}" \
+  --out-fixtures "${QREL_OPERATOR_FIXTURE_DIR}" \
+  --summary-out "${REPORT_DIR}/operator-debug-materialization-summary.json" \
+  --ranked-candidates-source none \
+  --profile generated_public \
+  --exclude-without-positive-qrels
+
+ELF_REAL_WORLD_LIVE_REPORT_DIR="${LIVE_REPORT_DIR}" \
+  ELF_REAL_WORLD_LIVE_FIXTURES="${QREL_FIXTURE_DIR}" \
+  ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES="${QREL_OPERATOR_FIXTURE_DIR}" \
+  ELF_REAL_WORLD_LIVE_WORK_DIR="${LIVE_WORK_DIR}" \
+  ELF_REAL_WORLD_LIVE_ELF_RUN_ID="real-world-memory-live-explicit-qrels-elf" \
+  ELF_REAL_WORLD_LIVE_QMD_RUN_ID="real-world-memory-live-explicit-qrels-qmd" \
+  ELF_REAL_WORLD_LIVE_COMBINED_RUN_ID="real-world-memory-live-elf-qmd-explicit-qrels-quantitative" \
+  bash scripts/real-world-live-adapters.sh
+
+jq -n \
+  --slurpfile memory_summary "${REPORT_DIR}/memory-materialization-summary.json" \
+  --slurpfile operator_summary "${REPORT_DIR}/operator-debug-materialization-summary.json" \
+  --slurpfile live_summary "${LIVE_REPORT_DIR}/summary.json" \
+  '{
+    schema: "elf.real_world_live_explicit_qrels_sweep/v1",
+    generated_at: (now | todateiso8601),
+    artifact_dir: (env.ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR // "tmp/real-world-memory/live-explicit-qrels"),
+    live_report_dir: "tmp/real-world-memory/live-explicit-qrels/live-adapters",
+    materialization: {
+      memory: $memory_summary[0],
+      operator_debugging_ux: $operator_summary[0]
+    },
+    live_summary: $live_summary[0],
+    boundary: "Input fixtures have deterministic explicit qrels, but ranked candidates are product-runtime traces from the live adapters. This improves qrel-source evidence only; leaderboard claims still require pass rows, full ranked coverage, held-out/leakage audit evidence, and paired significance."
+  }' >"${REPORT_DIR}/summary.json"
+
+echo "Live explicit-qrel adapter reports:"
+echo "  ${REPORT_DIR}/memory-materialization-summary.json"
+echo "  ${REPORT_DIR}/operator-debug-materialization-summary.json"
+echo "  ${LIVE_REPORT_DIR}/elf-report.json"
+echo "  ${LIVE_REPORT_DIR}/qmd-report.json"
+echo "  ${LIVE_REPORT_DIR}/qmd-quantitative-product-manifest.json"
+echo "  ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.json"
+echo "  ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.md"
+echo "  ${REPORT_DIR}/summary.json"

From 33d66158079c83419efe48897738800e66ca27d7 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 11:29:00 -0400
Subject: [PATCH 02/58] {"schema":"decodex/commit/1","summary":"Port
 quantitative benchmark report surface","authority":"manual"}

---
 .../bin/real_world_job_benchmark/commands.rs  |  18 +-
 .../bin/real_world_job_benchmark/fixtures.rs  |  13 +
 .../src/bin/real_world_job_benchmark/main.rs  |   7 +
 .../bin/real_world_job_benchmark/markdown.rs  |   8 +-
 .../markdown/quantitative.rs                  |  84 +++
 .../real_world_job_benchmark/quantitative.rs  | 489 ++++++++++++++++++
 .../quantitative_reports.rs                   |  76 +++
 .../real_world_job_benchmark/report_root.rs   |   7 +-
 .../bin/real_world_job_benchmark/scoring.rs   |   4 +
 .../scoring/answers.rs                        |  44 +-
 .../tests/real_world_job_benchmark.rs         |   1 +
 .../markdown_rendering_generated.rs           |   3 +
 .../real_world_job_benchmark/quantitative.rs  | 160 ++++++
 13 files changed, 883 insertions(+), 31 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
index 91dc476f..3e7d4ce1 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
@@ -1,7 +1,7 @@
 use crate::{
 	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path,
-	PathBuf, PrivateCorpusRedaction, PublishArgs, REPORT_SCHEMA, RealWorldJob, RealWorldReport,
-	Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
+	PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA,
+	RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
 };
 
 pub(super) fn run_command(args: RunArgs) -> Result<()> {
@@ -103,16 +103,26 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 	)?;
 	let scoreboard = crate::scoreboard_report(jobs, &job_reports, &summary, &external_adapters);
 	let operational_evidence = crate::operational_evidence_report(jobs, &job_reports);
+	let adapter = adapter_report(args)?;
+	let generated_at = OffsetDateTime::now_utc().format(&Rfc3339)?;
+	let quantitative_scoreboard = crate::quantitative_scoreboard_report(QuantitativeReportInput {
+		generated_at: generated_at.as_str(),
+		adapter: &adapter,
+		source_jobs: jobs,
+		jobs: &job_reports,
+		summary: &summary,
+	});
 
 	Ok(RealWorldReport {
 		schema: REPORT_SCHEMA.to_string(),
 		run_id: args.run_id.clone(),
-		generated_at: OffsetDateTime::now_utc().format(&Rfc3339)?,
+		generated_at,
 		runner_version: VERSION.to_string(),
 		corpus_profile: corpus_profile(jobs),
-		adapter: adapter_report(args)?,
+		adapter,
 		scoreboard,
 		operational_evidence,
+		quantitative_scoreboard,
 		external_adapters,
 		capture_integration: capture_integration_report(jobs),
 		summary,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs
index 32a5eb13..ad8dd669 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs
@@ -87,6 +87,8 @@ pub(super) struct ExpectedAnswer {
 	pub(super) must_not_include: Vec<String>,
 	#[serde(default)]
 	pub(super) evidence_links: BTreeMap<String, EvidenceLink>,
+	#[serde(default)]
+	pub(super) relevance_judgments: Vec<RelevanceJudgment>,
 	pub(super) answer_type: String,
 	#[serde(default)]
 	pub(super) accepted_alternates: Vec<Value>,
@@ -96,6 +98,13 @@ pub(super) struct ExpectedAnswer {
 	pub(super) requires_refusal: bool,
 }
 
+#[derive(Debug, Deserialize)]
+pub(super) struct RelevanceJudgment {
+	pub(super) evidence_id: String,
+	#[serde(default = "default_relevance_grade")]
+	pub(super) grade: f64,
+}
+
 #[derive(Debug, Deserialize)]
 pub(super) struct RequiredEvidence {
 	pub(super) evidence_id: String,
@@ -250,3 +259,7 @@ pub(super) struct AdapterResponse {
 	pub(super) answer: ProducedAnswer,
 	pub(super) consolidation: Option<ConsolidationFixture>,
 }
+
+fn default_relevance_grade() -> f64 {
+	1.0
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
index 9815886f..61715b35 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
@@ -16,6 +16,8 @@ mod job_reports;
 mod markdown;
 mod operational;
 mod operational_reports;
+mod quantitative;
+mod quantitative_reports;
 mod recovery;
 mod report_root;
 mod scoreboard;
@@ -84,6 +86,11 @@ use operational_reports::{
 	OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport,
 	OperationalResourceSummary,
 };
+use quantitative::{QuantitativeReportInput, quantitative_scoreboard_report};
+use quantitative_reports::{
+	QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
+	QuantitativePerQueryRow,
+};
 use report_root::RealWorldReport;
 use scoreboard::scoreboard_report;
 use scoreboard_reports::{
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs
index 36f9dba6..68bcb12a 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs
@@ -6,6 +6,7 @@ mod followups;
 mod header;
 mod jobs;
 mod operational;
+mod quantitative;
 mod scoreboard;
 mod trace;
 
@@ -16,9 +17,9 @@ use crate::{
 	AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, CostReport,
 	DEFAULT_ADAPTER_BEHAVIOR, EvolutionJobReport, ExternalAdapterReport, KnowledgeSummary,
 	MemorySummaryReport, OperatorDebugEvidence, OperatorUxGap, ProactiveBriefSummaryReport,
-	RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, ScenarioOutcomeCounts,
-	ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, ScoreboardRow,
-	TraceExplainability, WorkContinuitySummaryReport,
+	QuantitativeBenchmarkRow, RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES,
+	ScenarioOutcomeCounts, ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport,
+	ScoreboardRow, TraceExplainability, WorkContinuitySummaryReport,
 	formatting::{
 		adapter_status_str, round3, scenario_comparison_outcome_str, status_str,
 		trace_failure_stage,
@@ -32,6 +33,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S
 
 	self::header::render_markdown_header(&mut out, report, report_path.as_str());
 	self::scoreboard::render_markdown_scoreboard(&mut out, report);
+	self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report);
 	self::operational::render_markdown_operational_evidence(&mut out, report);
 	self::adapters::render_markdown_external_adapters(&mut out, report);
 	self::adapters::render_markdown_capture_integration(&mut out, report);
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs
new file mode 100644
index 00000000..1c3ec195
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs
@@ -0,0 +1,84 @@
+use crate::markdown::{self, QuantitativeBenchmarkRow, RealWorldReport};
+
+pub(super) fn render_markdown_quantitative_scoreboard(out: &mut String, report: &RealWorldReport) {
+	let scoreboard = &report.quantitative_scoreboard;
+
+	if scoreboard.schema.is_empty() {
+		return;
+	}
+
+	out.push_str("## Quantitative Benchmark Report\n\n");
+	out.push_str(concat!(
+		"Quantitative rows expose ranking metrics and their claim controls. ",
+		"Fixture-backed rows verify benchmark mechanics; leaderboard claims require explicit qrels, ",
+		"enough queries, and leakage controls.\n\n"
+	));
+	out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(scoreboard.schema.as_str())));
+	out.push_str(&format!("- Corpus: `{}`\n", markdown::md_inline(scoreboard.corpus_id.as_str())));
+	out.push_str(&format!(
+		"- k values: `{}`\n",
+		markdown::md_inline(
+			scoreboard
+				.k_values
+				.iter()
+				.map(usize::to_string)
+				.collect::<Vec<_>>()
+				.join(", ")
+				.as_str()
+		)
+	));
+	out.push_str(&format!(
+		"- Ranking queries: `{}` of `{}`; explicit-qrel queries: `{}`\n",
+		scoreboard.controls.current_ranking_query_count,
+		scoreboard.controls.current_query_count,
+		scoreboard.controls.current_explicit_qrel_query_count
+	));
+	out.push_str(&format!(
+		"- Leaderboard claim allowed: `{}`\n",
+		scoreboard.controls.leaderboard_claim_allowed
+	));
+	out.push_str(&format!(
+		"- Claim boundary: {}\n\n",
+		markdown::md_cell(scoreboard.claim_boundary.as_str())
+	));
+	out.push_str("| Product | State | Evidence | Qrels | Sample | Ranking Queries | Recall@5 | ");
+	out.push_str("Precision@5 | MRR | nDCG@5 | AP | Leaderboard |\n");
+	out.push_str(
+		"| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
+	);
+
+	for row in &scoreboard.rows {
+		out.push_str(&format!(
+			"| {} | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | `{}` |\n",
+			markdown::md_cell(row.product.as_str()),
+			markdown::md_inline(row.result_state.as_str()),
+			markdown::md_inline(row.evidence_class.as_str()),
+			markdown::md_inline(row.qrel_source.as_str()),
+			row.sample_size,
+			row.ranking_query_count,
+			quantitative_metric(row, "recall_at_5"),
+			quantitative_metric(row, "precision_at_5"),
+			quantitative_metric(row, "mrr"),
+			quantitative_metric(row, "ndcg_at_5"),
+			quantitative_metric(row, "average_precision"),
+			row.leaderboard_eligible
+		));
+	}
+
+	if !scoreboard.metrics_not_encoded.is_empty() {
+		out.push_str("\nMetrics not encoded:\n");
+
+		for metric in &scoreboard.metrics_not_encoded {
+			out.push_str(&format!("- `{}`\n", markdown::md_inline(metric.as_str())));
+		}
+
+		out.push('\n');
+	}
+}
+
+fn quantitative_metric(row: &QuantitativeBenchmarkRow, metric: &str) -> String {
+	row.metrics
+		.get(metric)
+		.and_then(|value| *value)
+		.map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value)))
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
new file mode 100644
index 00000000..fa96df20
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -0,0 +1,489 @@
+use crate::{
+	AdapterReport, BTreeMap, BTreeSet, JobReport, QuantitativeBenchmarkControls,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldJob,
+	ReportSummary, formatting, scoring,
+};
+
+const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
+const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10];
+const MIN_LEADERBOARD_QUERY_COUNT: usize = 30;
+const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
+	"Quantitative metrics are bounded to this generated report. ",
+	"Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims."
+);
+
+pub(super) struct QuantitativeReportInput<'a> {
+	pub(super) generated_at: &'a str,
+	pub(super) adapter: &'a AdapterReport,
+	pub(super) source_jobs: &'a [RealWorldJob],
+	pub(super) jobs: &'a [JobReport],
+	pub(super) summary: &'a ReportSummary,
+}
+
+pub(super) fn quantitative_scoreboard_report(
+	input: QuantitativeReportInput<'_>,
+) -> QuantitativeBenchmarkReport {
+	let corpus_id = quantitative_corpus_id(input.source_jobs);
+	let evidence_class = quantitative_evidence_class(input.adapter, input.jobs);
+	let per_query_rows = quantitative_per_query_rows(
+		input.source_jobs,
+		input.jobs,
+		corpus_id.as_str(),
+		evidence_class,
+		input.adapter.adapter_id.as_str(),
+	);
+	let ranking_query_count = per_query_rows
+		.iter()
+		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
+		.count();
+	let explicit_qrel_query_count =
+		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
+	let metric_comparable = ranking_query_count > 0;
+	let leaderboard_eligible = false;
+	let result_state = quantitative_result_state(input.summary);
+	let row = QuantitativeBenchmarkRow {
+		product: "ELF".to_string(),
+		adapter_id: input.adapter.adapter_id.clone(),
+		adapter_name: input.adapter.name.clone(),
+		suite: quantitative_suite_id(input.jobs),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.clone()),
+		result_state: result_state.to_string(),
+		comparable: metric_comparable,
+		metric_comparable,
+		leaderboard_eligible,
+		held_out: false,
+		leakage_audited: false,
+		fixture_regression_only: evidence_class == "fixture_backed",
+		sample_size: input.jobs.len(),
+		ranking_query_count,
+		ranking_coverage_state: ranking_coverage_state(
+			input.summary,
+			input.source_jobs.len(),
+			ranking_query_count,
+		)
+		.to_string(),
+		ranked_candidate_source: ranked_candidate_source(ranking_query_count).to_string(),
+		qrel_source: aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
+			.to_string(),
+		explicit_qrel_query_count,
+		metrics: aggregate_metrics(per_query_rows.as_slice()),
+		metric_states: aggregate_metric_states(result_state, metric_comparable),
+		denominators: aggregate_denominators(per_query_rows.as_slice()),
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	};
+	let controls = QuantitativeBenchmarkControls {
+		same_corpus_required: true,
+		same_task_required: true,
+		ranked_candidates_required_for_ranking_metrics: true,
+		explicit_relevance_judgments_required_for_leaderboard: true,
+		minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT,
+		current_query_count: input.source_jobs.len(),
+		current_ranking_query_count: ranking_query_count,
+		current_explicit_qrel_query_count: explicit_qrel_query_count,
+		leaderboard_claim_allowed: leaderboard_eligible,
+		leakage_control:
+			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
+				.to_string(),
+	};
+
+	QuantitativeBenchmarkReport {
+		schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(),
+		generated_at: input.generated_at.to_string(),
+		corpus_id,
+		k_values: QUANTITATIVE_K_VALUES.to_vec(),
+		rows: vec![row],
+		per_query_rows,
+		metrics_not_encoded: vec![
+			"paired_significance".to_string(),
+			"external_product_manifest_import".to_string(),
+			"audit_manifest_validation".to_string(),
+		],
+		controls,
+		claim_boundary: concat!(
+			"Do not convert fixture mechanics, missing explicit qrels, ",
+			"or partial candidate coverage into product leaderboard claims."
+		)
+		.to_string(),
+	}
+}
+
+fn quantitative_per_query_rows(
+	source_jobs: &[RealWorldJob],
+	jobs: &[JobReport],
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> Vec<QuantitativePerQueryRow> {
+	source_jobs
+		.iter()
+		.zip(jobs.iter())
+		.map(|(source_job, job)| {
+			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
+		})
+		.collect()
+}
+
+fn quantitative_per_query_row(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let relevance = relevance_grades(source_job, job);
+	let candidates = scoring::produced_evidence_order(source_job);
+	let positive_relevance_count = positive_qrel_count(&relevance);
+	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
+	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
+		"not_encoded"
+	} else {
+		formatting::status_str(job.status)
+	};
+	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
+	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
+
+	QuantitativePerQueryRow {
+		job_id: job.job_id.clone(),
+		suite: job.suite_id.clone(),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.to_string()),
+		result_state: formatting::status_str(job.status).to_string(),
+		expected_relevant_count: positive_relevance_count,
+		candidate_count: candidates.len(),
+		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
+		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		product: "ELF".to_string(),
+		adapter_id: adapter_id.to_string(),
+		metrics,
+		metric_states,
+		denominators,
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	}
+}
+
+fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
+	let explicit = source_job
+		.expected_answer
+		.relevance_judgments
+		.iter()
+		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+		.collect::<BTreeMap<_, _>>();
+
+	if !explicit.is_empty() {
+		return explicit;
+	}
+
+	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
+}
+
+fn per_query_metrics(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> BTreeMap<String, Option<f64>> {
+	let mut metrics = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
+
+		metrics
+			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
+		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
+		metrics.insert(
+			format!("success_at_{k}"),
+			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
+		);
+	}
+
+	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
+	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
+	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
+
+	metrics
+}
+
+fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
+	candidates
+		.iter()
+		.take(k)
+		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
+		.count()
+}
+
+fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}
+
+fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
+
+fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	let positive_count = positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}
+
+fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+	let mut sums = BTreeMap::<String, (f64, usize)>::new();
+	let mut metrics = quantitative_metric_names()
+		.into_iter()
+		.map(|metric| (metric, None))
+		.collect::<BTreeMap<_, _>>();
+
+	for row in rows {
+		for (metric, value) in &row.metrics {
+			if let Some(value) = value {
+				let (sum, count) = sums.entry(metric.clone()).or_default();
+
+				*sum += *value;
+				*count += 1;
+			}
+		}
+	}
+	for (metric, (sum, count)) in sums {
+		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
+	}
+
+	metrics
+}
+
+fn aggregate_metric_states(
+	result_state: &str,
+	metric_comparable: bool,
+) -> BTreeMap<String, String> {
+	let state = if metric_comparable { result_state } else { "not_encoded" };
+	let mut states = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		states.insert(format!("recall_at_{k}"), state.to_string());
+		states.insert(format!("precision_at_{k}"), state.to_string());
+		states.insert(format!("success_at_{k}"), state.to_string());
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		states.insert(metric.to_string(), state.to_string());
+	}
+
+	states
+}
+
+fn quantitative_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		metrics.push(metric.to_string());
+	}
+
+	metrics
+}
+
+fn per_query_denominators(
+	candidate_count: usize,
+	expected_relevant_count: usize,
+) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
+		denominators.insert(format!("precision_at_{k}"), *k);
+		denominators.insert(format!("success_at_{k}"), 1);
+	}
+
+	denominators.insert("mrr".to_string(), expected_relevant_count);
+	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
+	denominators.insert("average_precision".to_string(), expected_relevant_count);
+	denominators.insert("candidate_count".to_string(), candidate_count);
+
+	denominators
+}
+
+fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(
+			format!("recall_at_{k}"),
+			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
+		);
+		denominators.insert(
+			format!("precision_at_{k}"),
+			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
+		);
+		denominators.insert(
+			format!("success_at_{k}"),
+			sum_per_query_denominator(rows, &format!("success_at_{k}")),
+		);
+	}
+
+	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
+	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
+	denominators.insert(
+		"average_precision".to_string(),
+		sum_per_query_denominator(rows, "average_precision"),
+	);
+
+	denominators
+}
+
+fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
+	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
+}
+
+fn quantitative_corpus_id(source_jobs: &[RealWorldJob]) -> String {
+	let ids = source_jobs.iter().map(|job| job.corpus.corpus_id.as_str()).collect::<BTreeSet<_>>();
+
+	if ids.len() == 1 {
+		ids.into_iter().next().unwrap_or("unknown").to_string()
+	} else {
+		"mixed".to_string()
+	}
+}
+
+fn quantitative_suite_id(jobs: &[JobReport]) -> String {
+	let suites = jobs.iter().map(|job| job.suite_id.as_str()).collect::<BTreeSet<_>>();
+
+	if suites.len() == 1 {
+		suites.into_iter().next().unwrap_or("unknown").to_string()
+	} else {
+		"mixed".to_string()
+	}
+}
+
+fn quantitative_result_state(summary: &ReportSummary) -> &'static str {
+	if summary.unsupported_claim > 0 {
+		"unsupported_claim"
+	} else if summary.wrong_result > 0 {
+		"wrong_result"
+	} else if summary.incomplete > 0 {
+		"incomplete"
+	} else if summary.blocked > 0 {
+		"blocked"
+	} else if summary.not_encoded > 0 {
+		"not_encoded"
+	} else {
+		"pass"
+	}
+}
+
+fn quantitative_evidence_class(adapter: &AdapterReport, jobs: &[JobReport]) -> &'static str {
+	if adapter.behavior == "live_real_world_adapter" {
+		"live_real_world"
+	} else if jobs.iter().any(|job| job.operational_evidence_tier == "private_corpus") {
+		"private_corpus"
+	} else if jobs.iter().any(|job| job.operational_evidence_tier == "provider_backed") {
+		"provider_backed"
+	} else if adapter.behavior.contains("public_proxy") {
+		"public_proxy"
+	} else {
+		"fixture_backed"
+	}
+}
+
+fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		"explicit_qrels"
+	} else if empty {
+		"not_encoded"
+	} else {
+		"expected_evidence_fallback"
+	}
+}
+
+fn aggregate_qrel_source(
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if explicit_qrel_query_count == ranking_query_count {
+		"explicit_qrels"
+	} else if explicit_qrel_query_count == 0 {
+		"expected_evidence_fallback"
+	} else {
+		"mixed"
+	}
+}
+
+fn ranking_coverage_state(
+	summary: &ReportSummary,
+	source_job_count: usize,
+	ranking_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
+		"complete"
+	} else {
+		"partial_coverage"
+	}
+}
+
+fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
+	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
+}
+
+fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
+	relevance.values().filter(|grade| **grade > 0.0).count()
+}
+
+fn rate(numerator: usize, denominator: usize) -> Option<f64> {
+	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
new file mode 100644
index 00000000..73f2b1eb
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
@@ -0,0 +1,76 @@
+use crate::{BTreeMap, Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativeBenchmarkReport {
+	pub(super) schema: String,
+	pub(super) generated_at: String,
+	pub(super) corpus_id: String,
+	pub(super) k_values: Vec<usize>,
+	pub(super) rows: Vec<QuantitativeBenchmarkRow>,
+	#[serde(default)]
+	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
+	#[serde(default)]
+	pub(super) metrics_not_encoded: Vec<String>,
+	pub(super) controls: QuantitativeBenchmarkControls,
+	pub(super) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativeBenchmarkRow {
+	pub(super) product: String,
+	pub(super) adapter_id: String,
+	pub(super) adapter_name: String,
+	pub(super) suite: String,
+	pub(super) evidence_class: String,
+	pub(super) source_manifest_corpus_id: Option<String>,
+	pub(super) result_state: String,
+	pub(super) comparable: bool,
+	pub(super) metric_comparable: bool,
+	pub(super) leaderboard_eligible: bool,
+	pub(super) held_out: bool,
+	pub(super) leakage_audited: bool,
+	pub(super) fixture_regression_only: bool,
+	pub(super) sample_size: usize,
+	pub(super) ranking_query_count: usize,
+	pub(super) ranking_coverage_state: String,
+	pub(super) ranked_candidate_source: String,
+	pub(super) qrel_source: String,
+	pub(super) explicit_qrel_query_count: usize,
+	pub(super) metrics: BTreeMap<String, Option<f64>>,
+	pub(super) metric_states: BTreeMap<String, String>,
+	pub(super) denominators: BTreeMap<String, usize>,
+	pub(super) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativePerQueryRow {
+	pub(super) job_id: String,
+	pub(super) suite: String,
+	pub(super) evidence_class: String,
+	pub(super) source_manifest_corpus_id: Option<String>,
+	pub(super) result_state: String,
+	pub(super) expected_relevant_count: usize,
+	pub(super) candidate_count: usize,
+	pub(super) qrel_source: String,
+	pub(super) relevance_grade_sum: f64,
+	pub(super) product: String,
+	pub(super) adapter_id: String,
+	pub(super) metrics: BTreeMap<String, Option<f64>>,
+	pub(super) metric_states: BTreeMap<String, String>,
+	pub(super) denominators: BTreeMap<String, usize>,
+	pub(super) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativeBenchmarkControls {
+	pub(super) same_corpus_required: bool,
+	pub(super) same_task_required: bool,
+	pub(super) ranked_candidates_required_for_ranking_metrics: bool,
+	pub(super) explicit_relevance_judgments_required_for_leaderboard: bool,
+	pub(super) minimum_query_count_for_leaderboard: usize,
+	pub(super) current_query_count: usize,
+	pub(super) current_ranking_query_count: usize,
+	pub(super) current_explicit_qrel_query_count: usize,
+	pub(super) leaderboard_claim_allowed: bool,
+	pub(super) leakage_control: String,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs
index 9ee62f1e..797eb2ba 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs
@@ -1,7 +1,8 @@
 use crate::{
 	AdapterReport, CaptureIntegrationReport, Deserialize, EvolutionSummary, ExternalAdapterSection,
-	FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, ReportSummary,
-	ScoreboardReport, Serialize, SuiteReport, UnsupportedClaimReport,
+	FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction,
+	QuantitativeBenchmarkReport, ReportSummary, ScoreboardReport, Serialize, SuiteReport,
+	UnsupportedClaimReport,
 };
 
 #[derive(Debug, Deserialize, Serialize)]
@@ -17,6 +18,8 @@ pub(super) struct RealWorldReport {
 	#[serde(default)]
 	pub(super) operational_evidence: OperationalEvidenceReport,
 	#[serde(default)]
+	pub(super) quantitative_scoreboard: QuantitativeBenchmarkReport,
+	#[serde(default)]
 	pub(super) external_adapters: ExternalAdapterSection,
 	pub(super) capture_integration: CaptureIntegrationReport,
 	pub(super) summary: ReportSummary,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs
index 088a8842..2f0f34a7 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs
@@ -27,6 +27,10 @@ pub(super) fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport {
 	reports::job_report(job, scoring)
 }
 
+pub(super) fn produced_evidence_order(job: &RealWorldJob) -> Vec<String> {
+	self::answers::ordered_produced_evidence_ids(self::answers::produced_answer(job))
+}
+
 pub(super) fn score_job(job: &RealWorldJob) -> JobScoring {
 	let answer = self::answers::produced_answer(job);
 	let produced_evidence = self::answers::produced_evidence_ids(answer);
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs
index 3e60e5b1..1e2d85ed 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs
@@ -61,28 +61,7 @@ pub(super) fn trap_ids_used(
 		.collect()
 }
 
-fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer {
-	let _ = job;
-
-	static EMPTY_ANSWER: std::sync::OnceLock<ProducedAnswer> = std::sync::OnceLock::new();
-
-	EMPTY_ANSWER.get_or_init(|| ProducedAnswer {
-		content: String::new(),
-		claims: Vec::new(),
-		evidence_ids: Vec::new(),
-		pages: Vec::new(),
-		memory_summaries: Vec::new(),
-		proactive_briefs: Vec::new(),
-		scheduled_tasks: Vec::new(),
-		work_journal_readbacks: Vec::new(),
-		recovery_drills: Vec::new(),
-		latency_ms: None,
-		cost: None,
-		trace_explainability: None,
-	})
-}
-
-fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec<String> {
+pub(super) fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec<String> {
 	let mut seen = BTreeSet::new();
 	let mut evidence = Vec::new();
 
@@ -180,6 +159,27 @@ fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec<String> {
 	evidence
 }
 
+fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer {
+	let _ = job;
+
+	static EMPTY_ANSWER: std::sync::OnceLock<ProducedAnswer> = std::sync::OnceLock::new();
+
+	EMPTY_ANSWER.get_or_init(|| ProducedAnswer {
+		content: String::new(),
+		claims: Vec::new(),
+		evidence_ids: Vec::new(),
+		pages: Vec::new(),
+		memory_summaries: Vec::new(),
+		proactive_briefs: Vec::new(),
+		scheduled_tasks: Vec::new(),
+		work_journal_readbacks: Vec::new(),
+		recovery_drills: Vec::new(),
+		latency_ms: None,
+		cost: None,
+		trace_explainability: None,
+	})
+}
+
 fn push_ordered_evidence(
 	evidence: &mut Vec<String>,
 	seen: &mut BTreeSet<String>,
diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs
index 6df392ce..6aa5cecb 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark.rs
@@ -20,6 +20,7 @@
 #[path = "real_world_job_benchmark/operator_debug.rs"] mod operator_debug;
 #[path = "real_world_job_benchmark/proactive_brief.rs"] mod proactive_brief;
 #[path = "real_world_job_benchmark/production_ops.rs"] mod production_ops;
+#[path = "real_world_job_benchmark/quantitative.rs"] mod quantitative;
 #[path = "real_world_job_benchmark/recall_debug_reports.rs"] mod recall_debug_reports;
 #[path = "real_world_job_benchmark/retrieval.rs"] mod retrieval;
 #[path = "real_world_job_benchmark/root_aggregate.rs"] mod root_aggregate;
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs
index f5a395c8..dc83515a 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs
@@ -38,6 +38,9 @@ fn generated_json_report_renders_markdown() -> Result<()> {
 	assert!(markdown.contains("# Real-World Job Benchmark Report"));
 	assert!(markdown.contains("work_resume"));
 	assert!(markdown.contains("Capture And Integration Coverage"));
+	assert!(markdown.contains("Quantitative Benchmark Report"));
+	assert!(markdown.contains("leaderboard claims require explicit qrels"));
+	assert!(markdown.contains("| ELF | `pass` | `fixture_backed`"));
 	assert!(markdown.contains("External Adapter Coverage"));
 	assert!(markdown.contains("live-baseline-only"));
 	assert!(markdown.contains("live real-world"));
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
new file mode 100644
index 00000000..675dbeb3
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
@@ -0,0 +1,160 @@
+use std::{env, fs, process};
+
+use color_eyre::{Result, eyre};
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_benchmark/v1")
+	);
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str),
+		report.pointer("/generated_at").and_then(Value::as_str)
+	);
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array),
+		Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),])
+	);
+	assert_eq!(
+		report
+			.pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed")
+			.and_then(Value::as_bool),
+		Some(false)
+	);
+	assert_eq!(
+		report
+			.pointer("/quantitative_scoreboard/controls/current_query_count")
+			.and_then(Value::as_u64),
+		report.pointer("/summary/job_count").and_then(Value::as_u64)
+	);
+
+	assert_quantitative_row_contract(&report)?;
+	assert_quantitative_per_query_contract(&report)?;
+
+	Ok(())
+}
+
+#[test]
+fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> {
+	let source_path =
+		support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json");
+	let mut job = serde_json::from_str::<Value>(&fs::read_to_string(source_path)?)?;
+
+	support::set_json_pointer(
+		&mut job,
+		"/corpus/adapter_response/answer/evidence_ids",
+		serde_json::json!(["old-provider-note", "current-provider-report"]),
+	)?;
+
+	job.pointer_mut("/expected_answer")
+		.and_then(Value::as_object_mut)
+		.ok_or_else(|| eyre::eyre!("missing expected_answer object"))?
+		.insert(
+			"relevance_judgments".to_string(),
+			serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]),
+		);
+
+	let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id()));
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?;
+
+	let report = support::run_json_report_from(temp_dir)?;
+	let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?;
+	let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
+
+	assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
+	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0));
+	assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0));
+	assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
+
+	let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?;
+	let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?;
+
+	assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
+	assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
+
+	Ok(())
+}
+
+fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
+	let rows = support::array_at(report, "/quantitative_scoreboard/rows")?;
+
+	assert_eq!(rows.len(), 1);
+
+	let row = &rows[0];
+
+	assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF"));
+	assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke"));
+	assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality"));
+	assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
+	assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass"));
+	assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
+	assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete"));
+	assert_eq!(
+		row.pointer("/ranked_candidate_source").and_then(Value::as_str),
+		Some("produced_evidence_order")
+	);
+	assert_eq!(
+		row.pointer("/qrel_source").and_then(Value::as_str),
+		Some("expected_evidence_fallback")
+	);
+	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0));
+
+	for metric in [
+		"recall_at_1",
+		"precision_at_1",
+		"success_at_1",
+		"recall_at_5",
+		"precision_at_5",
+		"success_at_5",
+		"mrr",
+		"ndcg_at_5",
+		"average_precision",
+	] {
+		assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some());
+		assert_eq!(
+			row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str),
+			Some("pass")
+		);
+		assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some());
+	}
+
+	Ok(())
+}
+
+fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> {
+	let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?;
+	let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0);
+
+	assert_eq!(rows.len() as u64, job_count);
+
+	for row in rows {
+		assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
+		assert_eq!(
+			row.pointer("/qrel_source").and_then(Value::as_str),
+			Some("expected_evidence_fallback")
+		);
+		assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some());
+		assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some());
+		assert!(row.pointer("/metrics/recall_at_5").is_some());
+		assert!(row.pointer("/metrics/precision_at_5").is_some());
+		assert!(row.pointer("/metrics/ndcg_at_5").is_some());
+		assert!(row.pointer("/metrics/average_precision").is_some());
+	}
+
+	Ok(())
+}

From a92363be1975089599bdccfa9c879229e1d19097 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 11:46:21 -0400
Subject: [PATCH 03/58] {"schema":"decodex/commit/1","summary":"Port
 quantitative product manifest import export","authority":"manual"}

---
 .../src/bin/real_world_job_benchmark/cli.rs   |  27 ++
 .../bin/real_world_job_benchmark/commands.rs  |  21 +-
 .../src/bin/real_world_job_benchmark/main.rs  |  11 +-
 .../real_world_job_benchmark/quantitative.rs  | 284 +++++++++++++++++-
 .../quantitative_reports.rs                   |  11 +
 .../real_world_job_benchmark/quantitative.rs  | 218 +++++++++++++-
 6 files changed, 552 insertions(+), 20 deletions(-)

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
index e1bc6f32..ddcf4a7e 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
@@ -51,6 +51,9 @@ pub(super) struct RunArgs {
 	/// Skip loading the real-world external adapter coverage manifest.
 	#[arg(long)]
 	pub(super) skip_external_adapter_manifest: bool,
+	/// Optional same-corpus quantitative product manifest to merge into the report.
+	#[arg(long, value_name = "FILE")]
+	pub(super) quantitative_product_manifest: Option<PathBuf>,
 }
 
 #[derive(Debug, Parser)]
@@ -63,9 +66,33 @@ pub(super) struct PublishArgs {
 	pub(super) out: Option<PathBuf>,
 }
 
+#[derive(Debug, Parser)]
+pub(super) struct ExportQuantitativeProductManifestArgs {
+	/// Generated real_world_job JSON report to export.
+	#[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)]
+	pub(super) report: PathBuf,
+	/// Write product manifest JSON to this file. Omit to print to stdout.
+	#[arg(long, value_name = "FILE")]
+	pub(super) out: Option<PathBuf>,
+	/// Stable manifest id. Defaults to <run_id>-quantitative-product-manifest.
+	#[arg(long)]
+	pub(super) manifest_id: Option<String>,
+	/// Override the exported product name.
+	#[arg(long)]
+	pub(super) product: Option<String>,
+	/// Override the exported adapter id.
+	#[arg(long)]
+	pub(super) adapter_id: Option<String>,
+	/// Override the exported adapter name.
+	#[arg(long)]
+	pub(super) adapter_name: Option<String>,
+}
+
 #[derive(Debug, Subcommand)]
 #[command(rename_all = "kebab")]
 pub(super) enum Command {
+	/// Export the primary quantitative row as a reusable product manifest.
+	ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs),
 	/// Parse and score real_world_job fixtures, then emit a JSON report.
 	Run(RunArgs),
 	/// Render Markdown from a generated real_world_job JSON report.
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
index 3e7d4ce1..c36fedd4 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
@@ -1,7 +1,8 @@
 use crate::{
-	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path,
-	PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA,
-	RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
+	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile,
+	ExportQuantitativeProductManifestArgs, OffsetDateTime, Path, PathBuf, PrivateCorpusRedaction,
+	PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, RealWorldJob, RealWorldReport, Result,
+	Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
 };
 
 pub(super) fn run_command(args: RunArgs) -> Result<()> {
@@ -20,6 +21,17 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> {
 	write_or_print(args.out.as_deref(), markdown.as_str())
 }
 
+pub(super) fn export_quantitative_product_manifest_command(
+	args: ExportQuantitativeProductManifestArgs,
+) -> Result<()> {
+	let raw = fs::read_to_string(&args.report)?;
+	let report = serde_json::from_str::<RealWorldReport>(&raw)?;
+	let manifest = crate::quantitative_product_manifest_from_report(&report, &args)?;
+	let json = serde_json::to_string_pretty(&manifest)?;
+
+	write_or_print(args.out.as_deref(), json.as_str())
+}
+
 fn load_jobs(path: &Path) -> Result<Vec<RealWorldJob>> {
 	let paths = fixture_paths(path)?;
 	let mut jobs = Vec::with_capacity(paths.len());
@@ -111,7 +123,8 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 		source_jobs: jobs,
 		jobs: &job_reports,
 		summary: &summary,
-	});
+		product_manifest_path: args.quantitative_product_manifest.as_deref(),
+	})?;
 
 	Ok(RealWorldReport {
 		schema: REPORT_SCHEMA.to_string(),
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
index 61715b35..f8bbf36b 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
@@ -51,7 +51,7 @@ use artifacts::{
 	WorkJournalNextStepArtifact, WorkJournalReadbackArtifact, WorkJournalRejectedOptionArtifact,
 	WorkJournalWhereStoppedArtifact,
 };
-use cli::{Args, Command, PublishArgs, RunArgs};
+use cli::{Args, Command, ExportQuantitativeProductManifestArgs, PublishArgs, RunArgs};
 use diagnostic_reports::{
 	OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability,
 };
@@ -86,10 +86,13 @@ use operational_reports::{
 	OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport,
 	OperationalResourceSummary,
 };
-use quantitative::{QuantitativeReportInput, quantitative_scoreboard_report};
+use quantitative::{
+	QuantitativeReportInput, quantitative_product_manifest_from_report,
+	quantitative_scoreboard_report,
+};
 use quantitative_reports::{
 	QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
-	QuantitativePerQueryRow,
+	QuantitativePerQueryRow, QuantitativeProductManifest,
 };
 use report_root::RealWorldReport;
 use scoreboard::scoreboard_report;
@@ -174,6 +177,8 @@ fn main() -> Result<()> {
 	color_eyre::install()?;
 
 	match Args::parse().command {
+		Command::ExportQuantitativeProductManifest(args) =>
+			commands::export_quantitative_product_manifest_command(args),
 		Command::Run(args) => commands::run_command(args),
 		Command::Publish(args) => commands::publish_command(args),
 	}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index fa96df20..51d1c07e 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,10 +1,13 @@
 use crate::{
-	AdapterReport, BTreeMap, BTreeSet, JobReport, QuantitativeBenchmarkControls,
-	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldJob,
-	ReportSummary, formatting, scoring,
+	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeProductManifestArgs, JobReport, Path,
+	QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
+	QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob,
+	RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring,
 };
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
+const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str =
+	"elf.agent_memory_quantitative_product_manifest/v1";
 const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10];
 const MIN_LEADERBOARD_QUERY_COUNT: usize = 30;
 const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
@@ -18,11 +21,12 @@ pub(super) struct QuantitativeReportInput<'a> {
 	pub(super) source_jobs: &'a [RealWorldJob],
 	pub(super) jobs: &'a [JobReport],
 	pub(super) summary: &'a ReportSummary,
+	pub(super) product_manifest_path: Option<&'a Path>,
 }
 
 pub(super) fn quantitative_scoreboard_report(
 	input: QuantitativeReportInput<'_>,
-) -> QuantitativeBenchmarkReport {
+) -> Result<QuantitativeBenchmarkReport> {
 	let corpus_id = quantitative_corpus_id(input.source_jobs);
 	let evidence_class = quantitative_evidence_class(input.adapter, input.jobs);
 	let per_query_rows = quantitative_per_query_rows(
@@ -72,6 +76,16 @@ pub(super) fn quantitative_scoreboard_report(
 		denominators: aggregate_denominators(per_query_rows.as_slice()),
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	};
+	let product_manifest =
+		quantitative_product_manifest(input.product_manifest_path, corpus_id.as_str())?;
+	let imported_row_count = product_manifest.rows.len();
+	let imported_per_query_count = product_manifest.per_query_rows.len();
+	let mut rows = vec![row];
+	let mut merged_per_query_rows = per_query_rows;
+
+	rows.extend(product_manifest.rows);
+	merged_per_query_rows.extend(product_manifest.per_query_rows);
+
 	let controls = QuantitativeBenchmarkControls {
 		same_corpus_required: true,
 		same_task_required: true,
@@ -87,25 +101,271 @@ pub(super) fn quantitative_scoreboard_report(
 				.to_string(),
 	};
 
-	QuantitativeBenchmarkReport {
+	Ok(QuantitativeBenchmarkReport {
 		schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(),
 		generated_at: input.generated_at.to_string(),
 		corpus_id,
 		k_values: QUANTITATIVE_K_VALUES.to_vec(),
-		rows: vec![row],
-		per_query_rows,
-		metrics_not_encoded: vec![
-			"paired_significance".to_string(),
-			"external_product_manifest_import".to_string(),
-			"audit_manifest_validation".to_string(),
-		],
+		rows,
+		per_query_rows: merged_per_query_rows,
+		metrics_not_encoded: quantitative_metrics_not_encoded(
+			imported_row_count,
+			imported_per_query_count,
+		),
 		controls,
 		claim_boundary: concat!(
 			"Do not convert fixture mechanics, missing explicit qrels, ",
 			"or partial candidate coverage into product leaderboard claims."
 		)
 		.to_string(),
+	})
+}
+
+pub(super) fn quantitative_product_manifest_from_report(
+	report: &RealWorldReport,
+	args: &ExportQuantitativeProductManifestArgs,
+) -> Result<QuantitativeProductManifest> {
+	if report.schema != REPORT_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {REPORT_SCHEMA}.",
+			args.report.display(),
+			report.schema
+		));
+	}
+
+	let source_row =
+		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
+			eyre::eyre!("{} has no quantitative product row.", args.report.display())
+		})?;
+	let source_product = source_row.product.as_str();
+	let source_adapter_id = source_row.adapter_id.as_str();
+	let product = args.product.as_deref().unwrap_or(source_product).trim();
+	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
+	let adapter_name =
+		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
+
+	if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() {
+		return Err(eyre::eyre!(
+			"{} cannot export an incomplete quantitative product identity.",
+			args.report.display()
+		));
+	}
+	if product == "ELF" {
+		return Err(eyre::eyre!(
+			"{} exports product ELF; use --product for external product manifest exports.",
+			args.report.display()
+		));
+	}
+
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.adapter_name = adapter_name.to_string();
+	row.claim_boundary = concat!(
+		"Exported from a generated real_world_job_report quantitative row; ",
+		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
+	)
+	.to_string();
+
+	let mut per_query_rows = Vec::new();
+
+	for row in &report.quantitative_scoreboard.per_query_rows {
+		if row.product != source_product || row.adapter_id != source_adapter_id {
+			continue;
+		}
+
+		let mut row = row.clone();
+
+		row.product = product.to_string();
+		row.adapter_id = adapter_id.to_string();
+		row.claim_boundary = concat!(
+			"Exported from generated report per-query quantitative evidence; ",
+			"import does not relax paired-significance or leaderboard gates."
+		)
+		.to_string();
+
+		per_query_rows.push(row);
 	}
+
+	let manifest = QuantitativeProductManifest {
+		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
+		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
+		rows: vec![row],
+		per_query_rows,
+	};
+
+	validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?;
+
+	Ok(manifest)
+}
+
+fn quantitative_product_manifest(
+	path: Option<&Path>,
+	corpus_id: &str,
+) -> Result<QuantitativeProductManifest> {
+	let Some(path) = path else {
+		return Ok(QuantitativeProductManifest::default());
+	};
+	let raw = fs::read_to_string(path)?;
+	let mut manifest =
+		serde_json::from_str::<QuantitativeProductManifest>(&raw).map_err(|err| {
+			eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display())
+		})?;
+
+	for row in &mut manifest.rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+	for row in &mut manifest.per_query_rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+
+	validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
+
+	Ok(manifest)
+}
+
+fn validate_quantitative_product_manifest(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.corpus_id != corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected same-corpus {}.",
+			path.display(),
+			manifest.corpus_id,
+			corpus_id
+		));
+	}
+	if manifest.rows.is_empty() {
+		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
+	}
+
+	let row_keys = manifest
+		.rows
+		.iter()
+		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
+		.collect::<BTreeSet<_>>();
+
+	for row in &manifest.rows {
+		if row.product == "ELF" {
+			return Err(eyre::eyre!(
+				"{} quantitative product manifest must not inject ELF self rows.",
+				path.display()
+			));
+		}
+		if row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.adapter_name.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative product row.",
+				path.display()
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+	for row in &manifest.per_query_rows {
+		if row.job_id.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+			|| row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.qrel_source.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative per-query product row.",
+				path.display()
+			));
+		}
+		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} has no matching product row.",
+				path.display(),
+				row.product,
+				row.adapter_id
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+	for row in &manifest.rows {
+		if row.ranking_query_count == 0 {
+			continue;
+		}
+
+		let per_query_count = manifest
+			.per_query_rows
+			.iter()
+			.filter(|per_query| {
+				per_query.product == row.product && per_query.adapter_id == row.adapter_id
+			})
+			.count();
+
+		if per_query_count < row.ranking_query_count {
+			return Err(eyre::eyre!(
+				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				row.ranking_query_count,
+				per_query_count
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn quantitative_metrics_not_encoded(
+	imported_row_count: usize,
+	imported_per_query_count: usize,
+) -> Vec<String> {
+	let mut metrics =
+		vec!["paired_significance".to_string(), "audit_manifest_validation".to_string()];
+
+	if imported_row_count == 0 {
+		metrics.push("external_product_manifest_import".to_string());
+	}
+	if imported_row_count > 0 && imported_per_query_count == 0 {
+		metrics.push("imported_product_per_query_rows".to_string());
+	}
+
+	metrics
 }
 
 fn quantitative_per_query_rows(
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
index 73f2b1eb..a4552032 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
@@ -74,3 +74,14 @@ pub(super) struct QuantitativeBenchmarkControls {
 	pub(super) leaderboard_claim_allowed: bool,
 	pub(super) leakage_control: String,
 }
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativeProductManifest {
+	pub(super) schema: String,
+	pub(super) manifest_id: String,
+	pub(super) corpus_id: String,
+	#[serde(default)]
+	pub(super) rows: Vec<QuantitativeBenchmarkRow>,
+	#[serde(default)]
+	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
index 675dbeb3..b350eb3f 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
@@ -1,4 +1,8 @@
-use std::{env, fs, process};
+use std::{
+	env, fs,
+	path::Path,
+	process::{self, Command},
+};
 
 use color_eyre::{Result, eyre};
 use serde_json::Value;
@@ -87,6 +91,218 @@ fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> {
 	Ok(())
 }
 
+#[test]
+fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let manifest = support::load_json(&manifest_path)?;
+
+	assert_eq!(
+		manifest.pointer("/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_product_manifest/v1")
+	);
+	assert_eq!(
+		manifest.pointer("/rows/0/product").and_then(Value::as_str),
+		Some("Synthetic Rival")
+	);
+	assert_eq!(
+		manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str),
+		Some("synthetic_rival")
+	);
+
+	let imported = run_report_with_quantitative_manifest(&manifest_path)?;
+	let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?;
+	let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?;
+
+	assert_eq!(rows.len(), 2);
+	assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival"));
+	assert!(!support::array_contains_str(
+		&imported,
+		"/quantitative_scoreboard/metrics_not_encoded",
+		"external_product_manifest_import"
+	)?);
+	assert!(
+		support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any(
+			|row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival")
+		)
+	);
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("elf-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF"));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let mut manifest = support::load_json(&manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?;
+	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus"));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("missing-per-query-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let mut manifest = support::load_json(&manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?;
+	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+
+	let stderr = String::from_utf8_lossy(&output.stderr);
+
+	assert!(stderr.contains("ranked queries but only 0"));
+
+	Ok(())
+}
+
+fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result<Value> {
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(manifest_path)
+		.output()?;
+
+	assert!(
+		output.status.success(),
+		"real_world_job runner failed: {}",
+		String::from_utf8_lossy(&output.stderr)
+	);
+
+	Ok(serde_json::from_slice(&output.stdout)?)
+}
+
 fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
 	let rows = support::array_at(report, "/quantitative_scoreboard/rows")?;
 

From 4ee6bae78890b98d3627a6bcfc4f197b0d9f717c Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:01:09 -0400
Subject: [PATCH 04/58] {"schema":"decodex/commit/1","summary":"Port
 quantitative audit manifest gates","authority":"manual"}

---
 .../src/bin/real_world_job_benchmark/cli.rs   |  39 ++
 .../bin/real_world_job_benchmark/commands.rs  |  18 +-
 .../src/bin/real_world_job_benchmark/main.rs  |  16 +-
 .../real_world_job_benchmark/quantitative.rs  | 541 +++++++++++++++++-
 .../quantitative_reports.rs                   |  29 +
 .../real_world_job_benchmark/quantitative.rs  | 121 ++++
 6 files changed, 748 insertions(+), 16 deletions(-)

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
index ddcf4a7e..bae29a2e 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
@@ -54,6 +54,9 @@ pub(super) struct RunArgs {
 	/// Optional same-corpus quantitative product manifest to merge into the report.
 	#[arg(long, value_name = "FILE")]
 	pub(super) quantitative_product_manifest: Option<PathBuf>,
+	/// Optional audit manifest proving the current quantitative row's held-out/leakage gates.
+	#[arg(long, value_name = "FILE")]
+	pub(super) quantitative_audit_manifest: Option<PathBuf>,
 }
 
 #[derive(Debug, Parser)]
@@ -88,9 +91,45 @@ pub(super) struct ExportQuantitativeProductManifestArgs {
 	pub(super) adapter_name: Option<String>,
 }
 
+#[derive(Debug, Parser)]
+pub(super) struct ExportQuantitativeAuditManifestArgs {
+	/// Fixture file or directory containing current product-runtime real_world_job outputs.
+	#[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)]
+	pub(super) fixtures: PathBuf,
+	/// Write audit manifest JSON to this file. Omit to print to stdout.
+	#[arg(long, value_name = "FILE")]
+	pub(super) out: Option<PathBuf>,
+	/// Stable run id that the audit manifest is allowed to attest.
+	#[arg(long, default_value = DEFAULT_RUN_ID)]
+	pub(super) run_id: String,
+	/// Stable manifest id. Defaults to <run_id>-quantitative-audit-manifest.
+	#[arg(long)]
+	pub(super) manifest_id: Option<String>,
+	/// Product name for the current row.
+	#[arg(long, default_value = "ELF")]
+	pub(super) product: String,
+	/// Adapter id for the current row.
+	#[arg(long, default_value = DEFAULT_ADAPTER_ID)]
+	pub(super) adapter_id: String,
+	/// Mark the current row as held-out only when query ids were locked before runtime.
+	#[arg(long)]
+	pub(super) held_out: bool,
+	/// Mark the current row as leakage audited only when runtime inputs excluded answers/qrels.
+	#[arg(long)]
+	pub(super) leakage_audited: bool,
+	/// Audit control string. Repeat for multiple controls.
+	#[arg(long = "control")]
+	pub(super) controls: Vec<String>,
+	/// Claim boundary recorded in the audit manifest.
+	#[arg(long)]
+	pub(super) claim_boundary: Option<String>,
+}
+
 #[derive(Debug, Subcommand)]
 #[command(rename_all = "kebab")]
 pub(super) enum Command {
+	/// Export a quantitative audit manifest for the current fixture set.
+	ExportQuantitativeAuditManifest(ExportQuantitativeAuditManifestArgs),
 	/// Export the primary quantitative row as a reusable product manifest.
 	ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs),
 	/// Parse and score real_world_job fixtures, then emit a JSON report.
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
index c36fedd4..a151e6da 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
@@ -1,8 +1,8 @@
 use crate::{
 	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile,
-	ExportQuantitativeProductManifestArgs, OffsetDateTime, Path, PathBuf, PrivateCorpusRedaction,
-	PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, RealWorldJob, RealWorldReport, Result,
-	Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
+	ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime,
+	Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA,
+	RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
 };
 
 pub(super) fn run_command(args: RunArgs) -> Result<()> {
@@ -32,6 +32,16 @@ pub(super) fn export_quantitative_product_manifest_command(
 	write_or_print(args.out.as_deref(), json.as_str())
 }
 
+pub(super) fn export_quantitative_audit_manifest_command(
+	args: ExportQuantitativeAuditManifestArgs,
+) -> Result<()> {
+	let jobs = load_jobs(&args.fixtures)?;
+	let manifest = crate::quantitative_audit_manifest_from_jobs(jobs.as_slice(), &args)?;
+	let json = serde_json::to_string_pretty(&manifest)?;
+
+	write_or_print(args.out.as_deref(), json.as_str())
+}
+
 fn load_jobs(path: &Path) -> Result<Vec<RealWorldJob>> {
 	let paths = fixture_paths(path)?;
 	let mut jobs = Vec::with_capacity(paths.len());
@@ -118,12 +128,14 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 	let adapter = adapter_report(args)?;
 	let generated_at = OffsetDateTime::now_utc().format(&Rfc3339)?;
 	let quantitative_scoreboard = crate::quantitative_scoreboard_report(QuantitativeReportInput {
+		run_id: args.run_id.as_str(),
 		generated_at: generated_at.as_str(),
 		adapter: &adapter,
 		source_jobs: jobs,
 		jobs: &job_reports,
 		summary: &summary,
 		product_manifest_path: args.quantitative_product_manifest.as_deref(),
+		audit_manifest_path: args.quantitative_audit_manifest.as_deref(),
 	})?;
 
 	Ok(RealWorldReport {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
index f8bbf36b..50fadd82 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
@@ -51,7 +51,10 @@ use artifacts::{
 	WorkJournalNextStepArtifact, WorkJournalReadbackArtifact, WorkJournalRejectedOptionArtifact,
 	WorkJournalWhereStoppedArtifact,
 };
-use cli::{Args, Command, ExportQuantitativeProductManifestArgs, PublishArgs, RunArgs};
+use cli::{
+	Args, Command, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs,
+	PublishArgs, RunArgs,
+};
 use diagnostic_reports::{
 	OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability,
 };
@@ -87,12 +90,13 @@ use operational_reports::{
 	OperationalResourceSummary,
 };
 use quantitative::{
-	QuantitativeReportInput, quantitative_product_manifest_from_report,
-	quantitative_scoreboard_report,
+	QuantitativeReportInput, quantitative_audit_manifest_from_jobs,
+	quantitative_product_manifest_from_report, quantitative_scoreboard_report,
 };
 use quantitative_reports::{
-	QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
-	QuantitativePerQueryRow, QuantitativeProductManifest,
+	QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow,
+	QuantitativeProductManifest,
 };
 use report_root::RealWorldReport;
 use scoreboard::scoreboard_report;
@@ -177,6 +181,8 @@ fn main() -> Result<()> {
 	color_eyre::install()?;
 
 	match Args::parse().command {
+		Command::ExportQuantitativeAuditManifest(args) =>
+			commands::export_quantitative_audit_manifest_command(args),
 		Command::ExportQuantitativeProductManifest(args) =>
 			commands::export_quantitative_product_manifest_command(args),
 		Command::Run(args) => commands::run_command(args),
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index 51d1c07e..f799e9fc 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,13 +1,22 @@
+use std::env;
+
 use crate::{
-	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeProductManifestArgs, JobReport, Path,
-	QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
-	QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob,
-	RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring,
+	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs,
+	ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact,
+	QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport,
+	QuantitativeBenchmarkRow, QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA,
+	RealWorldJob, RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring,
 };
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
 const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str =
 	"elf.agent_memory_quantitative_product_manifest/v1";
+const QUANTITATIVE_AUDIT_MANIFEST_SCHEMA: &str = "elf.agent_memory_quantitative_audit_manifest/v1";
+const REQUIRED_HELD_OUT_AUDIT_CONTROL: &str = "query_ids_locked_before_product_runtime";
+const REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL: &str =
+	"product_runtime_did_not_receive_expected_answers_or_qrels";
+const REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL: &str =
+	"ranked_candidates_emitted_by_product_runtime";
 const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10];
 const MIN_LEADERBOARD_QUERY_COUNT: usize = 30;
 const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
@@ -16,12 +25,30 @@ const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
 );
 
 pub(super) struct QuantitativeReportInput<'a> {
+	pub(super) run_id: &'a str,
 	pub(super) generated_at: &'a str,
 	pub(super) adapter: &'a AdapterReport,
 	pub(super) source_jobs: &'a [RealWorldJob],
 	pub(super) jobs: &'a [JobReport],
 	pub(super) summary: &'a ReportSummary,
 	pub(super) product_manifest_path: Option<&'a Path>,
+	pub(super) audit_manifest_path: Option<&'a Path>,
+}
+
+struct QuantitativeAuditContext<'a> {
+	run_id: &'a str,
+	corpus_id: &'a str,
+	product: &'a str,
+	adapter_id: &'a str,
+	source_jobs: &'a [RealWorldJob],
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+}
+
+struct QuantitativeAuditEvidence {
+	held_out: bool,
+	leakage_audited: bool,
+	audit_manifest_id: Option<String>,
 }
 
 pub(super) fn quantitative_scoreboard_report(
@@ -43,8 +70,27 @@ pub(super) fn quantitative_scoreboard_report(
 	let explicit_qrel_query_count =
 		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
 	let metric_comparable = ranking_query_count > 0;
-	let leaderboard_eligible = false;
 	let result_state = quantitative_result_state(input.summary);
+	let audit_evidence = quantitative_audit_evidence(
+		input.audit_manifest_path,
+		QuantitativeAuditContext {
+			run_id: input.run_id,
+			corpus_id: corpus_id.as_str(),
+			product: "ELF",
+			adapter_id: input.adapter.adapter_id.as_str(),
+			source_jobs: input.source_jobs,
+			ranking_query_count,
+			explicit_qrel_query_count,
+		},
+	)?;
+	let leaderboard_eligible = quantitative_row_leaderboard_eligible(
+		evidence_class,
+		input.source_jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		metric_comparable,
+		&audit_evidence,
+	);
 	let row = QuantitativeBenchmarkRow {
 		product: "ELF".to_string(),
 		adapter_id: input.adapter.adapter_id.clone(),
@@ -56,8 +102,9 @@ pub(super) fn quantitative_scoreboard_report(
 		comparable: metric_comparable,
 		metric_comparable,
 		leaderboard_eligible,
-		held_out: false,
-		leakage_audited: false,
+		held_out: audit_evidence.held_out,
+		leakage_audited: audit_evidence.leakage_audited,
+		audit_manifest_id: audit_evidence.audit_manifest_id,
 		fixture_regression_only: evidence_class == "fixture_backed",
 		sample_size: input.jobs.len(),
 		ranking_query_count,
@@ -86,6 +133,7 @@ pub(super) fn quantitative_scoreboard_report(
 	rows.extend(product_manifest.rows);
 	merged_per_query_rows.extend(product_manifest.per_query_rows);
 
+	let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2;
 	let controls = QuantitativeBenchmarkControls {
 		same_corpus_required: true,
 		same_task_required: true,
@@ -95,7 +143,7 @@ pub(super) fn quantitative_scoreboard_report(
 		current_query_count: input.source_jobs.len(),
 		current_ranking_query_count: ranking_query_count,
 		current_explicit_qrel_query_count: explicit_qrel_query_count,
-		leaderboard_claim_allowed: leaderboard_eligible,
+		leaderboard_claim_allowed,
 		leakage_control:
 			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
 				.to_string(),
@@ -204,6 +252,303 @@ pub(super) fn quantitative_product_manifest_from_report(
 	Ok(manifest)
 }
 
+pub(super) fn quantitative_audit_manifest_from_jobs(
+	jobs: &[RealWorldJob],
+	args: &ExportQuantitativeAuditManifestArgs,
+) -> Result<QuantitativeAuditManifest> {
+	let product = args.product.trim();
+	let adapter_id = args.adapter_id.trim();
+
+	if product.is_empty() || adapter_id.is_empty() {
+		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
+	}
+
+	let corpus_id = quantitative_corpus_id(jobs);
+	let ranking_query_count = ranking_query_count(jobs);
+	let explicit_qrel_query_count = explicit_qrel_query_count(jobs);
+	let manifest = QuantitativeAuditManifest {
+		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
+		run_id: args.run_id.clone(),
+		corpus_id,
+		product: product.to_string(),
+		adapter_id: adapter_id.to_string(),
+		held_out: args.held_out,
+		leakage_audited: args.leakage_audited,
+		sample_size: jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
+		controls: args.controls.clone(),
+		artifacts: vec![QuantitativeAuditArtifact {
+			role: "product_runtime_fixtures".to_string(),
+			path: audit_artifact_display_path(args.fixtures.as_path()),
+			sha256: fixture_path_digest(args.fixtures.as_path())?,
+		}],
+		claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| {
+			if args.held_out || args.leakage_audited {
+				concat!(
+					"Audit manifest supplied by operator; runner validates run/corpus/product/",
+					"adapter/count/query-id/artifact bindings before opening row gates."
+				)
+				.to_string()
+			} else {
+				concat!(
+					"Diagnostic audit manifest binds the current product-runtime fixture set to ",
+					"query ids and counts, but it does not prove held-out or leakage-audited status."
+				)
+				.to_string()
+			}
+		}),
+	};
+
+	validate_quantitative_audit_manifest(
+		&manifest,
+		args.fixtures.as_path(),
+		QuantitativeAuditContext {
+			run_id: args.run_id.as_str(),
+			corpus_id: manifest.corpus_id.as_str(),
+			product,
+			adapter_id,
+			source_jobs: jobs,
+			ranking_query_count: manifest.ranking_query_count,
+			explicit_qrel_query_count: manifest.explicit_qrel_query_count,
+		},
+	)?;
+
+	Ok(manifest)
+}
+
+fn quantitative_audit_evidence(
+	path: Option<&Path>,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<QuantitativeAuditEvidence> {
+	let Some(path) = path else {
+		return Ok(QuantitativeAuditEvidence {
+			held_out: false,
+			leakage_audited: false,
+			audit_manifest_id: None,
+		});
+	};
+	let raw = fs::read_to_string(path)?;
+	let manifest = serde_json::from_str::<QuantitativeAuditManifest>(&raw).map_err(|err| {
+		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
+	})?;
+
+	validate_quantitative_audit_manifest(&manifest, path, context)?;
+
+	Ok(QuantitativeAuditEvidence {
+		held_out: manifest.held_out,
+		leakage_audited: manifest.leakage_audited,
+		audit_manifest_id: Some(manifest.manifest_id),
+	})
+}
+
+fn validate_quantitative_audit_manifest(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
+	validate_quantitative_audit_controls(manifest, path)?;
+
+	validate_quantitative_audit_artifacts(manifest, path)
+}
+
+fn validate_quantitative_audit_query_ids(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	source_jobs: &[RealWorldJob],
+) -> Result<()> {
+	let expected = ranking_query_ids(source_jobs);
+	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if actual.len() != manifest.query_ids.len() {
+		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
+	}
+	if actual != expected {
+		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
+		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
+
+		return Err(eyre::eyre!(
+			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
+			path.display(),
+			missing,
+			extra
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_quantitative_audit_controls(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
+		return Err(eyre::eyre!(
+			"{} marks held_out=true without required control {}.",
+			path.display(),
+			REQUIRED_HELD_OUT_AUDIT_CONTROL
+		));
+	}
+	if manifest.leakage_audited
+		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
+			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
+	{
+		return Err(eyre::eyre!(
+			"{} marks leakage_audited=true without required controls {} and {}.",
+			path.display(),
+			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
+		));
+	}
+	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} marks audit controls true but has an empty claim_boundary.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_quantitative_audit_artifacts(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	if manifest.artifacts.is_empty() {
+		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
+	}
+
+	for artifact in &manifest.artifacts {
+		if artifact.role.trim().is_empty()
+			|| artifact.path.trim().is_empty()
+			|| artifact.sha256.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative audit artifact.",
+				path.display()
+			));
+		}
+		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
+		{
+			return Err(eyre::eyre!(
+				"{} artifact {} has invalid sha256 digest {}.",
+				path.display(),
+				artifact.role,
+				artifact.sha256
+			));
+		}
+
+		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
+		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
+			eyre::eyre!(
+				"{} artifact {} could not be digested at {}: {err}",
+				path.display(),
+				artifact.role,
+				artifact_path.display()
+			)
+		})?;
+
+		if actual != artifact.sha256 {
+			return Err(eyre::eyre!(
+				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
+				path.display(),
+				artifact.role,
+				artifact_path.display(),
+				artifact.sha256,
+				actual
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
+	let raw = PathBuf::from(artifact_path);
+
+	if raw.is_absolute() {
+		return raw;
+	}
+
+	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
+
+	if cwd_path.exists() {
+		return cwd_path;
+	}
+
+	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
+}
+
 fn quantitative_product_manifest(
 	path: Option<&Path>,
 	corpus_id: &str,
@@ -290,6 +635,9 @@ fn validate_quantitative_product_manifest(
 				corpus_id
 			));
 		}
+		if row.leaderboard_eligible {
+			validate_leaderboard_eligible_product_row(path, row)?;
+		}
 	}
 	for row in &manifest.per_query_rows {
 		if row.job_id.trim().is_empty()
@@ -351,6 +699,34 @@ fn validate_quantitative_product_manifest(
 	Ok(())
 }
 
+fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}
+
 fn quantitative_metrics_not_encoded(
 	imported_row_count: usize,
 	imported_per_query_count: usize,
@@ -697,6 +1073,155 @@ fn quantitative_evidence_class(adapter: &AdapterReport, jobs: &[JobReport]) -> &
 	}
 }
 
+fn quantitative_row_leaderboard_eligible(
+	evidence_class: &str,
+	sample_size: usize,
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+	metric_comparable: bool,
+	audit_evidence: &QuantitativeAuditEvidence,
+) -> bool {
+	metric_comparable
+		&& evidence_class == "live_real_world"
+		&& sample_size >= MIN_LEADERBOARD_QUERY_COUNT
+		&& ranking_query_count == sample_size
+		&& explicit_qrel_query_count == ranking_query_count
+		&& audit_evidence.held_out
+		&& audit_evidence.leakage_audited
+		&& audit_evidence
+			.audit_manifest_id
+			.as_deref()
+			.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty())
+}
+
+fn fixture_path_digest(path: &Path) -> Result<String> {
+	let mut hasher = blake3::Hasher::new();
+
+	if path.is_file() {
+		hash_fixture_file(
+			path,
+			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
+			&mut hasher,
+		)?;
+
+		return Ok(hasher.finalize().to_hex().to_string());
+	}
+
+	let paths = audit_fixture_paths(path)?;
+
+	for fixture in paths {
+		let relative = fixture
+			.strip_prefix(path)
+			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
+			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
+
+		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
+	}
+
+	Ok(hasher.finalize().to_hex().to_string())
+}
+
+fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
+	let mut paths = Vec::new();
+
+	collect_audit_fixture_paths(path, &mut paths)?;
+
+	paths.sort();
+
+	Ok(paths)
+}
+
+fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
+	if path.is_file() {
+		paths.push(path.to_path_buf());
+
+		return Ok(());
+	}
+
+	for entry in fs::read_dir(path)? {
+		let entry_path = entry?.path();
+
+		if entry_path.is_dir() {
+			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
+		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
+			paths.push(entry_path);
+		}
+	}
+
+	Ok(())
+}
+
+fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
+	hasher.update(logical_path.as_bytes());
+	hasher.update(b"\0");
+	hasher.update(&fs::read(path)?);
+	hasher.update(b"\0");
+
+	Ok(())
+}
+
+fn audit_artifact_display_path(path: &Path) -> String {
+	let display_path = if path.is_absolute() {
+		env::current_dir()
+			.ok()
+			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
+			.unwrap_or_else(|| path.to_path_buf())
+	} else {
+		path.to_path_buf()
+	};
+
+	display_path.to_string_lossy().replace('\\', "/")
+}
+
+fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+	source_jobs
+		.iter()
+		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
+		.map(|job| job.job_id.as_str())
+		.collect()
+}
+
+fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	ranking_query_ids(source_jobs).len()
+}
+
+fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
+}
+
+fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		return source_job
+			.expected_answer
+			.relevance_judgments
+			.iter()
+			.filter(|judgment| judgment.grade > 0.0)
+			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+			.collect();
+	}
+
+	source_job
+		.required_evidence
+		.iter()
+		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
+		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
+		.collect()
+}
+
+fn ranking_query_attempted(job: &RealWorldJob) -> bool {
+	if !scoring::produced_evidence_order(job).is_empty() {
+		return true;
+	}
+
+	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
+		return false;
+	};
+
+	answer.trace_explainability.as_ref().is_some_and(|trace| {
+		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
+	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
+}
+
 fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
 	if !source_job.expected_answer.relevance_judgments.is_empty() {
 		"explicit_qrels"
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
index a4552032..6c953802 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
@@ -29,6 +29,7 @@ pub(super) struct QuantitativeBenchmarkRow {
 	pub(super) leaderboard_eligible: bool,
 	pub(super) held_out: bool,
 	pub(super) leakage_audited: bool,
+	pub(super) audit_manifest_id: Option<String>,
 	pub(super) fixture_regression_only: bool,
 	pub(super) sample_size: usize,
 	pub(super) ranking_query_count: usize,
@@ -85,3 +86,31 @@ pub(super) struct QuantitativeProductManifest {
 	#[serde(default)]
 	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
 }
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub(super) struct QuantitativeAuditManifest {
+	pub(super) schema: String,
+	pub(super) manifest_id: String,
+	pub(super) run_id: String,
+	pub(super) corpus_id: String,
+	pub(super) product: String,
+	pub(super) adapter_id: String,
+	pub(super) held_out: bool,
+	pub(super) leakage_audited: bool,
+	pub(super) sample_size: usize,
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+	pub(super) query_ids: Vec<String>,
+	#[serde(default)]
+	pub(super) controls: Vec<String>,
+	#[serde(default)]
+	pub(super) artifacts: Vec<QuantitativeAuditArtifact>,
+	pub(super) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub(super) struct QuantitativeAuditArtifact {
+	pub(super) role: String,
+	pub(super) path: String,
+	pub(super) sha256: String,
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
index b350eb3f..f2b03d5c 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
@@ -285,6 +285,107 @@ fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence(
 	Ok(())
 }
 
+#[test]
+fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> {
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id()));
+	let manifest_path = temp_dir.join("audit-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-audit-manifest")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--run-id")
+		.arg("audit-import-test")
+		.arg("--held-out")
+		.arg("--leakage-audited")
+		.arg("--control")
+		.arg("query_ids_locked_before_product_runtime")
+		.arg("--control")
+		.arg("product_runtime_did_not_receive_expected_answers_or_qrels")
+		.arg("--control")
+		.arg("ranked_candidates_emitted_by_product_runtime")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"quantitative audit export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let manifest = support::load_json(&manifest_path)?;
+
+	assert_eq!(
+		manifest.pointer("/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_audit_manifest/v1")
+	);
+	assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true));
+	assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
+	assert_eq!(
+		support::array_at(&manifest, "/query_ids")?.len() as u64,
+		manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default()
+	);
+
+	let imported = run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?;
+	let row = support::array_at(&imported, "/quantitative_scoreboard/rows")?
+		.first()
+		.ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
+
+	assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
+	assert_eq!(
+		row.pointer("/audit_manifest_id").and_then(Value::as_str),
+		Some("audit-import-test-quantitative-audit-manifest")
+	);
+	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> {
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id()));
+	let manifest_path = temp_dir.join("audit-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-audit-manifest")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--run-id")
+		.arg("audit-import-test")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"quantitative audit export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--run-id")
+		.arg("different-run")
+		.arg("--quantitative-audit-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run"));
+
+	Ok(())
+}
+
 fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result<Value> {
 	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
 		.arg("run")
@@ -303,6 +404,26 @@ fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result<Value>
 	Ok(serde_json::from_slice(&output.stdout)?)
 }
 
+fn run_report_with_quantitative_audit(manifest_path: &Path, run_id: &str) -> Result<Value> {
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--run-id")
+		.arg(run_id)
+		.arg("--quantitative-audit-manifest")
+		.arg(manifest_path)
+		.output()?;
+
+	assert!(
+		output.status.success(),
+		"real_world_job runner failed: {}",
+		String::from_utf8_lossy(&output.stderr)
+	);
+
+	Ok(serde_json::from_slice(&output.stdout)?)
+}
+
 fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
 	let rows = support::array_at(report, "/quantitative_scoreboard/rows")?;
 

From 8c95885575752d0c44ed6fd512c9c872ba347a97 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:06:38 -0400
Subject: [PATCH 05/58] {"schema":"decodex/commit/1","summary":"Add
 quantitative rate confidence intervals","authority":"manual"}

---
 .../src/bin/real_world_job_benchmark/main.rs  |  4 +-
 .../real_world_job_benchmark/quantitative.rs  | 85 ++++++++++++++++++-
 .../quantitative_reports.rs                   | 12 +++
 .../real_world_job_benchmark/quantitative.rs  | 16 ++++
 4 files changed, 113 insertions(+), 4 deletions(-)

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
index 50fadd82..dc77d8f0 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
@@ -95,8 +95,8 @@ use quantitative::{
 };
 use quantitative_reports::{
 	QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
-	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow,
-	QuantitativeProductManifest,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
+	QuantitativePerQueryRow, QuantitativeProductManifest,
 };
 use report_root::RealWorldReport;
 use scoreboard::scoreboard_report;
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index f799e9fc..ac782c30 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -4,8 +4,9 @@ use crate::{
 	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs,
 	ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact,
 	QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport,
-	QuantitativeBenchmarkRow, QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA,
-	RealWorldJob, RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring,
+	QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, QuantitativePerQueryRow,
+	QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, RealWorldReport, ReportSummary,
+	Result, eyre, formatting, fs, scoring,
 };
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
@@ -19,6 +20,7 @@ const REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL: &str =
 	"ranked_candidates_emitted_by_product_runtime";
 const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10];
 const MIN_LEADERBOARD_QUERY_COUNT: usize = 30;
+const WILSON_95_Z: f64 = 1.959963984540054;
 const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
 	"Quantitative metrics are bounded to this generated report. ",
 	"Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims."
@@ -121,6 +123,7 @@ pub(super) fn quantitative_scoreboard_report(
 		metrics: aggregate_metrics(per_query_rows.as_slice()),
 		metric_states: aggregate_metric_states(result_state, metric_comparable),
 		denominators: aggregate_denominators(per_query_rows.as_slice()),
+		confidence_intervals: aggregate_confidence_intervals(per_query_rows.as_slice()),
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	};
 	let product_manifest =
@@ -1019,6 +1022,84 @@ fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String,
 	denominators
 }
 
+fn aggregate_confidence_intervals(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, QuantitativeConfidenceInterval> {
+	let mut confidence_intervals = BTreeMap::new();
+
+	for metric in rate_metric_names() {
+		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
+
+		if denominator > 0 {
+			confidence_intervals.insert(
+				metric,
+				wilson_confidence_interval(numerator.min(denominator), denominator),
+			);
+		}
+	}
+
+	confidence_intervals
+}
+
+fn rate_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+
+	metrics
+}
+
+fn aggregate_rate_numerator_denominator(
+	rows: &[QuantitativePerQueryRow],
+	metric: &str,
+) -> (usize, usize) {
+	let mut numerator = 0;
+	let mut denominator = 0;
+
+	for row in rows {
+		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
+			continue;
+		};
+		let Some(row_denominator) = row.denominators.get(metric).copied() else {
+			continue;
+		};
+
+		if row_denominator == 0 {
+			continue;
+		}
+
+		denominator += row_denominator;
+		numerator += (value * row_denominator as f64).round() as usize;
+	}
+
+	(numerator, denominator)
+}
+
+fn wilson_confidence_interval(
+	numerator: usize,
+	denominator: usize,
+) -> QuantitativeConfidenceInterval {
+	let n = denominator as f64;
+	let p = numerator as f64 / n;
+	let z2 = WILSON_95_Z * WILSON_95_Z;
+	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
+	let half_width =
+		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
+
+	QuantitativeConfidenceInterval {
+		method: "wilson_score".to_string(),
+		confidence: 0.95,
+		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
+		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
+		numerator,
+		denominator,
+	}
+}
+
 fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
 	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
index 6c953802..ded35360 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
@@ -40,6 +40,8 @@ pub(super) struct QuantitativeBenchmarkRow {
 	pub(super) metrics: BTreeMap<String, Option<f64>>,
 	pub(super) metric_states: BTreeMap<String, String>,
 	pub(super) denominators: BTreeMap<String, usize>,
+	#[serde(default)]
+	pub(super) confidence_intervals: BTreeMap<String, QuantitativeConfidenceInterval>,
 	pub(super) claim_boundary: String,
 }
 
@@ -76,6 +78,16 @@ pub(super) struct QuantitativeBenchmarkControls {
 	pub(super) leakage_control: String,
 }
 
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(super) struct QuantitativeConfidenceInterval {
+	pub(super) method: String,
+	pub(super) confidence: f64,
+	pub(super) lower: f64,
+	pub(super) upper: f64,
+	pub(super) numerator: usize,
+	pub(super) denominator: usize,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub(super) struct QuantitativeProductManifest {
 	pub(super) schema: String,
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
index f2b03d5c..249c48e2 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
@@ -469,6 +469,22 @@ fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
 		);
 		assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some());
 	}
+	for metric in ["recall_at_5", "precision_at_5", "success_at_5"] {
+		assert_eq!(
+			row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str),
+			Some("wilson_score")
+		);
+		assert_eq!(
+			row.pointer(&format!("/confidence_intervals/{metric}/confidence"))
+				.and_then(Value::as_f64),
+			Some(0.95)
+		);
+		assert!(
+			row.pointer(&format!("/confidence_intervals/{metric}/denominator"))
+				.and_then(Value::as_u64)
+				.is_some()
+		);
+	}
 
 	Ok(())
 }

From 486c476331fcd6fff30e2c8ba2b4b2a3fb482adf Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:12:59 -0400
Subject: [PATCH 06/58] {"schema":"decodex/commit/1","summary":"Clean up split
 makefile comments","authority":"manual"}

---
 docs/spec/agent_memory_knowledge_system_v1.md |  3 +--
 makefiles/benchmark-memory-a.toml             |  2 +-
 makefiles/benchmark-memory-b.toml             | 11 +-------
 makefiles/check.toml                          | 21 ----------------
 makefiles/clean.toml                          | 16 ------------
 makefiles/format.toml                         | 17 -------------
 makefiles/lint-fix.toml                       | 19 --------------
 makefiles/lint.toml                           | 14 -----------
 makefiles/research.toml                       | 25 -------------------
 makefiles/smoke.toml                          | 22 ----------------
 makefiles/test.toml                           |  9 -------
 11 files changed, 3 insertions(+), 156 deletions(-)

diff --git a/docs/spec/agent_memory_knowledge_system_v1.md b/docs/spec/agent_memory_knowledge_system_v1.md
index 35d18ca8..070df71f 100644
--- a/docs/spec/agent_memory_knowledge_system_v1.md
+++ b/docs/spec/agent_memory_knowledge_system_v1.md
@@ -272,8 +272,7 @@ Repository-native validation is authoritative.
   docs are validation-ready.
 - Before a PR handoff or any push that refreshes a PR head, run the registered
   Decodex workflow gate: `cargo make fmt`, `cargo make lint-fix`, then
-  `cargo make checks`. In this Makefile tree, `checks` aliases the repo-native
-  aggregate `check` task.
+  `cargo make check`.
 - If a phase changes commands, schemas, config, runtime behavior, status semantics,
   or benchmark claims, update the owning docs and include drift evidence as required
   by `docs/policy.md`.
diff --git a/makefiles/benchmark-memory-a.toml b/makefiles/benchmark-memory-a.toml
index a7b5e6c6..3f09c7d4 100644
--- a/makefiles/benchmark-memory-a.toml
+++ b/makefiles/benchmark-memory-a.toml
@@ -1,4 +1,4 @@
-# Rust workspace tasks: Benchmark real-world memory tasks, first half.
+# Rust workspace tasks: real-world memory benchmark fixtures A-G.
 
 [tasks.real-world-memory]
 workspace = false
diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml
index 95003f90..3b47da39 100644
--- a/makefiles/benchmark-memory-b.toml
+++ b/makefiles/benchmark-memory-b.toml
@@ -1,4 +1,4 @@
-# Rust workspace tasks: Benchmark real-world memory tasks, second half.
+# Rust workspace tasks: real-world memory benchmark fixtures K-W and aggregate runners.
 
 [tasks.real-world-memory-json]
 workspace = false
@@ -686,12 +686,3 @@ args = [
 	"--out",
 	"tmp/real-world-memory/memory-summary/report.md",
 ]
-
-# Check
-# | task             | type      | cwd |
-# | ---------------- | --------- | --- |
-# | check            | composite |     |
-# | check-docs       | command   |     |
-# | check-rust       | command   |     |
-# | check-trace-gate | command   |     |
-# | checks           | composite |     |
diff --git a/makefiles/check.toml b/makefiles/check.toml
index 5756ac55..c6ab6569 100644
--- a/makefiles/check.toml
+++ b/makefiles/check.toml
@@ -1,14 +1,5 @@
 # Rust workspace tasks: Check.
 
-# Check
-# | task             | type      | cwd |
-# | ---------------- | --------- | --- |
-# | check            | composite |     |
-# | check-docs       | command   |     |
-# | check-rust       | command   |     |
-# | check-trace-gate | command   |     |
-# | checks           | composite |     |
-
 [tasks.check]
 clear = true
 workspace = false
@@ -43,15 +34,3 @@ command = "bash"
 args = [
 	"scripts/trace-gate.sh",
 ]
-
-[tasks.checks]
-workspace = false
-dependencies = [
-	"check",
-]
-
-# Clean
-# | task                       | type    | cwd |
-# | -------------------------- | ------- | --- |
-# | clean-baseline-live-docker | command |     |
-# | clean-parity-docker        | command |     |
diff --git a/makefiles/clean.toml b/makefiles/clean.toml
index 7fc71c62..bf899af0 100644
--- a/makefiles/clean.toml
+++ b/makefiles/clean.toml
@@ -1,11 +1,5 @@
 # Rust workspace tasks: Clean.
 
-# Clean
-# | task                       | type    | cwd |
-# | -------------------------- | ------- | --- |
-# | clean-baseline-live-docker | command |     |
-# | clean-parity-docker        | command |     |
-
 [tasks.clean-baseline-live-docker]
 workspace = false
 command = "docker"
@@ -29,13 +23,3 @@ args = [
 	"-v",
 	"--remove-orphans",
 ]
-
-# Format
-# | task           | type      | cwd |
-# | -------------- | --------- | --- |
-# | fmt            | composite |     |
-# | fmt-check      | composite |     |
-# | fmt-rust       | command   |     |
-# | fmt-rust-check | extend    |     |
-# | fmt-toml       | command   |     |
-# | fmt-toml-check | extend    |     |
diff --git a/makefiles/format.toml b/makefiles/format.toml
index e214c216..8046cfb9 100644
--- a/makefiles/format.toml
+++ b/makefiles/format.toml
@@ -1,15 +1,5 @@
 # Rust workspace tasks: Format.
 
-# Format
-# | task           | type      | cwd |
-# | -------------- | --------- | --- |
-# | fmt            | composite |     |
-# | fmt-check      | composite |     |
-# | fmt-rust       | command   |     |
-# | fmt-rust-check | extend    |     |
-# | fmt-toml       | command   |     |
-# | fmt-toml-check | extend    |     |
-
 [tasks.fmt]
 workspace = false
 dependencies = [
@@ -45,10 +35,3 @@ args = [
 	"fmt",
 	"--check",
 ]
-
-# Lint
-# | task        | type      | cwd |
-# | ----------- | --------- | --- |
-# | lint        | composite |     |
-# | lint-rust   | command   |     |
-# | lint-vstyle | command   |     |
diff --git a/makefiles/lint-fix.toml b/makefiles/lint-fix.toml
index 5aada462..aa2f8a4f 100644
--- a/makefiles/lint-fix.toml
+++ b/makefiles/lint-fix.toml
@@ -1,12 +1,5 @@
 # Rust workspace tasks: Lint Fix.
 
-# Lint Fix
-# | task            | type      | cwd |
-# | --------------- | --------- | --- |
-# | lint-fix        | composite |     |
-# | lint-fix-rust   | command   |     |
-# | lint-fix-vstyle | command   |     |
-
 [tasks.lint-fix]
 workspace = false
 dependencies = [
@@ -55,15 +48,3 @@ args = [
 	"--all-features",
 	"--strict",
 ]
-
-# Research
-# | task                                    | type      | cwd |
-# | --------------------------------------- | --------- | --- |
-# | external-memory-radar                   | command   |     |
-# | external-memory-radar-artifact          | composite |     |
-# | external-memory-radar-artifact-json     | command   |     |
-# | external-memory-radar-artifact-validate | command   |     |
-# | external-memory-radar-dry-run           | composite |     |
-# | external-memory-radar-dry-run-json      | command   |     |
-# | external-memory-radar-dry-run-validate  | command   |     |
-# | external-memory-radar-validate          | command   |     |
diff --git a/makefiles/lint.toml b/makefiles/lint.toml
index 1cedd668..a09517af 100644
--- a/makefiles/lint.toml
+++ b/makefiles/lint.toml
@@ -1,12 +1,5 @@
 # Rust workspace tasks: Lint.
 
-# Lint
-# | task        | type      | cwd |
-# | ----------- | --------- | --- |
-# | lint        | composite |     |
-# | lint-rust   | command   |     |
-# | lint-vstyle | command   |     |
-
 [tasks.lint]
 workspace = false
 dependencies = [
@@ -52,10 +45,3 @@ args = [
 	"--workspace",
 	"--all-features",
 ]
-
-# Lint Fix
-# | task            | type      | cwd |
-# | --------------- | --------- | --- |
-# | lint-fix        | composite |     |
-# | lint-fix-rust   | command   |     |
-# | lint-fix-vstyle | command   |     |
diff --git a/makefiles/research.toml b/makefiles/research.toml
index 1c9db279..45b5770c 100644
--- a/makefiles/research.toml
+++ b/makefiles/research.toml
@@ -1,17 +1,5 @@
 # Rust workspace tasks: Research.
 
-# Research
-# | task                                    | type      | cwd |
-# | --------------------------------------- | --------- | --- |
-# | external-memory-radar                   | command   |     |
-# | external-memory-radar-artifact          | composite |     |
-# | external-memory-radar-artifact-json     | command   |     |
-# | external-memory-radar-artifact-validate | command   |     |
-# | external-memory-radar-dry-run           | composite |     |
-# | external-memory-radar-dry-run-json      | command   |     |
-# | external-memory-radar-dry-run-validate  | command   |     |
-# | external-memory-radar-validate          | command   |     |
-
 [tasks.external-memory-radar]
 workspace = false
 command = "cargo"
@@ -127,16 +115,3 @@ args = [
 	"--cursor",
 	"apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json",
 ]
-
-# Smoke
-# | task                               | type      | cwd |
-# | ---------------------------------- | --------- | --- |
-# | smoke-graphify-docker-graph-report | command   |     |
-# | smoke-graphiti-zep-docker-temporal | command   |     |
-# | smoke-graphrag-docker              | command   |     |
-# | smoke-letta-core-archive-export-readback | command   |     |
-# | smoke-lightrag-docker-context      | command   |     |
-# | smoke-ragflow-docker               | command   |     |
-# | smoke-real-world-job               | composite |     |
-# | smoke-real-world-job-json          | command   |     |
-# | smoke-real-world-job-report        | command   |     |
diff --git a/makefiles/smoke.toml b/makefiles/smoke.toml
index 88c4e494..43b9874d 100644
--- a/makefiles/smoke.toml
+++ b/makefiles/smoke.toml
@@ -1,18 +1,5 @@
 # Rust workspace tasks: Smoke.
 
-# Smoke
-# | task                               | type      | cwd |
-# | ---------------------------------- | --------- | --- |
-# | smoke-graphify-docker-graph-report | command   |     |
-# | smoke-graphiti-zep-docker-temporal | command   |     |
-# | smoke-graphrag-docker              | command   |     |
-# | smoke-letta-core-archive-export-readback | command   |     |
-# | smoke-lightrag-docker-context      | command   |     |
-# | smoke-ragflow-docker               | command   |     |
-# | smoke-real-world-job               | composite |     |
-# | smoke-real-world-job-json          | command   |     |
-# | smoke-real-world-job-report        | command   |     |
-
 [tasks.smoke-graphify-docker-graph-report]
 workspace = false
 command = "bash"
@@ -102,12 +89,3 @@ args = [
 	"--out",
 	"tmp/real-world-job/real-world-job-smoke-report.md",
 ]
-
-# Test
-# | task                  | type      | cwd |
-# | --------------------- | --------- | --- |
-# | test                  | composite |     |
-# | test-e2e              | command   |     |
-# | test-rust             | command   |     |
-# | test-rust-all         | command   |     |
-# | test-rust-integration | command   |     |
diff --git a/makefiles/test.toml b/makefiles/test.toml
index 4245ab58..9ee899d8 100644
--- a/makefiles/test.toml
+++ b/makefiles/test.toml
@@ -1,14 +1,5 @@
 # Rust workspace tasks: Test.
 
-# Test
-# | task                  | type      | cwd |
-# | --------------------- | --------- | --- |
-# | test                  | composite |     |
-# | test-e2e              | command   |     |
-# | test-rust             | command   |     |
-# | test-rust-all         | command   |     |
-# | test-rust-integration | command   |     |
-
 [tasks.test]
 clear = true
 workspace = false

From d766be86f1e9ec33cde7fec9518420e262707a6f Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:17:00 -0400
Subject: [PATCH 07/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product manifests","authority":"manual"}

---
 .../real_world_job_benchmark/quantitative.rs  | 277 +-----------------
 .../quantitative/product_manifest.rs          | 267 +++++++++++++++++
 2 files changed, 277 insertions(+), 267 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index ac782c30..80fd746d 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,14 +1,18 @@
 use std::env;
 
+mod product_manifest;
+
+pub(super) use product_manifest::quantitative_product_manifest_from_report;
+
 use crate::{
-	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs,
-	ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact,
-	QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport,
-	QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, QuantitativePerQueryRow,
-	QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, RealWorldReport, ReportSummary,
-	Result, eyre, formatting, fs, scoring,
+	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, JobReport, Path,
+	PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
+	QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, eyre, formatting, fs, scoring,
 };
 
+use product_manifest::quantitative_product_manifest;
+
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
 const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str =
 	"elf.agent_memory_quantitative_product_manifest/v1";
@@ -172,89 +176,6 @@ pub(super) fn quantitative_scoreboard_report(
 	})
 }
 
-pub(super) fn quantitative_product_manifest_from_report(
-	report: &RealWorldReport,
-	args: &ExportQuantitativeProductManifestArgs,
-) -> Result<QuantitativeProductManifest> {
-	if report.schema != REPORT_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {REPORT_SCHEMA}.",
-			args.report.display(),
-			report.schema
-		));
-	}
-
-	let source_row =
-		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
-			eyre::eyre!("{} has no quantitative product row.", args.report.display())
-		})?;
-	let source_product = source_row.product.as_str();
-	let source_adapter_id = source_row.adapter_id.as_str();
-	let product = args.product.as_deref().unwrap_or(source_product).trim();
-	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
-	let adapter_name =
-		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
-
-	if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() {
-		return Err(eyre::eyre!(
-			"{} cannot export an incomplete quantitative product identity.",
-			args.report.display()
-		));
-	}
-	if product == "ELF" {
-		return Err(eyre::eyre!(
-			"{} exports product ELF; use --product for external product manifest exports.",
-			args.report.display()
-		));
-	}
-
-	let mut row = source_row.clone();
-
-	row.product = product.to_string();
-	row.adapter_id = adapter_id.to_string();
-	row.adapter_name = adapter_name.to_string();
-	row.claim_boundary = concat!(
-		"Exported from a generated real_world_job_report quantitative row; ",
-		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
-	)
-	.to_string();
-
-	let mut per_query_rows = Vec::new();
-
-	for row in &report.quantitative_scoreboard.per_query_rows {
-		if row.product != source_product || row.adapter_id != source_adapter_id {
-			continue;
-		}
-
-		let mut row = row.clone();
-
-		row.product = product.to_string();
-		row.adapter_id = adapter_id.to_string();
-		row.claim_boundary = concat!(
-			"Exported from generated report per-query quantitative evidence; ",
-			"import does not relax paired-significance or leaderboard gates."
-		)
-		.to_string();
-
-		per_query_rows.push(row);
-	}
-
-	let manifest = QuantitativeProductManifest {
-		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
-		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
-		rows: vec![row],
-		per_query_rows,
-	};
-
-	validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?;
-
-	Ok(manifest)
-}
-
 pub(super) fn quantitative_audit_manifest_from_jobs(
 	jobs: &[RealWorldJob],
 	args: &ExportQuantitativeAuditManifestArgs,
@@ -552,184 +473,6 @@ fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path:
 	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
 }
 
-fn quantitative_product_manifest(
-	path: Option<&Path>,
-	corpus_id: &str,
-) -> Result<QuantitativeProductManifest> {
-	let Some(path) = path else {
-		return Ok(QuantitativeProductManifest::default());
-	};
-	let raw = fs::read_to_string(path)?;
-	let mut manifest =
-		serde_json::from_str::<QuantitativeProductManifest>(&raw).map_err(|err| {
-			eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display())
-		})?;
-
-	for row in &mut manifest.rows {
-		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
-	}
-	for row in &mut manifest.per_query_rows {
-		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
-	}
-
-	validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
-
-	Ok(manifest)
-}
-
-fn validate_quantitative_product_manifest(
-	manifest: &QuantitativeProductManifest,
-	path: &Path,
-	corpus_id: &str,
-) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.corpus_id != corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected same-corpus {}.",
-			path.display(),
-			manifest.corpus_id,
-			corpus_id
-		));
-	}
-	if manifest.rows.is_empty() {
-		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
-	}
-
-	let row_keys = manifest
-		.rows
-		.iter()
-		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
-		.collect::<BTreeSet<_>>();
-
-	for row in &manifest.rows {
-		if row.product == "ELF" {
-			return Err(eyre::eyre!(
-				"{} quantitative product manifest must not inject ELF self rows.",
-				path.display()
-			));
-		}
-		if row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.adapter_name.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative product row.",
-				path.display()
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-		if row.leaderboard_eligible {
-			validate_leaderboard_eligible_product_row(path, row)?;
-		}
-	}
-	for row in &manifest.per_query_rows {
-		if row.job_id.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-			|| row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.qrel_source.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative per-query product row.",
-				path.display()
-			));
-		}
-		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} has no matching product row.",
-				path.display(),
-				row.product,
-				row.adapter_id
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-	}
-	for row in &manifest.rows {
-		if row.ranking_query_count == 0 {
-			continue;
-		}
-
-		let per_query_count = manifest
-			.per_query_rows
-			.iter()
-			.filter(|per_query| {
-				per_query.product == row.product && per_query.adapter_id == row.adapter_id
-			})
-			.count();
-
-		if per_query_count < row.ranking_query_count {
-			return Err(eyre::eyre!(
-				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				row.ranking_query_count,
-				per_query_count
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn validate_leaderboard_eligible_product_row(
-	path: &Path,
-	row: &QuantitativeBenchmarkRow,
-) -> Result<()> {
-	let has_audit_manifest_id = row
-		.audit_manifest_id
-		.as_deref()
-		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
-
-	if row.evidence_class != "live_real_world"
-		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
-		|| row.ranking_query_count != row.sample_size
-		|| row.explicit_qrel_query_count != row.ranking_query_count
-		|| !row.held_out
-		|| !row.leakage_audited
-		|| !has_audit_manifest_id
-	{
-		return Err(eyre::eyre!(
-			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
-
-	Ok(())
-}
-
 fn quantitative_metrics_not_encoded(
 	imported_row_count: usize,
 	imported_per_query_count: usize,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
new file mode 100644
index 00000000..ed3844d4
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -0,0 +1,267 @@
+use crate::{
+	BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow,
+	QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs,
+};
+
+use super::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA};
+
+pub(crate) fn quantitative_product_manifest_from_report(
+	report: &RealWorldReport,
+	args: &ExportQuantitativeProductManifestArgs,
+) -> Result<QuantitativeProductManifest> {
+	if report.schema != REPORT_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {REPORT_SCHEMA}.",
+			args.report.display(),
+			report.schema
+		));
+	}
+
+	let source_row =
+		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
+			eyre::eyre!("{} has no quantitative product row.", args.report.display())
+		})?;
+	let source_product = source_row.product.as_str();
+	let source_adapter_id = source_row.adapter_id.as_str();
+	let product = args.product.as_deref().unwrap_or(source_product).trim();
+	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
+	let adapter_name =
+		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
+
+	if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() {
+		return Err(eyre::eyre!(
+			"{} cannot export an incomplete quantitative product identity.",
+			args.report.display()
+		));
+	}
+	if product == "ELF" {
+		return Err(eyre::eyre!(
+			"{} exports product ELF; use --product for external product manifest exports.",
+			args.report.display()
+		));
+	}
+
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.adapter_name = adapter_name.to_string();
+	row.claim_boundary = concat!(
+		"Exported from a generated real_world_job_report quantitative row; ",
+		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
+	)
+	.to_string();
+
+	let mut per_query_rows = Vec::new();
+
+	for row in &report.quantitative_scoreboard.per_query_rows {
+		if row.product != source_product || row.adapter_id != source_adapter_id {
+			continue;
+		}
+
+		let mut row = row.clone();
+
+		row.product = product.to_string();
+		row.adapter_id = adapter_id.to_string();
+		row.claim_boundary = concat!(
+			"Exported from generated report per-query quantitative evidence; ",
+			"import does not relax paired-significance or leaderboard gates."
+		)
+		.to_string();
+
+		per_query_rows.push(row);
+	}
+
+	let manifest = QuantitativeProductManifest {
+		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
+		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
+		rows: vec![row],
+		per_query_rows,
+	};
+
+	validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?;
+
+	Ok(manifest)
+}
+
+pub(super) fn quantitative_product_manifest(
+	path: Option<&Path>,
+	corpus_id: &str,
+) -> Result<QuantitativeProductManifest> {
+	let Some(path) = path else {
+		return Ok(QuantitativeProductManifest::default());
+	};
+	let raw = fs::read_to_string(path)?;
+	let mut manifest =
+		serde_json::from_str::<QuantitativeProductManifest>(&raw).map_err(|err| {
+			eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display())
+		})?;
+
+	for row in &mut manifest.rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+	for row in &mut manifest.per_query_rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+
+	validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
+
+	Ok(manifest)
+}
+
+fn validate_quantitative_product_manifest(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.corpus_id != corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected same-corpus {}.",
+			path.display(),
+			manifest.corpus_id,
+			corpus_id
+		));
+	}
+	if manifest.rows.is_empty() {
+		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
+	}
+
+	let row_keys = manifest
+		.rows
+		.iter()
+		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
+		.collect::<BTreeSet<_>>();
+
+	for row in &manifest.rows {
+		if row.product == "ELF" {
+			return Err(eyre::eyre!(
+				"{} quantitative product manifest must not inject ELF self rows.",
+				path.display()
+			));
+		}
+		if row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.adapter_name.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative product row.",
+				path.display()
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+		if row.leaderboard_eligible {
+			validate_leaderboard_eligible_product_row(path, row)?;
+		}
+	}
+	for row in &manifest.per_query_rows {
+		if row.job_id.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+			|| row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.qrel_source.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative per-query product row.",
+				path.display()
+			));
+		}
+		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} has no matching product row.",
+				path.display(),
+				row.product,
+				row.adapter_id
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+	for row in &manifest.rows {
+		if row.ranking_query_count == 0 {
+			continue;
+		}
+
+		let per_query_count = manifest
+			.per_query_rows
+			.iter()
+			.filter(|per_query| {
+				per_query.product == row.product && per_query.adapter_id == row.adapter_id
+			})
+			.count();
+
+		if per_query_count < row.ranking_query_count {
+			return Err(eyre::eyre!(
+				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				row.ranking_query_count,
+				per_query_count
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}

From 5b60c392dd6b318ad8fd7b2b7fe3420bb0c25387 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:20:36 -0400
Subject: [PATCH 08/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit manifests","authority":"manual"}

---
 .../real_world_job_benchmark/quantitative.rs  | 404 +-----------------
 .../quantitative/audit_manifest.rs            | 404 ++++++++++++++++++
 2 files changed, 411 insertions(+), 397 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index 80fd746d..ec62228f 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,16 +1,18 @@
-use std::env;
-
+mod audit_manifest;
 mod product_manifest;
 
+pub(super) use audit_manifest::quantitative_audit_manifest_from_jobs;
 pub(super) use product_manifest::quantitative_product_manifest_from_report;
 
 use crate::{
-	AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, JobReport, Path,
-	PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
+	AdapterReport, BTreeMap, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls,
 	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
-	QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, eyre, formatting, fs, scoring,
+	QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, formatting, scoring,
 };
 
+use audit_manifest::{
+	QuantitativeAuditContext, QuantitativeAuditEvidence, quantitative_audit_evidence,
+};
 use product_manifest::quantitative_product_manifest;
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
@@ -41,22 +43,6 @@ pub(super) struct QuantitativeReportInput<'a> {
 	pub(super) audit_manifest_path: Option<&'a Path>,
 }
 
-struct QuantitativeAuditContext<'a> {
-	run_id: &'a str,
-	corpus_id: &'a str,
-	product: &'a str,
-	adapter_id: &'a str,
-	source_jobs: &'a [RealWorldJob],
-	ranking_query_count: usize,
-	explicit_qrel_query_count: usize,
-}
-
-struct QuantitativeAuditEvidence {
-	held_out: bool,
-	leakage_audited: bool,
-	audit_manifest_id: Option<String>,
-}
-
 pub(super) fn quantitative_scoreboard_report(
 	input: QuantitativeReportInput<'_>,
 ) -> Result<QuantitativeBenchmarkReport> {
@@ -176,303 +162,6 @@ pub(super) fn quantitative_scoreboard_report(
 	})
 }
 
-pub(super) fn quantitative_audit_manifest_from_jobs(
-	jobs: &[RealWorldJob],
-	args: &ExportQuantitativeAuditManifestArgs,
-) -> Result<QuantitativeAuditManifest> {
-	let product = args.product.trim();
-	let adapter_id = args.adapter_id.trim();
-
-	if product.is_empty() || adapter_id.is_empty() {
-		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
-	}
-
-	let corpus_id = quantitative_corpus_id(jobs);
-	let ranking_query_count = ranking_query_count(jobs);
-	let explicit_qrel_query_count = explicit_qrel_query_count(jobs);
-	let manifest = QuantitativeAuditManifest {
-		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
-		run_id: args.run_id.clone(),
-		corpus_id,
-		product: product.to_string(),
-		adapter_id: adapter_id.to_string(),
-		held_out: args.held_out,
-		leakage_audited: args.leakage_audited,
-		sample_size: jobs.len(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
-		controls: args.controls.clone(),
-		artifacts: vec![QuantitativeAuditArtifact {
-			role: "product_runtime_fixtures".to_string(),
-			path: audit_artifact_display_path(args.fixtures.as_path()),
-			sha256: fixture_path_digest(args.fixtures.as_path())?,
-		}],
-		claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| {
-			if args.held_out || args.leakage_audited {
-				concat!(
-					"Audit manifest supplied by operator; runner validates run/corpus/product/",
-					"adapter/count/query-id/artifact bindings before opening row gates."
-				)
-				.to_string()
-			} else {
-				concat!(
-					"Diagnostic audit manifest binds the current product-runtime fixture set to ",
-					"query ids and counts, but it does not prove held-out or leakage-audited status."
-				)
-				.to_string()
-			}
-		}),
-	};
-
-	validate_quantitative_audit_manifest(
-		&manifest,
-		args.fixtures.as_path(),
-		QuantitativeAuditContext {
-			run_id: args.run_id.as_str(),
-			corpus_id: manifest.corpus_id.as_str(),
-			product,
-			adapter_id,
-			source_jobs: jobs,
-			ranking_query_count: manifest.ranking_query_count,
-			explicit_qrel_query_count: manifest.explicit_qrel_query_count,
-		},
-	)?;
-
-	Ok(manifest)
-}
-
-fn quantitative_audit_evidence(
-	path: Option<&Path>,
-	context: QuantitativeAuditContext<'_>,
-) -> Result<QuantitativeAuditEvidence> {
-	let Some(path) = path else {
-		return Ok(QuantitativeAuditEvidence {
-			held_out: false,
-			leakage_audited: false,
-			audit_manifest_id: None,
-		});
-	};
-	let raw = fs::read_to_string(path)?;
-	let manifest = serde_json::from_str::<QuantitativeAuditManifest>(&raw).map_err(|err| {
-		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
-	})?;
-
-	validate_quantitative_audit_manifest(&manifest, path, context)?;
-
-	Ok(QuantitativeAuditEvidence {
-		held_out: manifest.held_out,
-		leakage_audited: manifest.leakage_audited,
-		audit_manifest_id: Some(manifest.manifest_id),
-	})
-}
-
-fn validate_quantitative_audit_manifest(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-	context: QuantitativeAuditContext<'_>,
-) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.run_id != context.run_id {
-		return Err(eyre::eyre!(
-			"{} has run_id {}, expected {}.",
-			path.display(),
-			manifest.run_id,
-			context.run_id
-		));
-	}
-	if manifest.corpus_id != context.corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected {}.",
-			path.display(),
-			manifest.corpus_id,
-			context.corpus_id
-		));
-	}
-	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
-		return Err(eyre::eyre!(
-			"{} has product {}:{} but current row is {}:{}.",
-			path.display(),
-			manifest.product,
-			manifest.adapter_id,
-			context.product,
-			context.adapter_id
-		));
-	}
-	if manifest.sample_size != context.source_jobs.len() {
-		return Err(eyre::eyre!(
-			"{} has sample_size {}, expected {}.",
-			path.display(),
-			manifest.sample_size,
-			context.source_jobs.len()
-		));
-	}
-	if manifest.ranking_query_count != context.ranking_query_count {
-		return Err(eyre::eyre!(
-			"{} has ranking_query_count {}, expected {}.",
-			path.display(),
-			manifest.ranking_query_count,
-			context.ranking_query_count
-		));
-	}
-	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
-		return Err(eyre::eyre!(
-			"{} has explicit_qrel_query_count {}, expected {}.",
-			path.display(),
-			manifest.explicit_qrel_query_count,
-			context.explicit_qrel_query_count
-		));
-	}
-
-	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
-	validate_quantitative_audit_controls(manifest, path)?;
-
-	validate_quantitative_audit_artifacts(manifest, path)
-}
-
-fn validate_quantitative_audit_query_ids(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-	source_jobs: &[RealWorldJob],
-) -> Result<()> {
-	let expected = ranking_query_ids(source_jobs);
-	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if actual.len() != manifest.query_ids.len() {
-		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
-	}
-	if actual != expected {
-		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
-		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
-
-		return Err(eyre::eyre!(
-			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
-			path.display(),
-			missing,
-			extra
-		));
-	}
-
-	Ok(())
-}
-
-fn validate_quantitative_audit_controls(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
-		return Err(eyre::eyre!(
-			"{} marks held_out=true without required control {}.",
-			path.display(),
-			REQUIRED_HELD_OUT_AUDIT_CONTROL
-		));
-	}
-	if manifest.leakage_audited
-		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
-			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
-	{
-		return Err(eyre::eyre!(
-			"{} marks leakage_audited=true without required controls {} and {}.",
-			path.display(),
-			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
-			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
-		));
-	}
-	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
-	{
-		return Err(eyre::eyre!(
-			"{} marks audit controls true but has an empty claim_boundary.",
-			path.display()
-		));
-	}
-
-	Ok(())
-}
-
-fn validate_quantitative_audit_artifacts(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	if manifest.artifacts.is_empty() {
-		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
-	}
-
-	for artifact in &manifest.artifacts {
-		if artifact.role.trim().is_empty()
-			|| artifact.path.trim().is_empty()
-			|| artifact.sha256.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative audit artifact.",
-				path.display()
-			));
-		}
-		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
-		{
-			return Err(eyre::eyre!(
-				"{} artifact {} has invalid sha256 digest {}.",
-				path.display(),
-				artifact.role,
-				artifact.sha256
-			));
-		}
-
-		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
-		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
-			eyre::eyre!(
-				"{} artifact {} could not be digested at {}: {err}",
-				path.display(),
-				artifact.role,
-				artifact_path.display()
-			)
-		})?;
-
-		if actual != artifact.sha256 {
-			return Err(eyre::eyre!(
-				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
-				path.display(),
-				artifact.role,
-				artifact_path.display(),
-				artifact.sha256,
-				actual
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
-	let raw = PathBuf::from(artifact_path);
-
-	if raw.is_absolute() {
-		return raw;
-	}
-
-	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
-
-	if cwd_path.exists() {
-		return cwd_path;
-	}
-
-	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
-}
-
 fn quantitative_metrics_not_encoded(
 	imported_row_count: usize,
 	imported_per_query_count: usize,
@@ -918,85 +607,6 @@ fn quantitative_row_leaderboard_eligible(
 			.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty())
 }
 
-fn fixture_path_digest(path: &Path) -> Result<String> {
-	let mut hasher = blake3::Hasher::new();
-
-	if path.is_file() {
-		hash_fixture_file(
-			path,
-			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
-			&mut hasher,
-		)?;
-
-		return Ok(hasher.finalize().to_hex().to_string());
-	}
-
-	let paths = audit_fixture_paths(path)?;
-
-	for fixture in paths {
-		let relative = fixture
-			.strip_prefix(path)
-			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
-			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
-
-		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
-	}
-
-	Ok(hasher.finalize().to_hex().to_string())
-}
-
-fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
-	let mut paths = Vec::new();
-
-	collect_audit_fixture_paths(path, &mut paths)?;
-
-	paths.sort();
-
-	Ok(paths)
-}
-
-fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
-	if path.is_file() {
-		paths.push(path.to_path_buf());
-
-		return Ok(());
-	}
-
-	for entry in fs::read_dir(path)? {
-		let entry_path = entry?.path();
-
-		if entry_path.is_dir() {
-			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
-		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
-			paths.push(entry_path);
-		}
-	}
-
-	Ok(())
-}
-
-fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
-	hasher.update(logical_path.as_bytes());
-	hasher.update(b"\0");
-	hasher.update(&fs::read(path)?);
-	hasher.update(b"\0");
-
-	Ok(())
-}
-
-fn audit_artifact_display_path(path: &Path) -> String {
-	let display_path = if path.is_absolute() {
-		env::current_dir()
-			.ok()
-			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
-			.unwrap_or_else(|| path.to_path_buf())
-	} else {
-		path.to_path_buf()
-	};
-
-	display_path.to_string_lossy().replace('\\', "/")
-}
-
 fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
 	source_jobs
 		.iter()
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
new file mode 100644
index 00000000..dbdb861d
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -0,0 +1,404 @@
+use std::env;
+
+use crate::{
+	BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact,
+	QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs,
+};
+
+use super::{
+	QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
+	REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+	explicit_qrel_query_count, quantitative_corpus_id, ranking_query_count, ranking_query_ids,
+};
+
+pub(super) struct QuantitativeAuditContext<'a> {
+	pub(super) run_id: &'a str,
+	pub(super) corpus_id: &'a str,
+	pub(super) product: &'a str,
+	pub(super) adapter_id: &'a str,
+	pub(super) source_jobs: &'a [RealWorldJob],
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+}
+
+pub(super) struct QuantitativeAuditEvidence {
+	pub(super) held_out: bool,
+	pub(super) leakage_audited: bool,
+	pub(super) audit_manifest_id: Option<String>,
+}
+
+pub(crate) fn quantitative_audit_manifest_from_jobs(
+	jobs: &[RealWorldJob],
+	args: &ExportQuantitativeAuditManifestArgs,
+) -> Result<QuantitativeAuditManifest> {
+	let product = args.product.trim();
+	let adapter_id = args.adapter_id.trim();
+
+	if product.is_empty() || adapter_id.is_empty() {
+		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
+	}
+
+	let corpus_id = quantitative_corpus_id(jobs);
+	let ranking_query_count = ranking_query_count(jobs);
+	let explicit_qrel_query_count = explicit_qrel_query_count(jobs);
+	let manifest = QuantitativeAuditManifest {
+		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
+		run_id: args.run_id.clone(),
+		corpus_id,
+		product: product.to_string(),
+		adapter_id: adapter_id.to_string(),
+		held_out: args.held_out,
+		leakage_audited: args.leakage_audited,
+		sample_size: jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
+		controls: args.controls.clone(),
+		artifacts: vec![QuantitativeAuditArtifact {
+			role: "product_runtime_fixtures".to_string(),
+			path: audit_artifact_display_path(args.fixtures.as_path()),
+			sha256: fixture_path_digest(args.fixtures.as_path())?,
+		}],
+		claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| {
+			if args.held_out || args.leakage_audited {
+				concat!(
+					"Audit manifest supplied by operator; runner validates run/corpus/product/",
+					"adapter/count/query-id/artifact bindings before opening row gates."
+				)
+				.to_string()
+			} else {
+				concat!(
+					"Diagnostic audit manifest binds the current product-runtime fixture set to ",
+					"query ids and counts, but it does not prove held-out or leakage-audited status."
+				)
+				.to_string()
+			}
+		}),
+	};
+
+	validate_quantitative_audit_manifest(
+		&manifest,
+		args.fixtures.as_path(),
+		QuantitativeAuditContext {
+			run_id: args.run_id.as_str(),
+			corpus_id: manifest.corpus_id.as_str(),
+			product,
+			adapter_id,
+			source_jobs: jobs,
+			ranking_query_count: manifest.ranking_query_count,
+			explicit_qrel_query_count: manifest.explicit_qrel_query_count,
+		},
+	)?;
+
+	Ok(manifest)
+}
+
+pub(super) fn quantitative_audit_evidence(
+	path: Option<&Path>,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<QuantitativeAuditEvidence> {
+	let Some(path) = path else {
+		return Ok(QuantitativeAuditEvidence {
+			held_out: false,
+			leakage_audited: false,
+			audit_manifest_id: None,
+		});
+	};
+	let raw = fs::read_to_string(path)?;
+	let manifest = serde_json::from_str::<QuantitativeAuditManifest>(&raw).map_err(|err| {
+		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
+	})?;
+
+	validate_quantitative_audit_manifest(&manifest, path, context)?;
+
+	Ok(QuantitativeAuditEvidence {
+		held_out: manifest.held_out,
+		leakage_audited: manifest.leakage_audited,
+		audit_manifest_id: Some(manifest.manifest_id),
+	})
+}
+
+fn validate_quantitative_audit_manifest(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
+	validate_quantitative_audit_controls(manifest, path)?;
+
+	validate_quantitative_audit_artifacts(manifest, path)
+}
+
+fn validate_quantitative_audit_query_ids(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	source_jobs: &[RealWorldJob],
+) -> Result<()> {
+	let expected = ranking_query_ids(source_jobs);
+	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if actual.len() != manifest.query_ids.len() {
+		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
+	}
+	if actual != expected {
+		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
+		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
+
+		return Err(eyre::eyre!(
+			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
+			path.display(),
+			missing,
+			extra
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_quantitative_audit_controls(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
+		return Err(eyre::eyre!(
+			"{} marks held_out=true without required control {}.",
+			path.display(),
+			REQUIRED_HELD_OUT_AUDIT_CONTROL
+		));
+	}
+	if manifest.leakage_audited
+		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
+			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
+	{
+		return Err(eyre::eyre!(
+			"{} marks leakage_audited=true without required controls {} and {}.",
+			path.display(),
+			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
+		));
+	}
+	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} marks audit controls true but has an empty claim_boundary.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_quantitative_audit_artifacts(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	if manifest.artifacts.is_empty() {
+		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
+	}
+
+	for artifact in &manifest.artifacts {
+		if artifact.role.trim().is_empty()
+			|| artifact.path.trim().is_empty()
+			|| artifact.sha256.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative audit artifact.",
+				path.display()
+			));
+		}
+		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
+		{
+			return Err(eyre::eyre!(
+				"{} artifact {} has invalid sha256 digest {}.",
+				path.display(),
+				artifact.role,
+				artifact.sha256
+			));
+		}
+
+		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
+		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
+			eyre::eyre!(
+				"{} artifact {} could not be digested at {}: {err}",
+				path.display(),
+				artifact.role,
+				artifact_path.display()
+			)
+		})?;
+
+		if actual != artifact.sha256 {
+			return Err(eyre::eyre!(
+				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
+				path.display(),
+				artifact.role,
+				artifact_path.display(),
+				artifact.sha256,
+				actual
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
+	let raw = PathBuf::from(artifact_path);
+
+	if raw.is_absolute() {
+		return raw;
+	}
+
+	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
+
+	if cwd_path.exists() {
+		return cwd_path;
+	}
+
+	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
+}
+
+fn fixture_path_digest(path: &Path) -> Result<String> {
+	let mut hasher = blake3::Hasher::new();
+
+	if path.is_file() {
+		hash_fixture_file(
+			path,
+			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
+			&mut hasher,
+		)?;
+
+		return Ok(hasher.finalize().to_hex().to_string());
+	}
+
+	let paths = audit_fixture_paths(path)?;
+
+	for fixture in paths {
+		let relative = fixture
+			.strip_prefix(path)
+			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
+			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
+
+		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
+	}
+
+	Ok(hasher.finalize().to_hex().to_string())
+}
+
+fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
+	let mut paths = Vec::new();
+
+	collect_audit_fixture_paths(path, &mut paths)?;
+
+	paths.sort();
+
+	Ok(paths)
+}
+
+fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
+	if path.is_file() {
+		paths.push(path.to_path_buf());
+
+		return Ok(());
+	}
+
+	for entry in fs::read_dir(path)? {
+		let entry_path = entry?.path();
+
+		if entry_path.is_dir() {
+			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
+		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
+			paths.push(entry_path);
+		}
+	}
+
+	Ok(())
+}
+
+fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
+	hasher.update(logical_path.as_bytes());
+	hasher.update(b"\0");
+	hasher.update(&fs::read(path)?);
+	hasher.update(b"\0");
+
+	Ok(())
+}
+
+fn audit_artifact_display_path(path: &Path) -> String {
+	let display_path = if path.is_absolute() {
+		env::current_dir()
+			.ok()
+			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
+			.unwrap_or_else(|| path.to_path_buf())
+	} else {
+		path.to_path_buf()
+	};
+
+	display_path.to_string_lossy().replace('\\', "/")
+}

From a083844af8fbbe1fef489cd9127d0ad09f2d8ac5 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:24:55 -0400
Subject: [PATCH 09/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative metric helpers","authority":"manual"}

---
 .../real_world_job_benchmark/quantitative.rs  | 499 +----------------
 .../quantitative/audit_manifest.rs            |  20 +-
 .../quantitative/metrics.rs                   | 503 ++++++++++++++++++
 .../quantitative/product_manifest.rs          |   3 +-
 4 files changed, 534 insertions(+), 491 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index ec62228f..16365e66 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,19 +1,17 @@
 mod audit_manifest;
+mod metrics;
 mod product_manifest;
 
-pub(super) use audit_manifest::quantitative_audit_manifest_from_jobs;
-pub(super) use product_manifest::quantitative_product_manifest_from_report;
-
-use crate::{
-	AdapterReport, BTreeMap, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls,
-	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
-	QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, formatting, scoring,
+pub(super) use self::{
+	audit_manifest::quantitative_audit_manifest_from_jobs,
+	product_manifest::quantitative_product_manifest_from_report,
 };
 
-use audit_manifest::{
-	QuantitativeAuditContext, QuantitativeAuditEvidence, quantitative_audit_evidence,
+use self::audit_manifest::{QuantitativeAuditContext, QuantitativeAuditEvidence};
+use crate::{
+	AdapterReport, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result,
 };
-use product_manifest::quantitative_product_manifest;
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
 const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str =
@@ -48,7 +46,7 @@ pub(super) fn quantitative_scoreboard_report(
 ) -> Result<QuantitativeBenchmarkReport> {
 	let corpus_id = quantitative_corpus_id(input.source_jobs);
 	let evidence_class = quantitative_evidence_class(input.adapter, input.jobs);
-	let per_query_rows = quantitative_per_query_rows(
+	let per_query_rows = metrics::quantitative_per_query_rows(
 		input.source_jobs,
 		input.jobs,
 		corpus_id.as_str(),
@@ -63,7 +61,7 @@ pub(super) fn quantitative_scoreboard_report(
 		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
 	let metric_comparable = ranking_query_count > 0;
 	let result_state = quantitative_result_state(input.summary);
-	let audit_evidence = quantitative_audit_evidence(
+	let audit_evidence = audit_manifest::quantitative_audit_evidence(
 		input.audit_manifest_path,
 		QuantitativeAuditContext {
 			run_id: input.run_id,
@@ -100,24 +98,26 @@ pub(super) fn quantitative_scoreboard_report(
 		fixture_regression_only: evidence_class == "fixture_backed",
 		sample_size: input.jobs.len(),
 		ranking_query_count,
-		ranking_coverage_state: ranking_coverage_state(
+		ranking_coverage_state: metrics::ranking_coverage_state(
 			input.summary,
 			input.source_jobs.len(),
 			ranking_query_count,
 		)
 		.to_string(),
-		ranked_candidate_source: ranked_candidate_source(ranking_query_count).to_string(),
-		qrel_source: aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
+		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
+		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
 			.to_string(),
 		explicit_qrel_query_count,
-		metrics: aggregate_metrics(per_query_rows.as_slice()),
-		metric_states: aggregate_metric_states(result_state, metric_comparable),
-		denominators: aggregate_denominators(per_query_rows.as_slice()),
-		confidence_intervals: aggregate_confidence_intervals(per_query_rows.as_slice()),
+		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
+		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
+		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
+		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	};
-	let product_manifest =
-		quantitative_product_manifest(input.product_manifest_path, corpus_id.as_str())?;
+	let product_manifest = product_manifest::quantitative_product_manifest(
+		input.product_manifest_path,
+		corpus_id.as_str(),
+	)?;
 	let imported_row_count = product_manifest.rows.len();
 	let imported_per_query_count = product_manifest.per_query_rows.len();
 	let mut rows = vec![row];
@@ -179,363 +179,6 @@ fn quantitative_metrics_not_encoded(
 	metrics
 }
 
-fn quantitative_per_query_rows(
-	source_jobs: &[RealWorldJob],
-	jobs: &[JobReport],
-	corpus_id: &str,
-	evidence_class: &str,
-	adapter_id: &str,
-) -> Vec<QuantitativePerQueryRow> {
-	source_jobs
-		.iter()
-		.zip(jobs.iter())
-		.map(|(source_job, job)| {
-			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
-		})
-		.collect()
-}
-
-fn quantitative_per_query_row(
-	source_job: &RealWorldJob,
-	job: &JobReport,
-	corpus_id: &str,
-	evidence_class: &str,
-	adapter_id: &str,
-) -> QuantitativePerQueryRow {
-	let relevance = relevance_grades(source_job, job);
-	let candidates = scoring::produced_evidence_order(source_job);
-	let positive_relevance_count = positive_qrel_count(&relevance);
-	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
-	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
-		"not_encoded"
-	} else {
-		formatting::status_str(job.status)
-	};
-	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
-	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
-
-	QuantitativePerQueryRow {
-		job_id: job.job_id.clone(),
-		suite: job.suite_id.clone(),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.to_string()),
-		result_state: formatting::status_str(job.status).to_string(),
-		expected_relevant_count: positive_relevance_count,
-		candidate_count: candidates.len(),
-		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
-		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
-		product: "ELF".to_string(),
-		adapter_id: adapter_id.to_string(),
-		metrics,
-		metric_states,
-		denominators,
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	}
-}
-
-fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
-	let explicit = source_job
-		.expected_answer
-		.relevance_judgments
-		.iter()
-		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-		.collect::<BTreeMap<_, _>>();
-
-	if !explicit.is_empty() {
-		return explicit;
-	}
-
-	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
-}
-
-fn per_query_metrics(
-	candidates: &[String],
-	relevance: &BTreeMap<String, f64>,
-) -> BTreeMap<String, Option<f64>> {
-	let mut metrics = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
-
-		metrics
-			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
-		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
-		metrics.insert(
-			format!("success_at_{k}"),
-			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
-		);
-	}
-
-	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
-	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
-	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
-
-	metrics
-}
-
-fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
-	candidates
-		.iter()
-		.take(k)
-		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
-		.count()
-}
-
-fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	Some(
-		candidates
-			.iter()
-			.position(|candidate| {
-				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
-			})
-			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
-	)
-}
-
-fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	let dcg = candidates
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, candidate)| {
-			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
-				/ ((index + 2) as f64).log2()
-		})
-		.sum::<f64>();
-	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
-
-	ideal.sort_by(|left, right| right.total_cmp(left));
-
-	let idcg = ideal
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
-		.sum::<f64>();
-
-	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
-}
-
-fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	let positive_count = positive_qrel_count(relevance);
-
-	if positive_count == 0 {
-		return None;
-	}
-
-	let mut hit_count = 0;
-	let mut precision_sum = 0.0;
-	let mut seen = BTreeSet::new();
-
-	for (index, candidate) in candidates.iter().enumerate() {
-		if !seen.insert(candidate.as_str()) {
-			continue;
-		}
-		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
-			hit_count += 1;
-			precision_sum += hit_count as f64 / (index + 1) as f64;
-		}
-	}
-
-	Some(precision_sum / positive_count as f64)
-}
-
-fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
-	let mut sums = BTreeMap::<String, (f64, usize)>::new();
-	let mut metrics = quantitative_metric_names()
-		.into_iter()
-		.map(|metric| (metric, None))
-		.collect::<BTreeMap<_, _>>();
-
-	for row in rows {
-		for (metric, value) in &row.metrics {
-			if let Some(value) = value {
-				let (sum, count) = sums.entry(metric.clone()).or_default();
-
-				*sum += *value;
-				*count += 1;
-			}
-		}
-	}
-	for (metric, (sum, count)) in sums {
-		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
-	}
-
-	metrics
-}
-
-fn aggregate_metric_states(
-	result_state: &str,
-	metric_comparable: bool,
-) -> BTreeMap<String, String> {
-	let state = if metric_comparable { result_state } else { "not_encoded" };
-	let mut states = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		states.insert(format!("recall_at_{k}"), state.to_string());
-		states.insert(format!("precision_at_{k}"), state.to_string());
-		states.insert(format!("success_at_{k}"), state.to_string());
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		states.insert(metric.to_string(), state.to_string());
-	}
-
-	states
-}
-
-fn quantitative_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		metrics.push(metric.to_string());
-	}
-
-	metrics
-}
-
-fn per_query_denominators(
-	candidate_count: usize,
-	expected_relevant_count: usize,
-) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
-		denominators.insert(format!("precision_at_{k}"), *k);
-		denominators.insert(format!("success_at_{k}"), 1);
-	}
-
-	denominators.insert("mrr".to_string(), expected_relevant_count);
-	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
-	denominators.insert("average_precision".to_string(), expected_relevant_count);
-	denominators.insert("candidate_count".to_string(), candidate_count);
-
-	denominators
-}
-
-fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(
-			format!("recall_at_{k}"),
-			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
-		);
-		denominators.insert(
-			format!("precision_at_{k}"),
-			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
-		);
-		denominators.insert(
-			format!("success_at_{k}"),
-			sum_per_query_denominator(rows, &format!("success_at_{k}")),
-		);
-	}
-
-	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
-	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
-	denominators.insert(
-		"average_precision".to_string(),
-		sum_per_query_denominator(rows, "average_precision"),
-	);
-
-	denominators
-}
-
-fn aggregate_confidence_intervals(
-	rows: &[QuantitativePerQueryRow],
-) -> BTreeMap<String, QuantitativeConfidenceInterval> {
-	let mut confidence_intervals = BTreeMap::new();
-
-	for metric in rate_metric_names() {
-		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
-
-		if denominator > 0 {
-			confidence_intervals.insert(
-				metric,
-				wilson_confidence_interval(numerator.min(denominator), denominator),
-			);
-		}
-	}
-
-	confidence_intervals
-}
-
-fn rate_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-
-	metrics
-}
-
-fn aggregate_rate_numerator_denominator(
-	rows: &[QuantitativePerQueryRow],
-	metric: &str,
-) -> (usize, usize) {
-	let mut numerator = 0;
-	let mut denominator = 0;
-
-	for row in rows {
-		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
-			continue;
-		};
-		let Some(row_denominator) = row.denominators.get(metric).copied() else {
-			continue;
-		};
-
-		if row_denominator == 0 {
-			continue;
-		}
-
-		denominator += row_denominator;
-		numerator += (value * row_denominator as f64).round() as usize;
-	}
-
-	(numerator, denominator)
-}
-
-fn wilson_confidence_interval(
-	numerator: usize,
-	denominator: usize,
-) -> QuantitativeConfidenceInterval {
-	let n = denominator as f64;
-	let p = numerator as f64 / n;
-	let z2 = WILSON_95_Z * WILSON_95_Z;
-	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
-	let half_width =
-		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
-
-	QuantitativeConfidenceInterval {
-		method: "wilson_score".to_string(),
-		confidence: 0.95,
-		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
-		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
-		numerator,
-		denominator,
-	}
-}
-
-fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
-	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
-}
-
 fn quantitative_corpus_id(source_jobs: &[RealWorldJob]) -> String {
 	let ids = source_jobs.iter().map(|job| job.corpus.corpus_id.as_str()).collect::<BTreeSet<_>>();
 
@@ -606,103 +249,3 @@ fn quantitative_row_leaderboard_eligible(
 			.as_deref()
 			.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty())
 }
-
-fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
-	source_jobs
-		.iter()
-		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
-		.map(|job| job.job_id.as_str())
-		.collect()
-}
-
-fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	ranking_query_ids(source_jobs).len()
-}
-
-fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
-}
-
-fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		return source_job
-			.expected_answer
-			.relevance_judgments
-			.iter()
-			.filter(|judgment| judgment.grade > 0.0)
-			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-			.collect();
-	}
-
-	source_job
-		.required_evidence
-		.iter()
-		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
-		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
-		.collect()
-}
-
-fn ranking_query_attempted(job: &RealWorldJob) -> bool {
-	if !scoring::produced_evidence_order(job).is_empty() {
-		return true;
-	}
-
-	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
-		return false;
-	};
-
-	answer.trace_explainability.as_ref().is_some_and(|trace| {
-		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
-	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
-}
-
-fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		"explicit_qrels"
-	} else if empty {
-		"not_encoded"
-	} else {
-		"expected_evidence_fallback"
-	}
-}
-
-fn aggregate_qrel_source(
-	ranking_query_count: usize,
-	explicit_qrel_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if explicit_qrel_query_count == ranking_query_count {
-		"explicit_qrels"
-	} else if explicit_qrel_query_count == 0 {
-		"expected_evidence_fallback"
-	} else {
-		"mixed"
-	}
-}
-
-fn ranking_coverage_state(
-	summary: &ReportSummary,
-	source_job_count: usize,
-	ranking_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
-		"complete"
-	} else {
-		"partial_coverage"
-	}
-}
-
-fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
-	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
-}
-
-fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
-	relevance.values().filter(|grade| **grade > 0.0).count()
-}
-
-fn rate(numerator: usize, denominator: usize) -> Option<f64> {
-	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
index dbdb861d..be8b9e50 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -3,12 +3,10 @@ use std::env;
 use crate::{
 	BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact,
 	QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs,
-};
-
-use super::{
-	QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
-	REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
-	explicit_qrel_query_count, quantitative_corpus_id, ranking_query_count, ranking_query_ids,
+	quantitative::{
+		QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
+		REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, metrics,
+	},
 };
 
 pub(super) struct QuantitativeAuditContext<'a> {
@@ -38,9 +36,9 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
 	}
 
-	let corpus_id = quantitative_corpus_id(jobs);
-	let ranking_query_count = ranking_query_count(jobs);
-	let explicit_qrel_query_count = explicit_qrel_query_count(jobs);
+	let corpus_id = super::quantitative_corpus_id(jobs);
+	let ranking_query_count = metrics::ranking_query_count(jobs);
+	let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs);
 	let manifest = QuantitativeAuditManifest {
 		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
 		manifest_id: args
@@ -56,7 +54,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 		sample_size: jobs.len(),
 		ranking_query_count,
 		explicit_qrel_query_count,
-		query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
+		query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
 		controls: args.controls.clone(),
 		artifacts: vec![QuantitativeAuditArtifact {
 			role: "product_runtime_fixtures".to_string(),
@@ -199,7 +197,7 @@ fn validate_quantitative_audit_query_ids(
 	path: &Path,
 	source_jobs: &[RealWorldJob],
 ) -> Result<()> {
-	let expected = ranking_query_ids(source_jobs);
+	let expected = metrics::ranking_query_ids(source_jobs);
 	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
 
 	if actual.len() != manifest.query_ids.len() {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
new file mode 100644
index 00000000..e5377d7b
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
@@ -0,0 +1,503 @@
+use crate::{
+	BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow,
+	RealWorldJob, ReportSummary, formatting,
+	quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, WILSON_95_Z},
+	scoring,
+};
+
+pub(super) fn quantitative_per_query_rows(
+	source_jobs: &[RealWorldJob],
+	jobs: &[JobReport],
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> Vec<QuantitativePerQueryRow> {
+	source_jobs
+		.iter()
+		.zip(jobs.iter())
+		.map(|(source_job, job)| {
+			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
+		})
+		.collect()
+}
+
+pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+	aggregate_metrics_impl(rows)
+}
+
+pub(super) fn aggregate_metric_states(
+	result_state: &str,
+	metric_comparable: bool,
+) -> BTreeMap<String, String> {
+	aggregate_metric_states_impl(result_state, metric_comparable)
+}
+
+pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+	aggregate_denominators_impl(rows)
+}
+
+pub(super) fn aggregate_confidence_intervals(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, QuantitativeConfidenceInterval> {
+	aggregate_confidence_intervals_impl(rows)
+}
+
+pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+	ranking_query_ids_impl(source_jobs)
+}
+
+pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	ranking_query_ids(source_jobs).len()
+}
+
+pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
+}
+
+pub(super) fn aggregate_qrel_source(
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+) -> &'static str {
+	aggregate_qrel_source_impl(ranking_query_count, explicit_qrel_query_count)
+}
+
+pub(super) fn ranking_coverage_state(
+	summary: &ReportSummary,
+	source_job_count: usize,
+	ranking_query_count: usize,
+) -> &'static str {
+	ranking_coverage_state_impl(summary, source_job_count, ranking_query_count)
+}
+
+pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
+	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
+}
+
+fn quantitative_per_query_row(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let relevance = relevance_grades(source_job, job);
+	let candidates = scoring::produced_evidence_order(source_job);
+	let positive_relevance_count = positive_qrel_count(&relevance);
+	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
+	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
+		"not_encoded"
+	} else {
+		formatting::status_str(job.status)
+	};
+	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
+	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
+
+	QuantitativePerQueryRow {
+		job_id: job.job_id.clone(),
+		suite: job.suite_id.clone(),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.to_string()),
+		result_state: formatting::status_str(job.status).to_string(),
+		expected_relevant_count: positive_relevance_count,
+		candidate_count: candidates.len(),
+		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
+		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		product: "ELF".to_string(),
+		adapter_id: adapter_id.to_string(),
+		metrics,
+		metric_states,
+		denominators,
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	}
+}
+
+fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
+	let explicit = source_job
+		.expected_answer
+		.relevance_judgments
+		.iter()
+		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+		.collect::<BTreeMap<_, _>>();
+
+	if !explicit.is_empty() {
+		return explicit;
+	}
+
+	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
+}
+
+fn per_query_metrics(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> BTreeMap<String, Option<f64>> {
+	let mut metrics = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
+
+		metrics
+			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
+		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
+		metrics.insert(
+			format!("success_at_{k}"),
+			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
+		);
+	}
+
+	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
+	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
+	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
+
+	metrics
+}
+
+fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
+	candidates
+		.iter()
+		.take(k)
+		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
+		.count()
+}
+
+fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}
+
+fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
+
+fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	let positive_count = positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}
+
+fn aggregate_metrics_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+	let mut sums = BTreeMap::<String, (f64, usize)>::new();
+	let mut metrics = quantitative_metric_names()
+		.into_iter()
+		.map(|metric| (metric, None))
+		.collect::<BTreeMap<_, _>>();
+
+	for row in rows {
+		for (metric, value) in &row.metrics {
+			if let Some(value) = value {
+				let (sum, count) = sums.entry(metric.clone()).or_default();
+
+				*sum += *value;
+				*count += 1;
+			}
+		}
+	}
+	for (metric, (sum, count)) in sums {
+		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
+	}
+
+	metrics
+}
+
+fn aggregate_metric_states_impl(
+	result_state: &str,
+	metric_comparable: bool,
+) -> BTreeMap<String, String> {
+	let state = if metric_comparable { result_state } else { "not_encoded" };
+	let mut states = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		states.insert(format!("recall_at_{k}"), state.to_string());
+		states.insert(format!("precision_at_{k}"), state.to_string());
+		states.insert(format!("success_at_{k}"), state.to_string());
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		states.insert(metric.to_string(), state.to_string());
+	}
+
+	states
+}
+
+fn quantitative_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		metrics.push(metric.to_string());
+	}
+
+	metrics
+}
+
+fn per_query_denominators(
+	candidate_count: usize,
+	expected_relevant_count: usize,
+) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
+		denominators.insert(format!("precision_at_{k}"), *k);
+		denominators.insert(format!("success_at_{k}"), 1);
+	}
+
+	denominators.insert("mrr".to_string(), expected_relevant_count);
+	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
+	denominators.insert("average_precision".to_string(), expected_relevant_count);
+	denominators.insert("candidate_count".to_string(), candidate_count);
+
+	denominators
+}
+
+fn aggregate_denominators_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(
+			format!("recall_at_{k}"),
+			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
+		);
+		denominators.insert(
+			format!("precision_at_{k}"),
+			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
+		);
+		denominators.insert(
+			format!("success_at_{k}"),
+			sum_per_query_denominator(rows, &format!("success_at_{k}")),
+		);
+	}
+
+	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
+	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
+	denominators.insert(
+		"average_precision".to_string(),
+		sum_per_query_denominator(rows, "average_precision"),
+	);
+
+	denominators
+}
+
+fn aggregate_confidence_intervals_impl(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, QuantitativeConfidenceInterval> {
+	let mut confidence_intervals = BTreeMap::new();
+
+	for metric in rate_metric_names() {
+		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
+
+		if denominator > 0 {
+			confidence_intervals.insert(
+				metric,
+				wilson_confidence_interval(numerator.min(denominator), denominator),
+			);
+		}
+	}
+
+	confidence_intervals
+}
+
+fn rate_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+
+	metrics
+}
+
+fn aggregate_rate_numerator_denominator(
+	rows: &[QuantitativePerQueryRow],
+	metric: &str,
+) -> (usize, usize) {
+	let mut numerator = 0;
+	let mut denominator = 0;
+
+	for row in rows {
+		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
+			continue;
+		};
+		let Some(row_denominator) = row.denominators.get(metric).copied() else {
+			continue;
+		};
+
+		if row_denominator == 0 {
+			continue;
+		}
+
+		denominator += row_denominator;
+		numerator += (value * row_denominator as f64).round() as usize;
+	}
+
+	(numerator, denominator)
+}
+
+fn wilson_confidence_interval(
+	numerator: usize,
+	denominator: usize,
+) -> QuantitativeConfidenceInterval {
+	let n = denominator as f64;
+	let p = numerator as f64 / n;
+	let z2 = WILSON_95_Z * WILSON_95_Z;
+	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
+	let half_width =
+		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
+
+	QuantitativeConfidenceInterval {
+		method: "wilson_score".to_string(),
+		confidence: 0.95,
+		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
+		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
+		numerator,
+		denominator,
+	}
+}
+
+fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
+	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
+}
+
+fn ranking_query_ids_impl(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+	source_jobs
+		.iter()
+		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
+		.map(|job| job.job_id.as_str())
+		.collect()
+}
+
+fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		return source_job
+			.expected_answer
+			.relevance_judgments
+			.iter()
+			.filter(|judgment| judgment.grade > 0.0)
+			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+			.collect();
+	}
+
+	source_job
+		.required_evidence
+		.iter()
+		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
+		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
+		.collect()
+}
+
+fn ranking_query_attempted(job: &RealWorldJob) -> bool {
+	if !scoring::produced_evidence_order(job).is_empty() {
+		return true;
+	}
+
+	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
+		return false;
+	};
+
+	answer.trace_explainability.as_ref().is_some_and(|trace| {
+		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
+	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
+}
+
+fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		"explicit_qrels"
+	} else if empty {
+		"not_encoded"
+	} else {
+		"expected_evidence_fallback"
+	}
+}
+
+fn aggregate_qrel_source_impl(
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if explicit_qrel_query_count == ranking_query_count {
+		"explicit_qrels"
+	} else if explicit_qrel_query_count == 0 {
+		"expected_evidence_fallback"
+	} else {
+		"mixed"
+	}
+}
+
+fn ranking_coverage_state_impl(
+	summary: &ReportSummary,
+	source_job_count: usize,
+	ranking_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
+		"complete"
+	} else {
+		"partial_coverage"
+	}
+}
+
+fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
+	relevance.values().filter(|grade| **grade > 0.0).count()
+}
+
+fn rate(numerator: usize, denominator: usize) -> Option<f64> {
+	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
index ed3844d4..111459e9 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -1,10 +1,9 @@
 use crate::{
 	BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow,
 	QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs,
+	quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA},
 };
 
-use super::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA};
-
 pub(crate) fn quantitative_product_manifest_from_report(
 	report: &RealWorldReport,
 	args: &ExportQuantitativeProductManifestArgs,

From dee9e0cbc342d8f3e899fa2a7c151da1a05d2094 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:39:57 -0400
Subject: [PATCH 10/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative benchmark tests","authority":"manual"}

---
 .../real_world_job_benchmark/quantitative.rs  | 478 +-----------------
 .../quantitative/audit_manifest.rs            | 110 ++++
 .../quantitative/contracts.rs                 | 127 +++++
 .../quantitative/metrics.rs                   |  53 ++
 .../quantitative/product_manifest.rs          | 203 ++++++++
 5 files changed, 500 insertions(+), 471 deletions(-)
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs

diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
index 249c48e2..9bcc07c8 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs
@@ -1,391 +1,15 @@
-use std::{
-	env, fs,
-	path::Path,
-	process::{self, Command},
-};
+#[path = "quantitative/audit_manifest.rs"] mod audit_manifest;
+#[path = "quantitative/contracts.rs"] mod contracts;
+#[path = "quantitative/metrics.rs"] mod metrics;
+#[path = "quantitative/product_manifest.rs"] mod product_manifest;
 
-use color_eyre::{Result, eyre};
+use std::{path::Path, process::Command};
+
+use color_eyre::Result;
 use serde_json::Value;
 
 use crate::support;
 
-#[test]
-fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-
-	assert_eq!(
-		report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str),
-		Some("elf.agent_memory_quantitative_benchmark/v1")
-	);
-	assert_eq!(
-		report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str),
-		report.pointer("/generated_at").and_then(Value::as_str)
-	);
-	assert_eq!(
-		report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array),
-		Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),])
-	);
-	assert_eq!(
-		report
-			.pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed")
-			.and_then(Value::as_bool),
-		Some(false)
-	);
-	assert_eq!(
-		report
-			.pointer("/quantitative_scoreboard/controls/current_query_count")
-			.and_then(Value::as_u64),
-		report.pointer("/summary/job_count").and_then(Value::as_u64)
-	);
-
-	assert_quantitative_row_contract(&report)?;
-	assert_quantitative_per_query_contract(&report)?;
-
-	Ok(())
-}
-
-#[test]
-fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> {
-	let source_path =
-		support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json");
-	let mut job = serde_json::from_str::<Value>(&fs::read_to_string(source_path)?)?;
-
-	support::set_json_pointer(
-		&mut job,
-		"/corpus/adapter_response/answer/evidence_ids",
-		serde_json::json!(["old-provider-note", "current-provider-report"]),
-	)?;
-
-	job.pointer_mut("/expected_answer")
-		.and_then(Value::as_object_mut)
-		.ok_or_else(|| eyre::eyre!("missing expected_answer object"))?
-		.insert(
-			"relevance_judgments".to_string(),
-			serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]),
-		);
-
-	let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id()));
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?;
-
-	let report = support::run_json_report_from(temp_dir)?;
-	let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?;
-	let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
-
-	assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
-	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1));
-	assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0));
-	assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0));
-	assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
-	assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5));
-	assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
-
-	let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?;
-	let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?;
-
-	assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
-	assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
-	assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir =
-		env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--product")
-		.arg("Synthetic Rival")
-		.arg("--adapter-id")
-		.arg("synthetic_rival")
-		.arg("--adapter-name")
-		.arg("Synthetic Rival adapter")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"product manifest export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let manifest = support::load_json(&manifest_path)?;
-
-	assert_eq!(
-		manifest.pointer("/schema").and_then(Value::as_str),
-		Some("elf.agent_memory_quantitative_product_manifest/v1")
-	);
-	assert_eq!(
-		manifest.pointer("/rows/0/product").and_then(Value::as_str),
-		Some("Synthetic Rival")
-	);
-	assert_eq!(
-		manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str),
-		Some("synthetic_rival")
-	);
-
-	let imported = run_report_with_quantitative_manifest(&manifest_path)?;
-	let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?;
-	let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?;
-
-	assert_eq!(rows.len(), 2);
-	assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival"));
-	assert!(!support::array_contains_str(
-		&imported,
-		"/quantitative_scoreboard/metrics_not_encoded",
-		"external_product_manifest_import"
-	)?);
-	assert!(
-		support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any(
-			|row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival")
-		)
-	);
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("elf-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-	assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF"));
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--product")
-		.arg("Synthetic Rival")
-		.arg("--adapter-id")
-		.arg("synthetic_rival")
-		.arg("--adapter-name")
-		.arg("Synthetic Rival adapter")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"product manifest export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let mut manifest = support::load_json(&manifest_path)?;
-
-	support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?;
-	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("run")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--quantitative-product-manifest")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-	assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus"));
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("missing-per-query-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--product")
-		.arg("Synthetic Rival")
-		.arg("--adapter-id")
-		.arg("synthetic_rival")
-		.arg("--adapter-name")
-		.arg("Synthetic Rival adapter")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"product manifest export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let mut manifest = support::load_json(&manifest_path)?;
-
-	support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?;
-	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("run")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--quantitative-product-manifest")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-
-	let stderr = String::from_utf8_lossy(&output.stderr);
-
-	assert!(stderr.contains("ranked queries but only 0"));
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> {
-	let temp_dir =
-		env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id()));
-	let manifest_path = temp_dir.join("audit-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-audit-manifest")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--run-id")
-		.arg("audit-import-test")
-		.arg("--held-out")
-		.arg("--leakage-audited")
-		.arg("--control")
-		.arg("query_ids_locked_before_product_runtime")
-		.arg("--control")
-		.arg("product_runtime_did_not_receive_expected_answers_or_qrels")
-		.arg("--control")
-		.arg("ranked_candidates_emitted_by_product_runtime")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"quantitative audit export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let manifest = support::load_json(&manifest_path)?;
-
-	assert_eq!(
-		manifest.pointer("/schema").and_then(Value::as_str),
-		Some("elf.agent_memory_quantitative_audit_manifest/v1")
-	);
-	assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true));
-	assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
-	assert_eq!(
-		support::array_at(&manifest, "/query_ids")?.len() as u64,
-		manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default()
-	);
-
-	let imported = run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?;
-	let row = support::array_at(&imported, "/quantitative_scoreboard/rows")?
-		.first()
-		.ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
-
-	assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true));
-	assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
-	assert_eq!(
-		row.pointer("/audit_manifest_id").and_then(Value::as_str),
-		Some("audit-import-test-quantitative-audit-manifest")
-	);
-	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
-
-	Ok(())
-}
-
-#[test]
-fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> {
-	let temp_dir =
-		env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id()));
-	let manifest_path = temp_dir.join("audit-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-audit-manifest")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--run-id")
-		.arg("audit-import-test")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"quantitative audit export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("run")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--run-id")
-		.arg("different-run")
-		.arg("--quantitative-audit-manifest")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-	assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run"));
-
-	Ok(())
-}
-
 fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result<Value> {
 	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
 		.arg("run")
@@ -423,91 +47,3 @@ fn run_report_with_quantitative_audit(manifest_path: &Path, run_id: &str) -> Res
 
 	Ok(serde_json::from_slice(&output.stdout)?)
 }
-
-fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
-	let rows = support::array_at(report, "/quantitative_scoreboard/rows")?;
-
-	assert_eq!(rows.len(), 1);
-
-	let row = &rows[0];
-
-	assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF"));
-	assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke"));
-	assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality"));
-	assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
-	assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass"));
-	assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true));
-	assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true));
-	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
-	assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true));
-	assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete"));
-	assert_eq!(
-		row.pointer("/ranked_candidate_source").and_then(Value::as_str),
-		Some("produced_evidence_order")
-	);
-	assert_eq!(
-		row.pointer("/qrel_source").and_then(Value::as_str),
-		Some("expected_evidence_fallback")
-	);
-	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0));
-
-	for metric in [
-		"recall_at_1",
-		"precision_at_1",
-		"success_at_1",
-		"recall_at_5",
-		"precision_at_5",
-		"success_at_5",
-		"mrr",
-		"ndcg_at_5",
-		"average_precision",
-	] {
-		assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some());
-		assert_eq!(
-			row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str),
-			Some("pass")
-		);
-		assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some());
-	}
-	for metric in ["recall_at_5", "precision_at_5", "success_at_5"] {
-		assert_eq!(
-			row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str),
-			Some("wilson_score")
-		);
-		assert_eq!(
-			row.pointer(&format!("/confidence_intervals/{metric}/confidence"))
-				.and_then(Value::as_f64),
-			Some(0.95)
-		);
-		assert!(
-			row.pointer(&format!("/confidence_intervals/{metric}/denominator"))
-				.and_then(Value::as_u64)
-				.is_some()
-		);
-	}
-
-	Ok(())
-}
-
-fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> {
-	let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?;
-	let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0);
-
-	assert_eq!(rows.len() as u64, job_count);
-
-	for row in rows {
-		assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
-		assert_eq!(
-			row.pointer("/qrel_source").and_then(Value::as_str),
-			Some("expected_evidence_fallback")
-		);
-		assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some());
-		assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some());
-		assert!(row.pointer("/metrics/recall_at_5").is_some());
-		assert!(row.pointer("/metrics/precision_at_5").is_some());
-		assert!(row.pointer("/metrics/ndcg_at_5").is_some());
-		assert!(row.pointer("/metrics/average_precision").is_some());
-	}
-
-	Ok(())
-}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs
new file mode 100644
index 00000000..5d8777cd
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -0,0 +1,110 @@
+use std::{
+	env, fs,
+	process::{self, Command},
+};
+
+use color_eyre::{Result, eyre};
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> {
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id()));
+	let manifest_path = temp_dir.join("audit-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-audit-manifest")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--run-id")
+		.arg("audit-import-test")
+		.arg("--held-out")
+		.arg("--leakage-audited")
+		.arg("--control")
+		.arg("query_ids_locked_before_product_runtime")
+		.arg("--control")
+		.arg("product_runtime_did_not_receive_expected_answers_or_qrels")
+		.arg("--control")
+		.arg("ranked_candidates_emitted_by_product_runtime")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"quantitative audit export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let manifest = support::load_json(&manifest_path)?;
+
+	assert_eq!(
+		manifest.pointer("/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_audit_manifest/v1")
+	);
+	assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true));
+	assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
+	assert_eq!(
+		support::array_at(&manifest, "/query_ids")?.len() as u64,
+		manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default()
+	);
+
+	let imported = super::run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?;
+	let row = support::array_at(&imported, "/quantitative_scoreboard/rows")?
+		.first()
+		.ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
+
+	assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true));
+	assert_eq!(
+		row.pointer("/audit_manifest_id").and_then(Value::as_str),
+		Some("audit-import-test-quantitative-audit-manifest")
+	);
+	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> {
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id()));
+	let manifest_path = temp_dir.join("audit-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-audit-manifest")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--run-id")
+		.arg("audit-import-test")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"quantitative audit export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--run-id")
+		.arg("different-run")
+		.arg("--quantitative-audit-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run"));
+
+	Ok(())
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs
new file mode 100644
index 00000000..fc158b77
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs
@@ -0,0 +1,127 @@
+use color_eyre::Result;
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_benchmark/v1")
+	);
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str),
+		report.pointer("/generated_at").and_then(Value::as_str)
+	);
+	assert_eq!(
+		report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array),
+		Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),])
+	);
+	assert_eq!(
+		report
+			.pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed")
+			.and_then(Value::as_bool),
+		Some(false)
+	);
+	assert_eq!(
+		report
+			.pointer("/quantitative_scoreboard/controls/current_query_count")
+			.and_then(Value::as_u64),
+		report.pointer("/summary/job_count").and_then(Value::as_u64)
+	);
+
+	assert_quantitative_row_contract(&report)?;
+	assert_quantitative_per_query_contract(&report)?;
+
+	Ok(())
+}
+
+fn assert_quantitative_row_contract(report: &Value) -> Result<()> {
+	let rows = support::array_at(report, "/quantitative_scoreboard/rows")?;
+
+	assert_eq!(rows.len(), 1);
+
+	let row = &rows[0];
+
+	assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF"));
+	assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke"));
+	assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality"));
+	assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
+	assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass"));
+	assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false));
+	assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true));
+	assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete"));
+	assert_eq!(
+		row.pointer("/ranked_candidate_source").and_then(Value::as_str),
+		Some("produced_evidence_order")
+	);
+	assert_eq!(
+		row.pointer("/qrel_source").and_then(Value::as_str),
+		Some("expected_evidence_fallback")
+	);
+	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0));
+
+	for metric in [
+		"recall_at_1",
+		"precision_at_1",
+		"success_at_1",
+		"recall_at_5",
+		"precision_at_5",
+		"success_at_5",
+		"mrr",
+		"ndcg_at_5",
+		"average_precision",
+	] {
+		assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some());
+		assert_eq!(
+			row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str),
+			Some("pass")
+		);
+		assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some());
+	}
+	for metric in ["recall_at_5", "precision_at_5", "success_at_5"] {
+		assert_eq!(
+			row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str),
+			Some("wilson_score")
+		);
+		assert_eq!(
+			row.pointer(&format!("/confidence_intervals/{metric}/confidence"))
+				.and_then(Value::as_f64),
+			Some(0.95)
+		);
+		assert!(
+			row.pointer(&format!("/confidence_intervals/{metric}/denominator"))
+				.and_then(Value::as_u64)
+				.is_some()
+		);
+	}
+
+	Ok(())
+}
+
+fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> {
+	let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?;
+	let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0);
+
+	assert_eq!(rows.len() as u64, job_count);
+
+	for row in rows {
+		assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed"));
+		assert_eq!(
+			row.pointer("/qrel_source").and_then(Value::as_str),
+			Some("expected_evidence_fallback")
+		);
+		assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some());
+		assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some());
+		assert!(row.pointer("/metrics/recall_at_5").is_some());
+		assert!(row.pointer("/metrics/precision_at_5").is_some());
+		assert!(row.pointer("/metrics/ndcg_at_5").is_some());
+		assert!(row.pointer("/metrics/average_precision").is_some());
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs
new file mode 100644
index 00000000..3b9262a0
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs
@@ -0,0 +1,53 @@
+use std::{env, fs, process};
+
+use color_eyre::{Result, eyre};
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> {
+	let source_path =
+		support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json");
+	let mut job = serde_json::from_str::<Value>(&fs::read_to_string(source_path)?)?;
+
+	support::set_json_pointer(
+		&mut job,
+		"/corpus/adapter_response/answer/evidence_ids",
+		serde_json::json!(["old-provider-note", "current-provider-report"]),
+	)?;
+
+	job.pointer_mut("/expected_answer")
+		.and_then(Value::as_object_mut)
+		.ok_or_else(|| eyre::eyre!("missing expected_answer object"))?
+		.insert(
+			"relevance_judgments".to_string(),
+			serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]),
+		);
+
+	let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id()));
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?;
+
+	let report = support::run_json_report_from(temp_dir)?;
+	let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?;
+	let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?;
+
+	assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
+	assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1));
+	assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0));
+	assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0));
+	assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
+
+	let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?;
+	let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?;
+
+	assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels"));
+	assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5));
+	assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1));
+
+	Ok(())
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs
new file mode 100644
index 00000000..c7b543c5
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -0,0 +1,203 @@
+use std::{
+	env, fs,
+	process::{self, Command},
+};
+
+use color_eyre::Result;
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir =
+		env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let manifest = support::load_json(&manifest_path)?;
+
+	assert_eq!(
+		manifest.pointer("/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_product_manifest/v1")
+	);
+	assert_eq!(
+		manifest.pointer("/rows/0/product").and_then(Value::as_str),
+		Some("Synthetic Rival")
+	);
+	assert_eq!(
+		manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str),
+		Some("synthetic_rival")
+	);
+
+	let imported = super::run_report_with_quantitative_manifest(&manifest_path)?;
+	let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?;
+	let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?;
+
+	assert_eq!(rows.len(), 2);
+	assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival"));
+	assert!(!support::array_contains_str(
+		&imported,
+		"/quantitative_scoreboard/metrics_not_encoded",
+		"external_product_manifest_import"
+	)?);
+	assert!(
+		support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any(
+			|row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival")
+		)
+	);
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("elf-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF"));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let mut manifest = support::load_json(&manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?;
+	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus"));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> {
+	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
+	let temp_dir = env::temp_dir()
+		.join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id()));
+	let report_path = temp_dir.join("report.json");
+	let manifest_path = temp_dir.join("missing-per-query-product-manifest.json");
+
+	fs::create_dir_all(&temp_dir)?;
+	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+
+	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&report_path)
+		.arg("--out")
+		.arg(&manifest_path)
+		.arg("--product")
+		.arg("Synthetic Rival")
+		.arg("--adapter-id")
+		.arg("synthetic_rival")
+		.arg("--adapter-name")
+		.arg("Synthetic Rival adapter")
+		.output()?;
+
+	assert!(
+		export.status.success(),
+		"product manifest export failed: {}",
+		String::from_utf8_lossy(&export.stderr)
+	);
+
+	let mut manifest = support::load_json(&manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?;
+	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+
+	let stderr = String::from_utf8_lossy(&output.stderr);
+
+	assert!(stderr.contains("ranked queries but only 0"));
+
+	Ok(())
+}

From f65b0e28c357e58f676e6b4b21b40b4bd48440a8 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:44:49 -0400
Subject: [PATCH 11/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative metric submodules","authority":"manual"}

---
 .../quantitative/metrics.rs                   | 465 +-----------------
 .../quantitative/metrics/aggregate.rs         | 172 +++++++
 .../quantitative/metrics/per_query.rs         | 212 ++++++++
 .../quantitative/metrics/ranking.rs           |  83 ++++
 4 files changed, 483 insertions(+), 449 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
index e5377d7b..779329f6 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
@@ -1,8 +1,10 @@
+mod aggregate;
+mod per_query;
+mod ranking;
+
 use crate::{
 	BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow,
-	RealWorldJob, ReportSummary, formatting,
-	quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, WILSON_95_Z},
-	scoring,
+	RealWorldJob, ReportSummary,
 };
 
 pub(super) fn quantitative_per_query_rows(
@@ -12,53 +14,47 @@ pub(super) fn quantitative_per_query_rows(
 	evidence_class: &str,
 	adapter_id: &str,
 ) -> Vec<QuantitativePerQueryRow> {
-	source_jobs
-		.iter()
-		.zip(jobs.iter())
-		.map(|(source_job, job)| {
-			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
-		})
-		.collect()
+	per_query::quantitative_per_query_rows(source_jobs, jobs, corpus_id, evidence_class, adapter_id)
 }
 
 pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
-	aggregate_metrics_impl(rows)
+	aggregate::aggregate_metrics(rows)
 }
 
 pub(super) fn aggregate_metric_states(
 	result_state: &str,
 	metric_comparable: bool,
 ) -> BTreeMap<String, String> {
-	aggregate_metric_states_impl(result_state, metric_comparable)
+	aggregate::aggregate_metric_states(result_state, metric_comparable)
 }
 
 pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
-	aggregate_denominators_impl(rows)
+	aggregate::aggregate_denominators(rows)
 }
 
 pub(super) fn aggregate_confidence_intervals(
 	rows: &[QuantitativePerQueryRow],
 ) -> BTreeMap<String, QuantitativeConfidenceInterval> {
-	aggregate_confidence_intervals_impl(rows)
+	aggregate::aggregate_confidence_intervals(rows)
 }
 
 pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
-	ranking_query_ids_impl(source_jobs)
+	ranking::ranking_query_ids(source_jobs)
 }
 
 pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	ranking_query_ids(source_jobs).len()
+	ranking::ranking_query_count(source_jobs)
 }
 
 pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
+	ranking::explicit_qrel_query_count(source_jobs)
 }
 
 pub(super) fn aggregate_qrel_source(
 	ranking_query_count: usize,
 	explicit_qrel_query_count: usize,
 ) -> &'static str {
-	aggregate_qrel_source_impl(ranking_query_count, explicit_qrel_query_count)
+	ranking::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
 }
 
 pub(super) fn ranking_coverage_state(
@@ -66,438 +62,9 @@ pub(super) fn ranking_coverage_state(
 	source_job_count: usize,
 	ranking_query_count: usize,
 ) -> &'static str {
-	ranking_coverage_state_impl(summary, source_job_count, ranking_query_count)
+	ranking::ranking_coverage_state(summary, source_job_count, ranking_query_count)
 }
 
 pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
-	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
-}
-
-fn quantitative_per_query_row(
-	source_job: &RealWorldJob,
-	job: &JobReport,
-	corpus_id: &str,
-	evidence_class: &str,
-	adapter_id: &str,
-) -> QuantitativePerQueryRow {
-	let relevance = relevance_grades(source_job, job);
-	let candidates = scoring::produced_evidence_order(source_job);
-	let positive_relevance_count = positive_qrel_count(&relevance);
-	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
-	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
-		"not_encoded"
-	} else {
-		formatting::status_str(job.status)
-	};
-	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
-	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
-
-	QuantitativePerQueryRow {
-		job_id: job.job_id.clone(),
-		suite: job.suite_id.clone(),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.to_string()),
-		result_state: formatting::status_str(job.status).to_string(),
-		expected_relevant_count: positive_relevance_count,
-		candidate_count: candidates.len(),
-		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
-		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
-		product: "ELF".to_string(),
-		adapter_id: adapter_id.to_string(),
-		metrics,
-		metric_states,
-		denominators,
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	}
-}
-
-fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
-	let explicit = source_job
-		.expected_answer
-		.relevance_judgments
-		.iter()
-		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-		.collect::<BTreeMap<_, _>>();
-
-	if !explicit.is_empty() {
-		return explicit;
-	}
-
-	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
-}
-
-fn per_query_metrics(
-	candidates: &[String],
-	relevance: &BTreeMap<String, f64>,
-) -> BTreeMap<String, Option<f64>> {
-	let mut metrics = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
-
-		metrics
-			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
-		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
-		metrics.insert(
-			format!("success_at_{k}"),
-			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
-		);
-	}
-
-	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
-	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
-	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
-
-	metrics
-}
-
-fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
-	candidates
-		.iter()
-		.take(k)
-		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
-		.count()
-}
-
-fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	Some(
-		candidates
-			.iter()
-			.position(|candidate| {
-				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
-			})
-			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
-	)
-}
-
-fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	let dcg = candidates
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, candidate)| {
-			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
-				/ ((index + 2) as f64).log2()
-		})
-		.sum::<f64>();
-	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
-
-	ideal.sort_by(|left, right| right.total_cmp(left));
-
-	let idcg = ideal
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
-		.sum::<f64>();
-
-	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
-}
-
-fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	let positive_count = positive_qrel_count(relevance);
-
-	if positive_count == 0 {
-		return None;
-	}
-
-	let mut hit_count = 0;
-	let mut precision_sum = 0.0;
-	let mut seen = BTreeSet::new();
-
-	for (index, candidate) in candidates.iter().enumerate() {
-		if !seen.insert(candidate.as_str()) {
-			continue;
-		}
-		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
-			hit_count += 1;
-			precision_sum += hit_count as f64 / (index + 1) as f64;
-		}
-	}
-
-	Some(precision_sum / positive_count as f64)
-}
-
-fn aggregate_metrics_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
-	let mut sums = BTreeMap::<String, (f64, usize)>::new();
-	let mut metrics = quantitative_metric_names()
-		.into_iter()
-		.map(|metric| (metric, None))
-		.collect::<BTreeMap<_, _>>();
-
-	for row in rows {
-		for (metric, value) in &row.metrics {
-			if let Some(value) = value {
-				let (sum, count) = sums.entry(metric.clone()).or_default();
-
-				*sum += *value;
-				*count += 1;
-			}
-		}
-	}
-	for (metric, (sum, count)) in sums {
-		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
-	}
-
-	metrics
-}
-
-fn aggregate_metric_states_impl(
-	result_state: &str,
-	metric_comparable: bool,
-) -> BTreeMap<String, String> {
-	let state = if metric_comparable { result_state } else { "not_encoded" };
-	let mut states = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		states.insert(format!("recall_at_{k}"), state.to_string());
-		states.insert(format!("precision_at_{k}"), state.to_string());
-		states.insert(format!("success_at_{k}"), state.to_string());
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		states.insert(metric.to_string(), state.to_string());
-	}
-
-	states
-}
-
-fn quantitative_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		metrics.push(metric.to_string());
-	}
-
-	metrics
-}
-
-fn per_query_denominators(
-	candidate_count: usize,
-	expected_relevant_count: usize,
-) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
-		denominators.insert(format!("precision_at_{k}"), *k);
-		denominators.insert(format!("success_at_{k}"), 1);
-	}
-
-	denominators.insert("mrr".to_string(), expected_relevant_count);
-	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
-	denominators.insert("average_precision".to_string(), expected_relevant_count);
-	denominators.insert("candidate_count".to_string(), candidate_count);
-
-	denominators
-}
-
-fn aggregate_denominators_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(
-			format!("recall_at_{k}"),
-			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
-		);
-		denominators.insert(
-			format!("precision_at_{k}"),
-			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
-		);
-		denominators.insert(
-			format!("success_at_{k}"),
-			sum_per_query_denominator(rows, &format!("success_at_{k}")),
-		);
-	}
-
-	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
-	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
-	denominators.insert(
-		"average_precision".to_string(),
-		sum_per_query_denominator(rows, "average_precision"),
-	);
-
-	denominators
-}
-
-fn aggregate_confidence_intervals_impl(
-	rows: &[QuantitativePerQueryRow],
-) -> BTreeMap<String, QuantitativeConfidenceInterval> {
-	let mut confidence_intervals = BTreeMap::new();
-
-	for metric in rate_metric_names() {
-		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
-
-		if denominator > 0 {
-			confidence_intervals.insert(
-				metric,
-				wilson_confidence_interval(numerator.min(denominator), denominator),
-			);
-		}
-	}
-
-	confidence_intervals
-}
-
-fn rate_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-
-	metrics
-}
-
-fn aggregate_rate_numerator_denominator(
-	rows: &[QuantitativePerQueryRow],
-	metric: &str,
-) -> (usize, usize) {
-	let mut numerator = 0;
-	let mut denominator = 0;
-
-	for row in rows {
-		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
-			continue;
-		};
-		let Some(row_denominator) = row.denominators.get(metric).copied() else {
-			continue;
-		};
-
-		if row_denominator == 0 {
-			continue;
-		}
-
-		denominator += row_denominator;
-		numerator += (value * row_denominator as f64).round() as usize;
-	}
-
-	(numerator, denominator)
-}
-
-fn wilson_confidence_interval(
-	numerator: usize,
-	denominator: usize,
-) -> QuantitativeConfidenceInterval {
-	let n = denominator as f64;
-	let p = numerator as f64 / n;
-	let z2 = WILSON_95_Z * WILSON_95_Z;
-	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
-	let half_width =
-		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
-
-	QuantitativeConfidenceInterval {
-		method: "wilson_score".to_string(),
-		confidence: 0.95,
-		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
-		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
-		numerator,
-		denominator,
-	}
-}
-
-fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
-	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
-}
-
-fn ranking_query_ids_impl(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
-	source_jobs
-		.iter()
-		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
-		.map(|job| job.job_id.as_str())
-		.collect()
-}
-
-fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		return source_job
-			.expected_answer
-			.relevance_judgments
-			.iter()
-			.filter(|judgment| judgment.grade > 0.0)
-			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-			.collect();
-	}
-
-	source_job
-		.required_evidence
-		.iter()
-		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
-		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
-		.collect()
-}
-
-fn ranking_query_attempted(job: &RealWorldJob) -> bool {
-	if !scoring::produced_evidence_order(job).is_empty() {
-		return true;
-	}
-
-	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
-		return false;
-	};
-
-	answer.trace_explainability.as_ref().is_some_and(|trace| {
-		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
-	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
-}
-
-fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		"explicit_qrels"
-	} else if empty {
-		"not_encoded"
-	} else {
-		"expected_evidence_fallback"
-	}
-}
-
-fn aggregate_qrel_source_impl(
-	ranking_query_count: usize,
-	explicit_qrel_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if explicit_qrel_query_count == ranking_query_count {
-		"explicit_qrels"
-	} else if explicit_qrel_query_count == 0 {
-		"expected_evidence_fallback"
-	} else {
-		"mixed"
-	}
-}
-
-fn ranking_coverage_state_impl(
-	summary: &ReportSummary,
-	source_job_count: usize,
-	ranking_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
-		"complete"
-	} else {
-		"partial_coverage"
-	}
-}
-
-fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
-	relevance.values().filter(|grade| **grade > 0.0).count()
-}
-
-fn rate(numerator: usize, denominator: usize) -> Option<f64> {
-	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+	ranking::ranked_candidate_source(ranking_query_count)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
new file mode 100644
index 00000000..cb2dd63d
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
@@ -0,0 +1,172 @@
+use crate::{
+	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
+	quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z},
+};
+
+pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+	let mut sums = BTreeMap::<String, (f64, usize)>::new();
+	let mut metrics = quantitative_metric_names()
+		.into_iter()
+		.map(|metric| (metric, None))
+		.collect::<BTreeMap<_, _>>();
+
+	for row in rows {
+		for (metric, value) in &row.metrics {
+			if let Some(value) = value {
+				let (sum, count) = sums.entry(metric.clone()).or_default();
+
+				*sum += *value;
+				*count += 1;
+			}
+		}
+	}
+	for (metric, (sum, count)) in sums {
+		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
+	}
+
+	metrics
+}
+
+pub(super) fn aggregate_metric_states(
+	result_state: &str,
+	metric_comparable: bool,
+) -> BTreeMap<String, String> {
+	let state = if metric_comparable { result_state } else { "not_encoded" };
+	let mut states = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		states.insert(format!("recall_at_{k}"), state.to_string());
+		states.insert(format!("precision_at_{k}"), state.to_string());
+		states.insert(format!("success_at_{k}"), state.to_string());
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		states.insert(metric.to_string(), state.to_string());
+	}
+
+	states
+}
+
+pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(
+			format!("recall_at_{k}"),
+			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
+		);
+		denominators.insert(
+			format!("precision_at_{k}"),
+			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
+		);
+		denominators.insert(
+			format!("success_at_{k}"),
+			sum_per_query_denominator(rows, &format!("success_at_{k}")),
+		);
+	}
+
+	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
+	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
+	denominators.insert(
+		"average_precision".to_string(),
+		sum_per_query_denominator(rows, "average_precision"),
+	);
+
+	denominators
+}
+
+pub(super) fn aggregate_confidence_intervals(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, QuantitativeConfidenceInterval> {
+	let mut confidence_intervals = BTreeMap::new();
+
+	for metric in rate_metric_names() {
+		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
+
+		if denominator > 0 {
+			confidence_intervals.insert(
+				metric,
+				wilson_confidence_interval(numerator.min(denominator), denominator),
+			);
+		}
+	}
+
+	confidence_intervals
+}
+
+fn quantitative_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		metrics.push(metric.to_string());
+	}
+
+	metrics
+}
+
+fn rate_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+
+	metrics
+}
+
+fn aggregate_rate_numerator_denominator(
+	rows: &[QuantitativePerQueryRow],
+	metric: &str,
+) -> (usize, usize) {
+	let mut numerator = 0;
+	let mut denominator = 0;
+
+	for row in rows {
+		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
+			continue;
+		};
+		let Some(row_denominator) = row.denominators.get(metric).copied() else {
+			continue;
+		};
+
+		if row_denominator == 0 {
+			continue;
+		}
+
+		denominator += row_denominator;
+		numerator += (value * row_denominator as f64).round() as usize;
+	}
+
+	(numerator, denominator)
+}
+
+fn wilson_confidence_interval(
+	numerator: usize,
+	denominator: usize,
+) -> QuantitativeConfidenceInterval {
+	let n = denominator as f64;
+	let p = numerator as f64 / n;
+	let z2 = WILSON_95_Z * WILSON_95_Z;
+	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
+	let half_width =
+		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
+
+	QuantitativeConfidenceInterval {
+		method: "wilson_score".to_string(),
+		confidence: 0.95,
+		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
+		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
+		numerator,
+		denominator,
+	}
+}
+
+fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
+	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
new file mode 100644
index 00000000..db9e932c
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
@@ -0,0 +1,212 @@
+use crate::{
+	BTreeMap, BTreeSet, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
+	quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY},
+	scoring,
+};
+
+pub(super) fn quantitative_per_query_rows(
+	source_jobs: &[RealWorldJob],
+	jobs: &[JobReport],
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> Vec<QuantitativePerQueryRow> {
+	source_jobs
+		.iter()
+		.zip(jobs.iter())
+		.map(|(source_job, job)| {
+			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
+		})
+		.collect()
+}
+
+fn quantitative_per_query_row(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let relevance = relevance_grades(source_job, job);
+	let candidates = scoring::produced_evidence_order(source_job);
+	let positive_relevance_count = positive_qrel_count(&relevance);
+	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
+	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
+		"not_encoded"
+	} else {
+		formatting::status_str(job.status)
+	};
+	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
+	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
+
+	QuantitativePerQueryRow {
+		job_id: job.job_id.clone(),
+		suite: job.suite_id.clone(),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.to_string()),
+		result_state: formatting::status_str(job.status).to_string(),
+		expected_relevant_count: positive_relevance_count,
+		candidate_count: candidates.len(),
+		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
+		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		product: "ELF".to_string(),
+		adapter_id: adapter_id.to_string(),
+		metrics,
+		metric_states,
+		denominators,
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	}
+}
+
+fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
+	let explicit = source_job
+		.expected_answer
+		.relevance_judgments
+		.iter()
+		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+		.collect::<BTreeMap<_, _>>();
+
+	if !explicit.is_empty() {
+		return explicit;
+	}
+
+	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
+}
+
+fn per_query_metrics(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> BTreeMap<String, Option<f64>> {
+	let mut metrics = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
+
+		metrics
+			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
+		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
+		metrics.insert(
+			format!("success_at_{k}"),
+			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
+		);
+	}
+
+	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
+	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
+	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
+
+	metrics
+}
+
+fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
+	candidates
+		.iter()
+		.take(k)
+		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
+		.count()
+}
+
+fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}
+
+fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
+
+fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	let positive_count = positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}
+
+fn per_query_denominators(
+	candidate_count: usize,
+	expected_relevant_count: usize,
+) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
+		denominators.insert(format!("precision_at_{k}"), *k);
+		denominators.insert(format!("success_at_{k}"), 1);
+	}
+
+	denominators.insert("mrr".to_string(), expected_relevant_count);
+	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
+	denominators.insert("average_precision".to_string(), expected_relevant_count);
+	denominators.insert("candidate_count".to_string(), candidate_count);
+
+	denominators
+}
+
+fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		"explicit_qrels"
+	} else if empty {
+		"not_encoded"
+	} else {
+		"expected_evidence_fallback"
+	}
+}
+
+fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
+	relevance.values().filter(|grade| **grade > 0.0).count()
+}
+
+fn rate(numerator: usize, denominator: usize) -> Option<f64> {
+	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
new file mode 100644
index 00000000..918a8613
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
@@ -0,0 +1,83 @@
+use crate::{BTreeMap, BTreeSet, RealWorldJob, ReportSummary, scoring};
+
+pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+	source_jobs
+		.iter()
+		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
+		.map(|job| job.job_id.as_str())
+		.collect()
+}
+
+pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	ranking_query_ids(source_jobs).len()
+}
+
+pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
+}
+
+pub(super) fn aggregate_qrel_source(
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if explicit_qrel_query_count == ranking_query_count {
+		"explicit_qrels"
+	} else if explicit_qrel_query_count == 0 {
+		"expected_evidence_fallback"
+	} else {
+		"mixed"
+	}
+}
+
+pub(super) fn ranking_coverage_state(
+	summary: &ReportSummary,
+	source_job_count: usize,
+	ranking_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
+		"complete"
+	} else {
+		"partial_coverage"
+	}
+}
+
+pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
+	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
+}
+
+fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		return source_job
+			.expected_answer
+			.relevance_judgments
+			.iter()
+			.filter(|judgment| judgment.grade > 0.0)
+			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+			.collect();
+	}
+
+	source_job
+		.required_evidence
+		.iter()
+		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
+		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
+		.collect()
+}
+
+fn ranking_query_attempted(job: &RealWorldJob) -> bool {
+	if !scoring::produced_evidence_order(job).is_empty() {
+		return true;
+	}
+
+	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
+		return false;
+	};
+
+	answer.trace_explainability.as_ref().is_some_and(|trace| {
+		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
+	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
+}

From ce6f82c58b8561f2d1b6bbb8ee24ffec2ec9df83 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:51:28 -0400
Subject: [PATCH 12/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit artifacts","authority":"manual"}

---
 .../quantitative/audit_manifest.rs            | 158 +-----------------
 .../quantitative/audit_manifest/artifacts.rs  | 151 +++++++++++++++++
 2 files changed, 156 insertions(+), 153 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
index be8b9e50..e927bbac 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -1,7 +1,7 @@
-use std::env;
+mod artifacts;
 
 use crate::{
-	BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact,
+	BTreeSet, ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact,
 	QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs,
 	quantitative::{
 		QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
@@ -58,8 +58,8 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 		controls: args.controls.clone(),
 		artifacts: vec![QuantitativeAuditArtifact {
 			role: "product_runtime_fixtures".to_string(),
-			path: audit_artifact_display_path(args.fixtures.as_path()),
-			sha256: fixture_path_digest(args.fixtures.as_path())?,
+			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
+			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
 		}],
 		claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| {
 			if args.held_out || args.leakage_audited {
@@ -189,7 +189,7 @@ fn validate_quantitative_audit_manifest(
 	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
 	validate_quantitative_audit_controls(manifest, path)?;
 
-	validate_quantitative_audit_artifacts(manifest, path)
+	artifacts::validate_quantitative_audit_artifacts(manifest, path)
 }
 
 fn validate_quantitative_audit_query_ids(
@@ -252,151 +252,3 @@ fn validate_quantitative_audit_controls(
 
 	Ok(())
 }
-
-fn validate_quantitative_audit_artifacts(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	if manifest.artifacts.is_empty() {
-		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
-	}
-
-	for artifact in &manifest.artifacts {
-		if artifact.role.trim().is_empty()
-			|| artifact.path.trim().is_empty()
-			|| artifact.sha256.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative audit artifact.",
-				path.display()
-			));
-		}
-		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
-		{
-			return Err(eyre::eyre!(
-				"{} artifact {} has invalid sha256 digest {}.",
-				path.display(),
-				artifact.role,
-				artifact.sha256
-			));
-		}
-
-		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
-		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
-			eyre::eyre!(
-				"{} artifact {} could not be digested at {}: {err}",
-				path.display(),
-				artifact.role,
-				artifact_path.display()
-			)
-		})?;
-
-		if actual != artifact.sha256 {
-			return Err(eyre::eyre!(
-				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
-				path.display(),
-				artifact.role,
-				artifact_path.display(),
-				artifact.sha256,
-				actual
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
-	let raw = PathBuf::from(artifact_path);
-
-	if raw.is_absolute() {
-		return raw;
-	}
-
-	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
-
-	if cwd_path.exists() {
-		return cwd_path;
-	}
-
-	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
-}
-
-fn fixture_path_digest(path: &Path) -> Result<String> {
-	let mut hasher = blake3::Hasher::new();
-
-	if path.is_file() {
-		hash_fixture_file(
-			path,
-			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
-			&mut hasher,
-		)?;
-
-		return Ok(hasher.finalize().to_hex().to_string());
-	}
-
-	let paths = audit_fixture_paths(path)?;
-
-	for fixture in paths {
-		let relative = fixture
-			.strip_prefix(path)
-			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
-			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
-
-		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
-	}
-
-	Ok(hasher.finalize().to_hex().to_string())
-}
-
-fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
-	let mut paths = Vec::new();
-
-	collect_audit_fixture_paths(path, &mut paths)?;
-
-	paths.sort();
-
-	Ok(paths)
-}
-
-fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
-	if path.is_file() {
-		paths.push(path.to_path_buf());
-
-		return Ok(());
-	}
-
-	for entry in fs::read_dir(path)? {
-		let entry_path = entry?.path();
-
-		if entry_path.is_dir() {
-			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
-		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
-			paths.push(entry_path);
-		}
-	}
-
-	Ok(())
-}
-
-fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
-	hasher.update(logical_path.as_bytes());
-	hasher.update(b"\0");
-	hasher.update(&fs::read(path)?);
-	hasher.update(b"\0");
-
-	Ok(())
-}
-
-fn audit_artifact_display_path(path: &Path) -> String {
-	let display_path = if path.is_absolute() {
-		env::current_dir()
-			.ok()
-			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
-			.unwrap_or_else(|| path.to_path_buf())
-	} else {
-		path.to_path_buf()
-	};
-
-	display_path.to_string_lossy().replace('\\', "/")
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
new file mode 100644
index 00000000..9e033400
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
@@ -0,0 +1,151 @@
+use std::env;
+
+use crate::{Path, PathBuf, QuantitativeAuditManifest, Result, eyre, fs};
+
+pub(super) fn validate_quantitative_audit_artifacts(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	if manifest.artifacts.is_empty() {
+		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
+	}
+
+	for artifact in &manifest.artifacts {
+		if artifact.role.trim().is_empty()
+			|| artifact.path.trim().is_empty()
+			|| artifact.sha256.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative audit artifact.",
+				path.display()
+			));
+		}
+		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
+		{
+			return Err(eyre::eyre!(
+				"{} artifact {} has invalid sha256 digest {}.",
+				path.display(),
+				artifact.role,
+				artifact.sha256
+			));
+		}
+
+		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
+		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
+			eyre::eyre!(
+				"{} artifact {} could not be digested at {}: {err}",
+				path.display(),
+				artifact.role,
+				artifact_path.display()
+			)
+		})?;
+
+		if actual != artifact.sha256 {
+			return Err(eyre::eyre!(
+				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
+				path.display(),
+				artifact.role,
+				artifact_path.display(),
+				artifact.sha256,
+				actual
+			));
+		}
+	}
+
+	Ok(())
+}
+
+pub(super) fn fixture_path_digest(path: &Path) -> Result<String> {
+	let mut hasher = blake3::Hasher::new();
+
+	if path.is_file() {
+		hash_fixture_file(
+			path,
+			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
+			&mut hasher,
+		)?;
+
+		return Ok(hasher.finalize().to_hex().to_string());
+	}
+
+	let paths = audit_fixture_paths(path)?;
+
+	for fixture in paths {
+		let relative = fixture
+			.strip_prefix(path)
+			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
+			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
+
+		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
+	}
+
+	Ok(hasher.finalize().to_hex().to_string())
+}
+
+pub(super) fn audit_artifact_display_path(path: &Path) -> String {
+	let display_path = if path.is_absolute() {
+		env::current_dir()
+			.ok()
+			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
+			.unwrap_or_else(|| path.to_path_buf())
+	} else {
+		path.to_path_buf()
+	};
+
+	display_path.to_string_lossy().replace('\\', "/")
+}
+
+fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
+	let raw = PathBuf::from(artifact_path);
+
+	if raw.is_absolute() {
+		return raw;
+	}
+
+	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
+
+	if cwd_path.exists() {
+		return cwd_path;
+	}
+
+	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
+}
+
+fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
+	let mut paths = Vec::new();
+
+	collect_audit_fixture_paths(path, &mut paths)?;
+
+	paths.sort();
+
+	Ok(paths)
+}
+
+fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
+	if path.is_file() {
+		paths.push(path.to_path_buf());
+
+		return Ok(());
+	}
+
+	for entry in fs::read_dir(path)? {
+		let entry_path = entry?.path();
+
+		if entry_path.is_dir() {
+			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
+		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
+			paths.push(entry_path);
+		}
+	}
+
+	Ok(())
+}
+
+fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
+	hasher.update(logical_path.as_bytes());
+	hasher.update(b"\0");
+	hasher.update(&fs::read(path)?);
+	hasher.update(b"\0");
+
+	Ok(())
+}

From 06ec4c1ba920482488a6f65e1b730dfa0059c221 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 12:55:12 -0400
Subject: [PATCH 13/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product validation","authority":"manual"}

---
 .../quantitative/product_manifest.rs          | 168 ++----------------
 .../product_manifest/validation.rs            | 157 ++++++++++++++++
 2 files changed, 167 insertions(+), 158 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
index 111459e9..ad9a2dee 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -1,7 +1,8 @@
+mod validation;
+
 use crate::{
-	BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow,
-	QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs,
-	quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA},
+	ExportQuantitativeProductManifestArgs, Path, QuantitativeProductManifest, REPORT_SCHEMA,
+	RealWorldReport, Result, eyre, fs, quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA,
 };
 
 pub(crate) fn quantitative_product_manifest_from_report(
@@ -82,7 +83,11 @@ pub(crate) fn quantitative_product_manifest_from_report(
 		per_query_rows,
 	};
 
-	validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?;
+	validation::validate_quantitative_product_manifest(
+		&manifest,
+		&args.report,
+		manifest.corpus_id.as_str(),
+	)?;
 
 	Ok(manifest)
 }
@@ -107,160 +112,7 @@ pub(super) fn quantitative_product_manifest(
 		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
 	}
 
-	validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
+	validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
 
 	Ok(manifest)
 }
-
-fn validate_quantitative_product_manifest(
-	manifest: &QuantitativeProductManifest,
-	path: &Path,
-	corpus_id: &str,
-) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.corpus_id != corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected same-corpus {}.",
-			path.display(),
-			manifest.corpus_id,
-			corpus_id
-		));
-	}
-	if manifest.rows.is_empty() {
-		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
-	}
-
-	let row_keys = manifest
-		.rows
-		.iter()
-		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
-		.collect::<BTreeSet<_>>();
-
-	for row in &manifest.rows {
-		if row.product == "ELF" {
-			return Err(eyre::eyre!(
-				"{} quantitative product manifest must not inject ELF self rows.",
-				path.display()
-			));
-		}
-		if row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.adapter_name.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative product row.",
-				path.display()
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-		if row.leaderboard_eligible {
-			validate_leaderboard_eligible_product_row(path, row)?;
-		}
-	}
-	for row in &manifest.per_query_rows {
-		if row.job_id.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-			|| row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.qrel_source.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative per-query product row.",
-				path.display()
-			));
-		}
-		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} has no matching product row.",
-				path.display(),
-				row.product,
-				row.adapter_id
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-	}
-	for row in &manifest.rows {
-		if row.ranking_query_count == 0 {
-			continue;
-		}
-
-		let per_query_count = manifest
-			.per_query_rows
-			.iter()
-			.filter(|per_query| {
-				per_query.product == row.product && per_query.adapter_id == row.adapter_id
-			})
-			.count();
-
-		if per_query_count < row.ranking_query_count {
-			return Err(eyre::eyre!(
-				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				row.ranking_query_count,
-				per_query_count
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn validate_leaderboard_eligible_product_row(
-	path: &Path,
-	row: &QuantitativeBenchmarkRow,
-) -> Result<()> {
-	let has_audit_manifest_id = row
-		.audit_manifest_id
-		.as_deref()
-		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
-
-	if row.evidence_class != "live_real_world"
-		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
-		|| row.ranking_query_count != row.sample_size
-		|| row.explicit_qrel_query_count != row.ranking_query_count
-		|| !row.held_out
-		|| !row.leakage_audited
-		|| !has_audit_manifest_id
-	{
-		return Err(eyre::eyre!(
-			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
-
-	Ok(())
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs
new file mode 100644
index 00000000..0ae5bf33
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs
@@ -0,0 +1,157 @@
+use crate::{
+	BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
+	quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA},
+};
+
+pub(super) fn validate_quantitative_product_manifest(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.corpus_id != corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected same-corpus {}.",
+			path.display(),
+			manifest.corpus_id,
+			corpus_id
+		));
+	}
+	if manifest.rows.is_empty() {
+		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
+	}
+
+	let row_keys = manifest
+		.rows
+		.iter()
+		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
+		.collect::<BTreeSet<_>>();
+
+	for row in &manifest.rows {
+		if row.product == "ELF" {
+			return Err(eyre::eyre!(
+				"{} quantitative product manifest must not inject ELF self rows.",
+				path.display()
+			));
+		}
+		if row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.adapter_name.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative product row.",
+				path.display()
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+		if row.leaderboard_eligible {
+			validate_leaderboard_eligible_product_row(path, row)?;
+		}
+	}
+	for row in &manifest.per_query_rows {
+		if row.job_id.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+			|| row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.qrel_source.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative per-query product row.",
+				path.display()
+			));
+		}
+		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} has no matching product row.",
+				path.display(),
+				row.product,
+				row.adapter_id
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+	for row in &manifest.rows {
+		if row.ranking_query_count == 0 {
+			continue;
+		}
+
+		let per_query_count = manifest
+			.per_query_rows
+			.iter()
+			.filter(|per_query| {
+				per_query.product == row.product && per_query.adapter_id == row.adapter_id
+			})
+			.count();
+
+		if per_query_count < row.ranking_query_count {
+			return Err(eyre::eyre!(
+				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				row.ranking_query_count,
+				per_query_count
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}

From c869f8bc33a58c5f99738d24b39588b73d33be03 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:00:33 -0400
Subject: [PATCH 14/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit validation","authority":"manual"}

---
 .../quantitative/audit_manifest.rs            | 145 +-----------------
 .../quantitative/audit_manifest/validation.rs | 142 +++++++++++++++++
 2 files changed, 147 insertions(+), 140 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
index e927bbac..d3e696a9 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -1,12 +1,10 @@
 mod artifacts;
+mod validation;
 
 use crate::{
-	BTreeSet, ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact,
+	ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact,
 	QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs,
-	quantitative::{
-		QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
-		REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, metrics,
-	},
+	quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, metrics},
 };
 
 pub(super) struct QuantitativeAuditContext<'a> {
@@ -78,7 +76,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 		}),
 	};
 
-	validate_quantitative_audit_manifest(
+	validation::validate_quantitative_audit_manifest(
 		&manifest,
 		args.fixtures.as_path(),
 		QuantitativeAuditContext {
@@ -111,7 +109,7 @@ pub(super) fn quantitative_audit_evidence(
 		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
 	})?;
 
-	validate_quantitative_audit_manifest(&manifest, path, context)?;
+	validation::validate_quantitative_audit_manifest(&manifest, path, context)?;
 
 	Ok(QuantitativeAuditEvidence {
 		held_out: manifest.held_out,
@@ -119,136 +117,3 @@ pub(super) fn quantitative_audit_evidence(
 		audit_manifest_id: Some(manifest.manifest_id),
 	})
 }
-
-fn validate_quantitative_audit_manifest(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-	context: QuantitativeAuditContext<'_>,
-) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.run_id != context.run_id {
-		return Err(eyre::eyre!(
-			"{} has run_id {}, expected {}.",
-			path.display(),
-			manifest.run_id,
-			context.run_id
-		));
-	}
-	if manifest.corpus_id != context.corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected {}.",
-			path.display(),
-			manifest.corpus_id,
-			context.corpus_id
-		));
-	}
-	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
-		return Err(eyre::eyre!(
-			"{} has product {}:{} but current row is {}:{}.",
-			path.display(),
-			manifest.product,
-			manifest.adapter_id,
-			context.product,
-			context.adapter_id
-		));
-	}
-	if manifest.sample_size != context.source_jobs.len() {
-		return Err(eyre::eyre!(
-			"{} has sample_size {}, expected {}.",
-			path.display(),
-			manifest.sample_size,
-			context.source_jobs.len()
-		));
-	}
-	if manifest.ranking_query_count != context.ranking_query_count {
-		return Err(eyre::eyre!(
-			"{} has ranking_query_count {}, expected {}.",
-			path.display(),
-			manifest.ranking_query_count,
-			context.ranking_query_count
-		));
-	}
-	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
-		return Err(eyre::eyre!(
-			"{} has explicit_qrel_query_count {}, expected {}.",
-			path.display(),
-			manifest.explicit_qrel_query_count,
-			context.explicit_qrel_query_count
-		));
-	}
-
-	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
-	validate_quantitative_audit_controls(manifest, path)?;
-
-	artifacts::validate_quantitative_audit_artifacts(manifest, path)
-}
-
-fn validate_quantitative_audit_query_ids(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-	source_jobs: &[RealWorldJob],
-) -> Result<()> {
-	let expected = metrics::ranking_query_ids(source_jobs);
-	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if actual.len() != manifest.query_ids.len() {
-		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
-	}
-	if actual != expected {
-		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
-		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
-
-		return Err(eyre::eyre!(
-			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
-			path.display(),
-			missing,
-			extra
-		));
-	}
-
-	Ok(())
-}
-
-fn validate_quantitative_audit_controls(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
-		return Err(eyre::eyre!(
-			"{} marks held_out=true without required control {}.",
-			path.display(),
-			REQUIRED_HELD_OUT_AUDIT_CONTROL
-		));
-	}
-	if manifest.leakage_audited
-		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
-			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
-	{
-		return Err(eyre::eyre!(
-			"{} marks leakage_audited=true without required controls {} and {}.",
-			path.display(),
-			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
-			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
-		));
-	}
-	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
-	{
-		return Err(eyre::eyre!(
-			"{} marks audit controls true but has an empty claim_boundary.",
-			path.display()
-		));
-	}
-
-	Ok(())
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs
new file mode 100644
index 00000000..5aab2c4f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs
@@ -0,0 +1,142 @@
+use crate::{
+	BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre,
+	quantitative::{
+		QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
+		REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+		audit_manifest::{QuantitativeAuditContext, artifacts},
+		metrics,
+	},
+};
+
+pub(super) fn validate_quantitative_audit_manifest(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
+	validate_quantitative_audit_controls(manifest, path)?;
+
+	artifacts::validate_quantitative_audit_artifacts(manifest, path)
+}
+
+fn validate_quantitative_audit_query_ids(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	source_jobs: &[RealWorldJob],
+) -> Result<()> {
+	let expected = metrics::ranking_query_ids(source_jobs);
+	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if actual.len() != manifest.query_ids.len() {
+		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
+	}
+	if actual != expected {
+		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
+		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
+
+		return Err(eyre::eyre!(
+			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
+			path.display(),
+			missing,
+			extra
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_quantitative_audit_controls(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
+		return Err(eyre::eyre!(
+			"{} marks held_out=true without required control {}.",
+			path.display(),
+			REQUIRED_HELD_OUT_AUDIT_CONTROL
+		));
+	}
+	if manifest.leakage_audited
+		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
+			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
+	{
+		return Err(eyre::eyre!(
+			"{} marks leakage_audited=true without required controls {} and {}.",
+			path.display(),
+			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
+		));
+	}
+	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} marks audit controls true but has an empty claim_boundary.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}

From 660172501cca286a38c30b750502563bc23de2a3 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:09:56 -0400
Subject: [PATCH 15/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report assembly","authority":"manual"}

---
 .../real_world_job_benchmark/quantitative.rs  | 141 +----------------
 .../quantitative/report.rs                    | 142 ++++++++++++++++++
 2 files changed, 146 insertions(+), 137 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
index 16365e66..4032c770 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs
@@ -1,17 +1,16 @@
 mod audit_manifest;
 mod metrics;
 mod product_manifest;
+mod report;
 
 pub(super) use self::{
 	audit_manifest::quantitative_audit_manifest_from_jobs,
 	product_manifest::quantitative_product_manifest_from_report,
+	report::{QuantitativeReportInput, quantitative_scoreboard_report},
 };
 
-use self::audit_manifest::{QuantitativeAuditContext, QuantitativeAuditEvidence};
-use crate::{
-	AdapterReport, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls,
-	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result,
-};
+use self::audit_manifest::QuantitativeAuditEvidence;
+use crate::{AdapterReport, BTreeSet, JobReport, RealWorldJob, ReportSummary};
 
 const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1";
 const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str =
@@ -30,138 +29,6 @@ const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!(
 	"Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims."
 );
 
-pub(super) struct QuantitativeReportInput<'a> {
-	pub(super) run_id: &'a str,
-	pub(super) generated_at: &'a str,
-	pub(super) adapter: &'a AdapterReport,
-	pub(super) source_jobs: &'a [RealWorldJob],
-	pub(super) jobs: &'a [JobReport],
-	pub(super) summary: &'a ReportSummary,
-	pub(super) product_manifest_path: Option<&'a Path>,
-	pub(super) audit_manifest_path: Option<&'a Path>,
-}
-
-pub(super) fn quantitative_scoreboard_report(
-	input: QuantitativeReportInput<'_>,
-) -> Result<QuantitativeBenchmarkReport> {
-	let corpus_id = quantitative_corpus_id(input.source_jobs);
-	let evidence_class = quantitative_evidence_class(input.adapter, input.jobs);
-	let per_query_rows = metrics::quantitative_per_query_rows(
-		input.source_jobs,
-		input.jobs,
-		corpus_id.as_str(),
-		evidence_class,
-		input.adapter.adapter_id.as_str(),
-	);
-	let ranking_query_count = per_query_rows
-		.iter()
-		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
-		.count();
-	let explicit_qrel_query_count =
-		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
-	let metric_comparable = ranking_query_count > 0;
-	let result_state = quantitative_result_state(input.summary);
-	let audit_evidence = audit_manifest::quantitative_audit_evidence(
-		input.audit_manifest_path,
-		QuantitativeAuditContext {
-			run_id: input.run_id,
-			corpus_id: corpus_id.as_str(),
-			product: "ELF",
-			adapter_id: input.adapter.adapter_id.as_str(),
-			source_jobs: input.source_jobs,
-			ranking_query_count,
-			explicit_qrel_query_count,
-		},
-	)?;
-	let leaderboard_eligible = quantitative_row_leaderboard_eligible(
-		evidence_class,
-		input.source_jobs.len(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		metric_comparable,
-		&audit_evidence,
-	);
-	let row = QuantitativeBenchmarkRow {
-		product: "ELF".to_string(),
-		adapter_id: input.adapter.adapter_id.clone(),
-		adapter_name: input.adapter.name.clone(),
-		suite: quantitative_suite_id(input.jobs),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.clone()),
-		result_state: result_state.to_string(),
-		comparable: metric_comparable,
-		metric_comparable,
-		leaderboard_eligible,
-		held_out: audit_evidence.held_out,
-		leakage_audited: audit_evidence.leakage_audited,
-		audit_manifest_id: audit_evidence.audit_manifest_id,
-		fixture_regression_only: evidence_class == "fixture_backed",
-		sample_size: input.jobs.len(),
-		ranking_query_count,
-		ranking_coverage_state: metrics::ranking_coverage_state(
-			input.summary,
-			input.source_jobs.len(),
-			ranking_query_count,
-		)
-		.to_string(),
-		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
-		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
-			.to_string(),
-		explicit_qrel_query_count,
-		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
-		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
-		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
-		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	};
-	let product_manifest = product_manifest::quantitative_product_manifest(
-		input.product_manifest_path,
-		corpus_id.as_str(),
-	)?;
-	let imported_row_count = product_manifest.rows.len();
-	let imported_per_query_count = product_manifest.per_query_rows.len();
-	let mut rows = vec![row];
-	let mut merged_per_query_rows = per_query_rows;
-
-	rows.extend(product_manifest.rows);
-	merged_per_query_rows.extend(product_manifest.per_query_rows);
-
-	let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2;
-	let controls = QuantitativeBenchmarkControls {
-		same_corpus_required: true,
-		same_task_required: true,
-		ranked_candidates_required_for_ranking_metrics: true,
-		explicit_relevance_judgments_required_for_leaderboard: true,
-		minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT,
-		current_query_count: input.source_jobs.len(),
-		current_ranking_query_count: ranking_query_count,
-		current_explicit_qrel_query_count: explicit_qrel_query_count,
-		leaderboard_claim_allowed,
-		leakage_control:
-			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
-				.to_string(),
-	};
-
-	Ok(QuantitativeBenchmarkReport {
-		schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(),
-		generated_at: input.generated_at.to_string(),
-		corpus_id,
-		k_values: QUANTITATIVE_K_VALUES.to_vec(),
-		rows,
-		per_query_rows: merged_per_query_rows,
-		metrics_not_encoded: quantitative_metrics_not_encoded(
-			imported_row_count,
-			imported_per_query_count,
-		),
-		controls,
-		claim_boundary: concat!(
-			"Do not convert fixture mechanics, missing explicit qrels, ",
-			"or partial candidate coverage into product leaderboard claims."
-		)
-		.to_string(),
-	})
-}
-
 fn quantitative_metrics_not_encoded(
 	imported_row_count: usize,
 	imported_per_query_count: usize,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
new file mode 100644
index 00000000..bb3ab895
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
@@ -0,0 +1,142 @@
+use crate::{
+	AdapterReport, JobReport, Path, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport,
+	QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result,
+	quantitative::{
+		self, MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY,
+		QUANTITATIVE_SCOREBOARD_SCHEMA,
+		audit_manifest::{self, QuantitativeAuditContext},
+		metrics, product_manifest,
+	},
+};
+
+pub(crate) struct QuantitativeReportInput<'a> {
+	pub(crate) run_id: &'a str,
+	pub(crate) generated_at: &'a str,
+	pub(crate) adapter: &'a AdapterReport,
+	pub(crate) source_jobs: &'a [RealWorldJob],
+	pub(crate) jobs: &'a [JobReport],
+	pub(crate) summary: &'a ReportSummary,
+	pub(crate) product_manifest_path: Option<&'a Path>,
+	pub(crate) audit_manifest_path: Option<&'a Path>,
+}
+
+pub(crate) fn quantitative_scoreboard_report(
+	input: QuantitativeReportInput<'_>,
+) -> Result<QuantitativeBenchmarkReport> {
+	let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs);
+	let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs);
+	let per_query_rows = metrics::quantitative_per_query_rows(
+		input.source_jobs,
+		input.jobs,
+		corpus_id.as_str(),
+		evidence_class,
+		input.adapter.adapter_id.as_str(),
+	);
+	let ranking_query_count = per_query_rows
+		.iter()
+		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
+		.count();
+	let explicit_qrel_query_count =
+		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
+	let metric_comparable = ranking_query_count > 0;
+	let result_state = quantitative::quantitative_result_state(input.summary);
+	let audit_evidence = audit_manifest::quantitative_audit_evidence(
+		input.audit_manifest_path,
+		QuantitativeAuditContext {
+			run_id: input.run_id,
+			corpus_id: corpus_id.as_str(),
+			product: "ELF",
+			adapter_id: input.adapter.adapter_id.as_str(),
+			source_jobs: input.source_jobs,
+			ranking_query_count,
+			explicit_qrel_query_count,
+		},
+	)?;
+	let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible(
+		evidence_class,
+		input.source_jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		metric_comparable,
+		&audit_evidence,
+	);
+	let row = QuantitativeBenchmarkRow {
+		product: "ELF".to_string(),
+		adapter_id: input.adapter.adapter_id.clone(),
+		adapter_name: input.adapter.name.clone(),
+		suite: quantitative::quantitative_suite_id(input.jobs),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.clone()),
+		result_state: result_state.to_string(),
+		comparable: metric_comparable,
+		metric_comparable,
+		leaderboard_eligible,
+		held_out: audit_evidence.held_out,
+		leakage_audited: audit_evidence.leakage_audited,
+		audit_manifest_id: audit_evidence.audit_manifest_id,
+		fixture_regression_only: evidence_class == "fixture_backed",
+		sample_size: input.jobs.len(),
+		ranking_query_count,
+		ranking_coverage_state: metrics::ranking_coverage_state(
+			input.summary,
+			input.source_jobs.len(),
+			ranking_query_count,
+		)
+		.to_string(),
+		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
+		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
+			.to_string(),
+		explicit_qrel_query_count,
+		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
+		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
+		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
+		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	};
+	let product_manifest = product_manifest::quantitative_product_manifest(
+		input.product_manifest_path,
+		corpus_id.as_str(),
+	)?;
+	let imported_row_count = product_manifest.rows.len();
+	let imported_per_query_count = product_manifest.per_query_rows.len();
+	let mut rows = vec![row];
+	let mut merged_per_query_rows = per_query_rows;
+
+	rows.extend(product_manifest.rows);
+	merged_per_query_rows.extend(product_manifest.per_query_rows);
+
+	let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2;
+	let controls = QuantitativeBenchmarkControls {
+		same_corpus_required: true,
+		same_task_required: true,
+		ranked_candidates_required_for_ranking_metrics: true,
+		explicit_relevance_judgments_required_for_leaderboard: true,
+		minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT,
+		current_query_count: input.source_jobs.len(),
+		current_ranking_query_count: ranking_query_count,
+		current_explicit_qrel_query_count: explicit_qrel_query_count,
+		leaderboard_claim_allowed,
+		leakage_control:
+			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
+				.to_string(),
+	};
+
+	Ok(QuantitativeBenchmarkReport {
+		schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(),
+		generated_at: input.generated_at.to_string(),
+		corpus_id,
+		k_values: QUANTITATIVE_K_VALUES.to_vec(),
+		rows,
+		per_query_rows: merged_per_query_rows,
+		metrics_not_encoded: quantitative::quantitative_metrics_not_encoded(
+			imported_row_count,
+			imported_per_query_count,
+		),
+		controls,
+		claim_boundary: concat!(
+			"Do not convert fixture mechanics, missing explicit qrels, ",
+			"or partial candidate coverage into product leaderboard claims."
+		)
+		.to_string(),
+	})
+}

From 6261914c54be6a7d8ba07a72e4455faa43f1b9af Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:14:46 -0400
Subject: [PATCH 16/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query metrics","authority":"manual"}

---
 .../quantitative/metrics/per_query.rs         | 145 ++----------------
 .../metrics/per_query/query_metrics.rs        | 129 ++++++++++++++++
 2 files changed, 139 insertions(+), 135 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
index db9e932c..fbbce5db 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
@@ -1,7 +1,8 @@
+mod query_metrics;
+
 use crate::{
-	BTreeMap, BTreeSet, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
-	quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY},
-	scoring,
+	BTreeMap, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
+	quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring,
 };
 
 pub(super) fn quantitative_per_query_rows(
@@ -29,15 +30,14 @@ fn quantitative_per_query_row(
 ) -> QuantitativePerQueryRow {
 	let relevance = relevance_grades(source_job, job);
 	let candidates = scoring::produced_evidence_order(source_job);
-	let positive_relevance_count = positive_qrel_count(&relevance);
-	let metrics = per_query_metrics(candidates.as_slice(), &relevance);
+	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
+	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
 	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
 		"not_encoded"
 	} else {
 		formatting::status_str(job.status)
 	};
 	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
-	let denominators = per_query_denominators(candidates.len(), positive_relevance_count);
 
 	QuantitativePerQueryRow {
 		job_id: job.job_id.clone(),
@@ -53,7 +53,10 @@ fn quantitative_per_query_row(
 		adapter_id: adapter_id.to_string(),
 		metrics,
 		metric_states,
-		denominators,
+		denominators: query_metrics::per_query_denominators(
+			candidates.len(),
+			positive_relevance_count,
+		),
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	}
 }
@@ -73,126 +76,6 @@ fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<Stri
 	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
 }
 
-fn per_query_metrics(
-	candidates: &[String],
-	relevance: &BTreeMap<String, f64>,
-) -> BTreeMap<String, Option<f64>> {
-	let mut metrics = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
-
-		metrics
-			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
-		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
-		metrics.insert(
-			format!("success_at_{k}"),
-			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
-		);
-	}
-
-	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
-	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
-	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
-
-	metrics
-}
-
-fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
-	candidates
-		.iter()
-		.take(k)
-		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
-		.count()
-}
-
-fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	Some(
-		candidates
-			.iter()
-			.position(|candidate| {
-				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
-			})
-			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
-	)
-}
-
-fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	let dcg = candidates
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, candidate)| {
-			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
-				/ ((index + 2) as f64).log2()
-		})
-		.sum::<f64>();
-	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
-
-	ideal.sort_by(|left, right| right.total_cmp(left));
-
-	let idcg = ideal
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
-		.sum::<f64>();
-
-	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
-}
-
-fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	let positive_count = positive_qrel_count(relevance);
-
-	if positive_count == 0 {
-		return None;
-	}
-
-	let mut hit_count = 0;
-	let mut precision_sum = 0.0;
-	let mut seen = BTreeSet::new();
-
-	for (index, candidate) in candidates.iter().enumerate() {
-		if !seen.insert(candidate.as_str()) {
-			continue;
-		}
-		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
-			hit_count += 1;
-			precision_sum += hit_count as f64 / (index + 1) as f64;
-		}
-	}
-
-	Some(precision_sum / positive_count as f64)
-}
-
-fn per_query_denominators(
-	candidate_count: usize,
-	expected_relevant_count: usize,
-) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
-		denominators.insert(format!("precision_at_{k}"), *k);
-		denominators.insert(format!("success_at_{k}"), 1);
-	}
-
-	denominators.insert("mrr".to_string(), expected_relevant_count);
-	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
-	denominators.insert("average_precision".to_string(), expected_relevant_count);
-	denominators.insert("candidate_count".to_string(), candidate_count);
-
-	denominators
-}
-
 fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
 	if !source_job.expected_answer.relevance_judgments.is_empty() {
 		"explicit_qrels"
@@ -202,11 +85,3 @@ fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
 		"expected_evidence_fallback"
 	}
 }
-
-fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
-	relevance.values().filter(|grade| **grade > 0.0).count()
-}
-
-fn rate(numerator: usize, denominator: usize) -> Option<f64> {
-	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs
new file mode 100644
index 00000000..01babc1d
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs
@@ -0,0 +1,129 @@
+use crate::{BTreeMap, BTreeSet, formatting, quantitative::QUANTITATIVE_K_VALUES};
+
+pub(super) fn per_query_metrics(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> BTreeMap<String, Option<f64>> {
+	let mut metrics = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
+
+		metrics
+			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
+		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
+		metrics.insert(
+			format!("success_at_{k}"),
+			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
+		);
+	}
+
+	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
+	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
+	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
+
+	metrics
+}
+
+pub(super) fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
+	relevance.values().filter(|grade| **grade > 0.0).count()
+}
+
+pub(super) fn per_query_denominators(
+	candidate_count: usize,
+	expected_relevant_count: usize,
+) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
+		denominators.insert(format!("precision_at_{k}"), *k);
+		denominators.insert(format!("success_at_{k}"), 1);
+	}
+
+	denominators.insert("mrr".to_string(), expected_relevant_count);
+	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
+	denominators.insert("average_precision".to_string(), expected_relevant_count);
+	denominators.insert("candidate_count".to_string(), candidate_count);
+
+	denominators
+}
+
+fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
+	candidates
+		.iter()
+		.take(k)
+		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
+		.count()
+}
+
+fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}
+
+fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
+	if positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
+
+fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
+	let positive_count = positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}
+
+fn rate(numerator: usize, denominator: usize) -> Option<f64> {
+	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+}

From 69af28a58114d96d5ae0adb6a147a30dcd0aaed1 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:18:20 -0400
Subject: [PATCH 17/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative aggregate confidence","authority":"manual"}

---
 .../quantitative/metrics/aggregate.rs         | 78 +-----------------
 .../metrics/aggregate/confidence.rs           | 82 +++++++++++++++++++
 2 files changed, 86 insertions(+), 74 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
index cb2dd63d..4d737d85 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
@@ -1,6 +1,8 @@
+mod confidence;
+
 use crate::{
 	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
-	quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z},
+	quantitative::QUANTITATIVE_K_VALUES,
 };
 
 pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
@@ -77,20 +79,7 @@ pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeM
 pub(super) fn aggregate_confidence_intervals(
 	rows: &[QuantitativePerQueryRow],
 ) -> BTreeMap<String, QuantitativeConfidenceInterval> {
-	let mut confidence_intervals = BTreeMap::new();
-
-	for metric in rate_metric_names() {
-		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
-
-		if denominator > 0 {
-			confidence_intervals.insert(
-				metric,
-				wilson_confidence_interval(numerator.min(denominator), denominator),
-			);
-		}
-	}
-
-	confidence_intervals
+	confidence::aggregate_confidence_intervals(rows)
 }
 
 fn quantitative_metric_names() -> Vec<String> {
@@ -108,65 +97,6 @@ fn quantitative_metric_names() -> Vec<String> {
 	metrics
 }
 
-fn rate_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-
-	metrics
-}
-
-fn aggregate_rate_numerator_denominator(
-	rows: &[QuantitativePerQueryRow],
-	metric: &str,
-) -> (usize, usize) {
-	let mut numerator = 0;
-	let mut denominator = 0;
-
-	for row in rows {
-		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
-			continue;
-		};
-		let Some(row_denominator) = row.denominators.get(metric).copied() else {
-			continue;
-		};
-
-		if row_denominator == 0 {
-			continue;
-		}
-
-		denominator += row_denominator;
-		numerator += (value * row_denominator as f64).round() as usize;
-	}
-
-	(numerator, denominator)
-}
-
-fn wilson_confidence_interval(
-	numerator: usize,
-	denominator: usize,
-) -> QuantitativeConfidenceInterval {
-	let n = denominator as f64;
-	let p = numerator as f64 / n;
-	let z2 = WILSON_95_Z * WILSON_95_Z;
-	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
-	let half_width =
-		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
-
-	QuantitativeConfidenceInterval {
-		method: "wilson_score".to_string(),
-		confidence: 0.95,
-		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
-		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
-		numerator,
-		denominator,
-	}
-}
-
 fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
 	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs
new file mode 100644
index 00000000..e1db5fb8
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs
@@ -0,0 +1,82 @@
+use crate::{
+	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
+	quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z},
+};
+
+pub(super) fn aggregate_confidence_intervals(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, QuantitativeConfidenceInterval> {
+	let mut confidence_intervals = BTreeMap::new();
+
+	for metric in rate_metric_names() {
+		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
+
+		if denominator > 0 {
+			confidence_intervals.insert(
+				metric,
+				wilson_confidence_interval(numerator.min(denominator), denominator),
+			);
+		}
+	}
+
+	confidence_intervals
+}
+
+fn rate_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+
+	metrics
+}
+
+fn aggregate_rate_numerator_denominator(
+	rows: &[QuantitativePerQueryRow],
+	metric: &str,
+) -> (usize, usize) {
+	let mut numerator = 0;
+	let mut denominator = 0;
+
+	for row in rows {
+		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
+			continue;
+		};
+		let Some(row_denominator) = row.denominators.get(metric).copied() else {
+			continue;
+		};
+
+		if row_denominator == 0 {
+			continue;
+		}
+
+		denominator += row_denominator;
+		numerator += (value * row_denominator as f64).round() as usize;
+	}
+
+	(numerator, denominator)
+}
+
+fn wilson_confidence_interval(
+	numerator: usize,
+	denominator: usize,
+) -> QuantitativeConfidenceInterval {
+	let n = denominator as f64;
+	let p = numerator as f64 / n;
+	let z2 = WILSON_95_Z * WILSON_95_Z;
+	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
+	let half_width =
+		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
+
+	QuantitativeConfidenceInterval {
+		method: "wilson_score".to_string(),
+		confidence: 0.95,
+		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
+		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
+		numerator,
+		denominator,
+	}
+}

From 0d546f8eafed53f70eee70bfd7380de3d15a0e18 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:24:10 -0400
Subject: [PATCH 18/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product manifest tests","authority":"manual"}

---
 .../quantitative/product_manifest.rs          | 185 +++---------------
 .../quantitative/product_manifest/export.rs   |  73 +++++++
 .../product_manifest/validation.rs            |  64 ++++++
 3 files changed, 162 insertions(+), 160 deletions(-)
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs
 create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs

diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs
index c7b543c5..054e70f3 100644
--- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -1,5 +1,9 @@
+#[path = "product_manifest/export.rs"] mod export;
+#[path = "product_manifest/validation.rs"] mod validation;
+
 use std::{
 	env, fs,
+	path::PathBuf,
 	process::{self, Command},
 };
 
@@ -8,114 +12,40 @@ use serde_json::Value;
 
 use crate::support;
 
-#[test]
-fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir =
-		env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--product")
-		.arg("Synthetic Rival")
-		.arg("--adapter-id")
-		.arg("synthetic_rival")
-		.arg("--adapter-name")
-		.arg("Synthetic Rival adapter")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"product manifest export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let manifest = support::load_json(&manifest_path)?;
-
-	assert_eq!(
-		manifest.pointer("/schema").and_then(Value::as_str),
-		Some("elf.agent_memory_quantitative_product_manifest/v1")
-	);
-	assert_eq!(
-		manifest.pointer("/rows/0/product").and_then(Value::as_str),
-		Some("Synthetic Rival")
-	);
-	assert_eq!(
-		manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str),
-		Some("synthetic_rival")
-	);
-
-	let imported = super::run_report_with_quantitative_manifest(&manifest_path)?;
-	let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?;
-	let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?;
+struct ProductManifestPaths {
+	temp_dir: PathBuf,
+	report_path: PathBuf,
+	manifest_path: PathBuf,
+}
 
-	assert_eq!(rows.len(), 2);
-	assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival"));
-	assert!(!support::array_contains_str(
-		&imported,
-		"/quantitative_scoreboard/metrics_not_encoded",
-		"external_product_manifest_import"
-	)?);
-	assert!(
-		support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any(
-			|row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival")
-		)
-	);
+fn product_manifest_paths(temp_name: &str, manifest_file: &str) -> ProductManifestPaths {
+	let temp_dir = env::temp_dir().join(format!("{temp_name}-{}", process::id()));
 
-	Ok(())
+	ProductManifestPaths {
+		report_path: temp_dir.join("report.json"),
+		manifest_path: temp_dir.join(manifest_file),
+		temp_dir,
+	}
 }
 
-#[test]
-fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> {
+fn write_adversarial_report(paths: &ProductManifestPaths) -> Result<()> {
 	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("elf-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.output()?;
 
-	assert!(!output.status.success());
-	assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF"));
+	fs::create_dir_all(&paths.temp_dir)?;
+	fs::write(&paths.report_path, serde_json::to_vec_pretty(&report)?)?;
 
 	Ok(())
 }
 
-#[test]
-fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
+fn export_synthetic_rival_manifest(paths: &ProductManifestPaths) -> Result<()> {
+	write_adversarial_report(paths)?;
 
 	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
 		.arg("export-quantitative-product-manifest")
 		.arg("--report")
-		.arg(&report_path)
+		.arg(&paths.report_path)
 		.arg("--out")
-		.arg(&manifest_path)
+		.arg(&paths.manifest_path)
 		.arg("--product")
 		.arg("Synthetic Rival")
 		.arg("--adapter-id")
@@ -130,74 +60,9 @@ fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
 		String::from_utf8_lossy(&export.stderr)
 	);
 
-	let mut manifest = support::load_json(&manifest_path)?;
-
-	support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?;
-	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("run")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--quantitative-product-manifest")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-	assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus"));
-
 	Ok(())
 }
 
-#[test]
-fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> {
-	let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?;
-	let temp_dir = env::temp_dir()
-		.join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id()));
-	let report_path = temp_dir.join("report.json");
-	let manifest_path = temp_dir.join("missing-per-query-product-manifest.json");
-
-	fs::create_dir_all(&temp_dir)?;
-	fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?;
-
-	let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("export-quantitative-product-manifest")
-		.arg("--report")
-		.arg(&report_path)
-		.arg("--out")
-		.arg(&manifest_path)
-		.arg("--product")
-		.arg("Synthetic Rival")
-		.arg("--adapter-id")
-		.arg("synthetic_rival")
-		.arg("--adapter-name")
-		.arg("Synthetic Rival adapter")
-		.output()?;
-
-	assert!(
-		export.status.success(),
-		"product manifest export failed: {}",
-		String::from_utf8_lossy(&export.stderr)
-	);
-
-	let mut manifest = support::load_json(&manifest_path)?;
-
-	support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?;
-	fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
-
-	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
-		.arg("run")
-		.arg("--fixtures")
-		.arg(support::adversarial_quality_fixture_dir())
-		.arg("--quantitative-product-manifest")
-		.arg(&manifest_path)
-		.output()?;
-
-	assert!(!output.status.success());
-
-	let stderr = String::from_utf8_lossy(&output.stderr);
-
-	assert!(stderr.contains("ranked queries but only 0"));
-
-	Ok(())
+fn run_report_with_manifest(paths: &ProductManifestPaths) -> Result<Value> {
+	super::run_report_with_quantitative_manifest(&paths.manifest_path)
 }
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs
new file mode 100644
index 00000000..d56f2bd7
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs
@@ -0,0 +1,73 @@
+use std::process::Command;
+
+use color_eyre::Result;
+use serde_json::Value;
+
+use crate::support;
+
+#[test]
+fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> {
+	let paths = super::product_manifest_paths(
+		"elf-quantitative-product-manifest-test",
+		"synthetic-rival-product-manifest.json",
+	);
+
+	super::export_synthetic_rival_manifest(&paths)?;
+
+	let manifest = support::load_json(&paths.manifest_path)?;
+
+	assert_eq!(
+		manifest.pointer("/schema").and_then(Value::as_str),
+		Some("elf.agent_memory_quantitative_product_manifest/v1")
+	);
+	assert_eq!(
+		manifest.pointer("/rows/0/product").and_then(Value::as_str),
+		Some("Synthetic Rival")
+	);
+	assert_eq!(
+		manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str),
+		Some("synthetic_rival")
+	);
+
+	let imported = super::run_report_with_manifest(&paths)?;
+	let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?;
+	let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?;
+
+	assert_eq!(rows.len(), 2);
+	assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival"));
+	assert!(!support::array_contains_str(
+		&imported,
+		"/quantitative_scoreboard/metrics_not_encoded",
+		"external_product_manifest_import"
+	)?);
+	assert!(
+		support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any(
+			|row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival")
+		)
+	);
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> {
+	let paths = super::product_manifest_paths(
+		"elf-quantitative-product-manifest-elf-test",
+		"elf-product-manifest.json",
+	);
+
+	super::write_adversarial_report(&paths)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("export-quantitative-product-manifest")
+		.arg("--report")
+		.arg(&paths.report_path)
+		.arg("--out")
+		.arg(&paths.manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF"));
+
+	Ok(())
+}
diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs
new file mode 100644
index 00000000..e4e302b3
--- /dev/null
+++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs
@@ -0,0 +1,64 @@
+use std::{fs, process::Command};
+
+use color_eyre::Result;
+
+use crate::support;
+
+#[test]
+fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> {
+	let paths = super::product_manifest_paths(
+		"elf-quantitative-product-manifest-corpus-test",
+		"wrong-corpus-product-manifest.json",
+	);
+
+	super::export_synthetic_rival_manifest(&paths)?;
+
+	let mut manifest = support::load_json(&paths.manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?;
+	fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&paths.manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+	assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus"));
+
+	Ok(())
+}
+
+#[test]
+fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> {
+	let paths = super::product_manifest_paths(
+		"elf-quantitative-product-manifest-per-query-test",
+		"missing-per-query-product-manifest.json",
+	);
+
+	super::export_synthetic_rival_manifest(&paths)?;
+
+	let mut manifest = support::load_json(&paths.manifest_path)?;
+
+	support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?;
+	fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?;
+
+	let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark"))
+		.arg("run")
+		.arg("--fixtures")
+		.arg(support::adversarial_quality_fixture_dir())
+		.arg("--quantitative-product-manifest")
+		.arg(&paths.manifest_path)
+		.output()?;
+
+	assert!(!output.status.success());
+
+	let stderr = String::from_utf8_lossy(&output.stderr);
+
+	assert!(stderr.contains("ranked queries but only 0"));
+
+	Ok(())
+}

From e19440ab2f38e4cbc319496a112db7bd25031440 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:28:02 -0400
Subject: [PATCH 19/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product row validation","authority":"manual"}

---
 .../product_manifest/validation.rs            | 131 +--------------
 .../product_manifest/validation/rows.rs       | 152 ++++++++++++++++++
 2 files changed, 159 insertions(+), 124 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs
index 0ae5bf33..fe86d636 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs
@@ -1,6 +1,8 @@
+mod rows;
+
 use crate::{
-	BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
-	quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA},
+	Path, QuantitativeProductManifest, Result, eyre,
+	quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA,
 };
 
 pub(super) fn validate_quantitative_product_manifest(
@@ -30,128 +32,9 @@ pub(super) fn validate_quantitative_product_manifest(
 		return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display()));
 	}
 
-	let row_keys = manifest
-		.rows
-		.iter()
-		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
-		.collect::<BTreeSet<_>>();
-
-	for row in &manifest.rows {
-		if row.product == "ELF" {
-			return Err(eyre::eyre!(
-				"{} quantitative product manifest must not inject ELF self rows.",
-				path.display()
-			));
-		}
-		if row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.adapter_name.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative product row.",
-				path.display()
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-		if row.leaderboard_eligible {
-			validate_leaderboard_eligible_product_row(path, row)?;
-		}
-	}
-	for row in &manifest.per_query_rows {
-		if row.job_id.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-			|| row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.qrel_source.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative per-query product row.",
-				path.display()
-			));
-		}
-		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} has no matching product row.",
-				path.display(),
-				row.product,
-				row.adapter_id
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-	}
-	for row in &manifest.rows {
-		if row.ranking_query_count == 0 {
-			continue;
-		}
-
-		let per_query_count = manifest
-			.per_query_rows
-			.iter()
-			.filter(|per_query| {
-				per_query.product == row.product && per_query.adapter_id == row.adapter_id
-			})
-			.count();
-
-		if per_query_count < row.ranking_query_count {
-			return Err(eyre::eyre!(
-				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				row.ranking_query_count,
-				per_query_count
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn validate_leaderboard_eligible_product_row(
-	path: &Path,
-	row: &QuantitativeBenchmarkRow,
-) -> Result<()> {
-	let has_audit_manifest_id = row
-		.audit_manifest_id
-		.as_deref()
-		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
-
-	if row.evidence_class != "live_real_world"
-		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
-		|| row.ranking_query_count != row.sample_size
-		|| row.explicit_qrel_query_count != row.ranking_query_count
-		|| !row.held_out
-		|| !row.leakage_audited
-		|| !has_audit_manifest_id
-	{
-		return Err(eyre::eyre!(
-			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
+	rows::validate_quantitative_product_rows(manifest, path, corpus_id)?;
+	rows::validate_quantitative_per_query_rows(manifest, path, corpus_id)?;
+	rows::validate_ranked_row_evidence(manifest, path)?;
 
 	Ok(())
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs
new file mode 100644
index 00000000..055234ed
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs
@@ -0,0 +1,152 @@
+use crate::{
+	BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
+	quantitative::MIN_LEADERBOARD_QUERY_COUNT,
+};
+
+pub(super) fn validate_quantitative_product_rows(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	for row in &manifest.rows {
+		if row.product == "ELF" {
+			return Err(eyre::eyre!(
+				"{} quantitative product manifest must not inject ELF self rows.",
+				path.display()
+			));
+		}
+		if row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.adapter_name.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative product row.",
+				path.display()
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+		if row.leaderboard_eligible {
+			validate_leaderboard_eligible_product_row(path, row)?;
+		}
+	}
+
+	Ok(())
+}
+
+pub(super) fn validate_quantitative_per_query_rows(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	let row_keys = manifest
+		.rows
+		.iter()
+		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
+		.collect::<BTreeSet<_>>();
+
+	for row in &manifest.per_query_rows {
+		if row.job_id.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+			|| row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.qrel_source.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative per-query product row.",
+				path.display()
+			));
+		}
+		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} has no matching product row.",
+				path.display(),
+				row.product,
+				row.adapter_id
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+
+	Ok(())
+}
+
+pub(super) fn validate_ranked_row_evidence(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+) -> Result<()> {
+	for row in &manifest.rows {
+		if row.ranking_query_count == 0 {
+			continue;
+		}
+
+		let per_query_count = manifest
+			.per_query_rows
+			.iter()
+			.filter(|per_query| {
+				per_query.product == row.product && per_query.adapter_id == row.adapter_id
+			})
+			.count();
+
+		if per_query_count < row.ranking_query_count {
+			return Err(eyre::eyre!(
+				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				row.ranking_query_count,
+				per_query_count
+			));
+		}
+	}
+
+	Ok(())
+}
+
+fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}

From 974489b3f7eaa3457a5dafaa9f380abacc4ac5db Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:34:30 -0400
Subject: [PATCH 20/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product row checks","authority":"manual"}

---
 .../product_manifest/validation/rows.rs       | 140 +-----------------
 .../validation/rows/per_query.rs              |  48 ++++++
 .../validation/rows/product.rs                |  73 +++++++++
 .../validation/rows/ranking.rs                |  33 +++++
 4 files changed, 162 insertions(+), 132 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs
index 055234ed..36009dfa 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs
@@ -1,47 +1,15 @@
-use crate::{
-	BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
-	quantitative::MIN_LEADERBOARD_QUERY_COUNT,
-};
+mod per_query;
+mod product;
+mod ranking;
+
+use crate::{Path, QuantitativeProductManifest, Result};
 
 pub(super) fn validate_quantitative_product_rows(
 	manifest: &QuantitativeProductManifest,
 	path: &Path,
 	corpus_id: &str,
 ) -> Result<()> {
-	for row in &manifest.rows {
-		if row.product == "ELF" {
-			return Err(eyre::eyre!(
-				"{} quantitative product manifest must not inject ELF self rows.",
-				path.display()
-			));
-		}
-		if row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.adapter_name.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative product row.",
-				path.display()
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-		if row.leaderboard_eligible {
-			validate_leaderboard_eligible_product_row(path, row)?;
-		}
-	}
-
-	Ok(())
+	product::validate_quantitative_product_rows(manifest, path, corpus_id)
 }
 
 pub(super) fn validate_quantitative_per_query_rows(
@@ -49,104 +17,12 @@ pub(super) fn validate_quantitative_per_query_rows(
 	path: &Path,
 	corpus_id: &str,
 ) -> Result<()> {
-	let row_keys = manifest
-		.rows
-		.iter()
-		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
-		.collect::<BTreeSet<_>>();
-
-	for row in &manifest.per_query_rows {
-		if row.job_id.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-			|| row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.qrel_source.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative per-query product row.",
-				path.display()
-			));
-		}
-		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} has no matching product row.",
-				path.display(),
-				row.product,
-				row.adapter_id
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
-	}
-
-	Ok(())
+	per_query::validate_quantitative_per_query_rows(manifest, path, corpus_id)
 }
 
 pub(super) fn validate_ranked_row_evidence(
 	manifest: &QuantitativeProductManifest,
 	path: &Path,
 ) -> Result<()> {
-	for row in &manifest.rows {
-		if row.ranking_query_count == 0 {
-			continue;
-		}
-
-		let per_query_count = manifest
-			.per_query_rows
-			.iter()
-			.filter(|per_query| {
-				per_query.product == row.product && per_query.adapter_id == row.adapter_id
-			})
-			.count();
-
-		if per_query_count < row.ranking_query_count {
-			return Err(eyre::eyre!(
-				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				row.ranking_query_count,
-				per_query_count
-			));
-		}
-	}
-
-	Ok(())
-}
-
-fn validate_leaderboard_eligible_product_row(
-	path: &Path,
-	row: &QuantitativeBenchmarkRow,
-) -> Result<()> {
-	let has_audit_manifest_id = row
-		.audit_manifest_id
-		.as_deref()
-		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
-
-	if row.evidence_class != "live_real_world"
-		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
-		|| row.ranking_query_count != row.sample_size
-		|| row.explicit_qrel_query_count != row.ranking_query_count
-		|| !row.held_out
-		|| !row.leakage_audited
-		|| !has_audit_manifest_id
-	{
-		return Err(eyre::eyre!(
-			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
-
-	Ok(())
+	ranking::validate_ranked_row_evidence(manifest, path)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
new file mode 100644
index 00000000..4e720a68
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
@@ -0,0 +1,48 @@
+use crate::{BTreeSet, Path, QuantitativeProductManifest, Result, eyre};
+
+pub(super) fn validate_quantitative_per_query_rows(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	let row_keys = manifest
+		.rows
+		.iter()
+		.map(|row| (row.product.as_str(), row.adapter_id.as_str()))
+		.collect::<BTreeSet<_>>();
+
+	for row in &manifest.per_query_rows {
+		if row.job_id.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+			|| row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.qrel_source.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative per-query product row.",
+				path.display()
+			));
+		}
+		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} has no matching product row.",
+				path.display(),
+				row.product,
+				row.adapter_id
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} per-query row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
new file mode 100644
index 00000000..913b0628
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
@@ -0,0 +1,73 @@
+use crate::{
+	Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
+	quantitative::MIN_LEADERBOARD_QUERY_COUNT,
+};
+
+pub(super) fn validate_quantitative_product_rows(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+	corpus_id: &str,
+) -> Result<()> {
+	for row in &manifest.rows {
+		if row.product == "ELF" {
+			return Err(eyre::eyre!(
+				"{} quantitative product manifest must not inject ELF self rows.",
+				path.display()
+			));
+		}
+		if row.product.trim().is_empty()
+			|| row.adapter_id.trim().is_empty()
+			|| row.adapter_name.trim().is_empty()
+			|| row.suite.trim().is_empty()
+			|| row.evidence_class.trim().is_empty()
+			|| row.result_state.trim().is_empty()
+		{
+			return Err(eyre::eyre!(
+				"{} has an incomplete quantitative product row.",
+				path.display()
+			));
+		}
+		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+			return Err(eyre::eyre!(
+				"{} row {}:{} is not same-corpus {}.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				corpus_id
+			));
+		}
+		if row.leaderboard_eligible {
+			validate_leaderboard_eligible_product_row(path, row)?;
+		}
+	}
+
+	Ok(())
+}
+
+fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs
new file mode 100644
index 00000000..8206e54b
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs
@@ -0,0 +1,33 @@
+use crate::{Path, QuantitativeProductManifest, Result, eyre};
+
+pub(super) fn validate_ranked_row_evidence(
+	manifest: &QuantitativeProductManifest,
+	path: &Path,
+) -> Result<()> {
+	for row in &manifest.rows {
+		if row.ranking_query_count == 0 {
+			continue;
+		}
+
+		let per_query_count = manifest
+			.per_query_rows
+			.iter()
+			.filter(|per_query| {
+				per_query.product == row.product && per_query.adapter_id == row.adapter_id
+			})
+			.count();
+
+		if per_query_count < row.ranking_query_count {
+			return Err(eyre::eyre!(
+				"{} row {}:{} declares {} ranked queries but only {} per-query rows.",
+				path.display(),
+				row.product,
+				row.adapter_id,
+				row.ranking_query_count,
+				per_query_count
+			));
+		}
+	}
+
+	Ok(())
+}

From e1fc0e4033222afe5f6bc8bde9ca9b6c59d6f0d3 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:41:35 -0400
Subject: [PATCH 21/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report assembly","authority":"manual"}

---
 .../quantitative/report.rs                    | 111 +++---------------
 .../quantitative/report/controls.rs           |  26 ++++
 .../quantitative/report/row.rs                | 100 ++++++++++++++++
 3 files changed, 142 insertions(+), 95 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
index bb3ab895..331acc70 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
@@ -1,12 +1,10 @@
+mod controls;
+mod row;
+
 use crate::{
-	AdapterReport, JobReport, Path, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport,
-	QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result,
-	quantitative::{
-		self, MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY,
-		QUANTITATIVE_SCOREBOARD_SCHEMA,
-		audit_manifest::{self, QuantitativeAuditContext},
-		metrics, product_manifest,
-	},
+	AdapterReport, JobReport, Path, QuantitativeBenchmarkReport, RealWorldJob, ReportSummary,
+	Result,
+	quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest},
 };
 
 pub(crate) struct QuantitativeReportInput<'a> {
@@ -23,108 +21,31 @@ pub(crate) struct QuantitativeReportInput<'a> {
 pub(crate) fn quantitative_scoreboard_report(
 	input: QuantitativeReportInput<'_>,
 ) -> Result<QuantitativeBenchmarkReport> {
-	let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs);
-	let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs);
-	let per_query_rows = metrics::quantitative_per_query_rows(
-		input.source_jobs,
-		input.jobs,
-		corpus_id.as_str(),
-		evidence_class,
-		input.adapter.adapter_id.as_str(),
-	);
-	let ranking_query_count = per_query_rows
-		.iter()
-		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
-		.count();
-	let explicit_qrel_query_count =
-		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
-	let metric_comparable = ranking_query_count > 0;
-	let result_state = quantitative::quantitative_result_state(input.summary);
-	let audit_evidence = audit_manifest::quantitative_audit_evidence(
-		input.audit_manifest_path,
-		QuantitativeAuditContext {
-			run_id: input.run_id,
-			corpus_id: corpus_id.as_str(),
-			product: "ELF",
-			adapter_id: input.adapter.adapter_id.as_str(),
-			source_jobs: input.source_jobs,
-			ranking_query_count,
-			explicit_qrel_query_count,
-		},
-	)?;
-	let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible(
-		evidence_class,
-		input.source_jobs.len(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		metric_comparable,
-		&audit_evidence,
-	);
-	let row = QuantitativeBenchmarkRow {
-		product: "ELF".to_string(),
-		adapter_id: input.adapter.adapter_id.clone(),
-		adapter_name: input.adapter.name.clone(),
-		suite: quantitative::quantitative_suite_id(input.jobs),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.clone()),
-		result_state: result_state.to_string(),
-		comparable: metric_comparable,
-		metric_comparable,
-		leaderboard_eligible,
-		held_out: audit_evidence.held_out,
-		leakage_audited: audit_evidence.leakage_audited,
-		audit_manifest_id: audit_evidence.audit_manifest_id,
-		fixture_regression_only: evidence_class == "fixture_backed",
-		sample_size: input.jobs.len(),
-		ranking_query_count,
-		ranking_coverage_state: metrics::ranking_coverage_state(
-			input.summary,
-			input.source_jobs.len(),
-			ranking_query_count,
-		)
-		.to_string(),
-		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
-		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
-			.to_string(),
-		explicit_qrel_query_count,
-		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
-		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
-		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
-		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	};
+	let current_row = row::current_quantitative_row(&input)?;
 	let product_manifest = product_manifest::quantitative_product_manifest(
 		input.product_manifest_path,
-		corpus_id.as_str(),
+		current_row.corpus_id.as_str(),
 	)?;
 	let imported_row_count = product_manifest.rows.len();
 	let imported_per_query_count = product_manifest.per_query_rows.len();
-	let mut rows = vec![row];
-	let mut merged_per_query_rows = per_query_rows;
+	let mut rows = vec![current_row.row];
+	let mut merged_per_query_rows = current_row.per_query_rows;
 
 	rows.extend(product_manifest.rows);
 	merged_per_query_rows.extend(product_manifest.per_query_rows);
 
 	let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2;
-	let controls = QuantitativeBenchmarkControls {
-		same_corpus_required: true,
-		same_task_required: true,
-		ranked_candidates_required_for_ranking_metrics: true,
-		explicit_relevance_judgments_required_for_leaderboard: true,
-		minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT,
-		current_query_count: input.source_jobs.len(),
-		current_ranking_query_count: ranking_query_count,
-		current_explicit_qrel_query_count: explicit_qrel_query_count,
+	let controls = controls::quantitative_benchmark_controls(
+		&input,
+		current_row.ranking_query_count,
+		current_row.explicit_qrel_query_count,
 		leaderboard_claim_allowed,
-		leakage_control:
-			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
-				.to_string(),
-	};
+	);
 
 	Ok(QuantitativeBenchmarkReport {
 		schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(),
 		generated_at: input.generated_at.to_string(),
-		corpus_id,
+		corpus_id: current_row.corpus_id,
 		k_values: QUANTITATIVE_K_VALUES.to_vec(),
 		rows,
 		per_query_rows: merged_per_query_rows,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs
new file mode 100644
index 00000000..78d4b723
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs
@@ -0,0 +1,26 @@
+use crate::{
+	QuantitativeBenchmarkControls,
+	quantitative::{MIN_LEADERBOARD_QUERY_COUNT, report::QuantitativeReportInput},
+};
+
+pub(super) fn quantitative_benchmark_controls(
+	input: &QuantitativeReportInput<'_>,
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+	leaderboard_claim_allowed: bool,
+) -> QuantitativeBenchmarkControls {
+	QuantitativeBenchmarkControls {
+		same_corpus_required: true,
+		same_task_required: true,
+		ranked_candidates_required_for_ranking_metrics: true,
+		explicit_relevance_judgments_required_for_leaderboard: true,
+		minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT,
+		current_query_count: input.source_jobs.len(),
+		current_ranking_query_count: ranking_query_count,
+		current_explicit_qrel_query_count: explicit_qrel_query_count,
+		leaderboard_claim_allowed,
+		leakage_control:
+			"held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims"
+				.to_string(),
+	}
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
new file mode 100644
index 00000000..d3f8b232
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
@@ -0,0 +1,100 @@
+use crate::{
+	QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result,
+	quantitative::{
+		self, QUANTITATIVE_ROW_CLAIM_BOUNDARY,
+		audit_manifest::{self, QuantitativeAuditContext},
+		metrics,
+		report::QuantitativeReportInput,
+	},
+};
+
+pub(super) struct CurrentQuantitativeRow {
+	pub(super) corpus_id: String,
+	pub(super) row: QuantitativeBenchmarkRow,
+	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+}
+
+pub(super) fn current_quantitative_row(
+	input: &QuantitativeReportInput<'_>,
+) -> Result<CurrentQuantitativeRow> {
+	let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs);
+	let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs);
+	let per_query_rows = metrics::quantitative_per_query_rows(
+		input.source_jobs,
+		input.jobs,
+		corpus_id.as_str(),
+		evidence_class,
+		input.adapter.adapter_id.as_str(),
+	);
+	let ranking_query_count = per_query_rows
+		.iter()
+		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
+		.count();
+	let explicit_qrel_query_count =
+		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
+	let metric_comparable = ranking_query_count > 0;
+	let result_state = quantitative::quantitative_result_state(input.summary);
+	let audit_evidence = audit_manifest::quantitative_audit_evidence(
+		input.audit_manifest_path,
+		QuantitativeAuditContext {
+			run_id: input.run_id,
+			corpus_id: corpus_id.as_str(),
+			product: "ELF",
+			adapter_id: input.adapter.adapter_id.as_str(),
+			source_jobs: input.source_jobs,
+			ranking_query_count,
+			explicit_qrel_query_count,
+		},
+	)?;
+	let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible(
+		evidence_class,
+		input.source_jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		metric_comparable,
+		&audit_evidence,
+	);
+	let row = QuantitativeBenchmarkRow {
+		product: "ELF".to_string(),
+		adapter_id: input.adapter.adapter_id.clone(),
+		adapter_name: input.adapter.name.clone(),
+		suite: quantitative::quantitative_suite_id(input.jobs),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.clone()),
+		result_state: result_state.to_string(),
+		comparable: metric_comparable,
+		metric_comparable,
+		leaderboard_eligible,
+		held_out: audit_evidence.held_out,
+		leakage_audited: audit_evidence.leakage_audited,
+		audit_manifest_id: audit_evidence.audit_manifest_id,
+		fixture_regression_only: evidence_class == "fixture_backed",
+		sample_size: input.jobs.len(),
+		ranking_query_count,
+		ranking_coverage_state: metrics::ranking_coverage_state(
+			input.summary,
+			input.source_jobs.len(),
+			ranking_query_count,
+		)
+		.to_string(),
+		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
+		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
+			.to_string(),
+		explicit_qrel_query_count,
+		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
+		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
+		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
+		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	};
+
+	Ok(CurrentQuantitativeRow {
+		corpus_id,
+		row,
+		per_query_rows,
+		ranking_query_count,
+		explicit_qrel_query_count,
+	})
+}

From 00148a88ebb21832069b26258de32e86fe618c75 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:47:03 -0400
Subject: [PATCH 22/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit artifact helpers","authority":"manual"}

---
 .../quantitative/audit_manifest/artifacts.rs  | 107 ++----------------
 .../audit_manifest/artifacts/digest.rs        |  67 +++++++++++
 .../audit_manifest/artifacts/paths.rs         |  35 ++++++
 3 files changed, 110 insertions(+), 99 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
index 9e033400..25a0bbb0 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
@@ -1,6 +1,9 @@
-use std::env;
+mod digest;
+mod paths;
 
-use crate::{Path, PathBuf, QuantitativeAuditManifest, Result, eyre, fs};
+pub(super) use self::{digest::fixture_path_digest, paths::audit_artifact_display_path};
+
+use crate::{Path, QuantitativeAuditManifest, Result, eyre};
 
 pub(super) fn validate_quantitative_audit_artifacts(
 	manifest: &QuantitativeAuditManifest,
@@ -30,8 +33,9 @@ pub(super) fn validate_quantitative_audit_artifacts(
 			));
 		}
 
-		let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
-		let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| {
+		let artifact_path =
+			paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
+		let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| {
 			eyre::eyre!(
 				"{} artifact {} could not be digested at {}: {err}",
 				path.display(),
@@ -54,98 +58,3 @@ pub(super) fn validate_quantitative_audit_artifacts(
 
 	Ok(())
 }
-
-pub(super) fn fixture_path_digest(path: &Path) -> Result<String> {
-	let mut hasher = blake3::Hasher::new();
-
-	if path.is_file() {
-		hash_fixture_file(
-			path,
-			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
-			&mut hasher,
-		)?;
-
-		return Ok(hasher.finalize().to_hex().to_string());
-	}
-
-	let paths = audit_fixture_paths(path)?;
-
-	for fixture in paths {
-		let relative = fixture
-			.strip_prefix(path)
-			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
-			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
-
-		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
-	}
-
-	Ok(hasher.finalize().to_hex().to_string())
-}
-
-pub(super) fn audit_artifact_display_path(path: &Path) -> String {
-	let display_path = if path.is_absolute() {
-		env::current_dir()
-			.ok()
-			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
-			.unwrap_or_else(|| path.to_path_buf())
-	} else {
-		path.to_path_buf()
-	};
-
-	display_path.to_string_lossy().replace('\\', "/")
-}
-
-fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf {
-	let raw = PathBuf::from(artifact_path);
-
-	if raw.is_absolute() {
-		return raw;
-	}
-
-	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
-
-	if cwd_path.exists() {
-		return cwd_path;
-	}
-
-	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
-}
-
-fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
-	let mut paths = Vec::new();
-
-	collect_audit_fixture_paths(path, &mut paths)?;
-
-	paths.sort();
-
-	Ok(paths)
-}
-
-fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
-	if path.is_file() {
-		paths.push(path.to_path_buf());
-
-		return Ok(());
-	}
-
-	for entry in fs::read_dir(path)? {
-		let entry_path = entry?.path();
-
-		if entry_path.is_dir() {
-			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
-		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
-			paths.push(entry_path);
-		}
-	}
-
-	Ok(())
-}
-
-fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
-	hasher.update(logical_path.as_bytes());
-	hasher.update(b"\0");
-	hasher.update(&fs::read(path)?);
-	hasher.update(b"\0");
-
-	Ok(())
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
new file mode 100644
index 00000000..bb75c802
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
@@ -0,0 +1,67 @@
+use crate::{Path, PathBuf, Result, fs};
+
+pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) -> Result<String> {
+	let mut hasher = blake3::Hasher::new();
+
+	if path.is_file() {
+		hash_fixture_file(
+			path,
+			path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"),
+			&mut hasher,
+		)?;
+
+		return Ok(hasher.finalize().to_hex().to_string());
+	}
+
+	let paths = audit_fixture_paths(path)?;
+
+	for fixture in paths {
+		let relative = fixture
+			.strip_prefix(path)
+			.map(|relative| relative.to_string_lossy().replace('\\', "/"))
+			.unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/"));
+
+		hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?;
+	}
+
+	Ok(hasher.finalize().to_hex().to_string())
+}
+
+fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
+	let mut paths = Vec::new();
+
+	collect_audit_fixture_paths(path, &mut paths)?;
+
+	paths.sort();
+
+	Ok(paths)
+}
+
+fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
+	if path.is_file() {
+		paths.push(path.to_path_buf());
+
+		return Ok(());
+	}
+
+	for entry in fs::read_dir(path)? {
+		let entry_path = entry?.path();
+
+		if entry_path.is_dir() {
+			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
+		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
+			paths.push(entry_path);
+		}
+	}
+
+	Ok(())
+}
+
+fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
+	hasher.update(logical_path.as_bytes());
+	hasher.update(b"\0");
+	hasher.update(&fs::read(path)?);
+	hasher.update(b"\0");
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs
new file mode 100644
index 00000000..3dd15d54
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs
@@ -0,0 +1,35 @@
+use std::env;
+
+use crate::{Path, PathBuf};
+
+pub(in crate::quantitative::audit_manifest) fn audit_artifact_display_path(path: &Path) -> String {
+	let display_path = if path.is_absolute() {
+		env::current_dir()
+			.ok()
+			.and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf))
+			.unwrap_or_else(|| path.to_path_buf())
+	} else {
+		path.to_path_buf()
+	};
+
+	display_path.to_string_lossy().replace('\\', "/")
+}
+
+pub(super) fn resolve_quantitative_audit_artifact_path(
+	manifest_path: &Path,
+	artifact_path: &str,
+) -> PathBuf {
+	let raw = PathBuf::from(artifact_path);
+
+	if raw.is_absolute() {
+		return raw;
+	}
+
+	let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone());
+
+	if cwd_path.exists() {
+		return cwd_path;
+	}
+
+	manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path)
+}

From 48781e60d558e93d28b5249fd697fba91aff8181 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:53:08 -0400
Subject: [PATCH 23/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit validation checks","authority":"manual"}

---
 .../quantitative/audit_manifest/validation.rs | 140 ++----------------
 .../audit_manifest/validation/controls.rs     |  42 ++++++
 .../audit_manifest/validation/identity.rs     |  73 +++++++++
 .../audit_manifest/validation/queries.rs      |  29 ++++
 4 files changed, 153 insertions(+), 131 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs
index 5aab2c4f..5a37d191 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs
@@ -1,11 +1,10 @@
+mod controls;
+mod identity;
+mod queries;
+
 use crate::{
-	BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre,
-	quantitative::{
-		QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL,
-		REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
-		audit_manifest::{QuantitativeAuditContext, artifacts},
-		metrics,
-	},
+	Path, QuantitativeAuditManifest, Result,
+	quantitative::audit_manifest::{QuantitativeAuditContext, artifacts},
 };
 
 pub(super) fn validate_quantitative_audit_manifest(
@@ -13,130 +12,9 @@ pub(super) fn validate_quantitative_audit_manifest(
 	path: &Path,
 	context: QuantitativeAuditContext<'_>,
 ) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.run_id != context.run_id {
-		return Err(eyre::eyre!(
-			"{} has run_id {}, expected {}.",
-			path.display(),
-			manifest.run_id,
-			context.run_id
-		));
-	}
-	if manifest.corpus_id != context.corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected {}.",
-			path.display(),
-			manifest.corpus_id,
-			context.corpus_id
-		));
-	}
-	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
-		return Err(eyre::eyre!(
-			"{} has product {}:{} but current row is {}:{}.",
-			path.display(),
-			manifest.product,
-			manifest.adapter_id,
-			context.product,
-			context.adapter_id
-		));
-	}
-	if manifest.sample_size != context.source_jobs.len() {
-		return Err(eyre::eyre!(
-			"{} has sample_size {}, expected {}.",
-			path.display(),
-			manifest.sample_size,
-			context.source_jobs.len()
-		));
-	}
-	if manifest.ranking_query_count != context.ranking_query_count {
-		return Err(eyre::eyre!(
-			"{} has ranking_query_count {}, expected {}.",
-			path.display(),
-			manifest.ranking_query_count,
-			context.ranking_query_count
-		));
-	}
-	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
-		return Err(eyre::eyre!(
-			"{} has explicit_qrel_query_count {}, expected {}.",
-			path.display(),
-			manifest.explicit_qrel_query_count,
-			context.explicit_qrel_query_count
-		));
-	}
-
-	validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
-	validate_quantitative_audit_controls(manifest, path)?;
+	identity::validate_quantitative_audit_identity(manifest, path, &context)?;
+	queries::validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?;
+	controls::validate_quantitative_audit_controls(manifest, path)?;
 
 	artifacts::validate_quantitative_audit_artifacts(manifest, path)
 }
-
-fn validate_quantitative_audit_query_ids(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-	source_jobs: &[RealWorldJob],
-) -> Result<()> {
-	let expected = metrics::ranking_query_ids(source_jobs);
-	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if actual.len() != manifest.query_ids.len() {
-		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
-	}
-	if actual != expected {
-		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
-		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
-
-		return Err(eyre::eyre!(
-			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
-			path.display(),
-			missing,
-			extra
-		));
-	}
-
-	Ok(())
-}
-
-fn validate_quantitative_audit_controls(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
-
-	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
-		return Err(eyre::eyre!(
-			"{} marks held_out=true without required control {}.",
-			path.display(),
-			REQUIRED_HELD_OUT_AUDIT_CONTROL
-		));
-	}
-	if manifest.leakage_audited
-		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
-			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
-	{
-		return Err(eyre::eyre!(
-			"{} marks leakage_audited=true without required controls {} and {}.",
-			path.display(),
-			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
-			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
-		));
-	}
-	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
-	{
-		return Err(eyre::eyre!(
-			"{} marks audit controls true but has an empty claim_boundary.",
-			path.display()
-		));
-	}
-
-	Ok(())
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs
new file mode 100644
index 00000000..9b15c1ae
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs
@@ -0,0 +1,42 @@
+use crate::{
+	BTreeSet, Path, QuantitativeAuditManifest, Result, eyre,
+	quantitative::{
+		REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, REQUIRED_HELD_OUT_AUDIT_CONTROL,
+		REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+	},
+};
+
+pub(super) fn validate_quantitative_audit_controls(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	let controls = manifest.controls.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) {
+		return Err(eyre::eyre!(
+			"{} marks held_out=true without required control {}.",
+			path.display(),
+			REQUIRED_HELD_OUT_AUDIT_CONTROL
+		));
+	}
+	if manifest.leakage_audited
+		&& (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL)
+			|| !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL))
+	{
+		return Err(eyre::eyre!(
+			"{} marks leakage_audited=true without required controls {} and {}.",
+			path.display(),
+			REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL,
+			REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL
+		));
+	}
+	if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} marks audit controls true but has an empty claim_boundary.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
new file mode 100644
index 00000000..461e9eb6
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
@@ -0,0 +1,73 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre,
+	quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, audit_manifest::QuantitativeAuditContext},
+};
+
+pub(super) fn validate_quantitative_audit_identity(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: &QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs
new file mode 100644
index 00000000..9910b436
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs
@@ -0,0 +1,29 @@
+use crate::{
+	BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre, quantitative::metrics,
+};
+
+pub(super) fn validate_quantitative_audit_query_ids(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	source_jobs: &[RealWorldJob],
+) -> Result<()> {
+	let expected = metrics::ranking_query_ids(source_jobs);
+	let actual = manifest.query_ids.iter().map(String::as_str).collect::<BTreeSet<_>>();
+
+	if actual.len() != manifest.query_ids.len() {
+		return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display()));
+	}
+	if actual != expected {
+		let missing = expected.difference(&actual).copied().collect::<Vec<_>>();
+		let extra = actual.difference(&expected).copied().collect::<Vec<_>>();
+
+		return Err(eyre::eyre!(
+			"{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.",
+			path.display(),
+			missing,
+			extra
+		));
+	}
+
+	Ok(())
+}

From c478442703fcab68edb096b8952f64eda8afeb30 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 13:59:56 -0400
Subject: [PATCH 24/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query metric formulas","authority":"manual"}

---
 .../metrics/per_query/query_metrics.rs        | 128 +++---------------
 .../per_query/query_metrics/denominators.rs   |  21 +++
 .../per_query/query_metrics/ranking.rs        |  78 +++++++++++
 .../per_query/query_metrics/relevance.rs      |  23 ++++
 4 files changed, 139 insertions(+), 111 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs
index 01babc1d..6685aa6e 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs
@@ -1,4 +1,10 @@
-use crate::{BTreeMap, BTreeSet, formatting, quantitative::QUANTITATIVE_K_VALUES};
+mod denominators;
+mod ranking;
+mod relevance;
+
+pub(super) use self::{denominators::per_query_denominators, relevance::positive_qrel_count};
+
+use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES};
 
 pub(super) fn per_query_metrics(
 	candidates: &[String],
@@ -7,123 +13,23 @@ pub(super) fn per_query_metrics(
 	let mut metrics = BTreeMap::new();
 
 	for k in QUANTITATIVE_K_VALUES {
-		let relevant_at_k = relevant_at_k(candidates, relevance, *k);
+		let relevant_at_k = relevance::relevant_at_k(candidates, relevance, *k);
 
-		metrics
-			.insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance)));
-		metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k));
+		metrics.insert(
+			format!("recall_at_{k}"),
+			relevance::rate(relevant_at_k, positive_qrel_count(relevance)),
+		);
+		metrics.insert(format!("precision_at_{k}"), relevance::rate(relevant_at_k, *k));
 		metrics.insert(
 			format!("success_at_{k}"),
 			Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)),
 		);
 	}
 
-	metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance));
-	metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5));
-	metrics.insert("average_precision".to_string(), average_precision(candidates, relevance));
-
+	metrics.insert("mrr".to_string(), ranking::reciprocal_rank(candidates, relevance));
+	metrics.insert("ndcg_at_5".to_string(), ranking::ndcg_at_k(candidates, relevance, 5));
 	metrics
-}
-
-pub(super) fn positive_qrel_count(relevance: &BTreeMap<String, f64>) -> usize {
-	relevance.values().filter(|grade| **grade > 0.0).count()
-}
-
-pub(super) fn per_query_denominators(
-	candidate_count: usize,
-	expected_relevant_count: usize,
-) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
-		denominators.insert(format!("precision_at_{k}"), *k);
-		denominators.insert(format!("success_at_{k}"), 1);
-	}
-
-	denominators.insert("mrr".to_string(), expected_relevant_count);
-	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
-	denominators.insert("average_precision".to_string(), expected_relevant_count);
-	denominators.insert("candidate_count".to_string(), candidate_count);
-
-	denominators
-}
-
-fn relevant_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> usize {
-	candidates
-		.iter()
-		.take(k)
-		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
-		.count()
-}
-
-fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
+		.insert("average_precision".to_string(), ranking::average_precision(candidates, relevance));
 
-	Some(
-		candidates
-			.iter()
-			.position(|candidate| {
-				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
-			})
-			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
-	)
-}
-
-fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap<String, f64>, k: usize) -> Option<f64> {
-	if positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	let dcg = candidates
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, candidate)| {
-			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
-				/ ((index + 2) as f64).log2()
-		})
-		.sum::<f64>();
-	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
-
-	ideal.sort_by(|left, right| right.total_cmp(left));
-
-	let idcg = ideal
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
-		.sum::<f64>();
-
-	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
-}
-
-fn average_precision(candidates: &[String], relevance: &BTreeMap<String, f64>) -> Option<f64> {
-	let positive_count = positive_qrel_count(relevance);
-
-	if positive_count == 0 {
-		return None;
-	}
-
-	let mut hit_count = 0;
-	let mut precision_sum = 0.0;
-	let mut seen = BTreeSet::new();
-
-	for (index, candidate) in candidates.iter().enumerate() {
-		if !seen.insert(candidate.as_str()) {
-			continue;
-		}
-		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
-			hit_count += 1;
-			precision_sum += hit_count as f64 / (index + 1) as f64;
-		}
-	}
-
-	Some(precision_sum / positive_count as f64)
-}
-
-fn rate(numerator: usize, denominator: usize) -> Option<f64> {
-	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+	metrics
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs
new file mode 100644
index 00000000..7ef22bc8
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs
@@ -0,0 +1,21 @@
+use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES};
+
+pub(in crate::quantitative::metrics::per_query) fn per_query_denominators(
+	candidate_count: usize,
+	expected_relevant_count: usize,
+) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(format!("recall_at_{k}"), expected_relevant_count);
+		denominators.insert(format!("precision_at_{k}"), *k);
+		denominators.insert(format!("success_at_{k}"), 1);
+	}
+
+	denominators.insert("mrr".to_string(), expected_relevant_count);
+	denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5));
+	denominators.insert("average_precision".to_string(), expected_relevant_count);
+	denominators.insert("candidate_count".to_string(), candidate_count);
+
+	denominators
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
new file mode 100644
index 00000000..515bfaed
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
@@ -0,0 +1,78 @@
+use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics};
+
+pub(super) fn reciprocal_rank(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> Option<f64> {
+	if query_metrics::positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}
+
+pub(super) fn ndcg_at_k(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+	k: usize,
+) -> Option<f64> {
+	if query_metrics::positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
+
+pub(super) fn average_precision(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> Option<f64> {
+	let positive_count = query_metrics::positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs
new file mode 100644
index 00000000..a3644eb1
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs
@@ -0,0 +1,23 @@
+use crate::{BTreeMap, formatting};
+
+pub(in crate::quantitative::metrics::per_query) fn positive_qrel_count(
+	relevance: &BTreeMap<String, f64>,
+) -> usize {
+	relevance.values().filter(|grade| **grade > 0.0).count()
+}
+
+pub(super) fn relevant_at_k(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+	k: usize,
+) -> usize {
+	candidates
+		.iter()
+		.take(k)
+		.filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0))
+		.count()
+}
+
+pub(super) fn rate(numerator: usize, denominator: usize) -> Option<f64> {
+	(denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64))
+}

From 5dd6220d289ae7db5a2ca55e87c68403bb0a5b75 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:05:30 -0400
Subject: [PATCH 25/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit manifest flow","authority":"manual"}

---
 .../quantitative/audit_manifest.rs            | 100 ++----------------
 .../quantitative/audit_manifest/evidence.rs   |  31 ++++++
 .../quantitative/audit_manifest/export.rs     |  83 +++++++++++++++
 3 files changed, 120 insertions(+), 94 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
index d3e696a9..01f7e463 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs
@@ -1,11 +1,11 @@
 mod artifacts;
+mod evidence;
+mod export;
 mod validation;
 
-use crate::{
-	ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact,
-	QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs,
-	quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, metrics},
-};
+pub(crate) use self::export::quantitative_audit_manifest_from_jobs;
+
+use crate::{Path, RealWorldJob, Result};
 
 pub(super) struct QuantitativeAuditContext<'a> {
 	pub(super) run_id: &'a str,
@@ -23,97 +23,9 @@ pub(super) struct QuantitativeAuditEvidence {
 	pub(super) audit_manifest_id: Option<String>,
 }
 
-pub(crate) fn quantitative_audit_manifest_from_jobs(
-	jobs: &[RealWorldJob],
-	args: &ExportQuantitativeAuditManifestArgs,
-) -> Result<QuantitativeAuditManifest> {
-	let product = args.product.trim();
-	let adapter_id = args.adapter_id.trim();
-
-	if product.is_empty() || adapter_id.is_empty() {
-		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
-	}
-
-	let corpus_id = super::quantitative_corpus_id(jobs);
-	let ranking_query_count = metrics::ranking_query_count(jobs);
-	let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs);
-	let manifest = QuantitativeAuditManifest {
-		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
-		run_id: args.run_id.clone(),
-		corpus_id,
-		product: product.to_string(),
-		adapter_id: adapter_id.to_string(),
-		held_out: args.held_out,
-		leakage_audited: args.leakage_audited,
-		sample_size: jobs.len(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
-		controls: args.controls.clone(),
-		artifacts: vec![QuantitativeAuditArtifact {
-			role: "product_runtime_fixtures".to_string(),
-			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
-			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
-		}],
-		claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| {
-			if args.held_out || args.leakage_audited {
-				concat!(
-					"Audit manifest supplied by operator; runner validates run/corpus/product/",
-					"adapter/count/query-id/artifact bindings before opening row gates."
-				)
-				.to_string()
-			} else {
-				concat!(
-					"Diagnostic audit manifest binds the current product-runtime fixture set to ",
-					"query ids and counts, but it does not prove held-out or leakage-audited status."
-				)
-				.to_string()
-			}
-		}),
-	};
-
-	validation::validate_quantitative_audit_manifest(
-		&manifest,
-		args.fixtures.as_path(),
-		QuantitativeAuditContext {
-			run_id: args.run_id.as_str(),
-			corpus_id: manifest.corpus_id.as_str(),
-			product,
-			adapter_id,
-			source_jobs: jobs,
-			ranking_query_count: manifest.ranking_query_count,
-			explicit_qrel_query_count: manifest.explicit_qrel_query_count,
-		},
-	)?;
-
-	Ok(manifest)
-}
-
 pub(super) fn quantitative_audit_evidence(
 	path: Option<&Path>,
 	context: QuantitativeAuditContext<'_>,
 ) -> Result<QuantitativeAuditEvidence> {
-	let Some(path) = path else {
-		return Ok(QuantitativeAuditEvidence {
-			held_out: false,
-			leakage_audited: false,
-			audit_manifest_id: None,
-		});
-	};
-	let raw = fs::read_to_string(path)?;
-	let manifest = serde_json::from_str::<QuantitativeAuditManifest>(&raw).map_err(|err| {
-		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
-	})?;
-
-	validation::validate_quantitative_audit_manifest(&manifest, path, context)?;
-
-	Ok(QuantitativeAuditEvidence {
-		held_out: manifest.held_out,
-		leakage_audited: manifest.leakage_audited,
-		audit_manifest_id: Some(manifest.manifest_id),
-	})
+	evidence::quantitative_audit_evidence(path, context)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs
new file mode 100644
index 00000000..f9b2e0d4
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs
@@ -0,0 +1,31 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre, fs,
+	quantitative::audit_manifest::{
+		QuantitativeAuditContext, QuantitativeAuditEvidence, validation,
+	},
+};
+
+pub(super) fn quantitative_audit_evidence(
+	path: Option<&Path>,
+	context: QuantitativeAuditContext<'_>,
+) -> Result<QuantitativeAuditEvidence> {
+	let Some(path) = path else {
+		return Ok(QuantitativeAuditEvidence {
+			held_out: false,
+			leakage_audited: false,
+			audit_manifest_id: None,
+		});
+	};
+	let raw = fs::read_to_string(path)?;
+	let manifest = serde_json::from_str::<QuantitativeAuditManifest>(&raw).map_err(|err| {
+		eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display())
+	})?;
+
+	validation::validate_quantitative_audit_manifest(&manifest, path, context)?;
+
+	Ok(QuantitativeAuditEvidence {
+		held_out: manifest.held_out,
+		leakage_audited: manifest.leakage_audited,
+		audit_manifest_id: Some(manifest.manifest_id),
+	})
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
new file mode 100644
index 00000000..e99d5a9c
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
@@ -0,0 +1,83 @@
+use crate::{
+	ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest,
+	RealWorldJob, Result, eyre,
+	quantitative::{
+		self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA,
+		audit_manifest::{QuantitativeAuditContext, artifacts, validation},
+		metrics,
+	},
+};
+
+pub(crate) fn quantitative_audit_manifest_from_jobs(
+	jobs: &[RealWorldJob],
+	args: &ExportQuantitativeAuditManifestArgs,
+) -> Result<QuantitativeAuditManifest> {
+	let product = args.product.trim();
+	let adapter_id = args.adapter_id.trim();
+
+	if product.is_empty() || adapter_id.is_empty() {
+		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
+	}
+
+	let corpus_id = quantitative::quantitative_corpus_id(jobs);
+	let ranking_query_count = metrics::ranking_query_count(jobs);
+	let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs);
+	let manifest = QuantitativeAuditManifest {
+		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
+		run_id: args.run_id.clone(),
+		corpus_id,
+		product: product.to_string(),
+		adapter_id: adapter_id.to_string(),
+		held_out: args.held_out,
+		leakage_audited: args.leakage_audited,
+		sample_size: jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
+		controls: args.controls.clone(),
+		artifacts: vec![QuantitativeAuditArtifact {
+			role: "product_runtime_fixtures".to_string(),
+			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
+			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
+		}],
+		claim_boundary: quantitative_audit_claim_boundary(args),
+	};
+
+	validation::validate_quantitative_audit_manifest(
+		&manifest,
+		args.fixtures.as_path(),
+		QuantitativeAuditContext {
+			run_id: args.run_id.as_str(),
+			corpus_id: manifest.corpus_id.as_str(),
+			product,
+			adapter_id,
+			source_jobs: jobs,
+			ranking_query_count: manifest.ranking_query_count,
+			explicit_qrel_query_count: manifest.explicit_qrel_query_count,
+		},
+	)?;
+
+	Ok(manifest)
+}
+
+fn quantitative_audit_claim_boundary(args: &ExportQuantitativeAuditManifestArgs) -> String {
+	args.claim_boundary.clone().unwrap_or_else(|| {
+		if args.held_out || args.leakage_audited {
+			concat!(
+				"Audit manifest supplied by operator; runner validates run/corpus/product/",
+				"adapter/count/query-id/artifact bindings before opening row gates."
+			)
+			.to_string()
+		} else {
+			concat!(
+				"Diagnostic audit manifest binds the current product-runtime fixture set to ",
+				"query ids and counts, but it does not prove held-out or leakage-audited status."
+			)
+			.to_string()
+		}
+	})
+}

From 7d8c5efd18218953bf0318a26dcafc9b227c04fa Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:13:50 -0400
Subject: [PATCH 26/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product manifest flow","authority":"manual"}

---
 .../quantitative/product_manifest.rs          | 114 +-----------------
 .../quantitative/product_manifest/export.rs   |  61 ++++++++++
 .../product_manifest/export/identity.rs       |  23 ++++
 .../product_manifest/export/rows.rs           |  55 +++++++++
 .../quantitative/product_manifest/import.rs   |  32 +++++
 5 files changed, 176 insertions(+), 109 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
index ad9a2dee..4cd8b6c0 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs
@@ -1,118 +1,14 @@
+mod export;
+mod import;
 mod validation;
 
-use crate::{
-	ExportQuantitativeProductManifestArgs, Path, QuantitativeProductManifest, REPORT_SCHEMA,
-	RealWorldReport, Result, eyre, fs, quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA,
-};
+pub(crate) use self::export::quantitative_product_manifest_from_report;
 
-pub(crate) fn quantitative_product_manifest_from_report(
-	report: &RealWorldReport,
-	args: &ExportQuantitativeProductManifestArgs,
-) -> Result<QuantitativeProductManifest> {
-	if report.schema != REPORT_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {REPORT_SCHEMA}.",
-			args.report.display(),
-			report.schema
-		));
-	}
-
-	let source_row =
-		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
-			eyre::eyre!("{} has no quantitative product row.", args.report.display())
-		})?;
-	let source_product = source_row.product.as_str();
-	let source_adapter_id = source_row.adapter_id.as_str();
-	let product = args.product.as_deref().unwrap_or(source_product).trim();
-	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
-	let adapter_name =
-		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
-
-	if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() {
-		return Err(eyre::eyre!(
-			"{} cannot export an incomplete quantitative product identity.",
-			args.report.display()
-		));
-	}
-	if product == "ELF" {
-		return Err(eyre::eyre!(
-			"{} exports product ELF; use --product for external product manifest exports.",
-			args.report.display()
-		));
-	}
-
-	let mut row = source_row.clone();
-
-	row.product = product.to_string();
-	row.adapter_id = adapter_id.to_string();
-	row.adapter_name = adapter_name.to_string();
-	row.claim_boundary = concat!(
-		"Exported from a generated real_world_job_report quantitative row; ",
-		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
-	)
-	.to_string();
-
-	let mut per_query_rows = Vec::new();
-
-	for row in &report.quantitative_scoreboard.per_query_rows {
-		if row.product != source_product || row.adapter_id != source_adapter_id {
-			continue;
-		}
-
-		let mut row = row.clone();
-
-		row.product = product.to_string();
-		row.adapter_id = adapter_id.to_string();
-		row.claim_boundary = concat!(
-			"Exported from generated report per-query quantitative evidence; ",
-			"import does not relax paired-significance or leaderboard gates."
-		)
-		.to_string();
-
-		per_query_rows.push(row);
-	}
-
-	let manifest = QuantitativeProductManifest {
-		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
-		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
-		rows: vec![row],
-		per_query_rows,
-	};
-
-	validation::validate_quantitative_product_manifest(
-		&manifest,
-		&args.report,
-		manifest.corpus_id.as_str(),
-	)?;
-
-	Ok(manifest)
-}
+use crate::{Path, QuantitativeProductManifest, Result};
 
 pub(super) fn quantitative_product_manifest(
 	path: Option<&Path>,
 	corpus_id: &str,
 ) -> Result<QuantitativeProductManifest> {
-	let Some(path) = path else {
-		return Ok(QuantitativeProductManifest::default());
-	};
-	let raw = fs::read_to_string(path)?;
-	let mut manifest =
-		serde_json::from_str::<QuantitativeProductManifest>(&raw).map_err(|err| {
-			eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display())
-		})?;
-
-	for row in &mut manifest.rows {
-		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
-	}
-	for row in &mut manifest.per_query_rows {
-		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
-	}
-
-	validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
-
-	Ok(manifest)
+	import::quantitative_product_manifest(path, corpus_id)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
new file mode 100644
index 00000000..ac105d5a
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
@@ -0,0 +1,61 @@
+mod identity;
+mod rows;
+
+use crate::{
+	ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, REPORT_SCHEMA,
+	RealWorldReport, Result, eyre,
+	quantitative::{QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, product_manifest::validation},
+};
+
+pub(crate) fn quantitative_product_manifest_from_report(
+	report: &RealWorldReport,
+	args: &ExportQuantitativeProductManifestArgs,
+) -> Result<QuantitativeProductManifest> {
+	if report.schema != REPORT_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {REPORT_SCHEMA}.",
+			args.report.display(),
+			report.schema
+		));
+	}
+
+	let source_row =
+		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
+			eyre::eyre!("{} has no quantitative product row.", args.report.display())
+		})?;
+	let source_product = source_row.product.as_str();
+	let source_adapter_id = source_row.adapter_id.as_str();
+	let product = args.product.as_deref().unwrap_or(source_product).trim();
+	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
+	let adapter_name =
+		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
+
+	identity::validate_export_identity(args, product, adapter_id, adapter_name)?;
+
+	let row = rows::exported_product_row(source_row, product, adapter_id, adapter_name);
+	let per_query_rows = rows::exported_per_query_rows(
+		report,
+		source_product,
+		source_adapter_id,
+		product,
+		adapter_id,
+	);
+	let manifest = QuantitativeProductManifest {
+		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
+		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
+		rows: vec![row],
+		per_query_rows,
+	};
+
+	validation::validate_quantitative_product_manifest(
+		&manifest,
+		&args.report,
+		manifest.corpus_id.as_str(),
+	)?;
+
+	Ok(manifest)
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs
new file mode 100644
index 00000000..4f1f6453
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs
@@ -0,0 +1,23 @@
+use crate::{ExportQuantitativeProductManifestArgs, Result, eyre};
+
+pub(super) fn validate_export_identity(
+	args: &ExportQuantitativeProductManifestArgs,
+	product: &str,
+	adapter_id: &str,
+	adapter_name: &str,
+) -> Result<()> {
+	if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() {
+		return Err(eyre::eyre!(
+			"{} cannot export an incomplete quantitative product identity.",
+			args.report.display()
+		));
+	}
+	if product == "ELF" {
+		return Err(eyre::eyre!(
+			"{} exports product ELF; use --product for external product manifest exports.",
+			args.report.display()
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
new file mode 100644
index 00000000..2e1923db
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
@@ -0,0 +1,55 @@
+use crate::{QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldReport};
+
+pub(super) fn exported_product_row(
+	source_row: &QuantitativeBenchmarkRow,
+	product: &str,
+	adapter_id: &str,
+	adapter_name: &str,
+) -> QuantitativeBenchmarkRow {
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.adapter_name = adapter_name.to_string();
+	row.claim_boundary = concat!(
+		"Exported from a generated real_world_job_report quantitative row; ",
+		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
+	)
+	.to_string();
+
+	row
+}
+
+pub(super) fn exported_per_query_rows(
+	report: &RealWorldReport,
+	source_product: &str,
+	source_adapter_id: &str,
+	product: &str,
+	adapter_id: &str,
+) -> Vec<QuantitativePerQueryRow> {
+	report
+		.quantitative_scoreboard
+		.per_query_rows
+		.iter()
+		.filter(|row| row.product == source_product && row.adapter_id == source_adapter_id)
+		.map(|row| exported_per_query_row(row, product, adapter_id))
+		.collect()
+}
+
+fn exported_per_query_row(
+	source_row: &QuantitativePerQueryRow,
+	product: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.claim_boundary = concat!(
+		"Exported from generated report per-query quantitative evidence; ",
+		"import does not relax paired-significance or leaderboard gates."
+	)
+	.to_string();
+
+	row
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs
new file mode 100644
index 00000000..12df9a92
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs
@@ -0,0 +1,32 @@
+use crate::{
+	Path, QuantitativeProductManifest, Result, eyre, fs, quantitative::product_manifest::validation,
+};
+
+pub(super) fn quantitative_product_manifest(
+	path: Option<&Path>,
+	corpus_id: &str,
+) -> Result<QuantitativeProductManifest> {
+	let Some(path) = path else {
+		return Ok(QuantitativeProductManifest::default());
+	};
+	let raw = fs::read_to_string(path)?;
+	let mut manifest =
+		serde_json::from_str::<QuantitativeProductManifest>(&raw).map_err(|err| {
+			eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display())
+		})?;
+
+	populate_source_manifest_corpus_ids(&mut manifest);
+
+	validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?;
+
+	Ok(manifest)
+}
+
+fn populate_source_manifest_corpus_ids(manifest: &mut QuantitativeProductManifest) {
+	for row in &mut manifest.rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+	for row in &mut manifest.per_query_rows {
+		row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone());
+	}
+}

From 44cea2fe5c93b23d99d5975a31370e7541ea862b Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:21:10 -0400
Subject: [PATCH 27/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative aggregate denominators","authority":"manual"}

---
 .../quantitative/metrics/aggregate.rs         | 49 ++-----------------
 .../metrics/aggregate/denominators.rs         | 33 +++++++++++++
 .../quantitative/metrics/aggregate/names.rs   | 16 ++++++
 3 files changed, 53 insertions(+), 45 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
index 4d737d85..b61ee782 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
@@ -1,4 +1,6 @@
 mod confidence;
+mod denominators;
+mod names;
 
 use crate::{
 	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
@@ -7,7 +9,7 @@ use crate::{
 
 pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
 	let mut sums = BTreeMap::<String, (f64, usize)>::new();
-	let mut metrics = quantitative_metric_names()
+	let mut metrics = names::quantitative_metric_names()
 		.into_iter()
 		.map(|metric| (metric, None))
 		.collect::<BTreeMap<_, _>>();
@@ -49,31 +51,7 @@ pub(super) fn aggregate_metric_states(
 }
 
 pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
-	let mut denominators = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		denominators.insert(
-			format!("recall_at_{k}"),
-			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
-		);
-		denominators.insert(
-			format!("precision_at_{k}"),
-			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
-		);
-		denominators.insert(
-			format!("success_at_{k}"),
-			sum_per_query_denominator(rows, &format!("success_at_{k}")),
-		);
-	}
-
-	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
-	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
-	denominators.insert(
-		"average_precision".to_string(),
-		sum_per_query_denominator(rows, "average_precision"),
-	);
-
-	denominators
+	denominators::aggregate_denominators(rows)
 }
 
 pub(super) fn aggregate_confidence_intervals(
@@ -81,22 +59,3 @@ pub(super) fn aggregate_confidence_intervals(
 ) -> BTreeMap<String, QuantitativeConfidenceInterval> {
 	confidence::aggregate_confidence_intervals(rows)
 }
-
-fn quantitative_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		metrics.push(metric.to_string());
-	}
-
-	metrics
-}
-
-fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
-	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs
new file mode 100644
index 00000000..3ddd044f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs
@@ -0,0 +1,33 @@
+use crate::{BTreeMap, QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES};
+
+pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+	let mut denominators = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		denominators.insert(
+			format!("recall_at_{k}"),
+			sum_per_query_denominator(rows, &format!("recall_at_{k}")),
+		);
+		denominators.insert(
+			format!("precision_at_{k}"),
+			sum_per_query_denominator(rows, &format!("precision_at_{k}")),
+		);
+		denominators.insert(
+			format!("success_at_{k}"),
+			sum_per_query_denominator(rows, &format!("success_at_{k}")),
+		);
+	}
+
+	denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr"));
+	denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5"));
+	denominators.insert(
+		"average_precision".to_string(),
+		sum_per_query_denominator(rows, "average_precision"),
+	);
+
+	denominators
+}
+
+fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize {
+	rows.iter().filter_map(|row| row.denominators.get(metric)).sum()
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs
new file mode 100644
index 00000000..90055feb
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs
@@ -0,0 +1,16 @@
+use crate::quantitative::QUANTITATIVE_K_VALUES;
+
+pub(super) fn quantitative_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		metrics.push(metric.to_string());
+	}
+
+	metrics
+}

From b61fbd5eb6f72b74fe65b4b02c79cec458b40219 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:24:44 -0400
Subject: [PATCH 28/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query evidence mapping","authority":"manual"}

---
 .../quantitative/metrics/per_query.rs         | 32 +++----------------
 .../metrics/per_query/evidence.rs             | 29 +++++++++++++++++
 2 files changed, 33 insertions(+), 28 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
index fbbce5db..cb184dc9 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
@@ -1,7 +1,8 @@
+mod evidence;
 mod query_metrics;
 
 use crate::{
-	BTreeMap, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
+	JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
 	quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring,
 };
 
@@ -28,7 +29,7 @@ fn quantitative_per_query_row(
 	evidence_class: &str,
 	adapter_id: &str,
 ) -> QuantitativePerQueryRow {
-	let relevance = relevance_grades(source_job, job);
+	let relevance = evidence::relevance_grades(source_job, job);
 	let candidates = scoring::produced_evidence_order(source_job);
 	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
 	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
@@ -47,7 +48,7 @@ fn quantitative_per_query_row(
 		result_state: formatting::status_str(job.status).to_string(),
 		expected_relevant_count: positive_relevance_count,
 		candidate_count: candidates.len(),
-		qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(),
+		qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(),
 		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
 		product: "ELF".to_string(),
 		adapter_id: adapter_id.to_string(),
@@ -60,28 +61,3 @@ fn quantitative_per_query_row(
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	}
 }
-
-fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap<String, f64> {
-	let explicit = source_job
-		.expected_answer
-		.relevance_judgments
-		.iter()
-		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-		.collect::<BTreeMap<_, _>>();
-
-	if !explicit.is_empty() {
-		return explicit;
-	}
-
-	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
-}
-
-fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		"explicit_qrels"
-	} else if empty {
-		"not_encoded"
-	} else {
-		"expected_evidence_fallback"
-	}
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs
new file mode 100644
index 00000000..1a13fac2
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs
@@ -0,0 +1,29 @@
+use crate::{BTreeMap, JobReport, RealWorldJob};
+
+pub(super) fn relevance_grades(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+) -> BTreeMap<String, f64> {
+	let explicit = source_job
+		.expected_answer
+		.relevance_judgments
+		.iter()
+		.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+		.collect::<BTreeMap<_, _>>();
+
+	if !explicit.is_empty() {
+		return explicit;
+	}
+
+	job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect()
+}
+
+pub(super) fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		"explicit_qrels"
+	} else if empty {
+		"not_encoded"
+	} else {
+		"expected_evidence_fallback"
+	}
+}

From 91c4600a0c8ab6ae86f3aea3a8d3ef2952fbce5f Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:28:28 -0400
Subject: [PATCH 29/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report row counts","authority":"manual"}

---
 .../quantitative/report/row.rs                | 11 +++++-----
 .../quantitative/report/row/query_counts.rs   | 21 +++++++++++++++++++
 2 files changed, 26 insertions(+), 6 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
index d3f8b232..71c66266 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
@@ -1,3 +1,5 @@
+mod query_counts;
+
 use crate::{
 	QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result,
 	quantitative::{
@@ -28,12 +30,9 @@ pub(super) fn current_quantitative_row(
 		evidence_class,
 		input.adapter.adapter_id.as_str(),
 	);
-	let ranking_query_count = per_query_rows
-		.iter()
-		.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
-		.count();
-	let explicit_qrel_query_count =
-		per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count();
+	let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice());
+	let ranking_query_count = query_counts.ranking_query_count;
+	let explicit_qrel_query_count = query_counts.explicit_qrel_query_count;
 	let metric_comparable = ranking_query_count > 0;
 	let result_state = quantitative::quantitative_result_state(input.summary);
 	let audit_evidence = audit_manifest::quantitative_audit_evidence(
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs
new file mode 100644
index 00000000..12632f0a
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs
@@ -0,0 +1,21 @@
+use crate::QuantitativePerQueryRow;
+
+pub(super) struct QuantitativeQueryCounts {
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+}
+
+pub(super) fn quantitative_query_counts(
+	per_query_rows: &[QuantitativePerQueryRow],
+) -> QuantitativeQueryCounts {
+	QuantitativeQueryCounts {
+		ranking_query_count: per_query_rows
+			.iter()
+			.filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0)
+			.count(),
+		explicit_qrel_query_count: per_query_rows
+			.iter()
+			.filter(|row| row.qrel_source == "explicit_qrels")
+			.count(),
+	}
+}

From 353c953f78fea4d0eac7f46b16e9f7178bdc24ab Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:32:14 -0400
Subject: [PATCH 30/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative ranking query detection","authority":"manual"}

---
 .../quantitative/metrics/ranking.rs           | 39 ++-----------------
 .../quantitative/metrics/ranking/queries.rs   | 38 ++++++++++++++++++
 2 files changed, 42 insertions(+), 35 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
index 918a8613..340a7115 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
@@ -1,9 +1,11 @@
-use crate::{BTreeMap, BTreeSet, RealWorldJob, ReportSummary, scoring};
+mod queries;
+
+use crate::{BTreeSet, RealWorldJob, ReportSummary};
 
 pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
 	source_jobs
 		.iter()
-		.filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job))
+		.filter(|job| queries::is_ranking_query(job))
 		.map(|job| job.job_id.as_str())
 		.collect()
 }
@@ -48,36 +50,3 @@ pub(super) fn ranking_coverage_state(
 pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
 	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
 }
-
-fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
-	if !source_job.expected_answer.relevance_judgments.is_empty() {
-		return source_job
-			.expected_answer
-			.relevance_judgments
-			.iter()
-			.filter(|judgment| judgment.grade > 0.0)
-			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
-			.collect();
-	}
-
-	source_job
-		.required_evidence
-		.iter()
-		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
-		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
-		.collect()
-}
-
-fn ranking_query_attempted(job: &RealWorldJob) -> bool {
-	if !scoring::produced_evidence_order(job).is_empty() {
-		return true;
-	}
-
-	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
-		return false;
-	};
-
-	answer.trace_explainability.as_ref().is_some_and(|trace| {
-		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
-	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs
new file mode 100644
index 00000000..8ada5678
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs
@@ -0,0 +1,38 @@
+use crate::{BTreeMap, RealWorldJob, scoring};
+
+pub(super) fn is_ranking_query(job: &RealWorldJob) -> bool {
+	!ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)
+}
+
+fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap<String, f64> {
+	if !source_job.expected_answer.relevance_judgments.is_empty() {
+		return source_job
+			.expected_answer
+			.relevance_judgments
+			.iter()
+			.filter(|judgment| judgment.grade > 0.0)
+			.map(|judgment| (judgment.evidence_id.clone(), judgment.grade))
+			.collect();
+	}
+
+	source_job
+		.required_evidence
+		.iter()
+		.filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain"))
+		.map(|evidence| (evidence.evidence_id.clone(), 1.0))
+		.collect()
+}
+
+fn ranking_query_attempted(job: &RealWorldJob) -> bool {
+	if !scoring::produced_evidence_order(job).is_empty() {
+		return true;
+	}
+
+	let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else {
+		return false;
+	};
+
+	answer.trace_explainability.as_ref().is_some_and(|trace| {
+		trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve")
+	}) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0)
+}

From 9a640111b7e4a40cb80d2fd1a932c0b9ae54e7e3 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:35:28 -0400
Subject: [PATCH 31/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative confidence interval helpers","authority":"manual"}

---
 .../metrics/aggregate/confidence.rs           | 74 ++-----------------
 .../metrics/aggregate/confidence/rates.rs     | 39 ++++++++++
 .../metrics/aggregate/confidence/wilson.rs    | 22 ++++++
 3 files changed, 69 insertions(+), 66 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs
index e1db5fb8..2a454bdc 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs
@@ -1,82 +1,24 @@
-use crate::{
-	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
-	quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z},
-};
+mod rates;
+mod wilson;
+
+use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow};
 
 pub(super) fn aggregate_confidence_intervals(
 	rows: &[QuantitativePerQueryRow],
 ) -> BTreeMap<String, QuantitativeConfidenceInterval> {
 	let mut confidence_intervals = BTreeMap::new();
 
-	for metric in rate_metric_names() {
-		let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str());
+	for metric in rates::rate_metric_names() {
+		let (numerator, denominator) =
+			rates::aggregate_rate_numerator_denominator(rows, metric.as_str());
 
 		if denominator > 0 {
 			confidence_intervals.insert(
 				metric,
-				wilson_confidence_interval(numerator.min(denominator), denominator),
+				wilson::wilson_confidence_interval(numerator.min(denominator), denominator),
 			);
 		}
 	}
 
 	confidence_intervals
 }
-
-fn rate_metric_names() -> Vec<String> {
-	let mut metrics = Vec::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		metrics.push(format!("recall_at_{k}"));
-		metrics.push(format!("precision_at_{k}"));
-		metrics.push(format!("success_at_{k}"));
-	}
-
-	metrics
-}
-
-fn aggregate_rate_numerator_denominator(
-	rows: &[QuantitativePerQueryRow],
-	metric: &str,
-) -> (usize, usize) {
-	let mut numerator = 0;
-	let mut denominator = 0;
-
-	for row in rows {
-		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
-			continue;
-		};
-		let Some(row_denominator) = row.denominators.get(metric).copied() else {
-			continue;
-		};
-
-		if row_denominator == 0 {
-			continue;
-		}
-
-		denominator += row_denominator;
-		numerator += (value * row_denominator as f64).round() as usize;
-	}
-
-	(numerator, denominator)
-}
-
-fn wilson_confidence_interval(
-	numerator: usize,
-	denominator: usize,
-) -> QuantitativeConfidenceInterval {
-	let n = denominator as f64;
-	let p = numerator as f64 / n;
-	let z2 = WILSON_95_Z * WILSON_95_Z;
-	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
-	let half_width =
-		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
-
-	QuantitativeConfidenceInterval {
-		method: "wilson_score".to_string(),
-		confidence: 0.95,
-		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
-		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
-		numerator,
-		denominator,
-	}
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs
new file mode 100644
index 00000000..4cfb3b7f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs
@@ -0,0 +1,39 @@
+use crate::{QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES};
+
+pub(super) fn rate_metric_names() -> Vec<String> {
+	let mut metrics = Vec::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		metrics.push(format!("recall_at_{k}"));
+		metrics.push(format!("precision_at_{k}"));
+		metrics.push(format!("success_at_{k}"));
+	}
+
+	metrics
+}
+
+pub(super) fn aggregate_rate_numerator_denominator(
+	rows: &[QuantitativePerQueryRow],
+	metric: &str,
+) -> (usize, usize) {
+	let mut numerator = 0;
+	let mut denominator = 0;
+
+	for row in rows {
+		let Some(value) = row.metrics.get(metric).and_then(|value| *value) else {
+			continue;
+		};
+		let Some(row_denominator) = row.denominators.get(metric).copied() else {
+			continue;
+		};
+
+		if row_denominator == 0 {
+			continue;
+		}
+
+		denominator += row_denominator;
+		numerator += (value * row_denominator as f64).round() as usize;
+	}
+
+	(numerator, denominator)
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs
new file mode 100644
index 00000000..99c3029d
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs
@@ -0,0 +1,22 @@
+use crate::{QuantitativeConfidenceInterval, formatting, quantitative::WILSON_95_Z};
+
+pub(super) fn wilson_confidence_interval(
+	numerator: usize,
+	denominator: usize,
+) -> QuantitativeConfidenceInterval {
+	let n = denominator as f64;
+	let p = numerator as f64 / n;
+	let z2 = WILSON_95_Z * WILSON_95_Z;
+	let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n);
+	let half_width =
+		WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n);
+
+	QuantitativeConfidenceInterval {
+		method: "wilson_score".to_string(),
+		confidence: 0.95,
+		lower: formatting::round3((center - half_width).clamp(0.0, 1.0)),
+		upper: formatting::round3((center + half_width).clamp(0.0, 1.0)),
+		numerator,
+		denominator,
+	}
+}

From 52c4ae07992adc65e50d5f063b331cbb71ba1330 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:38:16 -0400
Subject: [PATCH 32/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit export helpers","authority":"manual"}

---
 .../quantitative/audit_manifest/export.rs     | 29 ++++---------------
 .../audit_manifest/export/claim_boundary.rs   | 21 ++++++++++++++
 .../audit_manifest/export/identity.rs         |  9 ++++++
 3 files changed, 36 insertions(+), 23 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
index e99d5a9c..795960e0 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
@@ -1,6 +1,9 @@
+mod claim_boundary;
+mod identity;
+
 use crate::{
 	ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest,
-	RealWorldJob, Result, eyre,
+	RealWorldJob, Result,
 	quantitative::{
 		self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA,
 		audit_manifest::{QuantitativeAuditContext, artifacts, validation},
@@ -15,9 +18,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 	let product = args.product.trim();
 	let adapter_id = args.adapter_id.trim();
 
-	if product.is_empty() || adapter_id.is_empty() {
-		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
-	}
+	identity::validate_audit_export_identity(product, adapter_id)?;
 
 	let corpus_id = quantitative::quantitative_corpus_id(jobs);
 	let ranking_query_count = metrics::ranking_query_count(jobs);
@@ -44,7 +45,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
 			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
 		}],
-		claim_boundary: quantitative_audit_claim_boundary(args),
+		claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args),
 	};
 
 	validation::validate_quantitative_audit_manifest(
@@ -63,21 +64,3 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 
 	Ok(manifest)
 }
-
-fn quantitative_audit_claim_boundary(args: &ExportQuantitativeAuditManifestArgs) -> String {
-	args.claim_boundary.clone().unwrap_or_else(|| {
-		if args.held_out || args.leakage_audited {
-			concat!(
-				"Audit manifest supplied by operator; runner validates run/corpus/product/",
-				"adapter/count/query-id/artifact bindings before opening row gates."
-			)
-			.to_string()
-		} else {
-			concat!(
-				"Diagnostic audit manifest binds the current product-runtime fixture set to ",
-				"query ids and counts, but it does not prove held-out or leakage-audited status."
-			)
-			.to_string()
-		}
-	})
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs
new file mode 100644
index 00000000..3d572c61
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs
@@ -0,0 +1,21 @@
+use crate::ExportQuantitativeAuditManifestArgs;
+
+pub(super) fn quantitative_audit_claim_boundary(
+	args: &ExportQuantitativeAuditManifestArgs,
+) -> String {
+	args.claim_boundary.clone().unwrap_or_else(|| {
+		if args.held_out || args.leakage_audited {
+			concat!(
+				"Audit manifest supplied by operator; runner validates run/corpus/product/",
+				"adapter/count/query-id/artifact bindings before opening row gates."
+			)
+			.to_string()
+		} else {
+			concat!(
+				"Diagnostic audit manifest binds the current product-runtime fixture set to ",
+				"query ids and counts, but it does not prove held-out or leakage-audited status."
+			)
+			.to_string()
+		}
+	})
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs
new file mode 100644
index 00000000..872da0e6
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs
@@ -0,0 +1,9 @@
+use crate::{Result, eyre};
+
+pub(super) fn validate_audit_export_identity(product: &str, adapter_id: &str) -> Result<()> {
+	if product.is_empty() || adapter_id.is_empty() {
+		return Err(eyre::eyre!("quantitative audit export requires product and adapter_id."));
+	}
+
+	Ok(())
+}

From f8959df71695a9bd15d9bd4199746393554851a8 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:47:00 -0400
Subject: [PATCH 33/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report row assembly","authority":"manual"}

---
 .../quantitative/report/row.rs                | 46 ++++--------
 .../quantitative/report/row/benchmark_row.rs  | 71 +++++++++++++++++++
 2 files changed, 84 insertions(+), 33 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
index 71c66266..868863fe 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
@@ -1,12 +1,13 @@
+mod benchmark_row;
 mod query_counts;
 
 use crate::{
 	QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result,
 	quantitative::{
-		self, QUANTITATIVE_ROW_CLAIM_BOUNDARY,
+		self,
 		audit_manifest::{self, QuantitativeAuditContext},
 		metrics,
-		report::QuantitativeReportInput,
+		report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput},
 	},
 };
 
@@ -55,39 +56,18 @@ pub(super) fn current_quantitative_row(
 		metric_comparable,
 		&audit_evidence,
 	);
-	let row = QuantitativeBenchmarkRow {
-		product: "ELF".to_string(),
-		adapter_id: input.adapter.adapter_id.clone(),
-		adapter_name: input.adapter.name.clone(),
-		suite: quantitative::quantitative_suite_id(input.jobs),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.clone()),
-		result_state: result_state.to_string(),
-		comparable: metric_comparable,
-		metric_comparable,
-		leaderboard_eligible,
-		held_out: audit_evidence.held_out,
-		leakage_audited: audit_evidence.leakage_audited,
-		audit_manifest_id: audit_evidence.audit_manifest_id,
-		fixture_regression_only: evidence_class == "fixture_backed",
-		sample_size: input.jobs.len(),
+	let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput {
+		input,
+		corpus_id: corpus_id.as_str(),
+		evidence_class,
+		per_query_rows: per_query_rows.as_slice(),
 		ranking_query_count,
-		ranking_coverage_state: metrics::ranking_coverage_state(
-			input.summary,
-			input.source_jobs.len(),
-			ranking_query_count,
-		)
-		.to_string(),
-		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
-		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
-			.to_string(),
 		explicit_qrel_query_count,
-		metrics: metrics::aggregate_metrics(per_query_rows.as_slice()),
-		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
-		denominators: metrics::aggregate_denominators(per_query_rows.as_slice()),
-		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()),
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	};
+		metric_comparable,
+		result_state,
+		audit_evidence,
+		leaderboard_eligible,
+	});
 
 	Ok(CurrentQuantitativeRow {
 		corpus_id,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs
new file mode 100644
index 00000000..53198ae6
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs
@@ -0,0 +1,71 @@
+use crate::{
+	QuantitativeBenchmarkRow, QuantitativePerQueryRow,
+	quantitative::{
+		self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, audit_manifest::QuantitativeAuditEvidence, metrics,
+		report::QuantitativeReportInput,
+	},
+};
+
+pub(super) struct QuantitativeBenchmarkRowInput<'a, 'b> {
+	pub(super) input: &'a QuantitativeReportInput<'b>,
+	pub(super) corpus_id: &'a str,
+	pub(super) evidence_class: &'a str,
+	pub(super) per_query_rows: &'a [QuantitativePerQueryRow],
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+	pub(super) metric_comparable: bool,
+	pub(super) result_state: &'a str,
+	pub(super) audit_evidence: QuantitativeAuditEvidence,
+	pub(super) leaderboard_eligible: bool,
+}
+
+pub(super) fn quantitative_benchmark_row(
+	row_input: QuantitativeBenchmarkRowInput<'_, '_>,
+) -> QuantitativeBenchmarkRow {
+	let QuantitativeBenchmarkRowInput {
+		input,
+		corpus_id,
+		evidence_class,
+		per_query_rows,
+		ranking_query_count,
+		explicit_qrel_query_count,
+		metric_comparable,
+		result_state,
+		audit_evidence,
+		leaderboard_eligible,
+	} = row_input;
+
+	QuantitativeBenchmarkRow {
+		product: "ELF".to_string(),
+		adapter_id: input.adapter.adapter_id.clone(),
+		adapter_name: input.adapter.name.clone(),
+		suite: quantitative::quantitative_suite_id(input.jobs),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.to_string()),
+		result_state: result_state.to_string(),
+		comparable: metric_comparable,
+		metric_comparable,
+		leaderboard_eligible,
+		held_out: audit_evidence.held_out,
+		leakage_audited: audit_evidence.leakage_audited,
+		audit_manifest_id: audit_evidence.audit_manifest_id,
+		fixture_regression_only: evidence_class == "fixture_backed",
+		sample_size: input.jobs.len(),
+		ranking_query_count,
+		ranking_coverage_state: metrics::ranking_coverage_state(
+			input.summary,
+			input.source_jobs.len(),
+			ranking_query_count,
+		)
+		.to_string(),
+		ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(),
+		qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
+			.to_string(),
+		explicit_qrel_query_count,
+		metrics: metrics::aggregate_metrics(per_query_rows),
+		metric_states: metrics::aggregate_metric_states(result_state, metric_comparable),
+		denominators: metrics::aggregate_denominators(per_query_rows),
+		confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows),
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	}
+}

From c01c488f46655082c4708967fb89ce6e0127f009 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:50:18 -0400
Subject: [PATCH 34/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative average precision metric","authority":"manual"}

---
 .../per_query/query_metrics/ranking.rs        | 26 +++--------------
 .../ranking/average_precision.rs              | 28 +++++++++++++++++++
 2 files changed, 32 insertions(+), 22 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
index 515bfaed..5abea808 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
@@ -1,4 +1,6 @@
-use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics};
+mod average_precision;
+
+use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics};
 
 pub(super) fn reciprocal_rank(
 	candidates: &[String],
@@ -54,25 +56,5 @@ pub(super) fn average_precision(
 	candidates: &[String],
 	relevance: &BTreeMap<String, f64>,
 ) -> Option<f64> {
-	let positive_count = query_metrics::positive_qrel_count(relevance);
-
-	if positive_count == 0 {
-		return None;
-	}
-
-	let mut hit_count = 0;
-	let mut precision_sum = 0.0;
-	let mut seen = BTreeSet::new();
-
-	for (index, candidate) in candidates.iter().enumerate() {
-		if !seen.insert(candidate.as_str()) {
-			continue;
-		}
-		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
-			hit_count += 1;
-			precision_sum += hit_count as f64 / (index + 1) as f64;
-		}
-	}
-
-	Some(precision_sum / positive_count as f64)
+	average_precision::average_precision(candidates, relevance)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs
new file mode 100644
index 00000000..13c196ca
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs
@@ -0,0 +1,28 @@
+use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics};
+
+pub(super) fn average_precision(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> Option<f64> {
+	let positive_count = query_metrics::positive_qrel_count(relevance);
+
+	if positive_count == 0 {
+		return None;
+	}
+
+	let mut hit_count = 0;
+	let mut precision_sum = 0.0;
+	let mut seen = BTreeSet::new();
+
+	for (index, candidate) in candidates.iter().enumerate() {
+		if !seen.insert(candidate.as_str()) {
+			continue;
+		}
+		if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) {
+			hit_count += 1;
+			precision_sum += hit_count as f64 / (index + 1) as f64;
+		}
+	}
+
+	Some(precision_sum / positive_count as f64)
+}

From cc23d1aa3c78c377dadf9995e5b1d15fafc70f31 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 14:56:23 -0400
Subject: [PATCH 35/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report audit gates","authority":"manual"}

---
 .../quantitative/report/row.rs                | 29 ++++--------
 .../quantitative/report/row/audit_gates.rs    | 45 +++++++++++++++++++
 2 files changed, 53 insertions(+), 21 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
index 868863fe..8599700a 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
@@ -1,12 +1,11 @@
+mod audit_gates;
 mod benchmark_row;
 mod query_counts;
 
 use crate::{
 	QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result,
 	quantitative::{
-		self,
-		audit_manifest::{self, QuantitativeAuditContext},
-		metrics,
+		self, metrics,
 		report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput},
 	},
 };
@@ -36,26 +35,14 @@ pub(super) fn current_quantitative_row(
 	let explicit_qrel_query_count = query_counts.explicit_qrel_query_count;
 	let metric_comparable = ranking_query_count > 0;
 	let result_state = quantitative::quantitative_result_state(input.summary);
-	let audit_evidence = audit_manifest::quantitative_audit_evidence(
-		input.audit_manifest_path,
-		QuantitativeAuditContext {
-			run_id: input.run_id,
-			corpus_id: corpus_id.as_str(),
-			product: "ELF",
-			adapter_id: input.adapter.adapter_id.as_str(),
-			source_jobs: input.source_jobs,
-			ranking_query_count,
-			explicit_qrel_query_count,
-		},
-	)?;
-	let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible(
+	let audit_gates = audit_gates::quantitative_audit_gates(
+		input,
+		corpus_id.as_str(),
 		evidence_class,
-		input.source_jobs.len(),
 		ranking_query_count,
 		explicit_qrel_query_count,
 		metric_comparable,
-		&audit_evidence,
-	);
+	)?;
 	let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput {
 		input,
 		corpus_id: corpus_id.as_str(),
@@ -65,8 +52,8 @@ pub(super) fn current_quantitative_row(
 		explicit_qrel_query_count,
 		metric_comparable,
 		result_state,
-		audit_evidence,
-		leaderboard_eligible,
+		audit_evidence: audit_gates.audit_evidence,
+		leaderboard_eligible: audit_gates.leaderboard_eligible,
 	});
 
 	Ok(CurrentQuantitativeRow {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs
new file mode 100644
index 00000000..31d2ddee
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs
@@ -0,0 +1,45 @@
+use crate::{
+	Result,
+	quantitative::{
+		self,
+		audit_manifest::{self, QuantitativeAuditContext, QuantitativeAuditEvidence},
+		report::QuantitativeReportInput,
+	},
+};
+
+pub(super) struct QuantitativeAuditGates {
+	pub(super) audit_evidence: QuantitativeAuditEvidence,
+	pub(super) leaderboard_eligible: bool,
+}
+
+pub(super) fn quantitative_audit_gates(
+	input: &QuantitativeReportInput<'_>,
+	corpus_id: &str,
+	evidence_class: &str,
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+	metric_comparable: bool,
+) -> Result<QuantitativeAuditGates> {
+	let audit_evidence = audit_manifest::quantitative_audit_evidence(
+		input.audit_manifest_path,
+		QuantitativeAuditContext {
+			run_id: input.run_id,
+			corpus_id,
+			product: "ELF",
+			adapter_id: input.adapter.adapter_id.as_str(),
+			source_jobs: input.source_jobs,
+			ranking_query_count,
+			explicit_qrel_query_count,
+		},
+	)?;
+	let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible(
+		evidence_class,
+		input.source_jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		metric_comparable,
+		&audit_evidence,
+	);
+
+	Ok(QuantitativeAuditGates { audit_evidence, leaderboard_eligible })
+}

From dad67b4b40c29f6eefcb8b4d0d50054a86140421 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:01:19 -0400
Subject: [PATCH 36/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report data types","authority":"manual"}

---
 .../quantitative_reports.rs                   | 140 ++----------------
 .../quantitative_reports/audit.rs             |  29 ++++
 .../quantitative_reports/benchmark.rs         |  89 +++++++++++
 .../quantitative_reports/product.rs           |  12 ++
 4 files changed, 142 insertions(+), 128 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
index ded35360..a3bff704 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs
@@ -1,128 +1,12 @@
-use crate::{BTreeMap, Deserialize, Serialize};
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativeBenchmarkReport {
-	pub(super) schema: String,
-	pub(super) generated_at: String,
-	pub(super) corpus_id: String,
-	pub(super) k_values: Vec<usize>,
-	pub(super) rows: Vec<QuantitativeBenchmarkRow>,
-	#[serde(default)]
-	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
-	#[serde(default)]
-	pub(super) metrics_not_encoded: Vec<String>,
-	pub(super) controls: QuantitativeBenchmarkControls,
-	pub(super) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativeBenchmarkRow {
-	pub(super) product: String,
-	pub(super) adapter_id: String,
-	pub(super) adapter_name: String,
-	pub(super) suite: String,
-	pub(super) evidence_class: String,
-	pub(super) source_manifest_corpus_id: Option<String>,
-	pub(super) result_state: String,
-	pub(super) comparable: bool,
-	pub(super) metric_comparable: bool,
-	pub(super) leaderboard_eligible: bool,
-	pub(super) held_out: bool,
-	pub(super) leakage_audited: bool,
-	pub(super) audit_manifest_id: Option<String>,
-	pub(super) fixture_regression_only: bool,
-	pub(super) sample_size: usize,
-	pub(super) ranking_query_count: usize,
-	pub(super) ranking_coverage_state: String,
-	pub(super) ranked_candidate_source: String,
-	pub(super) qrel_source: String,
-	pub(super) explicit_qrel_query_count: usize,
-	pub(super) metrics: BTreeMap<String, Option<f64>>,
-	pub(super) metric_states: BTreeMap<String, String>,
-	pub(super) denominators: BTreeMap<String, usize>,
-	#[serde(default)]
-	pub(super) confidence_intervals: BTreeMap<String, QuantitativeConfidenceInterval>,
-	pub(super) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativePerQueryRow {
-	pub(super) job_id: String,
-	pub(super) suite: String,
-	pub(super) evidence_class: String,
-	pub(super) source_manifest_corpus_id: Option<String>,
-	pub(super) result_state: String,
-	pub(super) expected_relevant_count: usize,
-	pub(super) candidate_count: usize,
-	pub(super) qrel_source: String,
-	pub(super) relevance_grade_sum: f64,
-	pub(super) product: String,
-	pub(super) adapter_id: String,
-	pub(super) metrics: BTreeMap<String, Option<f64>>,
-	pub(super) metric_states: BTreeMap<String, String>,
-	pub(super) denominators: BTreeMap<String, usize>,
-	pub(super) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativeBenchmarkControls {
-	pub(super) same_corpus_required: bool,
-	pub(super) same_task_required: bool,
-	pub(super) ranked_candidates_required_for_ranking_metrics: bool,
-	pub(super) explicit_relevance_judgments_required_for_leaderboard: bool,
-	pub(super) minimum_query_count_for_leaderboard: usize,
-	pub(super) current_query_count: usize,
-	pub(super) current_ranking_query_count: usize,
-	pub(super) current_explicit_qrel_query_count: usize,
-	pub(super) leaderboard_claim_allowed: bool,
-	pub(super) leakage_control: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativeConfidenceInterval {
-	pub(super) method: String,
-	pub(super) confidence: f64,
-	pub(super) lower: f64,
-	pub(super) upper: f64,
-	pub(super) numerator: usize,
-	pub(super) denominator: usize,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(super) struct QuantitativeProductManifest {
-	pub(super) schema: String,
-	pub(super) manifest_id: String,
-	pub(super) corpus_id: String,
-	#[serde(default)]
-	pub(super) rows: Vec<QuantitativeBenchmarkRow>,
-	#[serde(default)]
-	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
-}
-
-#[derive(Clone, Debug, Deserialize, Serialize)]
-pub(super) struct QuantitativeAuditManifest {
-	pub(super) schema: String,
-	pub(super) manifest_id: String,
-	pub(super) run_id: String,
-	pub(super) corpus_id: String,
-	pub(super) product: String,
-	pub(super) adapter_id: String,
-	pub(super) held_out: bool,
-	pub(super) leakage_audited: bool,
-	pub(super) sample_size: usize,
-	pub(super) ranking_query_count: usize,
-	pub(super) explicit_qrel_query_count: usize,
-	pub(super) query_ids: Vec<String>,
-	#[serde(default)]
-	pub(super) controls: Vec<String>,
-	#[serde(default)]
-	pub(super) artifacts: Vec<QuantitativeAuditArtifact>,
-	pub(super) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Deserialize, Serialize)]
-pub(super) struct QuantitativeAuditArtifact {
-	pub(super) role: String,
-	pub(super) path: String,
-	pub(super) sha256: String,
-}
+mod audit;
+mod benchmark;
+mod product;
+
+pub(crate) use self::{
+	audit::{QuantitativeAuditArtifact, QuantitativeAuditManifest},
+	benchmark::{
+		QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow,
+		QuantitativeConfidenceInterval, QuantitativePerQueryRow,
+	},
+	product::QuantitativeProductManifest,
+};
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs
new file mode 100644
index 00000000..4b2ce584
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs
@@ -0,0 +1,29 @@
+use crate::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub(crate) struct QuantitativeAuditManifest {
+	pub(crate) schema: String,
+	pub(crate) manifest_id: String,
+	pub(crate) run_id: String,
+	pub(crate) corpus_id: String,
+	pub(crate) product: String,
+	pub(crate) adapter_id: String,
+	pub(crate) held_out: bool,
+	pub(crate) leakage_audited: bool,
+	pub(crate) sample_size: usize,
+	pub(crate) ranking_query_count: usize,
+	pub(crate) explicit_qrel_query_count: usize,
+	pub(crate) query_ids: Vec<String>,
+	#[serde(default)]
+	pub(crate) controls: Vec<String>,
+	#[serde(default)]
+	pub(crate) artifacts: Vec<QuantitativeAuditArtifact>,
+	pub(crate) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Deserialize, Serialize)]
+pub(crate) struct QuantitativeAuditArtifact {
+	pub(crate) role: String,
+	pub(crate) path: String,
+	pub(crate) sha256: String,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
new file mode 100644
index 00000000..7dfc1c88
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
@@ -0,0 +1,89 @@
+use crate::{BTreeMap, Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkReport {
+	pub(crate) schema: String,
+	pub(crate) generated_at: String,
+	pub(crate) corpus_id: String,
+	pub(crate) k_values: Vec<usize>,
+	pub(crate) rows: Vec<QuantitativeBenchmarkRow>,
+	#[serde(default)]
+	pub(crate) per_query_rows: Vec<QuantitativePerQueryRow>,
+	#[serde(default)]
+	pub(crate) metrics_not_encoded: Vec<String>,
+	pub(crate) controls: QuantitativeBenchmarkControls,
+	pub(crate) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkRow {
+	pub(crate) product: String,
+	pub(crate) adapter_id: String,
+	pub(crate) adapter_name: String,
+	pub(crate) suite: String,
+	pub(crate) evidence_class: String,
+	pub(crate) source_manifest_corpus_id: Option<String>,
+	pub(crate) result_state: String,
+	pub(crate) comparable: bool,
+	pub(crate) metric_comparable: bool,
+	pub(crate) leaderboard_eligible: bool,
+	pub(crate) held_out: bool,
+	pub(crate) leakage_audited: bool,
+	pub(crate) audit_manifest_id: Option<String>,
+	pub(crate) fixture_regression_only: bool,
+	pub(crate) sample_size: usize,
+	pub(crate) ranking_query_count: usize,
+	pub(crate) ranking_coverage_state: String,
+	pub(crate) ranked_candidate_source: String,
+	pub(crate) qrel_source: String,
+	pub(crate) explicit_qrel_query_count: usize,
+	pub(crate) metrics: BTreeMap<String, Option<f64>>,
+	pub(crate) metric_states: BTreeMap<String, String>,
+	pub(crate) denominators: BTreeMap<String, usize>,
+	#[serde(default)]
+	pub(crate) confidence_intervals: BTreeMap<String, QuantitativeConfidenceInterval>,
+	pub(crate) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativePerQueryRow {
+	pub(crate) job_id: String,
+	pub(crate) suite: String,
+	pub(crate) evidence_class: String,
+	pub(crate) source_manifest_corpus_id: Option<String>,
+	pub(crate) result_state: String,
+	pub(crate) expected_relevant_count: usize,
+	pub(crate) candidate_count: usize,
+	pub(crate) qrel_source: String,
+	pub(crate) relevance_grade_sum: f64,
+	pub(crate) product: String,
+	pub(crate) adapter_id: String,
+	pub(crate) metrics: BTreeMap<String, Option<f64>>,
+	pub(crate) metric_states: BTreeMap<String, String>,
+	pub(crate) denominators: BTreeMap<String, usize>,
+	pub(crate) claim_boundary: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkControls {
+	pub(crate) same_corpus_required: bool,
+	pub(crate) same_task_required: bool,
+	pub(crate) ranked_candidates_required_for_ranking_metrics: bool,
+	pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool,
+	pub(crate) minimum_query_count_for_leaderboard: usize,
+	pub(crate) current_query_count: usize,
+	pub(crate) current_ranking_query_count: usize,
+	pub(crate) current_explicit_qrel_query_count: usize,
+	pub(crate) leaderboard_claim_allowed: bool,
+	pub(crate) leakage_control: String,
+}
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeConfidenceInterval {
+	pub(crate) method: String,
+	pub(crate) confidence: f64,
+	pub(crate) lower: f64,
+	pub(crate) upper: f64,
+	pub(crate) numerator: usize,
+	pub(crate) denominator: usize,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs
new file mode 100644
index 00000000..efc5c357
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs
@@ -0,0 +1,12 @@
+use crate::{Deserialize, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeProductManifest {
+	pub(crate) schema: String,
+	pub(crate) manifest_id: String,
+	pub(crate) corpus_id: String,
+	#[serde(default)]
+	pub(crate) rows: Vec<QuantitativeBenchmarkRow>,
+	#[serde(default)]
+	pub(crate) per_query_rows: Vec<QuantitativePerQueryRow>,
+}

From e36caa61c671200deaefb65056f1d5691a03d239 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:05:29 -0400
Subject: [PATCH 37/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative benchmark report types","authority":"manual"}

---
 .../quantitative_reports/benchmark.rs         | 100 ++----------------
 .../benchmark/confidence.rs                   |  11 ++
 .../benchmark/controls.rs                     |  15 +++
 .../benchmark/per_query.rs                    |  20 ++++
 .../quantitative_reports/benchmark/report.rs  |  19 ++++
 .../quantitative_reports/benchmark/row.rs     |  31 ++++++
 6 files changed, 107 insertions(+), 89 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
index 7dfc1c88..50d36ff1 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs
@@ -1,89 +1,11 @@
-use crate::{BTreeMap, Deserialize, Serialize};
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(crate) struct QuantitativeBenchmarkReport {
-	pub(crate) schema: String,
-	pub(crate) generated_at: String,
-	pub(crate) corpus_id: String,
-	pub(crate) k_values: Vec<usize>,
-	pub(crate) rows: Vec<QuantitativeBenchmarkRow>,
-	#[serde(default)]
-	pub(crate) per_query_rows: Vec<QuantitativePerQueryRow>,
-	#[serde(default)]
-	pub(crate) metrics_not_encoded: Vec<String>,
-	pub(crate) controls: QuantitativeBenchmarkControls,
-	pub(crate) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(crate) struct QuantitativeBenchmarkRow {
-	pub(crate) product: String,
-	pub(crate) adapter_id: String,
-	pub(crate) adapter_name: String,
-	pub(crate) suite: String,
-	pub(crate) evidence_class: String,
-	pub(crate) source_manifest_corpus_id: Option<String>,
-	pub(crate) result_state: String,
-	pub(crate) comparable: bool,
-	pub(crate) metric_comparable: bool,
-	pub(crate) leaderboard_eligible: bool,
-	pub(crate) held_out: bool,
-	pub(crate) leakage_audited: bool,
-	pub(crate) audit_manifest_id: Option<String>,
-	pub(crate) fixture_regression_only: bool,
-	pub(crate) sample_size: usize,
-	pub(crate) ranking_query_count: usize,
-	pub(crate) ranking_coverage_state: String,
-	pub(crate) ranked_candidate_source: String,
-	pub(crate) qrel_source: String,
-	pub(crate) explicit_qrel_query_count: usize,
-	pub(crate) metrics: BTreeMap<String, Option<f64>>,
-	pub(crate) metric_states: BTreeMap<String, String>,
-	pub(crate) denominators: BTreeMap<String, usize>,
-	#[serde(default)]
-	pub(crate) confidence_intervals: BTreeMap<String, QuantitativeConfidenceInterval>,
-	pub(crate) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(crate) struct QuantitativePerQueryRow {
-	pub(crate) job_id: String,
-	pub(crate) suite: String,
-	pub(crate) evidence_class: String,
-	pub(crate) source_manifest_corpus_id: Option<String>,
-	pub(crate) result_state: String,
-	pub(crate) expected_relevant_count: usize,
-	pub(crate) candidate_count: usize,
-	pub(crate) qrel_source: String,
-	pub(crate) relevance_grade_sum: f64,
-	pub(crate) product: String,
-	pub(crate) adapter_id: String,
-	pub(crate) metrics: BTreeMap<String, Option<f64>>,
-	pub(crate) metric_states: BTreeMap<String, String>,
-	pub(crate) denominators: BTreeMap<String, usize>,
-	pub(crate) claim_boundary: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(crate) struct QuantitativeBenchmarkControls {
-	pub(crate) same_corpus_required: bool,
-	pub(crate) same_task_required: bool,
-	pub(crate) ranked_candidates_required_for_ranking_metrics: bool,
-	pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool,
-	pub(crate) minimum_query_count_for_leaderboard: usize,
-	pub(crate) current_query_count: usize,
-	pub(crate) current_ranking_query_count: usize,
-	pub(crate) current_explicit_qrel_query_count: usize,
-	pub(crate) leaderboard_claim_allowed: bool,
-	pub(crate) leakage_control: String,
-}
-
-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
-pub(crate) struct QuantitativeConfidenceInterval {
-	pub(crate) method: String,
-	pub(crate) confidence: f64,
-	pub(crate) lower: f64,
-	pub(crate) upper: f64,
-	pub(crate) numerator: usize,
-	pub(crate) denominator: usize,
-}
+mod confidence;
+mod controls;
+mod per_query;
+mod report;
+mod row;
+
+pub(crate) use self::{
+	confidence::QuantitativeConfidenceInterval, controls::QuantitativeBenchmarkControls,
+	per_query::QuantitativePerQueryRow, report::QuantitativeBenchmarkReport,
+	row::QuantitativeBenchmarkRow,
+};
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs
new file mode 100644
index 00000000..7a3da458
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs
@@ -0,0 +1,11 @@
+use crate::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeConfidenceInterval {
+	pub(crate) method: String,
+	pub(crate) confidence: f64,
+	pub(crate) lower: f64,
+	pub(crate) upper: f64,
+	pub(crate) numerator: usize,
+	pub(crate) denominator: usize,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs
new file mode 100644
index 00000000..1e8ea05f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs
@@ -0,0 +1,15 @@
+use crate::{Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkControls {
+	pub(crate) same_corpus_required: bool,
+	pub(crate) same_task_required: bool,
+	pub(crate) ranked_candidates_required_for_ranking_metrics: bool,
+	pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool,
+	pub(crate) minimum_query_count_for_leaderboard: usize,
+	pub(crate) current_query_count: usize,
+	pub(crate) current_ranking_query_count: usize,
+	pub(crate) current_explicit_qrel_query_count: usize,
+	pub(crate) leaderboard_claim_allowed: bool,
+	pub(crate) leakage_control: String,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs
new file mode 100644
index 00000000..35ce6d6f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs
@@ -0,0 +1,20 @@
+use crate::{BTreeMap, Deserialize, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativePerQueryRow {
+	pub(crate) job_id: String,
+	pub(crate) suite: String,
+	pub(crate) evidence_class: String,
+	pub(crate) source_manifest_corpus_id: Option<String>,
+	pub(crate) result_state: String,
+	pub(crate) expected_relevant_count: usize,
+	pub(crate) candidate_count: usize,
+	pub(crate) qrel_source: String,
+	pub(crate) relevance_grade_sum: f64,
+	pub(crate) product: String,
+	pub(crate) adapter_id: String,
+	pub(crate) metrics: BTreeMap<String, Option<f64>>,
+	pub(crate) metric_states: BTreeMap<String, String>,
+	pub(crate) denominators: BTreeMap<String, usize>,
+	pub(crate) claim_boundary: String,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs
new file mode 100644
index 00000000..1a57e138
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs
@@ -0,0 +1,19 @@
+use crate::{
+	Deserialize, QuantitativeBenchmarkControls, QuantitativeBenchmarkRow, QuantitativePerQueryRow,
+	Serialize,
+};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkReport {
+	pub(crate) schema: String,
+	pub(crate) generated_at: String,
+	pub(crate) corpus_id: String,
+	pub(crate) k_values: Vec<usize>,
+	pub(crate) rows: Vec<QuantitativeBenchmarkRow>,
+	#[serde(default)]
+	pub(crate) per_query_rows: Vec<QuantitativePerQueryRow>,
+	#[serde(default)]
+	pub(crate) metrics_not_encoded: Vec<String>,
+	pub(crate) controls: QuantitativeBenchmarkControls,
+	pub(crate) claim_boundary: String,
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs
new file mode 100644
index 00000000..cdef9042
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs
@@ -0,0 +1,31 @@
+use crate::{BTreeMap, Deserialize, QuantitativeConfidenceInterval, Serialize};
+
+#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+pub(crate) struct QuantitativeBenchmarkRow {
+	pub(crate) product: String,
+	pub(crate) adapter_id: String,
+	pub(crate) adapter_name: String,
+	pub(crate) suite: String,
+	pub(crate) evidence_class: String,
+	pub(crate) source_manifest_corpus_id: Option<String>,
+	pub(crate) result_state: String,
+	pub(crate) comparable: bool,
+	pub(crate) metric_comparable: bool,
+	pub(crate) leaderboard_eligible: bool,
+	pub(crate) held_out: bool,
+	pub(crate) leakage_audited: bool,
+	pub(crate) audit_manifest_id: Option<String>,
+	pub(crate) fixture_regression_only: bool,
+	pub(crate) sample_size: usize,
+	pub(crate) ranking_query_count: usize,
+	pub(crate) ranking_coverage_state: String,
+	pub(crate) ranked_candidate_source: String,
+	pub(crate) qrel_source: String,
+	pub(crate) explicit_qrel_query_count: usize,
+	pub(crate) metrics: BTreeMap<String, Option<f64>>,
+	pub(crate) metric_states: BTreeMap<String, String>,
+	pub(crate) denominators: BTreeMap<String, usize>,
+	#[serde(default)]
+	pub(crate) confidence_intervals: BTreeMap<String, QuantitativeConfidenceInterval>,
+	pub(crate) claim_boundary: String,
+}

From b3c5a5013875016094ab06f4a47af4f15c768ac8 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:09:42 -0400
Subject: [PATCH 38/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product row validation","authority":"manual"}

---
 .../validation/rows/product.rs                | 67 ++-----------------
 .../validation/rows/product/identity.rs       | 34 ++++++++++
 .../validation/rows/product/leaderboard.rs    | 31 +++++++++
 3 files changed, 72 insertions(+), 60 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
index 913b0628..ac009d59 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs
@@ -1,7 +1,7 @@
-use crate::{
-	Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre,
-	quantitative::MIN_LEADERBOARD_QUERY_COUNT,
-};
+mod identity;
+mod leaderboard;
+
+use crate::{Path, QuantitativeProductManifest, Result};
 
 pub(super) fn validate_quantitative_product_rows(
 	manifest: &QuantitativeProductManifest,
@@ -9,65 +9,12 @@ pub(super) fn validate_quantitative_product_rows(
 	corpus_id: &str,
 ) -> Result<()> {
 	for row in &manifest.rows {
-		if row.product == "ELF" {
-			return Err(eyre::eyre!(
-				"{} quantitative product manifest must not inject ELF self rows.",
-				path.display()
-			));
-		}
-		if row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.adapter_name.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative product row.",
-				path.display()
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
+		identity::validate_product_row_identity(path, row, corpus_id)?;
+
 		if row.leaderboard_eligible {
-			validate_leaderboard_eligible_product_row(path, row)?;
+			leaderboard::validate_leaderboard_eligible_product_row(path, row)?;
 		}
 	}
 
 	Ok(())
 }
-
-fn validate_leaderboard_eligible_product_row(
-	path: &Path,
-	row: &QuantitativeBenchmarkRow,
-) -> Result<()> {
-	let has_audit_manifest_id = row
-		.audit_manifest_id
-		.as_deref()
-		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
-
-	if row.evidence_class != "live_real_world"
-		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
-		|| row.ranking_query_count != row.sample_size
-		|| row.explicit_qrel_query_count != row.ranking_query_count
-		|| !row.held_out
-		|| !row.leakage_audited
-		|| !has_audit_manifest_id
-	{
-		return Err(eyre::eyre!(
-			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
-
-	Ok(())
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs
new file mode 100644
index 00000000..5dd82465
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs
@@ -0,0 +1,34 @@
+use crate::{Path, QuantitativeBenchmarkRow, Result, eyre};
+
+pub(super) fn validate_product_row_identity(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+	corpus_id: &str,
+) -> Result<()> {
+	if row.product == "ELF" {
+		return Err(eyre::eyre!(
+			"{} quantitative product manifest must not inject ELF self rows.",
+			path.display()
+		));
+	}
+	if row.product.trim().is_empty()
+		|| row.adapter_id.trim().is_empty()
+		|| row.adapter_name.trim().is_empty()
+		|| row.suite.trim().is_empty()
+		|| row.evidence_class.trim().is_empty()
+		|| row.result_state.trim().is_empty()
+	{
+		return Err(eyre::eyre!("{} has an incomplete quantitative product row.", path.display()));
+	}
+	if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+		return Err(eyre::eyre!(
+			"{} row {}:{} is not same-corpus {}.",
+			path.display(),
+			row.product,
+			row.adapter_id,
+			corpus_id
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs
new file mode 100644
index 00000000..e5f76ae2
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs
@@ -0,0 +1,31 @@
+use crate::{
+	Path, QuantitativeBenchmarkRow, Result, eyre, quantitative::MIN_LEADERBOARD_QUERY_COUNT,
+};
+
+pub(super) fn validate_leaderboard_eligible_product_row(
+	path: &Path,
+	row: &QuantitativeBenchmarkRow,
+) -> Result<()> {
+	let has_audit_manifest_id = row
+		.audit_manifest_id
+		.as_deref()
+		.is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty());
+
+	if row.evidence_class != "live_real_world"
+		|| row.sample_size < MIN_LEADERBOARD_QUERY_COUNT
+		|| row.ranking_query_count != row.sample_size
+		|| row.explicit_qrel_query_count != row.ranking_query_count
+		|| !row.held_out
+		|| !row.leakage_audited
+		|| !has_audit_manifest_id
+	{
+		return Err(eyre::eyre!(
+			"{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}

From fa6ebe9932c114ea66f23f5eea92afe10c5db12c Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:14:38 -0400
Subject: [PATCH 39/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit identity validation","authority":"manual"}

---
 .../audit_manifest/validation/identity.rs     | 69 ++-----------------
 .../validation/identity/context.rs            | 63 +++++++++++++++++
 .../validation/identity/schema.rs             | 21 ++++++
 3 files changed, 90 insertions(+), 63 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
index 461e9eb6..6444cdea 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs
@@ -1,6 +1,8 @@
+mod context;
+mod schema;
+
 use crate::{
-	Path, QuantitativeAuditManifest, Result, eyre,
-	quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, audit_manifest::QuantitativeAuditContext},
+	Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext,
 };
 
 pub(super) fn validate_quantitative_audit_identity(
@@ -8,66 +10,7 @@ pub(super) fn validate_quantitative_audit_identity(
 	path: &Path,
 	context: &QuantitativeAuditContext<'_>,
 ) -> Result<()> {
-	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
-		return Err(eyre::eyre!(
-			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
-			path.display(),
-			manifest.schema
-		));
-	}
-	if manifest.manifest_id.trim().is_empty() {
-		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
-	}
-	if manifest.run_id != context.run_id {
-		return Err(eyre::eyre!(
-			"{} has run_id {}, expected {}.",
-			path.display(),
-			manifest.run_id,
-			context.run_id
-		));
-	}
-	if manifest.corpus_id != context.corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected {}.",
-			path.display(),
-			manifest.corpus_id,
-			context.corpus_id
-		));
-	}
-	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
-		return Err(eyre::eyre!(
-			"{} has product {}:{} but current row is {}:{}.",
-			path.display(),
-			manifest.product,
-			manifest.adapter_id,
-			context.product,
-			context.adapter_id
-		));
-	}
-	if manifest.sample_size != context.source_jobs.len() {
-		return Err(eyre::eyre!(
-			"{} has sample_size {}, expected {}.",
-			path.display(),
-			manifest.sample_size,
-			context.source_jobs.len()
-		));
-	}
-	if manifest.ranking_query_count != context.ranking_query_count {
-		return Err(eyre::eyre!(
-			"{} has ranking_query_count {}, expected {}.",
-			path.display(),
-			manifest.ranking_query_count,
-			context.ranking_query_count
-		));
-	}
-	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
-		return Err(eyre::eyre!(
-			"{} has explicit_qrel_query_count {}, expected {}.",
-			path.display(),
-			manifest.explicit_qrel_query_count,
-			context.explicit_qrel_query_count
-		));
-	}
+	schema::validate_quantitative_audit_schema(manifest, path)?;
 
-	Ok(())
+	context::validate_quantitative_audit_context(manifest, path, context)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
new file mode 100644
index 00000000..d11c8636
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
@@ -0,0 +1,63 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre,
+	quantitative::audit_manifest::QuantitativeAuditContext,
+};
+
+pub(super) fn validate_quantitative_audit_context(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: &QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs
new file mode 100644
index 00000000..f288eeba
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs
@@ -0,0 +1,21 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre, quantitative::QUANTITATIVE_AUDIT_MANIFEST_SCHEMA,
+};
+
+pub(super) fn validate_quantitative_audit_schema(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA {
+		return Err(eyre::eyre!(
+			"{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.",
+			path.display(),
+			manifest.schema
+		));
+	}
+	if manifest.manifest_id.trim().is_empty() {
+		return Err(eyre::eyre!("{} has an empty manifest_id.", path.display()));
+	}
+
+	Ok(())
+}

From e1dfc21ef5494b1e9a8b5bc7d40ca59e4b335206 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:19:32 -0400
Subject: [PATCH 40/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit digest paths","authority":"manual"}

---
 .../audit_manifest/artifacts/digest.rs        | 36 +++----------------
 .../audit_manifest/artifacts/digest/paths.rs  | 31 ++++++++++++++++
 2 files changed, 35 insertions(+), 32 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
index bb75c802..d87860d9 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs
@@ -1,4 +1,6 @@
-use crate::{Path, PathBuf, Result, fs};
+mod paths;
+
+use crate::{Path, Result, fs};
 
 pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) -> Result<String> {
 	let mut hasher = blake3::Hasher::new();
@@ -13,7 +15,7 @@ pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path)
 		return Ok(hasher.finalize().to_hex().to_string());
 	}
 
-	let paths = audit_fixture_paths(path)?;
+	let paths = paths::audit_fixture_paths(path)?;
 
 	for fixture in paths {
 		let relative = fixture
@@ -27,36 +29,6 @@ pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path)
 	Ok(hasher.finalize().to_hex().to_string())
 }
 
-fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
-	let mut paths = Vec::new();
-
-	collect_audit_fixture_paths(path, &mut paths)?;
-
-	paths.sort();
-
-	Ok(paths)
-}
-
-fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
-	if path.is_file() {
-		paths.push(path.to_path_buf());
-
-		return Ok(());
-	}
-
-	for entry in fs::read_dir(path)? {
-		let entry_path = entry?.path();
-
-		if entry_path.is_dir() {
-			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
-		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
-			paths.push(entry_path);
-		}
-	}
-
-	Ok(())
-}
-
 fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> {
 	hasher.update(logical_path.as_bytes());
 	hasher.update(b"\0");
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs
new file mode 100644
index 00000000..a7ba276c
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs
@@ -0,0 +1,31 @@
+use crate::{Path, PathBuf, Result, fs};
+
+pub(super) fn audit_fixture_paths(path: &Path) -> Result<Vec<PathBuf>> {
+	let mut paths = Vec::new();
+
+	collect_audit_fixture_paths(path, &mut paths)?;
+
+	paths.sort();
+
+	Ok(paths)
+}
+
+fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec<PathBuf>) -> Result<()> {
+	if path.is_file() {
+		paths.push(path.to_path_buf());
+
+		return Ok(());
+	}
+
+	for entry in fs::read_dir(path)? {
+		let entry_path = entry?.path();
+
+		if entry_path.is_dir() {
+			collect_audit_fixture_paths(entry_path.as_path(), paths)?;
+		} else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") {
+			paths.push(entry_path);
+		}
+	}
+
+	Ok(())
+}

From 788ab64d3db9a123d72fb51b7635596c513707ae Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:25:15 -0400
Subject: [PATCH 41/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit manifest assembly","authority":"manual"}

---
 .../quantitative/audit_manifest/export.rs     | 38 ++--------------
 .../audit_manifest/export/manifest.rs         | 45 +++++++++++++++++++
 2 files changed, 49 insertions(+), 34 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
index 795960e0..6b23ccfa 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs
@@ -1,14 +1,10 @@
 mod claim_boundary;
 mod identity;
+mod manifest;
 
 use crate::{
-	ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest,
-	RealWorldJob, Result,
-	quantitative::{
-		self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA,
-		audit_manifest::{QuantitativeAuditContext, artifacts, validation},
-		metrics,
-	},
+	ExportQuantitativeAuditManifestArgs, QuantitativeAuditManifest, RealWorldJob, Result,
+	quantitative::audit_manifest::{QuantitativeAuditContext, validation},
 };
 
 pub(crate) fn quantitative_audit_manifest_from_jobs(
@@ -20,33 +16,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs(
 
 	identity::validate_audit_export_identity(product, adapter_id)?;
 
-	let corpus_id = quantitative::quantitative_corpus_id(jobs);
-	let ranking_query_count = metrics::ranking_query_count(jobs);
-	let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs);
-	let manifest = QuantitativeAuditManifest {
-		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
-		run_id: args.run_id.clone(),
-		corpus_id,
-		product: product.to_string(),
-		adapter_id: adapter_id.to_string(),
-		held_out: args.held_out,
-		leakage_audited: args.leakage_audited,
-		sample_size: jobs.len(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
-		controls: args.controls.clone(),
-		artifacts: vec![QuantitativeAuditArtifact {
-			role: "product_runtime_fixtures".to_string(),
-			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
-			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
-		}],
-		claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args),
-	};
+	let manifest = manifest::quantitative_audit_manifest(jobs, args, product, adapter_id)?;
 
 	validation::validate_quantitative_audit_manifest(
 		&manifest,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs
new file mode 100644
index 00000000..dad5a99e
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs
@@ -0,0 +1,45 @@
+use crate::{
+	ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest,
+	RealWorldJob, Result,
+	quantitative::{
+		self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA,
+		audit_manifest::{artifacts, export::claim_boundary},
+		metrics,
+	},
+};
+
+pub(super) fn quantitative_audit_manifest(
+	jobs: &[RealWorldJob],
+	args: &ExportQuantitativeAuditManifestArgs,
+	product: &str,
+	adapter_id: &str,
+) -> Result<QuantitativeAuditManifest> {
+	let corpus_id = quantitative::quantitative_corpus_id(jobs);
+	let ranking_query_count = metrics::ranking_query_count(jobs);
+	let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs);
+
+	Ok(QuantitativeAuditManifest {
+		schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)),
+		run_id: args.run_id.clone(),
+		corpus_id,
+		product: product.to_string(),
+		adapter_id: adapter_id.to_string(),
+		held_out: args.held_out,
+		leakage_audited: args.leakage_audited,
+		sample_size: jobs.len(),
+		ranking_query_count,
+		explicit_qrel_query_count,
+		query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(),
+		controls: args.controls.clone(),
+		artifacts: vec![QuantitativeAuditArtifact {
+			role: "product_runtime_fixtures".to_string(),
+			path: artifacts::audit_artifact_display_path(args.fixtures.as_path()),
+			sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?,
+		}],
+		claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args),
+	})
+}

From b54faa73ec205b795be3e7c106207f3c604856ef Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:28:23 -0400
Subject: [PATCH 42/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative benchmark row input","authority":"manual"}

---
 .../quantitative/report/row/benchmark_row.rs  | 24 +++++--------------
 .../report/row/benchmark_row/input.rs         | 17 +++++++++++++
 2 files changed, 23 insertions(+), 18 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs
index 53198ae6..4b8b2e31 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs
@@ -1,24 +1,12 @@
+mod input;
+
+pub(super) use self::input::QuantitativeBenchmarkRowInput;
+
 use crate::{
-	QuantitativeBenchmarkRow, QuantitativePerQueryRow,
-	quantitative::{
-		self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, audit_manifest::QuantitativeAuditEvidence, metrics,
-		report::QuantitativeReportInput,
-	},
+	QuantitativeBenchmarkRow,
+	quantitative::{self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, metrics},
 };
 
-pub(super) struct QuantitativeBenchmarkRowInput<'a, 'b> {
-	pub(super) input: &'a QuantitativeReportInput<'b>,
-	pub(super) corpus_id: &'a str,
-	pub(super) evidence_class: &'a str,
-	pub(super) per_query_rows: &'a [QuantitativePerQueryRow],
-	pub(super) ranking_query_count: usize,
-	pub(super) explicit_qrel_query_count: usize,
-	pub(super) metric_comparable: bool,
-	pub(super) result_state: &'a str,
-	pub(super) audit_evidence: QuantitativeAuditEvidence,
-	pub(super) leaderboard_eligible: bool,
-}
-
 pub(super) fn quantitative_benchmark_row(
 	row_input: QuantitativeBenchmarkRowInput<'_, '_>,
 ) -> QuantitativeBenchmarkRow {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs
new file mode 100644
index 00000000..a8e3f96a
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs
@@ -0,0 +1,17 @@
+use crate::{
+	QuantitativePerQueryRow,
+	quantitative::{audit_manifest::QuantitativeAuditEvidence, report::QuantitativeReportInput},
+};
+
+pub(in crate::quantitative::report::row) struct QuantitativeBenchmarkRowInput<'a, 'b> {
+	pub(in crate::quantitative::report::row) input: &'a QuantitativeReportInput<'b>,
+	pub(in crate::quantitative::report::row) corpus_id: &'a str,
+	pub(in crate::quantitative::report::row) evidence_class: &'a str,
+	pub(in crate::quantitative::report::row) per_query_rows: &'a [QuantitativePerQueryRow],
+	pub(in crate::quantitative::report::row) ranking_query_count: usize,
+	pub(in crate::quantitative::report::row) explicit_qrel_query_count: usize,
+	pub(in crate::quantitative::report::row) metric_comparable: bool,
+	pub(in crate::quantitative::report::row) result_state: &'a str,
+	pub(in crate::quantitative::report::row) audit_evidence: QuantitativeAuditEvidence,
+	pub(in crate::quantitative::report::row) leaderboard_eligible: bool,
+}

From 49325f60648101e030f8becee65a410e1a9afa8f Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:33:24 -0400
Subject: [PATCH 43/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product export manifest","authority":"manual"}

---
 .../quantitative/product_manifest/export.rs   | 37 ++-------------
 .../product_manifest/export/manifest.rs       | 46 +++++++++++++++++++
 .../product_manifest/export/source.rs         | 37 +++++++++++++++
 3 files changed, 87 insertions(+), 33 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
index ac105d5a..d72509f8 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs
@@ -1,10 +1,11 @@
 mod identity;
+mod manifest;
 mod rows;
+mod source;
 
 use crate::{
 	ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, REPORT_SCHEMA,
-	RealWorldReport, Result, eyre,
-	quantitative::{QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, product_manifest::validation},
+	RealWorldReport, Result, eyre, quantitative::product_manifest::validation,
 };
 
 pub(crate) fn quantitative_product_manifest_from_report(
@@ -19,37 +20,7 @@ pub(crate) fn quantitative_product_manifest_from_report(
 		));
 	}
 
-	let source_row =
-		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
-			eyre::eyre!("{} has no quantitative product row.", args.report.display())
-		})?;
-	let source_product = source_row.product.as_str();
-	let source_adapter_id = source_row.adapter_id.as_str();
-	let product = args.product.as_deref().unwrap_or(source_product).trim();
-	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
-	let adapter_name =
-		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
-
-	identity::validate_export_identity(args, product, adapter_id, adapter_name)?;
-
-	let row = rows::exported_product_row(source_row, product, adapter_id, adapter_name);
-	let per_query_rows = rows::exported_per_query_rows(
-		report,
-		source_product,
-		source_adapter_id,
-		product,
-		adapter_id,
-	);
-	let manifest = QuantitativeProductManifest {
-		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
-		manifest_id: args
-			.manifest_id
-			.clone()
-			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
-		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
-		rows: vec![row],
-		per_query_rows,
-	};
+	let manifest = manifest::quantitative_product_manifest(report, args)?;
 
 	validation::validate_quantitative_product_manifest(
 		&manifest,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs
new file mode 100644
index 00000000..592cb19f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs
@@ -0,0 +1,46 @@
+use crate::{
+	ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, RealWorldReport, Result,
+	quantitative::{
+		QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA,
+		product_manifest::export::{identity, rows, source},
+	},
+};
+
+pub(super) fn quantitative_product_manifest(
+	report: &RealWorldReport,
+	args: &ExportQuantitativeProductManifestArgs,
+) -> Result<QuantitativeProductManifest> {
+	let source = source::product_export_identity(report, args)?;
+
+	identity::validate_export_identity(
+		args,
+		source.product,
+		source.adapter_id,
+		source.adapter_name,
+	)?;
+
+	let row = rows::exported_product_row(
+		source.row,
+		source.product,
+		source.adapter_id,
+		source.adapter_name,
+	);
+	let per_query_rows = rows::exported_per_query_rows(
+		report,
+		source.source_product,
+		source.source_adapter_id,
+		source.product,
+		source.adapter_id,
+	);
+
+	Ok(QuantitativeProductManifest {
+		schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(),
+		manifest_id: args
+			.manifest_id
+			.clone()
+			.unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)),
+		corpus_id: report.quantitative_scoreboard.corpus_id.clone(),
+		rows: vec![row],
+		per_query_rows,
+	})
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs
new file mode 100644
index 00000000..6a3b7ed9
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs
@@ -0,0 +1,37 @@
+use crate::{
+	ExportQuantitativeProductManifestArgs, QuantitativeBenchmarkRow, RealWorldReport, Result, eyre,
+};
+
+pub(super) struct ProductExportIdentity<'report> {
+	pub(super) row: &'report QuantitativeBenchmarkRow,
+	pub(super) source_product: &'report str,
+	pub(super) source_adapter_id: &'report str,
+	pub(super) product: &'report str,
+	pub(super) adapter_id: &'report str,
+	pub(super) adapter_name: &'report str,
+}
+
+pub(super) fn product_export_identity<'report>(
+	report: &'report RealWorldReport,
+	args: &'report ExportQuantitativeProductManifestArgs,
+) -> Result<ProductExportIdentity<'report>> {
+	let source_row =
+		report.quantitative_scoreboard.rows.first().ok_or_else(|| {
+			eyre::eyre!("{} has no quantitative product row.", args.report.display())
+		})?;
+	let source_product = source_row.product.as_str();
+	let source_adapter_id = source_row.adapter_id.as_str();
+	let product = args.product.as_deref().unwrap_or(source_product).trim();
+	let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim();
+	let adapter_name =
+		args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim();
+
+	Ok(ProductExportIdentity {
+		row: source_row,
+		source_product,
+		source_adapter_id,
+		product,
+		adapter_id,
+		adapter_name,
+	})
+}

From 449927f2fa427d3a1764ce49e6b8278fb695358f Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:38:41 -0400
Subject: [PATCH 44/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit artifact validation","authority":"manual"}

---
 .../quantitative/audit_manifest/artifacts.rs  | 62 ++-----------------
 .../audit_manifest/artifacts/validation.rs    | 20 ++++++
 .../artifacts/validation/digest.rs            | 33 ++++++++++
 .../artifacts/validation/fields.rs            | 26 ++++++++
 4 files changed, 84 insertions(+), 57 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
index 25a0bbb0..855af455 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs
@@ -1,60 +1,8 @@
 mod digest;
 mod paths;
+mod validation;
 
-pub(super) use self::{digest::fixture_path_digest, paths::audit_artifact_display_path};
-
-use crate::{Path, QuantitativeAuditManifest, Result, eyre};
-
-pub(super) fn validate_quantitative_audit_artifacts(
-	manifest: &QuantitativeAuditManifest,
-	path: &Path,
-) -> Result<()> {
-	if manifest.artifacts.is_empty() {
-		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
-	}
-
-	for artifact in &manifest.artifacts {
-		if artifact.role.trim().is_empty()
-			|| artifact.path.trim().is_empty()
-			|| artifact.sha256.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative audit artifact.",
-				path.display()
-			));
-		}
-		if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit())
-		{
-			return Err(eyre::eyre!(
-				"{} artifact {} has invalid sha256 digest {}.",
-				path.display(),
-				artifact.role,
-				artifact.sha256
-			));
-		}
-
-		let artifact_path =
-			paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
-		let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| {
-			eyre::eyre!(
-				"{} artifact {} could not be digested at {}: {err}",
-				path.display(),
-				artifact.role,
-				artifact_path.display()
-			)
-		})?;
-
-		if actual != artifact.sha256 {
-			return Err(eyre::eyre!(
-				"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
-				path.display(),
-				artifact.role,
-				artifact_path.display(),
-				artifact.sha256,
-				actual
-			));
-		}
-	}
-
-	Ok(())
-}
+pub(super) use self::{
+	digest::fixture_path_digest, paths::audit_artifact_display_path,
+	validation::validate_quantitative_audit_artifacts,
+};
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs
new file mode 100644
index 00000000..21c5e7bb
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs
@@ -0,0 +1,20 @@
+mod digest;
+mod fields;
+
+use crate::{Path, QuantitativeAuditManifest, Result, eyre};
+
+pub(in crate::quantitative::audit_manifest) fn validate_quantitative_audit_artifacts(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+) -> Result<()> {
+	if manifest.artifacts.is_empty() {
+		return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display()));
+	}
+
+	for artifact in &manifest.artifacts {
+		fields::validate_audit_artifact_fields(path, artifact)?;
+		digest::validate_audit_artifact_digest(path, artifact)?;
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs
new file mode 100644
index 00000000..e6af0f61
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs
@@ -0,0 +1,33 @@
+use crate::{
+	Path, QuantitativeAuditArtifact, Result, eyre,
+	quantitative::audit_manifest::artifacts::{digest, paths},
+};
+
+pub(super) fn validate_audit_artifact_digest(
+	path: &Path,
+	artifact: &QuantitativeAuditArtifact,
+) -> Result<()> {
+	let artifact_path =
+		paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str());
+	let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| {
+		eyre::eyre!(
+			"{} artifact {} could not be digested at {}: {err}",
+			path.display(),
+			artifact.role,
+			artifact_path.display()
+		)
+	})?;
+
+	if actual != artifact.sha256 {
+		return Err(eyre::eyre!(
+			"{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.",
+			path.display(),
+			artifact.role,
+			artifact_path.display(),
+			artifact.sha256,
+			actual
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs
new file mode 100644
index 00000000..af6c149c
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs
@@ -0,0 +1,26 @@
+use crate::{Path, QuantitativeAuditArtifact, Result, eyre};
+
+pub(super) fn validate_audit_artifact_fields(
+	path: &Path,
+	artifact: &QuantitativeAuditArtifact,
+) -> Result<()> {
+	if artifact.role.trim().is_empty()
+		|| artifact.path.trim().is_empty()
+		|| artifact.sha256.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} has an incomplete quantitative audit artifact.",
+			path.display()
+		));
+	}
+	if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) {
+		return Err(eyre::eyre!(
+			"{} artifact {} has invalid sha256 digest {}.",
+			path.display(),
+			artifact.role,
+			artifact.sha256
+		));
+	}
+
+	Ok(())
+}

From 779dbed7360c27fa4ecc101896e38c284a24126a Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:41:42 -0400
Subject: [PATCH 45/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative ranking metrics","authority":"manual"}

---
 .../per_query/query_metrics/ranking.rs        | 43 +++----------------
 .../per_query/query_metrics/ranking/ndcg.rs   | 33 ++++++++++++++
 .../query_metrics/ranking/reciprocal_rank.rs  | 19 ++++++++
 3 files changed, 57 insertions(+), 38 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
index 5abea808..e9d7dbf7 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs
@@ -1,23 +1,14 @@
 mod average_precision;
+mod ndcg;
+mod reciprocal_rank;
 
-use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics};
+use crate::BTreeMap;
 
 pub(super) fn reciprocal_rank(
 	candidates: &[String],
 	relevance: &BTreeMap<String, f64>,
 ) -> Option<f64> {
-	if query_metrics::positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	Some(
-		candidates
-			.iter()
-			.position(|candidate| {
-				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
-			})
-			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
-	)
+	reciprocal_rank::reciprocal_rank(candidates, relevance)
 }
 
 pub(super) fn ndcg_at_k(
@@ -25,31 +16,7 @@ pub(super) fn ndcg_at_k(
 	relevance: &BTreeMap<String, f64>,
 	k: usize,
 ) -> Option<f64> {
-	if query_metrics::positive_qrel_count(relevance) == 0 {
-		return None;
-	}
-
-	let dcg = candidates
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, candidate)| {
-			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
-				/ ((index + 2) as f64).log2()
-		})
-		.sum::<f64>();
-	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
-
-	ideal.sort_by(|left, right| right.total_cmp(left));
-
-	let idcg = ideal
-		.iter()
-		.take(k)
-		.enumerate()
-		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
-		.sum::<f64>();
-
-	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+	ndcg::ndcg_at_k(candidates, relevance, k)
 }
 
 pub(super) fn average_precision(
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs
new file mode 100644
index 00000000..540d2f66
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs
@@ -0,0 +1,33 @@
+use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics};
+
+pub(super) fn ndcg_at_k(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+	k: usize,
+) -> Option<f64> {
+	if query_metrics::positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	let dcg = candidates
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, candidate)| {
+			relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0)
+				/ ((index + 2) as f64).log2()
+		})
+		.sum::<f64>();
+	let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::<Vec<_>>();
+
+	ideal.sort_by(|left, right| right.total_cmp(left));
+
+	let idcg = ideal
+		.iter()
+		.take(k)
+		.enumerate()
+		.map(|(index, grade)| grade / ((index + 2) as f64).log2())
+		.sum::<f64>();
+
+	Some(if idcg > 0.0 { dcg / idcg } else { 0.0 })
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs
new file mode 100644
index 00000000..99956367
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs
@@ -0,0 +1,19 @@
+use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics};
+
+pub(super) fn reciprocal_rank(
+	candidates: &[String],
+	relevance: &BTreeMap<String, f64>,
+) -> Option<f64> {
+	if query_metrics::positive_qrel_count(relevance) == 0 {
+		return None;
+	}
+
+	Some(
+		candidates
+			.iter()
+			.position(|candidate| {
+				relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)
+			})
+			.map_or(0.0, |index| 1.0 / (index + 1) as f64),
+	)
+}

From 119287f70cfa6fd42a7e71ae0f1707ef557bdb68 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:46:04 -0400
Subject: [PATCH 46/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query row assembly","authority":"manual"}

---
 .../quantitative/metrics/per_query.rs         | 48 ++-----------------
 .../quantitative/metrics/per_query/row.rs     | 48 +++++++++++++++++++
 2 files changed, 51 insertions(+), 45 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
index cb184dc9..2f8de046 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
@@ -1,10 +1,8 @@
 mod evidence;
 mod query_metrics;
+mod row;
 
-use crate::{
-	JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
-	quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring,
-};
+use crate::{JobReport, QuantitativePerQueryRow, RealWorldJob};
 
 pub(super) fn quantitative_per_query_rows(
 	source_jobs: &[RealWorldJob],
@@ -17,47 +15,7 @@ pub(super) fn quantitative_per_query_rows(
 		.iter()
 		.zip(jobs.iter())
 		.map(|(source_job, job)| {
-			quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
+			row::quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id)
 		})
 		.collect()
 }
-
-fn quantitative_per_query_row(
-	source_job: &RealWorldJob,
-	job: &JobReport,
-	corpus_id: &str,
-	evidence_class: &str,
-	adapter_id: &str,
-) -> QuantitativePerQueryRow {
-	let relevance = evidence::relevance_grades(source_job, job);
-	let candidates = scoring::produced_evidence_order(source_job);
-	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
-	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
-	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
-		"not_encoded"
-	} else {
-		formatting::status_str(job.status)
-	};
-	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
-
-	QuantitativePerQueryRow {
-		job_id: job.job_id.clone(),
-		suite: job.suite_id.clone(),
-		evidence_class: evidence_class.to_string(),
-		source_manifest_corpus_id: Some(corpus_id.to_string()),
-		result_state: formatting::status_str(job.status).to_string(),
-		expected_relevant_count: positive_relevance_count,
-		candidate_count: candidates.len(),
-		qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(),
-		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
-		product: "ELF".to_string(),
-		adapter_id: adapter_id.to_string(),
-		metrics,
-		metric_states,
-		denominators: query_metrics::per_query_denominators(
-			candidates.len(),
-			positive_relevance_count,
-		),
-		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
-	}
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs
new file mode 100644
index 00000000..2a892850
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs
@@ -0,0 +1,48 @@
+use crate::{
+	JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
+	quantitative::{
+		QUANTITATIVE_ROW_CLAIM_BOUNDARY,
+		metrics::per_query::{evidence, query_metrics},
+	},
+	scoring,
+};
+
+pub(super) fn quantitative_per_query_row(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+	corpus_id: &str,
+	evidence_class: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let relevance = evidence::relevance_grades(source_job, job);
+	let candidates = scoring::produced_evidence_order(source_job);
+	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
+	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
+	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
+		"not_encoded"
+	} else {
+		formatting::status_str(job.status)
+	};
+	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
+
+	QuantitativePerQueryRow {
+		job_id: job.job_id.clone(),
+		suite: job.suite_id.clone(),
+		evidence_class: evidence_class.to_string(),
+		source_manifest_corpus_id: Some(corpus_id.to_string()),
+		result_state: formatting::status_str(job.status).to_string(),
+		expected_relevant_count: positive_relevance_count,
+		candidate_count: candidates.len(),
+		qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(),
+		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		product: "ELF".to_string(),
+		adapter_id: adapter_id.to_string(),
+		metrics,
+		metric_states,
+		denominators: query_metrics::per_query_denominators(
+			candidates.len(),
+			positive_relevance_count,
+		),
+		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
+	}
+}

From a2282cac363d29b7a1cbb373d161389bcdb57693 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:48:48 -0400
Subject: [PATCH 47/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative aggregate metric assembly","authority":"manual"}

---
 .../quantitative/metrics/aggregate.rs         | 43 +++----------------
 .../quantitative/metrics/aggregate/metrics.rs | 27 ++++++++++++
 .../quantitative/metrics/aggregate/states.rs  | 20 +++++++++
 3 files changed, 52 insertions(+), 38 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
index b61ee782..9e899d64 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
@@ -1,53 +1,20 @@
 mod confidence;
 mod denominators;
+mod metrics;
 mod names;
+mod states;
 
-use crate::{
-	BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting,
-	quantitative::QUANTITATIVE_K_VALUES,
-};
+use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow};
 
 pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
-	let mut sums = BTreeMap::<String, (f64, usize)>::new();
-	let mut metrics = names::quantitative_metric_names()
-		.into_iter()
-		.map(|metric| (metric, None))
-		.collect::<BTreeMap<_, _>>();
-
-	for row in rows {
-		for (metric, value) in &row.metrics {
-			if let Some(value) = value {
-				let (sum, count) = sums.entry(metric.clone()).or_default();
-
-				*sum += *value;
-				*count += 1;
-			}
-		}
-	}
-	for (metric, (sum, count)) in sums {
-		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
-	}
-
-	metrics
+	metrics::aggregate_metrics(rows)
 }
 
 pub(super) fn aggregate_metric_states(
 	result_state: &str,
 	metric_comparable: bool,
 ) -> BTreeMap<String, String> {
-	let state = if metric_comparable { result_state } else { "not_encoded" };
-	let mut states = BTreeMap::new();
-
-	for k in QUANTITATIVE_K_VALUES {
-		states.insert(format!("recall_at_{k}"), state.to_string());
-		states.insert(format!("precision_at_{k}"), state.to_string());
-		states.insert(format!("success_at_{k}"), state.to_string());
-	}
-	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
-		states.insert(metric.to_string(), state.to_string());
-	}
-
-	states
+	states::aggregate_metric_states(result_state, metric_comparable)
 }
 
 pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs
new file mode 100644
index 00000000..db17c0c1
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs
@@ -0,0 +1,27 @@
+use crate::{
+	BTreeMap, QuantitativePerQueryRow, formatting, quantitative::metrics::aggregate::names,
+};
+
+pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+	let mut sums = BTreeMap::<String, (f64, usize)>::new();
+	let mut metrics = names::quantitative_metric_names()
+		.into_iter()
+		.map(|metric| (metric, None))
+		.collect::<BTreeMap<_, _>>();
+
+	for row in rows {
+		for (metric, value) in &row.metrics {
+			if let Some(value) = value {
+				let (sum, count) = sums.entry(metric.clone()).or_default();
+
+				*sum += *value;
+				*count += 1;
+			}
+		}
+	}
+	for (metric, (sum, count)) in sums {
+		metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64)));
+	}
+
+	metrics
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs
new file mode 100644
index 00000000..c9f631bb
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs
@@ -0,0 +1,20 @@
+use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES};
+
+pub(super) fn aggregate_metric_states(
+	result_state: &str,
+	metric_comparable: bool,
+) -> BTreeMap<String, String> {
+	let state = if metric_comparable { result_state } else { "not_encoded" };
+	let mut states = BTreeMap::new();
+
+	for k in QUANTITATIVE_K_VALUES {
+		states.insert(format!("recall_at_{k}"), state.to_string());
+		states.insert(format!("precision_at_{k}"), state.to_string());
+		states.insert(format!("success_at_{k}"), state.to_string());
+	}
+	for metric in ["mrr", "ndcg_at_5", "average_precision"] {
+		states.insert(metric.to_string(), state.to_string());
+	}
+
+	states
+}

From 4e6170e3c387c0ece0d7c57397bca832ab3ef7ea Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:51:19 -0400
Subject: [PATCH 48/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report input type","authority":"manual"}

---
 .../quantitative/report.rs                      | 17 ++++-------------
 .../quantitative/report/input.rs                | 12 ++++++++++++
 2 files changed, 16 insertions(+), 13 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
index 331acc70..3922622a 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
@@ -1,23 +1,14 @@
 mod controls;
+mod input;
 mod row;
 
+pub(crate) use self::input::QuantitativeReportInput;
+
 use crate::{
-	AdapterReport, JobReport, Path, QuantitativeBenchmarkReport, RealWorldJob, ReportSummary,
-	Result,
+	QuantitativeBenchmarkReport, Result,
 	quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest},
 };
 
-pub(crate) struct QuantitativeReportInput<'a> {
-	pub(crate) run_id: &'a str,
-	pub(crate) generated_at: &'a str,
-	pub(crate) adapter: &'a AdapterReport,
-	pub(crate) source_jobs: &'a [RealWorldJob],
-	pub(crate) jobs: &'a [JobReport],
-	pub(crate) summary: &'a ReportSummary,
-	pub(crate) product_manifest_path: Option<&'a Path>,
-	pub(crate) audit_manifest_path: Option<&'a Path>,
-}
-
 pub(crate) fn quantitative_scoreboard_report(
 	input: QuantitativeReportInput<'_>,
 ) -> Result<QuantitativeBenchmarkReport> {
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs
new file mode 100644
index 00000000..c4412050
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs
@@ -0,0 +1,12 @@
+use crate::{AdapterReport, JobReport, Path, RealWorldJob, ReportSummary};
+
+pub(crate) struct QuantitativeReportInput<'a> {
+	pub(crate) run_id: &'a str,
+	pub(crate) generated_at: &'a str,
+	pub(crate) adapter: &'a AdapterReport,
+	pub(crate) source_jobs: &'a [RealWorldJob],
+	pub(crate) jobs: &'a [JobReport],
+	pub(crate) summary: &'a ReportSummary,
+	pub(crate) product_manifest_path: Option<&'a Path>,
+	pub(crate) audit_manifest_path: Option<&'a Path>,
+}

From c7aebf9f6e4438e0b9046b33537fc2470f464f7e Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:55:10 -0400
Subject: [PATCH 49/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative audit context validation","authority":"manual"}

---
 .../validation/identity/context.rs            | 58 ++-----------------
 .../validation/identity/context/counts.rs     | 37 ++++++++++++
 .../validation/identity/context/fields.rs     | 39 +++++++++++++
 3 files changed, 82 insertions(+), 52 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
index d11c8636..1d6be494 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs
@@ -1,6 +1,8 @@
+mod counts;
+mod fields;
+
 use crate::{
-	Path, QuantitativeAuditManifest, Result, eyre,
-	quantitative::audit_manifest::QuantitativeAuditContext,
+	Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext,
 };
 
 pub(super) fn validate_quantitative_audit_context(
@@ -8,56 +10,8 @@ pub(super) fn validate_quantitative_audit_context(
 	path: &Path,
 	context: &QuantitativeAuditContext<'_>,
 ) -> Result<()> {
-	if manifest.run_id != context.run_id {
-		return Err(eyre::eyre!(
-			"{} has run_id {}, expected {}.",
-			path.display(),
-			manifest.run_id,
-			context.run_id
-		));
-	}
-	if manifest.corpus_id != context.corpus_id {
-		return Err(eyre::eyre!(
-			"{} has corpus_id {}, expected {}.",
-			path.display(),
-			manifest.corpus_id,
-			context.corpus_id
-		));
-	}
-	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
-		return Err(eyre::eyre!(
-			"{} has product {}:{} but current row is {}:{}.",
-			path.display(),
-			manifest.product,
-			manifest.adapter_id,
-			context.product,
-			context.adapter_id
-		));
-	}
-	if manifest.sample_size != context.source_jobs.len() {
-		return Err(eyre::eyre!(
-			"{} has sample_size {}, expected {}.",
-			path.display(),
-			manifest.sample_size,
-			context.source_jobs.len()
-		));
-	}
-	if manifest.ranking_query_count != context.ranking_query_count {
-		return Err(eyre::eyre!(
-			"{} has ranking_query_count {}, expected {}.",
-			path.display(),
-			manifest.ranking_query_count,
-			context.ranking_query_count
-		));
-	}
-	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
-		return Err(eyre::eyre!(
-			"{} has explicit_qrel_query_count {}, expected {}.",
-			path.display(),
-			manifest.explicit_qrel_query_count,
-			context.explicit_qrel_query_count
-		));
-	}
+	fields::validate_quantitative_audit_context_fields(manifest, path, context)?;
+	counts::validate_quantitative_audit_context_counts(manifest, path, context)?;
 
 	Ok(())
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs
new file mode 100644
index 00000000..a9e61f1f
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs
@@ -0,0 +1,37 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre,
+	quantitative::audit_manifest::QuantitativeAuditContext,
+};
+
+pub(super) fn validate_quantitative_audit_context_counts(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: &QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.sample_size != context.source_jobs.len() {
+		return Err(eyre::eyre!(
+			"{} has sample_size {}, expected {}.",
+			path.display(),
+			manifest.sample_size,
+			context.source_jobs.len()
+		));
+	}
+	if manifest.ranking_query_count != context.ranking_query_count {
+		return Err(eyre::eyre!(
+			"{} has ranking_query_count {}, expected {}.",
+			path.display(),
+			manifest.ranking_query_count,
+			context.ranking_query_count
+		));
+	}
+	if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count {
+		return Err(eyre::eyre!(
+			"{} has explicit_qrel_query_count {}, expected {}.",
+			path.display(),
+			manifest.explicit_qrel_query_count,
+			context.explicit_qrel_query_count
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs
new file mode 100644
index 00000000..1b39ccad
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs
@@ -0,0 +1,39 @@
+use crate::{
+	Path, QuantitativeAuditManifest, Result, eyre,
+	quantitative::audit_manifest::QuantitativeAuditContext,
+};
+
+pub(super) fn validate_quantitative_audit_context_fields(
+	manifest: &QuantitativeAuditManifest,
+	path: &Path,
+	context: &QuantitativeAuditContext<'_>,
+) -> Result<()> {
+	if manifest.run_id != context.run_id {
+		return Err(eyre::eyre!(
+			"{} has run_id {}, expected {}.",
+			path.display(),
+			manifest.run_id,
+			context.run_id
+		));
+	}
+	if manifest.corpus_id != context.corpus_id {
+		return Err(eyre::eyre!(
+			"{} has corpus_id {}, expected {}.",
+			path.display(),
+			manifest.corpus_id,
+			context.corpus_id
+		));
+	}
+	if manifest.product != context.product || manifest.adapter_id != context.adapter_id {
+		return Err(eyre::eyre!(
+			"{} has product {}:{} but current row is {}:{}.",
+			path.display(),
+			manifest.product,
+			manifest.adapter_id,
+			context.product,
+			context.adapter_id
+		));
+	}
+
+	Ok(())
+}

From a7a45db230703a2b2f8a4fe0fc1b98a86ec1e15a Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 15:58:24 -0400
Subject: [PATCH 50/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative row basis assembly","authority":"manual"}

---
 .../quantitative/report/row.rs                | 53 +++++++------------
 .../quantitative/report/row/basis.rs          | 41 ++++++++++++++
 2 files changed, 61 insertions(+), 33 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
index 8599700a..ee420902 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs
@@ -1,12 +1,12 @@
 mod audit_gates;
+mod basis;
 mod benchmark_row;
 mod query_counts;
 
 use crate::{
 	QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result,
-	quantitative::{
-		self, metrics,
-		report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput},
+	quantitative::report::{
+		QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput,
 	},
 };
 
@@ -21,46 +21,33 @@ pub(super) struct CurrentQuantitativeRow {
 pub(super) fn current_quantitative_row(
 	input: &QuantitativeReportInput<'_>,
 ) -> Result<CurrentQuantitativeRow> {
-	let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs);
-	let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs);
-	let per_query_rows = metrics::quantitative_per_query_rows(
-		input.source_jobs,
-		input.jobs,
-		corpus_id.as_str(),
-		evidence_class,
-		input.adapter.adapter_id.as_str(),
-	);
-	let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice());
-	let ranking_query_count = query_counts.ranking_query_count;
-	let explicit_qrel_query_count = query_counts.explicit_qrel_query_count;
-	let metric_comparable = ranking_query_count > 0;
-	let result_state = quantitative::quantitative_result_state(input.summary);
+	let basis = basis::quantitative_row_basis(input);
 	let audit_gates = audit_gates::quantitative_audit_gates(
 		input,
-		corpus_id.as_str(),
-		evidence_class,
-		ranking_query_count,
-		explicit_qrel_query_count,
-		metric_comparable,
+		basis.corpus_id.as_str(),
+		basis.evidence_class,
+		basis.ranking_query_count,
+		basis.explicit_qrel_query_count,
+		basis.metric_comparable,
 	)?;
 	let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput {
 		input,
-		corpus_id: corpus_id.as_str(),
-		evidence_class,
-		per_query_rows: per_query_rows.as_slice(),
-		ranking_query_count,
-		explicit_qrel_query_count,
-		metric_comparable,
-		result_state,
+		corpus_id: basis.corpus_id.as_str(),
+		evidence_class: basis.evidence_class,
+		per_query_rows: basis.per_query_rows.as_slice(),
+		ranking_query_count: basis.ranking_query_count,
+		explicit_qrel_query_count: basis.explicit_qrel_query_count,
+		metric_comparable: basis.metric_comparable,
+		result_state: basis.result_state,
 		audit_evidence: audit_gates.audit_evidence,
 		leaderboard_eligible: audit_gates.leaderboard_eligible,
 	});
 
 	Ok(CurrentQuantitativeRow {
-		corpus_id,
+		corpus_id: basis.corpus_id,
 		row,
-		per_query_rows,
-		ranking_query_count,
-		explicit_qrel_query_count,
+		per_query_rows: basis.per_query_rows,
+		ranking_query_count: basis.ranking_query_count,
+		explicit_qrel_query_count: basis.explicit_qrel_query_count,
 	})
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs
new file mode 100644
index 00000000..0f1a7e47
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs
@@ -0,0 +1,41 @@
+use crate::{
+	QuantitativePerQueryRow,
+	quantitative::{
+		self, metrics,
+		report::{QuantitativeReportInput, row::query_counts},
+	},
+};
+
+pub(super) struct QuantitativeRowBasis {
+	pub(super) corpus_id: String,
+	pub(super) evidence_class: &'static str,
+	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
+	pub(super) ranking_query_count: usize,
+	pub(super) explicit_qrel_query_count: usize,
+	pub(super) metric_comparable: bool,
+	pub(super) result_state: &'static str,
+}
+
+pub(super) fn quantitative_row_basis(input: &QuantitativeReportInput<'_>) -> QuantitativeRowBasis {
+	let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs);
+	let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs);
+	let per_query_rows = metrics::quantitative_per_query_rows(
+		input.source_jobs,
+		input.jobs,
+		corpus_id.as_str(),
+		evidence_class,
+		input.adapter.adapter_id.as_str(),
+	);
+	let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice());
+	let ranking_query_count = query_counts.ranking_query_count;
+
+	QuantitativeRowBasis {
+		corpus_id,
+		evidence_class,
+		per_query_rows,
+		ranking_query_count,
+		explicit_qrel_query_count: query_counts.explicit_qrel_query_count,
+		metric_comparable: ranking_query_count > 0,
+		result_state: quantitative::quantitative_result_state(input.summary),
+	}
+}

From e941669c207697ab26a2cb0fc1dfb99dc26285f8 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:04:20 -0400
Subject: [PATCH 51/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative product export rows","authority":"manual"}

---
 .../product_manifest/export/rows.rs           | 57 +------------------
 .../product_manifest/export/rows/per_query.rs | 35 ++++++++++++
 .../product_manifest/export/rows/product.rs   | 21 +++++++
 3 files changed, 59 insertions(+), 54 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
index 2e1923db..e29f4f74 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs
@@ -1,55 +1,4 @@
-use crate::{QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldReport};
+mod per_query;
+mod product;
 
-pub(super) fn exported_product_row(
-	source_row: &QuantitativeBenchmarkRow,
-	product: &str,
-	adapter_id: &str,
-	adapter_name: &str,
-) -> QuantitativeBenchmarkRow {
-	let mut row = source_row.clone();
-
-	row.product = product.to_string();
-	row.adapter_id = adapter_id.to_string();
-	row.adapter_name = adapter_name.to_string();
-	row.claim_boundary = concat!(
-		"Exported from a generated real_world_job_report quantitative row; ",
-		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
-	)
-	.to_string();
-
-	row
-}
-
-pub(super) fn exported_per_query_rows(
-	report: &RealWorldReport,
-	source_product: &str,
-	source_adapter_id: &str,
-	product: &str,
-	adapter_id: &str,
-) -> Vec<QuantitativePerQueryRow> {
-	report
-		.quantitative_scoreboard
-		.per_query_rows
-		.iter()
-		.filter(|row| row.product == source_product && row.adapter_id == source_adapter_id)
-		.map(|row| exported_per_query_row(row, product, adapter_id))
-		.collect()
-}
-
-fn exported_per_query_row(
-	source_row: &QuantitativePerQueryRow,
-	product: &str,
-	adapter_id: &str,
-) -> QuantitativePerQueryRow {
-	let mut row = source_row.clone();
-
-	row.product = product.to_string();
-	row.adapter_id = adapter_id.to_string();
-	row.claim_boundary = concat!(
-		"Exported from generated report per-query quantitative evidence; ",
-		"import does not relax paired-significance or leaderboard gates."
-	)
-	.to_string();
-
-	row
-}
+pub(super) use self::{per_query::exported_per_query_rows, product::exported_product_row};
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs
new file mode 100644
index 00000000..fcc61d9e
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs
@@ -0,0 +1,35 @@
+use crate::{QuantitativePerQueryRow, RealWorldReport};
+
+pub(in crate::quantitative::product_manifest::export) fn exported_per_query_rows(
+	report: &RealWorldReport,
+	source_product: &str,
+	source_adapter_id: &str,
+	product: &str,
+	adapter_id: &str,
+) -> Vec<QuantitativePerQueryRow> {
+	report
+		.quantitative_scoreboard
+		.per_query_rows
+		.iter()
+		.filter(|row| row.product == source_product && row.adapter_id == source_adapter_id)
+		.map(|row| exported_per_query_row(row, product, adapter_id))
+		.collect()
+}
+
+fn exported_per_query_row(
+	source_row: &QuantitativePerQueryRow,
+	product: &str,
+	adapter_id: &str,
+) -> QuantitativePerQueryRow {
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.claim_boundary = concat!(
+		"Exported from generated report per-query quantitative evidence; ",
+		"import does not relax paired-significance or leaderboard gates."
+	)
+	.to_string();
+
+	row
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs
new file mode 100644
index 00000000..2551c2ff
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs
@@ -0,0 +1,21 @@
+use crate::QuantitativeBenchmarkRow;
+
+pub(in crate::quantitative::product_manifest::export) fn exported_product_row(
+	source_row: &QuantitativeBenchmarkRow,
+	product: &str,
+	adapter_id: &str,
+	adapter_name: &str,
+) -> QuantitativeBenchmarkRow {
+	let mut row = source_row.clone();
+
+	row.product = product.to_string();
+	row.adapter_id = adapter_id.to_string();
+	row.adapter_name = adapter_name.to_string();
+	row.claim_boundary = concat!(
+		"Exported from a generated real_world_job_report quantitative row; ",
+		"import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates."
+	)
+	.to_string();
+
+	row
+}

From 52764643af7ff6ad4c9bac1db31c12b8cbe85306 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:07:29 -0400
Subject: [PATCH 52/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative report imports","authority":"manual"}

---
 .../quantitative/report.rs                    | 15 +++++------
 .../quantitative/report/imported.rs           | 27 +++++++++++++++++++
 2 files changed, 34 insertions(+), 8 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
index 3922622a..08b4b84a 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs
@@ -1,4 +1,5 @@
 mod controls;
+mod imported;
 mod input;
 mod row;
 
@@ -6,24 +7,22 @@ pub(crate) use self::input::QuantitativeReportInput;
 
 use crate::{
 	QuantitativeBenchmarkReport, Result,
-	quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest},
+	quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA},
 };
 
 pub(crate) fn quantitative_scoreboard_report(
 	input: QuantitativeReportInput<'_>,
 ) -> Result<QuantitativeBenchmarkReport> {
 	let current_row = row::current_quantitative_row(&input)?;
-	let product_manifest = product_manifest::quantitative_product_manifest(
+	let imported_rows = imported::imported_quantitative_rows(
 		input.product_manifest_path,
 		current_row.corpus_id.as_str(),
 	)?;
-	let imported_row_count = product_manifest.rows.len();
-	let imported_per_query_count = product_manifest.per_query_rows.len();
 	let mut rows = vec![current_row.row];
 	let mut merged_per_query_rows = current_row.per_query_rows;
 
-	rows.extend(product_manifest.rows);
-	merged_per_query_rows.extend(product_manifest.per_query_rows);
+	rows.extend(imported_rows.rows);
+	merged_per_query_rows.extend(imported_rows.per_query_rows);
 
 	let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2;
 	let controls = controls::quantitative_benchmark_controls(
@@ -41,8 +40,8 @@ pub(crate) fn quantitative_scoreboard_report(
 		rows,
 		per_query_rows: merged_per_query_rows,
 		metrics_not_encoded: quantitative::quantitative_metrics_not_encoded(
-			imported_row_count,
-			imported_per_query_count,
+			imported_rows.row_count,
+			imported_rows.per_query_count,
 		),
 		controls,
 		claim_boundary: concat!(
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs
new file mode 100644
index 00000000..2b2a2515
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs
@@ -0,0 +1,27 @@
+use crate::{
+	Path, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::product_manifest,
+};
+
+pub(super) struct ImportedQuantitativeRows {
+	pub(super) rows: Vec<QuantitativeBenchmarkRow>,
+	pub(super) per_query_rows: Vec<QuantitativePerQueryRow>,
+	pub(super) row_count: usize,
+	pub(super) per_query_count: usize,
+}
+
+pub(super) fn imported_quantitative_rows(
+	product_manifest_path: Option<&Path>,
+	corpus_id: &str,
+) -> Result<ImportedQuantitativeRows> {
+	let product_manifest =
+		product_manifest::quantitative_product_manifest(product_manifest_path, corpus_id)?;
+	let row_count = product_manifest.rows.len();
+	let per_query_count = product_manifest.per_query_rows.len();
+
+	Ok(ImportedQuantitativeRows {
+		rows: product_manifest.rows,
+		per_query_rows: product_manifest.per_query_rows,
+		row_count,
+		per_query_count,
+	})
+}

From 80680ac36cda4a1d22efce08c87e18202ce406e2 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:10:34 -0400
Subject: [PATCH 53/58] {"schema":"decodex/commit/1","summary":"Slim
 quantitative metrics facade","authority":"manual"}

---
 .../quantitative/metrics.rs                   | 75 +++----------------
 .../quantitative/metrics/aggregate.rs         | 12 ++-
 .../quantitative/metrics/per_query.rs         |  2 +-
 .../quantitative/metrics/ranking.rs           | 12 +--
 4 files changed, 25 insertions(+), 76 deletions(-)

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
index 779329f6..6ee91f58 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs
@@ -2,69 +2,14 @@ mod aggregate;
 mod per_query;
 mod ranking;
 
-use crate::{
-	BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow,
-	RealWorldJob, ReportSummary,
+pub(super) use self::{
+	aggregate::{
+		aggregate_confidence_intervals, aggregate_denominators, aggregate_metric_states,
+		aggregate_metrics,
+	},
+	per_query::quantitative_per_query_rows,
+	ranking::{
+		aggregate_qrel_source, explicit_qrel_query_count, ranked_candidate_source,
+		ranking_coverage_state, ranking_query_count, ranking_query_ids,
+	},
 };
-
-pub(super) fn quantitative_per_query_rows(
-	source_jobs: &[RealWorldJob],
-	jobs: &[JobReport],
-	corpus_id: &str,
-	evidence_class: &str,
-	adapter_id: &str,
-) -> Vec<QuantitativePerQueryRow> {
-	per_query::quantitative_per_query_rows(source_jobs, jobs, corpus_id, evidence_class, adapter_id)
-}
-
-pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
-	aggregate::aggregate_metrics(rows)
-}
-
-pub(super) fn aggregate_metric_states(
-	result_state: &str,
-	metric_comparable: bool,
-) -> BTreeMap<String, String> {
-	aggregate::aggregate_metric_states(result_state, metric_comparable)
-}
-
-pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
-	aggregate::aggregate_denominators(rows)
-}
-
-pub(super) fn aggregate_confidence_intervals(
-	rows: &[QuantitativePerQueryRow],
-) -> BTreeMap<String, QuantitativeConfidenceInterval> {
-	aggregate::aggregate_confidence_intervals(rows)
-}
-
-pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
-	ranking::ranking_query_ids(source_jobs)
-}
-
-pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	ranking::ranking_query_count(source_jobs)
-}
-
-pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	ranking::explicit_qrel_query_count(source_jobs)
-}
-
-pub(super) fn aggregate_qrel_source(
-	ranking_query_count: usize,
-	explicit_qrel_query_count: usize,
-) -> &'static str {
-	ranking::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count)
-}
-
-pub(super) fn ranking_coverage_state(
-	summary: &ReportSummary,
-	source_job_count: usize,
-	ranking_query_count: usize,
-) -> &'static str {
-	ranking::ranking_coverage_state(summary, source_job_count, ranking_query_count)
-}
-
-pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
-	ranking::ranked_candidate_source(ranking_query_count)
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
index 9e899d64..992201a6 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs
@@ -6,22 +6,26 @@ mod states;
 
 use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow};
 
-pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, Option<f64>> {
+pub(in crate::quantitative) fn aggregate_metrics(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, Option<f64>> {
 	metrics::aggregate_metrics(rows)
 }
 
-pub(super) fn aggregate_metric_states(
+pub(in crate::quantitative) fn aggregate_metric_states(
 	result_state: &str,
 	metric_comparable: bool,
 ) -> BTreeMap<String, String> {
 	states::aggregate_metric_states(result_state, metric_comparable)
 }
 
-pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap<String, usize> {
+pub(in crate::quantitative) fn aggregate_denominators(
+	rows: &[QuantitativePerQueryRow],
+) -> BTreeMap<String, usize> {
 	denominators::aggregate_denominators(rows)
 }
 
-pub(super) fn aggregate_confidence_intervals(
+pub(in crate::quantitative) fn aggregate_confidence_intervals(
 	rows: &[QuantitativePerQueryRow],
 ) -> BTreeMap<String, QuantitativeConfidenceInterval> {
 	confidence::aggregate_confidence_intervals(rows)
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
index 2f8de046..1c1bf433 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs
@@ -4,7 +4,7 @@ mod row;
 
 use crate::{JobReport, QuantitativePerQueryRow, RealWorldJob};
 
-pub(super) fn quantitative_per_query_rows(
+pub(in crate::quantitative) fn quantitative_per_query_rows(
 	source_jobs: &[RealWorldJob],
 	jobs: &[JobReport],
 	corpus_id: &str,
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
index 340a7115..ab045e46 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
@@ -2,7 +2,7 @@ mod queries;
 
 use crate::{BTreeSet, RealWorldJob, ReportSummary};
 
-pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
 	source_jobs
 		.iter()
 		.filter(|job| queries::is_ranking_query(job))
@@ -10,15 +10,15 @@ pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str>
 		.collect()
 }
 
-pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
+pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
 	ranking_query_ids(source_jobs).len()
 }
 
-pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
+pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
 	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
 }
 
-pub(super) fn aggregate_qrel_source(
+pub(in crate::quantitative) fn aggregate_qrel_source(
 	ranking_query_count: usize,
 	explicit_qrel_query_count: usize,
 ) -> &'static str {
@@ -33,7 +33,7 @@ pub(super) fn aggregate_qrel_source(
 	}
 }
 
-pub(super) fn ranking_coverage_state(
+pub(in crate::quantitative) fn ranking_coverage_state(
 	summary: &ReportSummary,
 	source_job_count: usize,
 	ranking_query_count: usize,
@@ -47,6 +47,6 @@ pub(super) fn ranking_coverage_state(
 	}
 }
 
-pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
+pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
 	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
 }

From cf61ab3107ca5b9be15b74b576f73dfbcc2238aa Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:15:16 -0400
Subject: [PATCH 54/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative ranking helpers","authority":"manual"}

---
 .../quantitative/metrics/ranking.rs           | 58 +++----------------
 .../quantitative/metrics/ranking/counts.rs    | 17 ++++++
 .../quantitative/metrics/ranking/coverage.rs  | 19 ++++++
 .../quantitative/metrics/ranking/qrels.rs     | 14 +++++
 4 files changed, 58 insertions(+), 50 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
index ab045e46..6805ca30 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs
@@ -1,52 +1,10 @@
+mod counts;
+mod coverage;
+mod qrels;
 mod queries;
 
-use crate::{BTreeSet, RealWorldJob, ReportSummary};
-
-pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
-	source_jobs
-		.iter()
-		.filter(|job| queries::is_ranking_query(job))
-		.map(|job| job.job_id.as_str())
-		.collect()
-}
-
-pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	ranking_query_ids(source_jobs).len()
-}
-
-pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
-	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
-}
-
-pub(in crate::quantitative) fn aggregate_qrel_source(
-	ranking_query_count: usize,
-	explicit_qrel_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if explicit_qrel_query_count == ranking_query_count {
-		"explicit_qrels"
-	} else if explicit_qrel_query_count == 0 {
-		"expected_evidence_fallback"
-	} else {
-		"mixed"
-	}
-}
-
-pub(in crate::quantitative) fn ranking_coverage_state(
-	summary: &ReportSummary,
-	source_job_count: usize,
-	ranking_query_count: usize,
-) -> &'static str {
-	if ranking_query_count == 0 {
-		"not_encoded"
-	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
-		"complete"
-	} else {
-		"partial_coverage"
-	}
-}
-
-pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
-	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
-}
+pub(in crate::quantitative) use self::{
+	counts::{explicit_qrel_query_count, ranking_query_count, ranking_query_ids},
+	coverage::{ranked_candidate_source, ranking_coverage_state},
+	qrels::aggregate_qrel_source,
+};
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs
new file mode 100644
index 00000000..c8dd4408
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs
@@ -0,0 +1,17 @@
+use crate::{BTreeSet, RealWorldJob, quantitative::metrics::ranking::queries};
+
+pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> {
+	source_jobs
+		.iter()
+		.filter(|job| queries::is_ranking_query(job))
+		.map(|job| job.job_id.as_str())
+		.collect()
+}
+
+pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	ranking_query_ids(source_jobs).len()
+}
+
+pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize {
+	source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count()
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs
new file mode 100644
index 00000000..eb419d40
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs
@@ -0,0 +1,19 @@
+use crate::ReportSummary;
+
+pub(in crate::quantitative) fn ranking_coverage_state(
+	summary: &ReportSummary,
+	source_job_count: usize,
+	ranking_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if ranking_query_count == source_job_count && summary.not_encoded == 0 {
+		"complete"
+	} else {
+		"partial_coverage"
+	}
+}
+
+pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str {
+	if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" }
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs
new file mode 100644
index 00000000..9b5c3daa
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs
@@ -0,0 +1,14 @@
+pub(in crate::quantitative) fn aggregate_qrel_source(
+	ranking_query_count: usize,
+	explicit_qrel_query_count: usize,
+) -> &'static str {
+	if ranking_query_count == 0 {
+		"not_encoded"
+	} else if explicit_qrel_query_count == ranking_query_count {
+		"explicit_qrels"
+	} else if explicit_qrel_query_count == 0 {
+		"expected_evidence_fallback"
+	} else {
+		"mixed"
+	}
+}

From c919308265f79a648740ddf3c80b4d2f5d769442 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:20:02 -0400
Subject: [PATCH 55/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query validation identity","authority":"manual"}

---
 .../validation/rows/per_query.rs              | 35 ++--------
 .../validation/rows/per_query/identity.rs     | 66 +++++++++++++++++++
 2 files changed, 70 insertions(+), 31 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
index 4e720a68..12dc5508 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs
@@ -1,4 +1,6 @@
-use crate::{BTreeSet, Path, QuantitativeProductManifest, Result, eyre};
+mod identity;
+
+use crate::{BTreeSet, Path, QuantitativeProductManifest, Result};
 
 pub(super) fn validate_quantitative_per_query_rows(
 	manifest: &QuantitativeProductManifest,
@@ -12,36 +14,7 @@ pub(super) fn validate_quantitative_per_query_rows(
 		.collect::<BTreeSet<_>>();
 
 	for row in &manifest.per_query_rows {
-		if row.job_id.trim().is_empty()
-			|| row.suite.trim().is_empty()
-			|| row.evidence_class.trim().is_empty()
-			|| row.result_state.trim().is_empty()
-			|| row.product.trim().is_empty()
-			|| row.adapter_id.trim().is_empty()
-			|| row.qrel_source.trim().is_empty()
-		{
-			return Err(eyre::eyre!(
-				"{} has an incomplete quantitative per-query product row.",
-				path.display()
-			));
-		}
-		if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} has no matching product row.",
-				path.display(),
-				row.product,
-				row.adapter_id
-			));
-		}
-		if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-			return Err(eyre::eyre!(
-				"{} per-query row {}:{} is not same-corpus {}.",
-				path.display(),
-				row.product,
-				row.adapter_id,
-				corpus_id
-			));
-		}
+		identity::validate_per_query_row_identity(path, row, &row_keys, corpus_id)?;
 	}
 
 	Ok(())
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs
new file mode 100644
index 00000000..509f471e
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs
@@ -0,0 +1,66 @@
+use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre};
+
+pub(super) fn validate_per_query_row_identity(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+	row_keys: &BTreeSet<(&str, &str)>,
+	corpus_id: &str,
+) -> Result<()> {
+	validate_complete_per_query_row(path, row)?;
+	validate_matching_product_row(path, row, row_keys)?;
+
+	validate_same_corpus_per_query_row(path, row, corpus_id)
+}
+
+fn validate_complete_per_query_row(path: &Path, row: &QuantitativePerQueryRow) -> Result<()> {
+	if row.job_id.trim().is_empty()
+		|| row.suite.trim().is_empty()
+		|| row.evidence_class.trim().is_empty()
+		|| row.result_state.trim().is_empty()
+		|| row.product.trim().is_empty()
+		|| row.adapter_id.trim().is_empty()
+		|| row.qrel_source.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} has an incomplete quantitative per-query product row.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_matching_product_row(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+	row_keys: &BTreeSet<(&str, &str)>,
+) -> Result<()> {
+	if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+		return Err(eyre::eyre!(
+			"{} per-query row {}:{} has no matching product row.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}
+
+fn validate_same_corpus_per_query_row(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+	corpus_id: &str,
+) -> Result<()> {
+	if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+		return Err(eyre::eyre!(
+			"{} per-query row {}:{} is not same-corpus {}.",
+			path.display(),
+			row.product,
+			row.adapter_id,
+			corpus_id
+		));
+	}
+
+	Ok(())
+}

From ff093e59ca54b98177f3c01873e4511c3d1bd721 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:22:48 -0400
Subject: [PATCH 56/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query identity checks","authority":"manual"}

---
 .../validation/rows/per_query/identity.rs     | 65 +++----------------
 .../rows/per_query/identity/corpus.rs         | 19 ++++++
 .../rows/per_query/identity/fields.rs         | 22 +++++++
 .../rows/per_query/identity/product.rs        | 18 +++++
 4 files changed, 67 insertions(+), 57 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs
index 509f471e..737e869e 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs
@@ -1,66 +1,17 @@
-use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre};
+mod corpus;
+mod fields;
+mod product;
 
-pub(super) fn validate_per_query_row_identity(
-	path: &Path,
-	row: &QuantitativePerQueryRow,
-	row_keys: &BTreeSet<(&str, &str)>,
-	corpus_id: &str,
-) -> Result<()> {
-	validate_complete_per_query_row(path, row)?;
-	validate_matching_product_row(path, row, row_keys)?;
-
-	validate_same_corpus_per_query_row(path, row, corpus_id)
-}
-
-fn validate_complete_per_query_row(path: &Path, row: &QuantitativePerQueryRow) -> Result<()> {
-	if row.job_id.trim().is_empty()
-		|| row.suite.trim().is_empty()
-		|| row.evidence_class.trim().is_empty()
-		|| row.result_state.trim().is_empty()
-		|| row.product.trim().is_empty()
-		|| row.adapter_id.trim().is_empty()
-		|| row.qrel_source.trim().is_empty()
-	{
-		return Err(eyre::eyre!(
-			"{} has an incomplete quantitative per-query product row.",
-			path.display()
-		));
-	}
+use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result};
 
-	Ok(())
-}
-
-fn validate_matching_product_row(
+pub(super) fn validate_per_query_row_identity(
 	path: &Path,
 	row: &QuantitativePerQueryRow,
 	row_keys: &BTreeSet<(&str, &str)>,
-) -> Result<()> {
-	if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
-		return Err(eyre::eyre!(
-			"{} per-query row {}:{} has no matching product row.",
-			path.display(),
-			row.product,
-			row.adapter_id
-		));
-	}
-
-	Ok(())
-}
-
-fn validate_same_corpus_per_query_row(
-	path: &Path,
-	row: &QuantitativePerQueryRow,
 	corpus_id: &str,
 ) -> Result<()> {
-	if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
-		return Err(eyre::eyre!(
-			"{} per-query row {}:{} is not same-corpus {}.",
-			path.display(),
-			row.product,
-			row.adapter_id,
-			corpus_id
-		));
-	}
+	fields::validate_complete_per_query_row(path, row)?;
+	product::validate_matching_product_row(path, row, row_keys)?;
 
-	Ok(())
+	corpus::validate_same_corpus_per_query_row(path, row, corpus_id)
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs
new file mode 100644
index 00000000..45d0c11c
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs
@@ -0,0 +1,19 @@
+use crate::{Path, QuantitativePerQueryRow, Result, eyre};
+
+pub(super) fn validate_same_corpus_per_query_row(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+	corpus_id: &str,
+) -> Result<()> {
+	if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) {
+		return Err(eyre::eyre!(
+			"{} per-query row {}:{} is not same-corpus {}.",
+			path.display(),
+			row.product,
+			row.adapter_id,
+			corpus_id
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs
new file mode 100644
index 00000000..049614f1
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs
@@ -0,0 +1,22 @@
+use crate::{Path, QuantitativePerQueryRow, Result, eyre};
+
+pub(super) fn validate_complete_per_query_row(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+) -> Result<()> {
+	if row.job_id.trim().is_empty()
+		|| row.suite.trim().is_empty()
+		|| row.evidence_class.trim().is_empty()
+		|| row.result_state.trim().is_empty()
+		|| row.product.trim().is_empty()
+		|| row.adapter_id.trim().is_empty()
+		|| row.qrel_source.trim().is_empty()
+	{
+		return Err(eyre::eyre!(
+			"{} has an incomplete quantitative per-query product row.",
+			path.display()
+		));
+	}
+
+	Ok(())
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs
new file mode 100644
index 00000000..dfed81b1
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs
@@ -0,0 +1,18 @@
+use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre};
+
+pub(super) fn validate_matching_product_row(
+	path: &Path,
+	row: &QuantitativePerQueryRow,
+	row_keys: &BTreeSet<(&str, &str)>,
+) -> Result<()> {
+	if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) {
+		return Err(eyre::eyre!(
+			"{} per-query row {}:{} has no matching product row.",
+			path.display(),
+			row.product,
+			row.adapter_id
+		));
+	}
+
+	Ok(())
+}

From 5fe18db2423263f3338adcd501d916e83e906990 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:26:11 -0400
Subject: [PATCH 57/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query row basis","authority":"manual"}

---
 .../quantitative/metrics/per_query/row.rs     | 36 ++++-------
 .../metrics/per_query/row/basis.rs            | 60 +++++++++++++++++++
 2 files changed, 71 insertions(+), 25 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs
index 2a892850..7378fd72 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs
@@ -1,10 +1,8 @@
+mod basis;
+
 use crate::{
 	JobReport, QuantitativePerQueryRow, RealWorldJob, formatting,
-	quantitative::{
-		QUANTITATIVE_ROW_CLAIM_BOUNDARY,
-		metrics::per_query::{evidence, query_metrics},
-	},
-	scoring,
+	quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY,
 };
 
 pub(super) fn quantitative_per_query_row(
@@ -14,16 +12,7 @@ pub(super) fn quantitative_per_query_row(
 	evidence_class: &str,
 	adapter_id: &str,
 ) -> QuantitativePerQueryRow {
-	let relevance = evidence::relevance_grades(source_job, job);
-	let candidates = scoring::produced_evidence_order(source_job);
-	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
-	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
-	let metric_state = if positive_relevance_count == 0 || candidates.is_empty() {
-		"not_encoded"
-	} else {
-		formatting::status_str(job.status)
-	};
-	let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect();
+	let basis = basis::quantitative_per_query_row_basis(source_job, job);
 
 	QuantitativePerQueryRow {
 		job_id: job.job_id.clone(),
@@ -31,18 +20,15 @@ pub(super) fn quantitative_per_query_row(
 		evidence_class: evidence_class.to_string(),
 		source_manifest_corpus_id: Some(corpus_id.to_string()),
 		result_state: formatting::status_str(job.status).to_string(),
-		expected_relevant_count: positive_relevance_count,
-		candidate_count: candidates.len(),
-		qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(),
-		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		expected_relevant_count: basis.positive_relevance_count,
+		candidate_count: basis.candidate_count,
+		qrel_source: basis.qrel_source,
+		relevance_grade_sum: basis.relevance_grade_sum,
 		product: "ELF".to_string(),
 		adapter_id: adapter_id.to_string(),
-		metrics,
-		metric_states,
-		denominators: query_metrics::per_query_denominators(
-			candidates.len(),
-			positive_relevance_count,
-		),
+		metrics: basis.metrics,
+		metric_states: basis.metric_states,
+		denominators: basis.denominators,
 		claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(),
 	}
 }
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs
new file mode 100644
index 00000000..34db9c8b
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs
@@ -0,0 +1,60 @@
+use crate::{
+	BTreeMap, JobReport, RealWorldJob, formatting,
+	quantitative::metrics::per_query::{evidence, query_metrics},
+	scoring,
+};
+
+pub(super) struct QuantitativePerQueryRowBasis {
+	pub(super) positive_relevance_count: usize,
+	pub(super) candidate_count: usize,
+	pub(super) qrel_source: String,
+	pub(super) relevance_grade_sum: f64,
+	pub(super) metrics: BTreeMap<String, Option<f64>>,
+	pub(super) metric_states: BTreeMap<String, String>,
+	pub(super) denominators: BTreeMap<String, usize>,
+}
+
+pub(super) fn quantitative_per_query_row_basis(
+	source_job: &RealWorldJob,
+	job: &JobReport,
+) -> QuantitativePerQueryRowBasis {
+	let relevance = evidence::relevance_grades(source_job, job);
+	let candidates = scoring::produced_evidence_order(source_job);
+	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
+	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
+	let candidate_count = candidates.len();
+	let metric_states = per_query_metric_states(
+		metrics.keys(),
+		positive_relevance_count,
+		candidate_count,
+		formatting::status_str(job.status),
+	);
+
+	QuantitativePerQueryRowBasis {
+		positive_relevance_count,
+		candidate_count,
+		qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(),
+		relevance_grade_sum: formatting::round3(relevance.values().sum::<f64>()),
+		metrics,
+		metric_states,
+		denominators: query_metrics::per_query_denominators(
+			candidate_count,
+			positive_relevance_count,
+		),
+	}
+}
+
+fn per_query_metric_states<'a>(
+	metric_names: impl Iterator<Item = &'a String>,
+	positive_relevance_count: usize,
+	candidate_count: usize,
+	result_state: &str,
+) -> BTreeMap<String, String> {
+	let metric_state = if positive_relevance_count == 0 || candidate_count == 0 {
+		"not_encoded"
+	} else {
+		result_state
+	};
+
+	metric_names.map(|key| (key.clone(), metric_state.to_string())).collect()
+}

From 3626f64f2a9bb8d0976ce9c45d5d96ca93166885 Mon Sep 17 00:00:00 2001
From: Yvette Carlisle <y@acg.box>
Date: Wed, 1 Jul 2026 16:28:27 -0400
Subject: [PATCH 58/58] {"schema":"decodex/commit/1","summary":"Split
 quantitative per-query metric states","authority":"manual"}

---
 .../metrics/per_query/row/basis.rs            | 19 +++----------------
 .../metrics/per_query/row/basis/states.rs     | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 16 deletions(-)
 create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs
index 34db9c8b..42ed6323 100644
--- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs
@@ -1,3 +1,5 @@
+mod states;
+
 use crate::{
 	BTreeMap, JobReport, RealWorldJob, formatting,
 	quantitative::metrics::per_query::{evidence, query_metrics},
@@ -23,7 +25,7 @@ pub(super) fn quantitative_per_query_row_basis(
 	let positive_relevance_count = query_metrics::positive_qrel_count(&relevance);
 	let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance);
 	let candidate_count = candidates.len();
-	let metric_states = per_query_metric_states(
+	let metric_states = states::per_query_metric_states(
 		metrics.keys(),
 		positive_relevance_count,
 		candidate_count,
@@ -43,18 +45,3 @@ pub(super) fn quantitative_per_query_row_basis(
 		),
 	}
 }
-
-fn per_query_metric_states<'a>(
-	metric_names: impl Iterator<Item = &'a String>,
-	positive_relevance_count: usize,
-	candidate_count: usize,
-	result_state: &str,
-) -> BTreeMap<String, String> {
-	let metric_state = if positive_relevance_count == 0 || candidate_count == 0 {
-		"not_encoded"
-	} else {
-		result_state
-	};
-
-	metric_names.map(|key| (key.clone(), metric_state.to_string())).collect()
-}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs
new file mode 100644
index 00000000..7c987253
--- /dev/null
+++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs
@@ -0,0 +1,16 @@
+use crate::BTreeMap;
+
+pub(super) fn per_query_metric_states<'a>(
+	metric_names: impl Iterator<Item = &'a String>,
+	positive_relevance_count: usize,
+	candidate_count: usize,
+	result_state: &str,
+) -> BTreeMap<String, String> {
+	let metric_state = if positive_relevance_count == 0 || candidate_count == 0 {
+		"not_encoded"
+	} else {
+		result_state
+	};
+
+	metric_names.map(|key| (key.clone(), metric_state.to_string())).collect()
+}