hack-ink · yvette-carlisle · Jul 2, 2026 · Jul 1, 2026 · Jul 1, 2026 · Jul 1, 2026
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
@@ -51,6 +51,12 @@ pub(super) struct RunArgs {
 	/// Skip loading the real-world external adapter coverage manifest.
 	#[arg(long)]
 	pub(super) skip_external_adapter_manifest: bool,
+	/// Optional same-corpus quantitative product manifest to merge into the report.
+	#[arg(long, value_name = "FILE")]
+	pub(super) quantitative_product_manifest: Option<PathBuf>,
+	/// Optional audit manifest proving the current quantitative row's held-out/leakage gates.
+	#[arg(long, value_name = "FILE")]
+	pub(super) quantitative_audit_manifest: Option<PathBuf>,
 }
 
 #[derive(Debug, Parser)]
@@ -63,9 +69,69 @@ pub(super) struct PublishArgs {
 	pub(super) out: Option<PathBuf>,
 }
 
+#[derive(Debug, Parser)]
+pub(super) struct ExportQuantitativeProductManifestArgs {
+	/// Generated real_world_job JSON report to export.
+	#[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)]
+	pub(super) report: PathBuf,
+	/// Write product manifest JSON to this file. Omit to print to stdout.
+	#[arg(long, value_name = "FILE")]
+	pub(super) out: Option<PathBuf>,
+	/// Stable manifest id. Defaults to <run_id>-quantitative-product-manifest.
+	#[arg(long)]
+	pub(super) manifest_id: Option<String>,
+	/// Override the exported product name.
+	#[arg(long)]
+	pub(super) product: Option<String>,
+	/// Override the exported adapter id.
+	#[arg(long)]
+	pub(super) adapter_id: Option<String>,
+	/// Override the exported adapter name.
+	#[arg(long)]
+	pub(super) adapter_name: Option<String>,
+}
+
+#[derive(Debug, Parser)]
+pub(super) struct ExportQuantitativeAuditManifestArgs {
+	/// Fixture file or directory containing current product-runtime real_world_job outputs.
+	#[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)]
+	pub(super) fixtures: PathBuf,
+	/// Write audit manifest JSON to this file. Omit to print to stdout.
+	#[arg(long, value_name = "FILE")]
+	pub(super) out: Option<PathBuf>,
+	/// Stable run id that the audit manifest is allowed to attest.
+	#[arg(long, default_value = DEFAULT_RUN_ID)]
+	pub(super) run_id: String,
+	/// Stable manifest id. Defaults to <run_id>-quantitative-audit-manifest.
+	#[arg(long)]
+	pub(super) manifest_id: Option<String>,
+	/// Product name for the current row.
+	#[arg(long, default_value = "ELF")]
+	pub(super) product: String,
+	/// Adapter id for the current row.
+	#[arg(long, default_value = DEFAULT_ADAPTER_ID)]
+	pub(super) adapter_id: String,
+	/// Mark the current row as held-out only when query ids were locked before runtime.
+	#[arg(long)]
+	pub(super) held_out: bool,
+	/// Mark the current row as leakage audited only when runtime inputs excluded answers/qrels.
+	#[arg(long)]
+	pub(super) leakage_audited: bool,
+	/// Audit control string. Repeat for multiple controls.
+	#[arg(long = "control")]
+	pub(super) controls: Vec<String>,
+	/// Claim boundary recorded in the audit manifest.
+	#[arg(long)]
+	pub(super) claim_boundary: Option<String>,
+}
+
 #[derive(Debug, Subcommand)]
 #[command(rename_all = "kebab")]
 pub(super) enum Command {
+	/// Export a quantitative audit manifest for the current fixture set.
+	ExportQuantitativeAuditManifest(ExportQuantitativeAuditManifestArgs),
+	/// Export the primary quantitative row as a reusable product manifest.
+	ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs),
 	/// Parse and score real_world_job fixtures, then emit a JSON report.
 	Run(RunArgs),
 	/// Render Markdown from a generated real_world_job JSON report.

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
@@ -1,7 +1,8 @@
 use crate::{
-	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path,
-	PathBuf, PrivateCorpusRedaction, PublishArgs, REPORT_SCHEMA, RealWorldJob, RealWorldReport,
-	Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
+	AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile,
+	ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime,
+	Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA,
+	RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
 };
 
 pub(super) fn run_command(args: RunArgs) -> Result<()> {
@@ -20,6 +21,27 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> {
 	write_or_print(args.out.as_deref(), markdown.as_str())
 }
 
+pub(super) fn export_quantitative_product_manifest_command(
+	args: ExportQuantitativeProductManifestArgs,
+) -> Result<()> {
+	let raw = fs::read_to_string(&args.report)?;
+	let report = serde_json::from_str::<RealWorldReport>(&raw)?;
+	let manifest = crate::quantitative_product_manifest_from_report(&report, &args)?;
+	let json = serde_json::to_string_pretty(&manifest)?;
+
+	write_or_print(args.out.as_deref(), json.as_str())
+}
+
+pub(super) fn export_quantitative_audit_manifest_command(
+	args: ExportQuantitativeAuditManifestArgs,
+) -> Result<()> {
+	let jobs = load_jobs(&args.fixtures)?;
+	let manifest = crate::quantitative_audit_manifest_from_jobs(jobs.as_slice(), &args)?;
+	let json = serde_json::to_string_pretty(&manifest)?;
+
+	write_or_print(args.out.as_deref(), json.as_str())
+}
+
 fn load_jobs(path: &Path) -> Result<Vec<RealWorldJob>> {
 	let paths = fixture_paths(path)?;
 	let mut jobs = Vec::with_capacity(paths.len());
@@ -103,16 +125,29 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
 	)?;
 	let scoreboard = crate::scoreboard_report(jobs, &job_reports, &summary, &external_adapters);
 	let operational_evidence = crate::operational_evidence_report(jobs, &job_reports);
+	let adapter = adapter_report(args)?;
+	let generated_at = OffsetDateTime::now_utc().format(&Rfc3339)?;
+	let quantitative_scoreboard = crate::quantitative_scoreboard_report(QuantitativeReportInput {
+		run_id: args.run_id.as_str(),
+		generated_at: generated_at.as_str(),
+		adapter: &adapter,
+		source_jobs: jobs,
+		jobs: &job_reports,
+		summary: &summary,
+		product_manifest_path: args.quantitative_product_manifest.as_deref(),
+		audit_manifest_path: args.quantitative_audit_manifest.as_deref(),
+	})?;
 
 	Ok(RealWorldReport {
 		schema: REPORT_SCHEMA.to_string(),
 		run_id: args.run_id.clone(),
-		generated_at: OffsetDateTime::now_utc().format(&Rfc3339)?,
+		generated_at,
 		runner_version: VERSION.to_string(),
 		corpus_profile: corpus_profile(jobs),
-		adapter: adapter_report(args)?,
+		adapter,
 		scoreboard,
 		operational_evidence,
+		quantitative_scoreboard,
 		external_adapters,
 		capture_integration: capture_integration_report(jobs),
 		summary,

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs
@@ -87,6 +87,8 @@ pub(super) struct ExpectedAnswer {
 	pub(super) must_not_include: Vec<String>,
 	#[serde(default)]
 	pub(super) evidence_links: BTreeMap<String, EvidenceLink>,
+	#[serde(default)]
+	pub(super) relevance_judgments: Vec<RelevanceJudgment>,
 	pub(super) answer_type: String,
 	#[serde(default)]
 	pub(super) accepted_alternates: Vec<Value>,
@@ -96,6 +98,13 @@ pub(super) struct ExpectedAnswer {
 	pub(super) requires_refusal: bool,
 }
 
+#[derive(Debug, Deserialize)]
+pub(super) struct RelevanceJudgment {
+	pub(super) evidence_id: String,
+	#[serde(default = "default_relevance_grade")]
+	pub(super) grade: f64,
+}
+
 #[derive(Debug, Deserialize)]
 pub(super) struct RequiredEvidence {
 	pub(super) evidence_id: String,
@@ -250,3 +259,7 @@ pub(super) struct AdapterResponse {
 	pub(super) answer: ProducedAnswer,
 	pub(super) consolidation: Option<ConsolidationFixture>,
 }
+
+fn default_relevance_grade() -> f64 {
+	1.0
+}
diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
@@ -16,6 +16,8 @@ mod job_reports;
 mod markdown;
 mod operational;
 mod operational_reports;
+mod quantitative;
+mod quantitative_reports;
 mod recovery;
 mod report_root;
 mod scoreboard;
@@ -49,7 +51,10 @@ use artifacts::{
 	WorkJournalNextStepArtifact, WorkJournalReadbackArtifact, WorkJournalRejectedOptionArtifact,
 	WorkJournalWhereStoppedArtifact,
 };
-use cli::{Args, Command, PublishArgs, RunArgs};
+use cli::{
+	Args, Command, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs,
+	PublishArgs, RunArgs,
+};
 use diagnostic_reports::{
 	OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability,
 };
@@ -84,6 +89,15 @@ use operational_reports::{
 	OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport,
 	OperationalResourceSummary,
 };
+use quantitative::{
+	QuantitativeReportInput, quantitative_audit_manifest_from_jobs,
+	quantitative_product_manifest_from_report, quantitative_scoreboard_report,
+};
+use quantitative_reports::{
+	QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
+	QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
+	QuantitativePerQueryRow, QuantitativeProductManifest,
+};
 use report_root::RealWorldReport;
 use scoreboard::scoreboard_report;
 use scoreboard_reports::{
@@ -167,6 +181,10 @@ fn main() -> Result<()> {
 	color_eyre::install()?;
 
 	match Args::parse().command {
+		Command::ExportQuantitativeAuditManifest(args) =>
+			commands::export_quantitative_audit_manifest_command(args),
+		Command::ExportQuantitativeProductManifest(args) =>
+			commands::export_quantitative_product_manifest_command(args),
 		Command::Run(args) => commands::run_command(args),
 		Command::Publish(args) => commands::publish_command(args),
 	}

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs
@@ -6,6 +6,7 @@ mod followups;
 mod header;
 mod jobs;
 mod operational;
+mod quantitative;
 mod scoreboard;
 mod trace;
 
@@ -16,9 +17,9 @@ use crate::{
 	AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, CostReport,
 	DEFAULT_ADAPTER_BEHAVIOR, EvolutionJobReport, ExternalAdapterReport, KnowledgeSummary,
 	MemorySummaryReport, OperatorDebugEvidence, OperatorUxGap, ProactiveBriefSummaryReport,
-	RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, ScenarioOutcomeCounts,
-	ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, ScoreboardRow,
-	TraceExplainability, WorkContinuitySummaryReport,
+	QuantitativeBenchmarkRow, RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES,
+	ScenarioOutcomeCounts, ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport,
+	ScoreboardRow, TraceExplainability, WorkContinuitySummaryReport,
 	formatting::{
 		adapter_status_str, round3, scenario_comparison_outcome_str, status_str,
 		trace_failure_stage,
@@ -32,6 +33,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S
 
 	self::header::render_markdown_header(&mut out, report, report_path.as_str());
 	self::scoreboard::render_markdown_scoreboard(&mut out, report);
+	self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report);
 	self::operational::render_markdown_operational_evidence(&mut out, report);
 	self::adapters::render_markdown_external_adapters(&mut out, report);
 	self::adapters::render_markdown_capture_integration(&mut out, report);

diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs
@@ -0,0 +1,84 @@
+use crate::markdown::{self, QuantitativeBenchmarkRow, RealWorldReport};
+
+pub(super) fn render_markdown_quantitative_scoreboard(out: &mut String, report: &RealWorldReport) {
+	let scoreboard = &report.quantitative_scoreboard;
+
+	if scoreboard.schema.is_empty() {
+		return;
+	}
+
+	out.push_str("## Quantitative Benchmark Report\n\n");
+	out.push_str(concat!(
+		"Quantitative rows expose ranking metrics and their claim controls. ",
+		"Fixture-backed rows verify benchmark mechanics; leaderboard claims require explicit qrels, ",
+		"enough queries, and leakage controls.\n\n"
+	));
+	out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(scoreboard.schema.as_str())));
+	out.push_str(&format!("- Corpus: `{}`\n", markdown::md_inline(scoreboard.corpus_id.as_str())));
+	out.push_str(&format!(
+		"- k values: `{}`\n",
+		markdown::md_inline(
+			scoreboard
+				.k_values
+				.iter()
+				.map(usize::to_string)
+				.collect::<Vec<_>>()
+				.join(", ")
+				.as_str()
+		)
+	));
+	out.push_str(&format!(
+		"- Ranking queries: `{}` of `{}`; explicit-qrel queries: `{}`\n",
+		scoreboard.controls.current_ranking_query_count,
+		scoreboard.controls.current_query_count,
+		scoreboard.controls.current_explicit_qrel_query_count
+	));
+	out.push_str(&format!(
+		"- Leaderboard claim allowed: `{}`\n",
+		scoreboard.controls.leaderboard_claim_allowed
+	));
+	out.push_str(&format!(
+		"- Claim boundary: {}\n\n",
+		markdown::md_cell(scoreboard.claim_boundary.as_str())
+	));
+	out.push_str("| Product | State | Evidence | Qrels | Sample | Ranking Queries | Recall@5 | ");
+	out.push_str("Precision@5 | MRR | nDCG@5 | AP | Leaderboard |\n");
+	out.push_str(
+		"| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
+	);
+
+	for row in &scoreboard.rows {
+		out.push_str(&format!(
+			"| {} | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | `{}` |\n",
+			markdown::md_cell(row.product.as_str()),
+			markdown::md_inline(row.result_state.as_str()),
+			markdown::md_inline(row.evidence_class.as_str()),
+			markdown::md_inline(row.qrel_source.as_str()),
+			row.sample_size,
+			row.ranking_query_count,
+			quantitative_metric(row, "recall_at_5"),
+			quantitative_metric(row, "precision_at_5"),
+			quantitative_metric(row, "mrr"),
+			quantitative_metric(row, "ndcg_at_5"),
+			quantitative_metric(row, "average_precision"),
+			row.leaderboard_eligible
+		));
+	}
+
+	if !scoreboard.metrics_not_encoded.is_empty() {
+		out.push_str("\nMetrics not encoded:\n");
+
+		for metric in &scoreboard.metrics_not_encoded {
+			out.push_str(&format!("- `{}`\n", markdown::md_inline(metric.as_str())));
+		}
+
+		out.push('\n');
+	}
+}
+
+fn quantitative_metric(row: &QuantitativeBenchmarkRow, metric: &str) -> String {
+	row.metrics
+		.get(metric)
+		.and_then(|value| *value)
+		.map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value)))
+}