Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
c013899
{"schema":"decodex/commit/1","summary":"Port explicit qrel benchmark …
yvette-carlisle Jul 1, 2026
33d6615
{"schema":"decodex/commit/1","summary":"Port quantitative benchmark r…
yvette-carlisle Jul 1, 2026
a92363b
{"schema":"decodex/commit/1","summary":"Port quantitative product man…
yvette-carlisle Jul 1, 2026
4ee6bae
{"schema":"decodex/commit/1","summary":"Port quantitative audit manif…
yvette-carlisle Jul 1, 2026
8c95885
{"schema":"decodex/commit/1","summary":"Add quantitative rate confide…
yvette-carlisle Jul 1, 2026
486c476
{"schema":"decodex/commit/1","summary":"Clean up split makefile comme…
yvette-carlisle Jul 1, 2026
d766be8
{"schema":"decodex/commit/1","summary":"Split quantitative product ma…
yvette-carlisle Jul 1, 2026
5b60c39
{"schema":"decodex/commit/1","summary":"Split quantitative audit mani…
yvette-carlisle Jul 1, 2026
a083844
{"schema":"decodex/commit/1","summary":"Split quantitative metric hel…
yvette-carlisle Jul 1, 2026
dee9e0c
{"schema":"decodex/commit/1","summary":"Split quantitative benchmark …
yvette-carlisle Jul 1, 2026
f65b0e2
{"schema":"decodex/commit/1","summary":"Split quantitative metric sub…
yvette-carlisle Jul 1, 2026
ce6f82c
{"schema":"decodex/commit/1","summary":"Split quantitative audit arti…
yvette-carlisle Jul 1, 2026
06ec4c1
{"schema":"decodex/commit/1","summary":"Split quantitative product va…
yvette-carlisle Jul 1, 2026
c869f8b
{"schema":"decodex/commit/1","summary":"Split quantitative audit vali…
yvette-carlisle Jul 1, 2026
6601725
{"schema":"decodex/commit/1","summary":"Split quantitative report ass…
yvette-carlisle Jul 1, 2026
6261914
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
69af28a
{"schema":"decodex/commit/1","summary":"Split quantitative aggregate …
yvette-carlisle Jul 1, 2026
0d546f8
{"schema":"decodex/commit/1","summary":"Split quantitative product ma…
yvette-carlisle Jul 1, 2026
e19440a
{"schema":"decodex/commit/1","summary":"Split quantitative product ro…
yvette-carlisle Jul 1, 2026
974489b
{"schema":"decodex/commit/1","summary":"Split quantitative product ro…
yvette-carlisle Jul 1, 2026
e1fc0e4
{"schema":"decodex/commit/1","summary":"Split quantitative report ass…
yvette-carlisle Jul 1, 2026
00148a8
{"schema":"decodex/commit/1","summary":"Split quantitative audit arti…
yvette-carlisle Jul 1, 2026
48781e6
{"schema":"decodex/commit/1","summary":"Split quantitative audit vali…
yvette-carlisle Jul 1, 2026
c478442
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
5dd6220
{"schema":"decodex/commit/1","summary":"Split quantitative audit mani…
yvette-carlisle Jul 1, 2026
7d8c5ef
{"schema":"decodex/commit/1","summary":"Split quantitative product ma…
yvette-carlisle Jul 1, 2026
44cea2f
{"schema":"decodex/commit/1","summary":"Split quantitative aggregate …
yvette-carlisle Jul 1, 2026
b61fbd5
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
91c4600
{"schema":"decodex/commit/1","summary":"Split quantitative report row…
yvette-carlisle Jul 1, 2026
353c953
{"schema":"decodex/commit/1","summary":"Split quantitative ranking qu…
yvette-carlisle Jul 1, 2026
9a64011
{"schema":"decodex/commit/1","summary":"Split quantitative confidence…
yvette-carlisle Jul 1, 2026
52c4ae0
{"schema":"decodex/commit/1","summary":"Split quantitative audit expo…
yvette-carlisle Jul 1, 2026
f8959df
{"schema":"decodex/commit/1","summary":"Split quantitative report row…
yvette-carlisle Jul 1, 2026
c01c488
{"schema":"decodex/commit/1","summary":"Split quantitative average pr…
yvette-carlisle Jul 1, 2026
cc23d1a
{"schema":"decodex/commit/1","summary":"Split quantitative report aud…
yvette-carlisle Jul 1, 2026
dad67b4
{"schema":"decodex/commit/1","summary":"Split quantitative report dat…
yvette-carlisle Jul 1, 2026
e36caa6
{"schema":"decodex/commit/1","summary":"Split quantitative benchmark …
yvette-carlisle Jul 1, 2026
b3c5a50
{"schema":"decodex/commit/1","summary":"Split quantitative product ro…
yvette-carlisle Jul 1, 2026
fa6ebe9
{"schema":"decodex/commit/1","summary":"Split quantitative audit iden…
yvette-carlisle Jul 1, 2026
e1dfc21
{"schema":"decodex/commit/1","summary":"Split quantitative audit dige…
yvette-carlisle Jul 1, 2026
788ab64
{"schema":"decodex/commit/1","summary":"Split quantitative audit mani…
yvette-carlisle Jul 1, 2026
b54faa7
{"schema":"decodex/commit/1","summary":"Split quantitative benchmark …
yvette-carlisle Jul 1, 2026
49325f6
{"schema":"decodex/commit/1","summary":"Split quantitative product ex…
yvette-carlisle Jul 1, 2026
449927f
{"schema":"decodex/commit/1","summary":"Split quantitative audit arti…
yvette-carlisle Jul 1, 2026
779dbed
{"schema":"decodex/commit/1","summary":"Split quantitative ranking me…
yvette-carlisle Jul 1, 2026
119287f
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
a2282ca
{"schema":"decodex/commit/1","summary":"Split quantitative aggregate …
yvette-carlisle Jul 1, 2026
4e6170e
{"schema":"decodex/commit/1","summary":"Split quantitative report inp…
yvette-carlisle Jul 1, 2026
c7aebf9
{"schema":"decodex/commit/1","summary":"Split quantitative audit cont…
yvette-carlisle Jul 1, 2026
a7a45db
{"schema":"decodex/commit/1","summary":"Split quantitative row basis …
yvette-carlisle Jul 1, 2026
e941669
{"schema":"decodex/commit/1","summary":"Split quantitative product ex…
yvette-carlisle Jul 1, 2026
5276464
{"schema":"decodex/commit/1","summary":"Split quantitative report imp…
yvette-carlisle Jul 1, 2026
80680ac
{"schema":"decodex/commit/1","summary":"Slim quantitative metrics fac…
yvette-carlisle Jul 1, 2026
cf61ab3
{"schema":"decodex/commit/1","summary":"Split quantitative ranking he…
yvette-carlisle Jul 1, 2026
c919308
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
ff093e5
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
5fe18db
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
3626f64
{"schema":"decodex/commit/1","summary":"Split quantitative per-query …
yvette-carlisle Jul 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ pub(super) struct RunArgs {
/// Skip loading the real-world external adapter coverage manifest.
#[arg(long)]
pub(super) skip_external_adapter_manifest: bool,
/// Optional same-corpus quantitative product manifest to merge into the report.
#[arg(long, value_name = "FILE")]
pub(super) quantitative_product_manifest: Option<PathBuf>,
/// Optional audit manifest proving the current quantitative row's held-out/leakage gates.
#[arg(long, value_name = "FILE")]
pub(super) quantitative_audit_manifest: Option<PathBuf>,
}

#[derive(Debug, Parser)]
Expand All @@ -63,9 +69,69 @@ pub(super) struct PublishArgs {
pub(super) out: Option<PathBuf>,
}

#[derive(Debug, Parser)]
pub(super) struct ExportQuantitativeProductManifestArgs {
/// Generated real_world_job JSON report to export.
#[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)]
pub(super) report: PathBuf,
/// Write product manifest JSON to this file. Omit to print to stdout.
#[arg(long, value_name = "FILE")]
pub(super) out: Option<PathBuf>,
/// Stable manifest id. Defaults to <run_id>-quantitative-product-manifest.
#[arg(long)]
pub(super) manifest_id: Option<String>,
/// Override the exported product name.
#[arg(long)]
pub(super) product: Option<String>,
/// Override the exported adapter id.
#[arg(long)]
pub(super) adapter_id: Option<String>,
/// Override the exported adapter name.
#[arg(long)]
pub(super) adapter_name: Option<String>,
}

#[derive(Debug, Parser)]
pub(super) struct ExportQuantitativeAuditManifestArgs {
/// Fixture file or directory containing current product-runtime real_world_job outputs.
#[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)]
pub(super) fixtures: PathBuf,
/// Write audit manifest JSON to this file. Omit to print to stdout.
#[arg(long, value_name = "FILE")]
pub(super) out: Option<PathBuf>,
/// Stable run id that the audit manifest is allowed to attest.
#[arg(long, default_value = DEFAULT_RUN_ID)]
pub(super) run_id: String,
/// Stable manifest id. Defaults to <run_id>-quantitative-audit-manifest.
#[arg(long)]
pub(super) manifest_id: Option<String>,
/// Product name for the current row.
#[arg(long, default_value = "ELF")]
pub(super) product: String,
/// Adapter id for the current row.
#[arg(long, default_value = DEFAULT_ADAPTER_ID)]
pub(super) adapter_id: String,
/// Mark the current row as held-out only when query ids were locked before runtime.
#[arg(long)]
pub(super) held_out: bool,
/// Mark the current row as leakage audited only when runtime inputs excluded answers/qrels.
#[arg(long)]
pub(super) leakage_audited: bool,
/// Audit control string. Repeat for multiple controls.
#[arg(long = "control")]
pub(super) controls: Vec<String>,
/// Claim boundary recorded in the audit manifest.
#[arg(long)]
pub(super) claim_boundary: Option<String>,
}

#[derive(Debug, Subcommand)]
#[command(rename_all = "kebab")]
pub(super) enum Command {
/// Export a quantitative audit manifest for the current fixture set.
ExportQuantitativeAuditManifest(ExportQuantitativeAuditManifestArgs),
/// Export the primary quantitative row as a reusable product manifest.
ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs),
/// Parse and score real_world_job fixtures, then emit a JSON report.
Run(RunArgs),
/// Render Markdown from a generated real_world_job JSON report.
Expand Down
45 changes: 40 additions & 5 deletions apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use crate::{
AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path,
PathBuf, PrivateCorpusRedaction, PublishArgs, REPORT_SCHEMA, RealWorldJob, RealWorldReport,
Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile,
ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime,
Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA,
RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs,
};

pub(super) fn run_command(args: RunArgs) -> Result<()> {
Expand All @@ -20,6 +21,27 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> {
write_or_print(args.out.as_deref(), markdown.as_str())
}

pub(super) fn export_quantitative_product_manifest_command(
args: ExportQuantitativeProductManifestArgs,
) -> Result<()> {
let raw = fs::read_to_string(&args.report)?;
let report = serde_json::from_str::<RealWorldReport>(&raw)?;
let manifest = crate::quantitative_product_manifest_from_report(&report, &args)?;
let json = serde_json::to_string_pretty(&manifest)?;

write_or_print(args.out.as_deref(), json.as_str())
}

pub(super) fn export_quantitative_audit_manifest_command(
args: ExportQuantitativeAuditManifestArgs,
) -> Result<()> {
let jobs = load_jobs(&args.fixtures)?;
let manifest = crate::quantitative_audit_manifest_from_jobs(jobs.as_slice(), &args)?;
let json = serde_json::to_string_pretty(&manifest)?;

write_or_print(args.out.as_deref(), json.as_str())
}

fn load_jobs(path: &Path) -> Result<Vec<RealWorldJob>> {
let paths = fixture_paths(path)?;
let mut jobs = Vec::with_capacity(paths.len());
Expand Down Expand Up @@ -103,16 +125,29 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result<RealWorldReport
)?;
let scoreboard = crate::scoreboard_report(jobs, &job_reports, &summary, &external_adapters);
let operational_evidence = crate::operational_evidence_report(jobs, &job_reports);
let adapter = adapter_report(args)?;
let generated_at = OffsetDateTime::now_utc().format(&Rfc3339)?;
let quantitative_scoreboard = crate::quantitative_scoreboard_report(QuantitativeReportInput {
run_id: args.run_id.as_str(),
generated_at: generated_at.as_str(),
adapter: &adapter,
source_jobs: jobs,
jobs: &job_reports,
summary: &summary,
product_manifest_path: args.quantitative_product_manifest.as_deref(),
audit_manifest_path: args.quantitative_audit_manifest.as_deref(),
})?;

Ok(RealWorldReport {
schema: REPORT_SCHEMA.to_string(),
run_id: args.run_id.clone(),
generated_at: OffsetDateTime::now_utc().format(&Rfc3339)?,
generated_at,
runner_version: VERSION.to_string(),
corpus_profile: corpus_profile(jobs),
adapter: adapter_report(args)?,
adapter,
scoreboard,
operational_evidence,
quantitative_scoreboard,
external_adapters,
capture_integration: capture_integration_report(jobs),
summary,
Expand Down
13 changes: 13 additions & 0 deletions apps/elf-eval/src/bin/real_world_job_benchmark/fixtures.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ pub(super) struct ExpectedAnswer {
pub(super) must_not_include: Vec<String>,
#[serde(default)]
pub(super) evidence_links: BTreeMap<String, EvidenceLink>,
#[serde(default)]
pub(super) relevance_judgments: Vec<RelevanceJudgment>,
pub(super) answer_type: String,
#[serde(default)]
pub(super) accepted_alternates: Vec<Value>,
Expand All @@ -96,6 +98,13 @@ pub(super) struct ExpectedAnswer {
pub(super) requires_refusal: bool,
}

#[derive(Debug, Deserialize)]
pub(super) struct RelevanceJudgment {
pub(super) evidence_id: String,
#[serde(default = "default_relevance_grade")]
pub(super) grade: f64,
}

#[derive(Debug, Deserialize)]
pub(super) struct RequiredEvidence {
pub(super) evidence_id: String,
Expand Down Expand Up @@ -250,3 +259,7 @@ pub(super) struct AdapterResponse {
pub(super) answer: ProducedAnswer,
pub(super) consolidation: Option<ConsolidationFixture>,
}

fn default_relevance_grade() -> f64 {
1.0
}
20 changes: 19 additions & 1 deletion apps/elf-eval/src/bin/real_world_job_benchmark/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ mod job_reports;
mod markdown;
mod operational;
mod operational_reports;
mod quantitative;
mod quantitative_reports;
mod recovery;
mod report_root;
mod scoreboard;
Expand Down Expand Up @@ -49,7 +51,10 @@ use artifacts::{
WorkJournalNextStepArtifact, WorkJournalReadbackArtifact, WorkJournalRejectedOptionArtifact,
WorkJournalWhereStoppedArtifact,
};
use cli::{Args, Command, PublishArgs, RunArgs};
use cli::{
Args, Command, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs,
PublishArgs, RunArgs,
};
use diagnostic_reports::{
OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability,
};
Expand Down Expand Up @@ -84,6 +89,15 @@ use operational_reports::{
OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport,
OperationalResourceSummary,
};
use quantitative::{
QuantitativeReportInput, quantitative_audit_manifest_from_jobs,
quantitative_product_manifest_from_report, quantitative_scoreboard_report,
};
use quantitative_reports::{
QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls,
QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval,
QuantitativePerQueryRow, QuantitativeProductManifest,
};
use report_root::RealWorldReport;
use scoreboard::scoreboard_report;
use scoreboard_reports::{
Expand Down Expand Up @@ -167,6 +181,10 @@ fn main() -> Result<()> {
color_eyre::install()?;

match Args::parse().command {
Command::ExportQuantitativeAuditManifest(args) =>
commands::export_quantitative_audit_manifest_command(args),
Command::ExportQuantitativeProductManifest(args) =>
commands::export_quantitative_product_manifest_command(args),
Command::Run(args) => commands::run_command(args),
Command::Publish(args) => commands::publish_command(args),
}
Expand Down
8 changes: 5 additions & 3 deletions apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ mod followups;
mod header;
mod jobs;
mod operational;
mod quantitative;
mod scoreboard;
mod trace;

Expand All @@ -16,9 +17,9 @@ use crate::{
AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, CostReport,
DEFAULT_ADAPTER_BEHAVIOR, EvolutionJobReport, ExternalAdapterReport, KnowledgeSummary,
MemorySummaryReport, OperatorDebugEvidence, OperatorUxGap, ProactiveBriefSummaryReport,
RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, ScenarioOutcomeCounts,
ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, ScoreboardRow,
TraceExplainability, WorkContinuitySummaryReport,
QuantitativeBenchmarkRow, RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES,
ScenarioOutcomeCounts, ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport,
ScoreboardRow, TraceExplainability, WorkContinuitySummaryReport,
formatting::{
adapter_status_str, round3, scenario_comparison_outcome_str, status_str,
trace_failure_stage,
Expand All @@ -32,6 +33,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S

self::header::render_markdown_header(&mut out, report, report_path.as_str());
self::scoreboard::render_markdown_scoreboard(&mut out, report);
self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report);
self::operational::render_markdown_operational_evidence(&mut out, report);
self::adapters::render_markdown_external_adapters(&mut out, report);
self::adapters::render_markdown_capture_integration(&mut out, report);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
use crate::markdown::{self, QuantitativeBenchmarkRow, RealWorldReport};

pub(super) fn render_markdown_quantitative_scoreboard(out: &mut String, report: &RealWorldReport) {
let scoreboard = &report.quantitative_scoreboard;

if scoreboard.schema.is_empty() {
return;
}

out.push_str("## Quantitative Benchmark Report\n\n");
out.push_str(concat!(
"Quantitative rows expose ranking metrics and their claim controls. ",
"Fixture-backed rows verify benchmark mechanics; leaderboard claims require explicit qrels, ",
"enough queries, and leakage controls.\n\n"
));
out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(scoreboard.schema.as_str())));
out.push_str(&format!("- Corpus: `{}`\n", markdown::md_inline(scoreboard.corpus_id.as_str())));
out.push_str(&format!(
"- k values: `{}`\n",
markdown::md_inline(
scoreboard
.k_values
.iter()
.map(usize::to_string)
.collect::<Vec<_>>()
.join(", ")
.as_str()
)
));
out.push_str(&format!(
"- Ranking queries: `{}` of `{}`; explicit-qrel queries: `{}`\n",
scoreboard.controls.current_ranking_query_count,
scoreboard.controls.current_query_count,
scoreboard.controls.current_explicit_qrel_query_count
));
out.push_str(&format!(
"- Leaderboard claim allowed: `{}`\n",
scoreboard.controls.leaderboard_claim_allowed
));
out.push_str(&format!(
"- Claim boundary: {}\n\n",
markdown::md_cell(scoreboard.claim_boundary.as_str())
));
out.push_str("| Product | State | Evidence | Qrels | Sample | Ranking Queries | Recall@5 | ");
out.push_str("Precision@5 | MRR | nDCG@5 | AP | Leaderboard |\n");
out.push_str(
"| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n",
);

for row in &scoreboard.rows {
out.push_str(&format!(
"| {} | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | `{}` |\n",
markdown::md_cell(row.product.as_str()),
markdown::md_inline(row.result_state.as_str()),
markdown::md_inline(row.evidence_class.as_str()),
markdown::md_inline(row.qrel_source.as_str()),
row.sample_size,
row.ranking_query_count,
quantitative_metric(row, "recall_at_5"),
quantitative_metric(row, "precision_at_5"),
quantitative_metric(row, "mrr"),
quantitative_metric(row, "ndcg_at_5"),
quantitative_metric(row, "average_precision"),
row.leaderboard_eligible
));
}

if !scoreboard.metrics_not_encoded.is_empty() {
out.push_str("\nMetrics not encoded:\n");

for metric in &scoreboard.metrics_not_encoded {
out.push_str(&format!("- `{}`\n", markdown::md_inline(metric.as_str())));
}

out.push('\n');
}
}

fn quantitative_metric(row: &QuantitativeBenchmarkRow, metric: &str) -> String {
row.metrics
.get(metric)
.and_then(|value| *value)
.map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value)))
}
Loading