diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs index e1bc6f32..bae29a2e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs @@ -51,6 +51,12 @@ pub(super) struct RunArgs { /// Skip loading the real-world external adapter coverage manifest. #[arg(long)] pub(super) skip_external_adapter_manifest: bool, + /// Optional same-corpus quantitative product manifest to merge into the report. + #[arg(long, value_name = "FILE")] + pub(super) quantitative_product_manifest: Option, + /// Optional audit manifest proving the current quantitative row's held-out/leakage gates. + #[arg(long, value_name = "FILE")] + pub(super) quantitative_audit_manifest: Option, } #[derive(Debug, Parser)] @@ -63,9 +69,69 @@ pub(super) struct PublishArgs { pub(super) out: Option, } +#[derive(Debug, Parser)] +pub(super) struct ExportQuantitativeProductManifestArgs { + /// Generated real_world_job JSON report to export. + #[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)] + pub(super) report: PathBuf, + /// Write product manifest JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + pub(super) out: Option, + /// Stable manifest id. Defaults to -quantitative-product-manifest. + #[arg(long)] + pub(super) manifest_id: Option, + /// Override the exported product name. + #[arg(long)] + pub(super) product: Option, + /// Override the exported adapter id. + #[arg(long)] + pub(super) adapter_id: Option, + /// Override the exported adapter name. + #[arg(long)] + pub(super) adapter_name: Option, +} + +#[derive(Debug, Parser)] +pub(super) struct ExportQuantitativeAuditManifestArgs { + /// Fixture file or directory containing current product-runtime real_world_job outputs. + #[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)] + pub(super) fixtures: PathBuf, + /// Write audit manifest JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + pub(super) out: Option, + /// Stable run id that the audit manifest is allowed to attest. + #[arg(long, default_value = DEFAULT_RUN_ID)] + pub(super) run_id: String, + /// Stable manifest id. Defaults to -quantitative-audit-manifest. + #[arg(long)] + pub(super) manifest_id: Option, + /// Product name for the current row. + #[arg(long, default_value = "ELF")] + pub(super) product: String, + /// Adapter id for the current row. + #[arg(long, default_value = DEFAULT_ADAPTER_ID)] + pub(super) adapter_id: String, + /// Mark the current row as held-out only when query ids were locked before runtime. + #[arg(long)] + pub(super) held_out: bool, + /// Mark the current row as leakage audited only when runtime inputs excluded answers/qrels. + #[arg(long)] + pub(super) leakage_audited: bool, + /// Audit control string. Repeat for multiple controls. + #[arg(long = "control")] + pub(super) controls: Vec, + /// Claim boundary recorded in the audit manifest. + #[arg(long)] + pub(super) claim_boundary: Option, +} + #[derive(Debug, Subcommand)] #[command(rename_all = "kebab")] pub(super) enum Command { + /// Export a quantitative audit manifest for the current fixture set. + ExportQuantitativeAuditManifest(ExportQuantitativeAuditManifestArgs), + /// Export the primary quantitative row as a reusable product manifest. + ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs), /// Parse and score real_world_job fixtures, then emit a JSON report. Run(RunArgs), /// Render Markdown from a generated real_world_job JSON report. diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs index 91dc476f..a151e6da 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs @@ -1,7 +1,8 @@ use crate::{ - AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path, - PathBuf, PrivateCorpusRedaction, PublishArgs, REPORT_SCHEMA, RealWorldJob, RealWorldReport, - Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, + AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, + ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime, + Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, + RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, }; pub(super) fn run_command(args: RunArgs) -> Result<()> { @@ -20,6 +21,27 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> { write_or_print(args.out.as_deref(), markdown.as_str()) } +pub(super) fn export_quantitative_product_manifest_command( + args: ExportQuantitativeProductManifestArgs, +) -> Result<()> { + let raw = fs::read_to_string(&args.report)?; + let report = serde_json::from_str::(&raw)?; + let manifest = crate::quantitative_product_manifest_from_report(&report, &args)?; + let json = serde_json::to_string_pretty(&manifest)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + +pub(super) fn export_quantitative_audit_manifest_command( + args: ExportQuantitativeAuditManifestArgs, +) -> Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let manifest = crate::quantitative_audit_manifest_from_jobs(jobs.as_slice(), &args)?; + let json = serde_json::to_string_pretty(&manifest)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + fn load_jobs(path: &Path) -> Result> { let paths = fixture_paths(path)?; let mut jobs = Vec::with_capacity(paths.len()); @@ -103,16 +125,29 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result, #[serde(default)] pub(super) evidence_links: BTreeMap, + #[serde(default)] + pub(super) relevance_judgments: Vec, pub(super) answer_type: String, #[serde(default)] pub(super) accepted_alternates: Vec, @@ -96,6 +98,13 @@ pub(super) struct ExpectedAnswer { pub(super) requires_refusal: bool, } +#[derive(Debug, Deserialize)] +pub(super) struct RelevanceJudgment { + pub(super) evidence_id: String, + #[serde(default = "default_relevance_grade")] + pub(super) grade: f64, +} + #[derive(Debug, Deserialize)] pub(super) struct RequiredEvidence { pub(super) evidence_id: String, @@ -250,3 +259,7 @@ pub(super) struct AdapterResponse { pub(super) answer: ProducedAnswer, pub(super) consolidation: Option, } + +fn default_relevance_grade() -> f64 { + 1.0 +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs index 9815886f..dc77d8f0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -16,6 +16,8 @@ mod job_reports; mod markdown; mod operational; mod operational_reports; +mod quantitative; +mod quantitative_reports; mod recovery; mod report_root; mod scoreboard; @@ -49,7 +51,10 @@ use artifacts::{ WorkJournalNextStepArtifact, WorkJournalReadbackArtifact, WorkJournalRejectedOptionArtifact, WorkJournalWhereStoppedArtifact, }; -use cli::{Args, Command, PublishArgs, RunArgs}; +use cli::{ + Args, Command, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, + PublishArgs, RunArgs, +}; use diagnostic_reports::{ OperatorDebugEvidence, OperatorUxGap, TraceExplainability, TraceStageExplainability, }; @@ -84,6 +89,15 @@ use operational_reports::{ OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport, OperationalResourceSummary, }; +use quantitative::{ + QuantitativeReportInput, quantitative_audit_manifest_from_jobs, + quantitative_product_manifest_from_report, quantitative_scoreboard_report, +}; +use quantitative_reports::{ + QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls, + QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, + QuantitativePerQueryRow, QuantitativeProductManifest, +}; use report_root::RealWorldReport; use scoreboard::scoreboard_report; use scoreboard_reports::{ @@ -167,6 +181,10 @@ fn main() -> Result<()> { color_eyre::install()?; match Args::parse().command { + Command::ExportQuantitativeAuditManifest(args) => + commands::export_quantitative_audit_manifest_command(args), + Command::ExportQuantitativeProductManifest(args) => + commands::export_quantitative_product_manifest_command(args), Command::Run(args) => commands::run_command(args), Command::Publish(args) => commands::publish_command(args), } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs index 36f9dba6..68bcb12a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs @@ -6,6 +6,7 @@ mod followups; mod header; mod jobs; mod operational; +mod quantitative; mod scoreboard; mod trace; @@ -16,9 +17,9 @@ use crate::{ AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, CostReport, DEFAULT_ADAPTER_BEHAVIOR, EvolutionJobReport, ExternalAdapterReport, KnowledgeSummary, MemorySummaryReport, OperatorDebugEvidence, OperatorUxGap, ProactiveBriefSummaryReport, - RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, ScenarioOutcomeCounts, - ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, ScoreboardRow, - TraceExplainability, WorkContinuitySummaryReport, + QuantitativeBenchmarkRow, RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, + ScenarioOutcomeCounts, ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, + ScoreboardRow, TraceExplainability, WorkContinuitySummaryReport, formatting::{ adapter_status_str, round3, scenario_comparison_outcome_str, status_str, trace_failure_stage, @@ -32,6 +33,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S self::header::render_markdown_header(&mut out, report, report_path.as_str()); self::scoreboard::render_markdown_scoreboard(&mut out, report); + self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report); self::operational::render_markdown_operational_evidence(&mut out, report); self::adapters::render_markdown_external_adapters(&mut out, report); self::adapters::render_markdown_capture_integration(&mut out, report); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs new file mode 100644 index 00000000..1c3ec195 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs @@ -0,0 +1,84 @@ +use crate::markdown::{self, QuantitativeBenchmarkRow, RealWorldReport}; + +pub(super) fn render_markdown_quantitative_scoreboard(out: &mut String, report: &RealWorldReport) { + let scoreboard = &report.quantitative_scoreboard; + + if scoreboard.schema.is_empty() { + return; + } + + out.push_str("## Quantitative Benchmark Report\n\n"); + out.push_str(concat!( + "Quantitative rows expose ranking metrics and their claim controls. ", + "Fixture-backed rows verify benchmark mechanics; leaderboard claims require explicit qrels, ", + "enough queries, and leakage controls.\n\n" + )); + out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(scoreboard.schema.as_str()))); + out.push_str(&format!("- Corpus: `{}`\n", markdown::md_inline(scoreboard.corpus_id.as_str()))); + out.push_str(&format!( + "- k values: `{}`\n", + markdown::md_inline( + scoreboard + .k_values + .iter() + .map(usize::to_string) + .collect::>() + .join(", ") + .as_str() + ) + )); + out.push_str(&format!( + "- Ranking queries: `{}` of `{}`; explicit-qrel queries: `{}`\n", + scoreboard.controls.current_ranking_query_count, + scoreboard.controls.current_query_count, + scoreboard.controls.current_explicit_qrel_query_count + )); + out.push_str(&format!( + "- Leaderboard claim allowed: `{}`\n", + scoreboard.controls.leaderboard_claim_allowed + )); + out.push_str(&format!( + "- Claim boundary: {}\n\n", + markdown::md_cell(scoreboard.claim_boundary.as_str()) + )); + out.push_str("| Product | State | Evidence | Qrels | Sample | Ranking Queries | Recall@5 | "); + out.push_str("Precision@5 | MRR | nDCG@5 | AP | Leaderboard |\n"); + out.push_str( + "| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n", + ); + + for row in &scoreboard.rows { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | `{}` |\n", + markdown::md_cell(row.product.as_str()), + markdown::md_inline(row.result_state.as_str()), + markdown::md_inline(row.evidence_class.as_str()), + markdown::md_inline(row.qrel_source.as_str()), + row.sample_size, + row.ranking_query_count, + quantitative_metric(row, "recall_at_5"), + quantitative_metric(row, "precision_at_5"), + quantitative_metric(row, "mrr"), + quantitative_metric(row, "ndcg_at_5"), + quantitative_metric(row, "average_precision"), + row.leaderboard_eligible + )); + } + + if !scoreboard.metrics_not_encoded.is_empty() { + out.push_str("\nMetrics not encoded:\n"); + + for metric in &scoreboard.metrics_not_encoded { + out.push_str(&format!("- `{}`\n", markdown::md_inline(metric.as_str()))); + } + + out.push('\n'); + } +} + +fn quantitative_metric(row: &QuantitativeBenchmarkRow, metric: &str) -> String { + row.metrics + .get(metric) + .and_then(|value| *value) + .map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value))) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs new file mode 100644 index 00000000..4032c770 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -0,0 +1,118 @@ +mod audit_manifest; +mod metrics; +mod product_manifest; +mod report; + +pub(super) use self::{ + audit_manifest::quantitative_audit_manifest_from_jobs, + product_manifest::quantitative_product_manifest_from_report, + report::{QuantitativeReportInput, quantitative_scoreboard_report}, +}; + +use self::audit_manifest::QuantitativeAuditEvidence; +use crate::{AdapterReport, BTreeSet, JobReport, RealWorldJob, ReportSummary}; + +const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; +const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = + "elf.agent_memory_quantitative_product_manifest/v1"; +const QUANTITATIVE_AUDIT_MANIFEST_SCHEMA: &str = "elf.agent_memory_quantitative_audit_manifest/v1"; +const REQUIRED_HELD_OUT_AUDIT_CONTROL: &str = "query_ids_locked_before_product_runtime"; +const REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL: &str = + "product_runtime_did_not_receive_expected_answers_or_qrels"; +const REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL: &str = + "ranked_candidates_emitted_by_product_runtime"; +const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10]; +const MIN_LEADERBOARD_QUERY_COUNT: usize = 30; +const WILSON_95_Z: f64 = 1.959963984540054; +const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( + "Quantitative metrics are bounded to this generated report. ", + "Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims." +); + +fn quantitative_metrics_not_encoded( + imported_row_count: usize, + imported_per_query_count: usize, +) -> Vec { + let mut metrics = + vec!["paired_significance".to_string(), "audit_manifest_validation".to_string()]; + + if imported_row_count == 0 { + metrics.push("external_product_manifest_import".to_string()); + } + if imported_row_count > 0 && imported_per_query_count == 0 { + metrics.push("imported_product_per_query_rows".to_string()); + } + + metrics +} + +fn quantitative_corpus_id(source_jobs: &[RealWorldJob]) -> String { + let ids = source_jobs.iter().map(|job| job.corpus.corpus_id.as_str()).collect::>(); + + if ids.len() == 1 { + ids.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn quantitative_suite_id(jobs: &[JobReport]) -> String { + let suites = jobs.iter().map(|job| job.suite_id.as_str()).collect::>(); + + if suites.len() == 1 { + suites.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn quantitative_result_state(summary: &ReportSummary) -> &'static str { + if summary.unsupported_claim > 0 { + "unsupported_claim" + } else if summary.wrong_result > 0 { + "wrong_result" + } else if summary.incomplete > 0 { + "incomplete" + } else if summary.blocked > 0 { + "blocked" + } else if summary.not_encoded > 0 { + "not_encoded" + } else { + "pass" + } +} + +fn quantitative_evidence_class(adapter: &AdapterReport, jobs: &[JobReport]) -> &'static str { + if adapter.behavior == "live_real_world_adapter" { + "live_real_world" + } else if jobs.iter().any(|job| job.operational_evidence_tier == "private_corpus") { + "private_corpus" + } else if jobs.iter().any(|job| job.operational_evidence_tier == "provider_backed") { + "provider_backed" + } else if adapter.behavior.contains("public_proxy") { + "public_proxy" + } else { + "fixture_backed" + } +} + +fn quantitative_row_leaderboard_eligible( + evidence_class: &str, + sample_size: usize, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + metric_comparable: bool, + audit_evidence: &QuantitativeAuditEvidence, +) -> bool { + metric_comparable + && evidence_class == "live_real_world" + && sample_size >= MIN_LEADERBOARD_QUERY_COUNT + && ranking_query_count == sample_size + && explicit_qrel_query_count == ranking_query_count + && audit_evidence.held_out + && audit_evidence.leakage_audited + && audit_evidence + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs new file mode 100644 index 00000000..01f7e463 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -0,0 +1,31 @@ +mod artifacts; +mod evidence; +mod export; +mod validation; + +pub(crate) use self::export::quantitative_audit_manifest_from_jobs; + +use crate::{Path, RealWorldJob, Result}; + +pub(super) struct QuantitativeAuditContext<'a> { + pub(super) run_id: &'a str, + pub(super) corpus_id: &'a str, + pub(super) product: &'a str, + pub(super) adapter_id: &'a str, + pub(super) source_jobs: &'a [RealWorldJob], + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) struct QuantitativeAuditEvidence { + pub(super) held_out: bool, + pub(super) leakage_audited: bool, + pub(super) audit_manifest_id: Option, +} + +pub(super) fn quantitative_audit_evidence( + path: Option<&Path>, + context: QuantitativeAuditContext<'_>, +) -> Result { + evidence::quantitative_audit_evidence(path, context) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs new file mode 100644 index 00000000..855af455 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs @@ -0,0 +1,8 @@ +mod digest; +mod paths; +mod validation; + +pub(super) use self::{ + digest::fixture_path_digest, paths::audit_artifact_display_path, + validation::validate_quantitative_audit_artifacts, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs new file mode 100644 index 00000000..d87860d9 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs @@ -0,0 +1,39 @@ +mod paths; + +use crate::{Path, Result, fs}; + +pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) -> Result { + let mut hasher = blake3::Hasher::new(); + + if path.is_file() { + hash_fixture_file( + path, + path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), + &mut hasher, + )?; + + return Ok(hasher.finalize().to_hex().to_string()); + } + + let paths = paths::audit_fixture_paths(path)?; + + for fixture in paths { + let relative = fixture + .strip_prefix(path) + .map(|relative| relative.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); + + hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; + } + + Ok(hasher.finalize().to_hex().to_string()) +} + +fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { + hasher.update(logical_path.as_bytes()); + hasher.update(b"\0"); + hasher.update(&fs::read(path)?); + hasher.update(b"\0"); + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs new file mode 100644 index 00000000..a7ba276c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs @@ -0,0 +1,31 @@ +use crate::{Path, PathBuf, Result, fs}; + +pub(super) fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs new file mode 100644 index 00000000..3dd15d54 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs @@ -0,0 +1,35 @@ +use std::env; + +use crate::{Path, PathBuf}; + +pub(in crate::quantitative::audit_manifest) fn audit_artifact_display_path(path: &Path) -> String { + let display_path = if path.is_absolute() { + env::current_dir() + .ok() + .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) + .unwrap_or_else(|| path.to_path_buf()) + } else { + path.to_path_buf() + }; + + display_path.to_string_lossy().replace('\\', "/") +} + +pub(super) fn resolve_quantitative_audit_artifact_path( + manifest_path: &Path, + artifact_path: &str, +) -> PathBuf { + let raw = PathBuf::from(artifact_path); + + if raw.is_absolute() { + return raw; + } + + let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); + + if cwd_path.exists() { + return cwd_path; + } + + manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs new file mode 100644 index 00000000..21c5e7bb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs @@ -0,0 +1,20 @@ +mod digest; +mod fields; + +use crate::{Path, QuantitativeAuditManifest, Result, eyre}; + +pub(in crate::quantitative::audit_manifest) fn validate_quantitative_audit_artifacts( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.artifacts.is_empty() { + return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); + } + + for artifact in &manifest.artifacts { + fields::validate_audit_artifact_fields(path, artifact)?; + digest::validate_audit_artifact_digest(path, artifact)?; + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs new file mode 100644 index 00000000..e6af0f61 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs @@ -0,0 +1,33 @@ +use crate::{ + Path, QuantitativeAuditArtifact, Result, eyre, + quantitative::audit_manifest::artifacts::{digest, paths}, +}; + +pub(super) fn validate_audit_artifact_digest( + path: &Path, + artifact: &QuantitativeAuditArtifact, +) -> Result<()> { + let artifact_path = + paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| { + eyre::eyre!( + "{} artifact {} could not be digested at {}: {err}", + path.display(), + artifact.role, + artifact_path.display() + ) + })?; + + if actual != artifact.sha256 { + return Err(eyre::eyre!( + "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", + path.display(), + artifact.role, + artifact_path.display(), + artifact.sha256, + actual + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs new file mode 100644 index 00000000..af6c149c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs @@ -0,0 +1,26 @@ +use crate::{Path, QuantitativeAuditArtifact, Result, eyre}; + +pub(super) fn validate_audit_artifact_fields( + path: &Path, + artifact: &QuantitativeAuditArtifact, +) -> Result<()> { + if artifact.role.trim().is_empty() + || artifact.path.trim().is_empty() + || artifact.sha256.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative audit artifact.", + path.display() + )); + } + if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) { + return Err(eyre::eyre!( + "{} artifact {} has invalid sha256 digest {}.", + path.display(), + artifact.role, + artifact.sha256 + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs new file mode 100644 index 00000000..f9b2e0d4 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs @@ -0,0 +1,31 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, fs, + quantitative::audit_manifest::{ + QuantitativeAuditContext, QuantitativeAuditEvidence, validation, + }, +}; + +pub(super) fn quantitative_audit_evidence( + path: Option<&Path>, + context: QuantitativeAuditContext<'_>, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeAuditEvidence { + held_out: false, + leakage_audited: false, + audit_manifest_id: None, + }); + }; + let raw = fs::read_to_string(path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) + })?; + + validation::validate_quantitative_audit_manifest(&manifest, path, context)?; + + Ok(QuantitativeAuditEvidence { + held_out: manifest.held_out, + leakage_audited: manifest.leakage_audited, + audit_manifest_id: Some(manifest.manifest_id), + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs new file mode 100644 index 00000000..6b23ccfa --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs @@ -0,0 +1,36 @@ +mod claim_boundary; +mod identity; +mod manifest; + +use crate::{ + ExportQuantitativeAuditManifestArgs, QuantitativeAuditManifest, RealWorldJob, Result, + quantitative::audit_manifest::{QuantitativeAuditContext, validation}, +}; + +pub(crate) fn quantitative_audit_manifest_from_jobs( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, +) -> Result { + let product = args.product.trim(); + let adapter_id = args.adapter_id.trim(); + + identity::validate_audit_export_identity(product, adapter_id)?; + + let manifest = manifest::quantitative_audit_manifest(jobs, args, product, adapter_id)?; + + validation::validate_quantitative_audit_manifest( + &manifest, + args.fixtures.as_path(), + QuantitativeAuditContext { + run_id: args.run_id.as_str(), + corpus_id: manifest.corpus_id.as_str(), + product, + adapter_id, + source_jobs: jobs, + ranking_query_count: manifest.ranking_query_count, + explicit_qrel_query_count: manifest.explicit_qrel_query_count, + }, + )?; + + Ok(manifest) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs new file mode 100644 index 00000000..3d572c61 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs @@ -0,0 +1,21 @@ +use crate::ExportQuantitativeAuditManifestArgs; + +pub(super) fn quantitative_audit_claim_boundary( + args: &ExportQuantitativeAuditManifestArgs, +) -> String { + args.claim_boundary.clone().unwrap_or_else(|| { + if args.held_out || args.leakage_audited { + concat!( + "Audit manifest supplied by operator; runner validates run/corpus/product/", + "adapter/count/query-id/artifact bindings before opening row gates." + ) + .to_string() + } else { + concat!( + "Diagnostic audit manifest binds the current product-runtime fixture set to ", + "query ids and counts, but it does not prove held-out or leakage-audited status." + ) + .to_string() + } + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs new file mode 100644 index 00000000..872da0e6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs @@ -0,0 +1,9 @@ +use crate::{Result, eyre}; + +pub(super) fn validate_audit_export_identity(product: &str, adapter_id: &str) -> Result<()> { + if product.is_empty() || adapter_id.is_empty() { + return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs new file mode 100644 index 00000000..dad5a99e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs @@ -0,0 +1,45 @@ +use crate::{ + ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest, + RealWorldJob, Result, + quantitative::{ + self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, + audit_manifest::{artifacts, export::claim_boundary}, + metrics, + }, +}; + +pub(super) fn quantitative_audit_manifest( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, + product: &str, + adapter_id: &str, +) -> Result { + let corpus_id = quantitative::quantitative_corpus_id(jobs); + let ranking_query_count = metrics::ranking_query_count(jobs); + let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); + + Ok(QuantitativeAuditManifest { + schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), + run_id: args.run_id.clone(), + corpus_id, + product: product.to_string(), + adapter_id: adapter_id.to_string(), + held_out: args.held_out, + leakage_audited: args.leakage_audited, + sample_size: jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + controls: args.controls.clone(), + artifacts: vec![QuantitativeAuditArtifact { + role: "product_runtime_fixtures".to_string(), + path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), + sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, + }], + claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args), + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs new file mode 100644 index 00000000..5a37d191 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs @@ -0,0 +1,20 @@ +mod controls; +mod identity; +mod queries; + +use crate::{ + Path, QuantitativeAuditManifest, Result, + quantitative::audit_manifest::{QuantitativeAuditContext, artifacts}, +}; + +pub(super) fn validate_quantitative_audit_manifest( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: QuantitativeAuditContext<'_>, +) -> Result<()> { + identity::validate_quantitative_audit_identity(manifest, path, &context)?; + queries::validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; + controls::validate_quantitative_audit_controls(manifest, path)?; + + artifacts::validate_quantitative_audit_artifacts(manifest, path) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs new file mode 100644 index 00000000..9b15c1ae --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs @@ -0,0 +1,42 @@ +use crate::{ + BTreeSet, Path, QuantitativeAuditManifest, Result, eyre, + quantitative::{ + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, REQUIRED_HELD_OUT_AUDIT_CONTROL, + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + }, +}; + +pub(super) fn validate_quantitative_audit_controls( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + let controls = manifest.controls.iter().map(String::as_str).collect::>(); + + if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { + return Err(eyre::eyre!( + "{} marks held_out=true without required control {}.", + path.display(), + REQUIRED_HELD_OUT_AUDIT_CONTROL + )); + } + if manifest.leakage_audited + && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) + || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) + { + return Err(eyre::eyre!( + "{} marks leakage_audited=true without required controls {} and {}.", + path.display(), + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL + )); + } + if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() + { + return Err(eyre::eyre!( + "{} marks audit controls true but has an empty claim_boundary.", + path.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs new file mode 100644 index 00000000..6444cdea --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs @@ -0,0 +1,16 @@ +mod context; +mod schema; + +use crate::{ + Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_identity( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + schema::validate_quantitative_audit_schema(manifest, path)?; + + context::validate_quantitative_audit_context(manifest, path, context) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs new file mode 100644 index 00000000..1d6be494 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs @@ -0,0 +1,17 @@ +mod counts; +mod fields; + +use crate::{ + Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + fields::validate_quantitative_audit_context_fields(manifest, path, context)?; + counts::validate_quantitative_audit_context_counts(manifest, path, context)?; + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs new file mode 100644 index 00000000..a9e61f1f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs @@ -0,0 +1,37 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context_counts( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs new file mode 100644 index 00000000..1b39ccad --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs @@ -0,0 +1,39 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context_fields( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs new file mode 100644 index 00000000..f288eeba --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs @@ -0,0 +1,21 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, quantitative::QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, +}; + +pub(super) fn validate_quantitative_audit_schema( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs new file mode 100644 index 00000000..9910b436 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs @@ -0,0 +1,29 @@ +use crate::{ + BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre, quantitative::metrics, +}; + +pub(super) fn validate_quantitative_audit_query_ids( + manifest: &QuantitativeAuditManifest, + path: &Path, + source_jobs: &[RealWorldJob], +) -> Result<()> { + let expected = metrics::ranking_query_ids(source_jobs); + let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); + + if actual.len() != manifest.query_ids.len() { + return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); + } + if actual != expected { + let missing = expected.difference(&actual).copied().collect::>(); + let extra = actual.difference(&expected).copied().collect::>(); + + return Err(eyre::eyre!( + "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", + path.display(), + missing, + extra + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs new file mode 100644 index 00000000..6ee91f58 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs @@ -0,0 +1,15 @@ +mod aggregate; +mod per_query; +mod ranking; + +pub(super) use self::{ + aggregate::{ + aggregate_confidence_intervals, aggregate_denominators, aggregate_metric_states, + aggregate_metrics, + }, + per_query::quantitative_per_query_rows, + ranking::{ + aggregate_qrel_source, explicit_qrel_query_count, ranked_candidate_source, + ranking_coverage_state, ranking_query_count, ranking_query_ids, + }, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs new file mode 100644 index 00000000..992201a6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -0,0 +1,32 @@ +mod confidence; +mod denominators; +mod metrics; +mod names; +mod states; + +use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow}; + +pub(in crate::quantitative) fn aggregate_metrics( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap> { + metrics::aggregate_metrics(rows) +} + +pub(in crate::quantitative) fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + states::aggregate_metric_states(result_state, metric_comparable) +} + +pub(in crate::quantitative) fn aggregate_denominators( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + denominators::aggregate_denominators(rows) +} + +pub(in crate::quantitative) fn aggregate_confidence_intervals( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + confidence::aggregate_confidence_intervals(rows) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs new file mode 100644 index 00000000..2a454bdc --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs @@ -0,0 +1,24 @@ +mod rates; +mod wilson; + +use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow}; + +pub(super) fn aggregate_confidence_intervals( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + let mut confidence_intervals = BTreeMap::new(); + + for metric in rates::rate_metric_names() { + let (numerator, denominator) = + rates::aggregate_rate_numerator_denominator(rows, metric.as_str()); + + if denominator > 0 { + confidence_intervals.insert( + metric, + wilson::wilson_confidence_interval(numerator.min(denominator), denominator), + ); + } + } + + confidence_intervals +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs new file mode 100644 index 00000000..4cfb3b7f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs @@ -0,0 +1,39 @@ +use crate::{QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +pub(super) fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs new file mode 100644 index 00000000..99c3029d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs @@ -0,0 +1,22 @@ +use crate::{QuantitativeConfidenceInterval, formatting, quantitative::WILSON_95_Z}; + +pub(super) fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs new file mode 100644 index 00000000..3ddd044f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs @@ -0,0 +1,33 @@ +use crate::{BTreeMap, QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert( + format!("recall_at_{k}"), + sum_per_query_denominator(rows, &format!("recall_at_{k}")), + ); + denominators.insert( + format!("precision_at_{k}"), + sum_per_query_denominator(rows, &format!("precision_at_{k}")), + ); + denominators.insert( + format!("success_at_{k}"), + sum_per_query_denominator(rows, &format!("success_at_{k}")), + ); + } + + denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); + denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); + denominators.insert( + "average_precision".to_string(), + sum_per_query_denominator(rows, "average_precision"), + ); + + denominators +} + +fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { + rows.iter().filter_map(|row| row.denominators.get(metric)).sum() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs new file mode 100644 index 00000000..db17c0c1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs @@ -0,0 +1,27 @@ +use crate::{ + BTreeMap, QuantitativePerQueryRow, formatting, quantitative::metrics::aggregate::names, +}; + +pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + let mut sums = BTreeMap::::new(); + let mut metrics = names::quantitative_metric_names() + .into_iter() + .map(|metric| (metric, None)) + .collect::>(); + + for row in rows { + for (metric, value) in &row.metrics { + if let Some(value) = value { + let (sum, count) = sums.entry(metric.clone()).or_default(); + + *sum += *value; + *count += 1; + } + } + } + for (metric, (sum, count)) in sums { + metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); + } + + metrics +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs new file mode 100644 index 00000000..90055feb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs @@ -0,0 +1,16 @@ +use crate::quantitative::QUANTITATIVE_K_VALUES; + +pub(super) fn quantitative_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + metrics.push(metric.to_string()); + } + + metrics +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs new file mode 100644 index 00000000..c9f631bb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs @@ -0,0 +1,20 @@ +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + let state = if metric_comparable { result_state } else { "not_encoded" }; + let mut states = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + states.insert(format!("recall_at_{k}"), state.to_string()); + states.insert(format!("precision_at_{k}"), state.to_string()); + states.insert(format!("success_at_{k}"), state.to_string()); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + states.insert(metric.to_string(), state.to_string()); + } + + states +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs new file mode 100644 index 00000000..1c1bf433 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -0,0 +1,21 @@ +mod evidence; +mod query_metrics; +mod row; + +use crate::{JobReport, QuantitativePerQueryRow, RealWorldJob}; + +pub(in crate::quantitative) fn quantitative_per_query_rows( + source_jobs: &[RealWorldJob], + jobs: &[JobReport], + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> Vec { + source_jobs + .iter() + .zip(jobs.iter()) + .map(|(source_job, job)| { + row::quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) + }) + .collect() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs new file mode 100644 index 00000000..1a13fac2 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs @@ -0,0 +1,29 @@ +use crate::{BTreeMap, JobReport, RealWorldJob}; + +pub(super) fn relevance_grades( + source_job: &RealWorldJob, + job: &JobReport, +) -> BTreeMap { + let explicit = source_job + .expected_answer + .relevance_judgments + .iter() + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect::>(); + + if !explicit.is_empty() { + return explicit; + } + + job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() +} + +pub(super) fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { + if !source_job.expected_answer.relevance_judgments.is_empty() { + "explicit_qrels" + } else if empty { + "not_encoded" + } else { + "expected_evidence_fallback" + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs new file mode 100644 index 00000000..6685aa6e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs @@ -0,0 +1,35 @@ +mod denominators; +mod ranking; +mod relevance; + +pub(super) use self::{denominators::per_query_denominators, relevance::positive_qrel_count}; + +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn per_query_metrics( + candidates: &[String], + relevance: &BTreeMap, +) -> BTreeMap> { + let mut metrics = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + let relevant_at_k = relevance::relevant_at_k(candidates, relevance, *k); + + metrics.insert( + format!("recall_at_{k}"), + relevance::rate(relevant_at_k, positive_qrel_count(relevance)), + ); + metrics.insert(format!("precision_at_{k}"), relevance::rate(relevant_at_k, *k)); + metrics.insert( + format!("success_at_{k}"), + Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), + ); + } + + metrics.insert("mrr".to_string(), ranking::reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ranking::ndcg_at_k(candidates, relevance, 5)); + metrics + .insert("average_precision".to_string(), ranking::average_precision(candidates, relevance)); + + metrics +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs new file mode 100644 index 00000000..7ef22bc8 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs @@ -0,0 +1,21 @@ +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; + +pub(in crate::quantitative::metrics::per_query) fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs new file mode 100644 index 00000000..e9d7dbf7 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs @@ -0,0 +1,27 @@ +mod average_precision; +mod ndcg; +mod reciprocal_rank; + +use crate::BTreeMap; + +pub(super) fn reciprocal_rank( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + reciprocal_rank::reciprocal_rank(candidates, relevance) +} + +pub(super) fn ndcg_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> Option { + ndcg::ndcg_at_k(candidates, relevance, k) +} + +pub(super) fn average_precision( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + average_precision::average_precision(candidates, relevance) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs new file mode 100644 index 00000000..13c196ca --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs @@ -0,0 +1,28 @@ +use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn average_precision( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + let positive_count = query_metrics::positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs new file mode 100644 index 00000000..540d2f66 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs @@ -0,0 +1,33 @@ +use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn ndcg_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs new file mode 100644 index 00000000..99956367 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs @@ -0,0 +1,19 @@ +use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn reciprocal_rank( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs new file mode 100644 index 00000000..a3644eb1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs @@ -0,0 +1,23 @@ +use crate::{BTreeMap, formatting}; + +pub(in crate::quantitative::metrics::per_query) fn positive_qrel_count( + relevance: &BTreeMap, +) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +pub(super) fn relevant_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +pub(super) fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs new file mode 100644 index 00000000..7378fd72 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs @@ -0,0 +1,34 @@ +mod basis; + +use crate::{ + JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, + quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, +}; + +pub(super) fn quantitative_per_query_row( + source_job: &RealWorldJob, + job: &JobReport, + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let basis = basis::quantitative_per_query_row_basis(source_job, job); + + QuantitativePerQueryRow { + job_id: job.job_id.clone(), + suite: job.suite_id.clone(), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: formatting::status_str(job.status).to_string(), + expected_relevant_count: basis.positive_relevance_count, + candidate_count: basis.candidate_count, + qrel_source: basis.qrel_source, + relevance_grade_sum: basis.relevance_grade_sum, + product: "ELF".to_string(), + adapter_id: adapter_id.to_string(), + metrics: basis.metrics, + metric_states: basis.metric_states, + denominators: basis.denominators, + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs new file mode 100644 index 00000000..42ed6323 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs @@ -0,0 +1,47 @@ +mod states; + +use crate::{ + BTreeMap, JobReport, RealWorldJob, formatting, + quantitative::metrics::per_query::{evidence, query_metrics}, + scoring, +}; + +pub(super) struct QuantitativePerQueryRowBasis { + pub(super) positive_relevance_count: usize, + pub(super) candidate_count: usize, + pub(super) qrel_source: String, + pub(super) relevance_grade_sum: f64, + pub(super) metrics: BTreeMap>, + pub(super) metric_states: BTreeMap, + pub(super) denominators: BTreeMap, +} + +pub(super) fn quantitative_per_query_row_basis( + source_job: &RealWorldJob, + job: &JobReport, +) -> QuantitativePerQueryRowBasis { + let relevance = evidence::relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); + let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); + let candidate_count = candidates.len(); + let metric_states = states::per_query_metric_states( + metrics.keys(), + positive_relevance_count, + candidate_count, + formatting::status_str(job.status), + ); + + QuantitativePerQueryRowBasis { + positive_relevance_count, + candidate_count, + qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + metrics, + metric_states, + denominators: query_metrics::per_query_denominators( + candidate_count, + positive_relevance_count, + ), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs new file mode 100644 index 00000000..7c987253 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs @@ -0,0 +1,16 @@ +use crate::BTreeMap; + +pub(super) fn per_query_metric_states<'a>( + metric_names: impl Iterator, + positive_relevance_count: usize, + candidate_count: usize, + result_state: &str, +) -> BTreeMap { + let metric_state = if positive_relevance_count == 0 || candidate_count == 0 { + "not_encoded" + } else { + result_state + }; + + metric_names.map(|key| (key.clone(), metric_state.to_string())).collect() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs new file mode 100644 index 00000000..6805ca30 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs @@ -0,0 +1,10 @@ +mod counts; +mod coverage; +mod qrels; +mod queries; + +pub(in crate::quantitative) use self::{ + counts::{explicit_qrel_query_count, ranking_query_count, ranking_query_ids}, + coverage::{ranked_candidate_source, ranking_coverage_state}, + qrels::aggregate_qrel_source, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs new file mode 100644 index 00000000..c8dd4408 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs @@ -0,0 +1,17 @@ +use crate::{BTreeSet, RealWorldJob, quantitative::metrics::ranking::queries}; + +pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + source_jobs + .iter() + .filter(|job| queries::is_ranking_query(job)) + .map(|job| job.job_id.as_str()) + .collect() +} + +pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { + ranking_query_ids(source_jobs).len() +} + +pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { + source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs new file mode 100644 index 00000000..eb419d40 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs @@ -0,0 +1,19 @@ +use crate::ReportSummary; + +pub(in crate::quantitative) fn ranking_coverage_state( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { + "complete" + } else { + "partial_coverage" + } +} + +pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { + if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs new file mode 100644 index 00000000..9b5c3daa --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs @@ -0,0 +1,14 @@ +pub(in crate::quantitative) fn aggregate_qrel_source( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if explicit_qrel_query_count == ranking_query_count { + "explicit_qrels" + } else if explicit_qrel_query_count == 0 { + "expected_evidence_fallback" + } else { + "mixed" + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs new file mode 100644 index 00000000..8ada5678 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs @@ -0,0 +1,38 @@ +use crate::{BTreeMap, RealWorldJob, scoring}; + +pub(super) fn is_ranking_query(job: &RealWorldJob) -> bool { + !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job) +} + +fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { + if !source_job.expected_answer.relevance_judgments.is_empty() { + return source_job + .expected_answer + .relevance_judgments + .iter() + .filter(|judgment| judgment.grade > 0.0) + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect(); + } + + source_job + .required_evidence + .iter() + .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) + .map(|evidence| (evidence.evidence_id.clone(), 1.0)) + .collect() +} + +fn ranking_query_attempted(job: &RealWorldJob) -> bool { + if !scoring::produced_evidence_order(job).is_empty() { + return true; + } + + let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { + return false; + }; + + answer.trace_explainability.as_ref().is_some_and(|trace| { + trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") + }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs new file mode 100644 index 00000000..4cd8b6c0 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs @@ -0,0 +1,14 @@ +mod export; +mod import; +mod validation; + +pub(crate) use self::export::quantitative_product_manifest_from_report; + +use crate::{Path, QuantitativeProductManifest, Result}; + +pub(super) fn quantitative_product_manifest( + path: Option<&Path>, + corpus_id: &str, +) -> Result { + import::quantitative_product_manifest(path, corpus_id) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs new file mode 100644 index 00000000..d72509f8 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs @@ -0,0 +1,32 @@ +mod identity; +mod manifest; +mod rows; +mod source; + +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, REPORT_SCHEMA, + RealWorldReport, Result, eyre, quantitative::product_manifest::validation, +}; + +pub(crate) fn quantitative_product_manifest_from_report( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + if report.schema != REPORT_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {REPORT_SCHEMA}.", + args.report.display(), + report.schema + )); + } + + let manifest = manifest::quantitative_product_manifest(report, args)?; + + validation::validate_quantitative_product_manifest( + &manifest, + &args.report, + manifest.corpus_id.as_str(), + )?; + + Ok(manifest) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs new file mode 100644 index 00000000..4f1f6453 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs @@ -0,0 +1,23 @@ +use crate::{ExportQuantitativeProductManifestArgs, Result, eyre}; + +pub(super) fn validate_export_identity( + args: &ExportQuantitativeProductManifestArgs, + product: &str, + adapter_id: &str, + adapter_name: &str, +) -> Result<()> { + if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { + return Err(eyre::eyre!( + "{} cannot export an incomplete quantitative product identity.", + args.report.display() + )); + } + if product == "ELF" { + return Err(eyre::eyre!( + "{} exports product ELF; use --product for external product manifest exports.", + args.report.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs new file mode 100644 index 00000000..592cb19f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs @@ -0,0 +1,46 @@ +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, RealWorldReport, Result, + quantitative::{ + QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, + product_manifest::export::{identity, rows, source}, + }, +}; + +pub(super) fn quantitative_product_manifest( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + let source = source::product_export_identity(report, args)?; + + identity::validate_export_identity( + args, + source.product, + source.adapter_id, + source.adapter_name, + )?; + + let row = rows::exported_product_row( + source.row, + source.product, + source.adapter_id, + source.adapter_name, + ); + let per_query_rows = rows::exported_per_query_rows( + report, + source.source_product, + source.source_adapter_id, + source.product, + source.adapter_id, + ); + + Ok(QuantitativeProductManifest { + schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), + corpus_id: report.quantitative_scoreboard.corpus_id.clone(), + rows: vec![row], + per_query_rows, + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs new file mode 100644 index 00000000..e29f4f74 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs @@ -0,0 +1,4 @@ +mod per_query; +mod product; + +pub(super) use self::{per_query::exported_per_query_rows, product::exported_product_row}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs new file mode 100644 index 00000000..fcc61d9e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs @@ -0,0 +1,35 @@ +use crate::{QuantitativePerQueryRow, RealWorldReport}; + +pub(in crate::quantitative::product_manifest::export) fn exported_per_query_rows( + report: &RealWorldReport, + source_product: &str, + source_adapter_id: &str, + product: &str, + adapter_id: &str, +) -> Vec { + report + .quantitative_scoreboard + .per_query_rows + .iter() + .filter(|row| row.product == source_product && row.adapter_id == source_adapter_id) + .map(|row| exported_per_query_row(row, product, adapter_id)) + .collect() +} + +fn exported_per_query_row( + source_row: &QuantitativePerQueryRow, + product: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.claim_boundary = concat!( + "Exported from generated report per-query quantitative evidence; ", + "import does not relax paired-significance or leaderboard gates." + ) + .to_string(); + + row +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs new file mode 100644 index 00000000..2551c2ff --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs @@ -0,0 +1,21 @@ +use crate::QuantitativeBenchmarkRow; + +pub(in crate::quantitative::product_manifest::export) fn exported_product_row( + source_row: &QuantitativeBenchmarkRow, + product: &str, + adapter_id: &str, + adapter_name: &str, +) -> QuantitativeBenchmarkRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.adapter_name = adapter_name.to_string(); + row.claim_boundary = concat!( + "Exported from a generated real_world_job_report quantitative row; ", + "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." + ) + .to_string(); + + row +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs new file mode 100644 index 00000000..6a3b7ed9 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs @@ -0,0 +1,37 @@ +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeBenchmarkRow, RealWorldReport, Result, eyre, +}; + +pub(super) struct ProductExportIdentity<'report> { + pub(super) row: &'report QuantitativeBenchmarkRow, + pub(super) source_product: &'report str, + pub(super) source_adapter_id: &'report str, + pub(super) product: &'report str, + pub(super) adapter_id: &'report str, + pub(super) adapter_name: &'report str, +} + +pub(super) fn product_export_identity<'report>( + report: &'report RealWorldReport, + args: &'report ExportQuantitativeProductManifestArgs, +) -> Result> { + let source_row = + report.quantitative_scoreboard.rows.first().ok_or_else(|| { + eyre::eyre!("{} has no quantitative product row.", args.report.display()) + })?; + let source_product = source_row.product.as_str(); + let source_adapter_id = source_row.adapter_id.as_str(); + let product = args.product.as_deref().unwrap_or(source_product).trim(); + let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); + let adapter_name = + args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); + + Ok(ProductExportIdentity { + row: source_row, + source_product, + source_adapter_id, + product, + adapter_id, + adapter_name, + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs new file mode 100644 index 00000000..12df9a92 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs @@ -0,0 +1,32 @@ +use crate::{ + Path, QuantitativeProductManifest, Result, eyre, fs, quantitative::product_manifest::validation, +}; + +pub(super) fn quantitative_product_manifest( + path: Option<&Path>, + corpus_id: &str, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeProductManifest::default()); + }; + let raw = fs::read_to_string(path)?; + let mut manifest = + serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) + })?; + + populate_source_manifest_corpus_ids(&mut manifest); + + validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?; + + Ok(manifest) +} + +fn populate_source_manifest_corpus_ids(manifest: &mut QuantitativeProductManifest) { + for row in &mut manifest.rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + for row in &mut manifest.per_query_rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs new file mode 100644 index 00000000..fe86d636 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs @@ -0,0 +1,40 @@ +mod rows; + +use crate::{ + Path, QuantitativeProductManifest, Result, eyre, + quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, +}; + +pub(super) fn validate_quantitative_product_manifest( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.corpus_id != corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected same-corpus {}.", + path.display(), + manifest.corpus_id, + corpus_id + )); + } + if manifest.rows.is_empty() { + return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); + } + + rows::validate_quantitative_product_rows(manifest, path, corpus_id)?; + rows::validate_quantitative_per_query_rows(manifest, path, corpus_id)?; + rows::validate_ranked_row_evidence(manifest, path)?; + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs new file mode 100644 index 00000000..36009dfa --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs @@ -0,0 +1,28 @@ +mod per_query; +mod product; +mod ranking; + +use crate::{Path, QuantitativeProductManifest, Result}; + +pub(super) fn validate_quantitative_product_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + product::validate_quantitative_product_rows(manifest, path, corpus_id) +} + +pub(super) fn validate_quantitative_per_query_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + per_query::validate_quantitative_per_query_rows(manifest, path, corpus_id) +} + +pub(super) fn validate_ranked_row_evidence( + manifest: &QuantitativeProductManifest, + path: &Path, +) -> Result<()> { + ranking::validate_ranked_row_evidence(manifest, path) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs new file mode 100644 index 00000000..12dc5508 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs @@ -0,0 +1,21 @@ +mod identity; + +use crate::{BTreeSet, Path, QuantitativeProductManifest, Result}; + +pub(super) fn validate_quantitative_per_query_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.per_query_rows { + identity::validate_per_query_row_identity(path, row, &row_keys, corpus_id)?; + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs new file mode 100644 index 00000000..737e869e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs @@ -0,0 +1,17 @@ +mod corpus; +mod fields; +mod product; + +use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result}; + +pub(super) fn validate_per_query_row_identity( + path: &Path, + row: &QuantitativePerQueryRow, + row_keys: &BTreeSet<(&str, &str)>, + corpus_id: &str, +) -> Result<()> { + fields::validate_complete_per_query_row(path, row)?; + product::validate_matching_product_row(path, row, row_keys)?; + + corpus::validate_same_corpus_per_query_row(path, row, corpus_id) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs new file mode 100644 index 00000000..45d0c11c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs @@ -0,0 +1,19 @@ +use crate::{Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_same_corpus_per_query_row( + path: &Path, + row: &QuantitativePerQueryRow, + corpus_id: &str, +) -> Result<()> { + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs new file mode 100644 index 00000000..049614f1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs @@ -0,0 +1,22 @@ +use crate::{Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_complete_per_query_row( + path: &Path, + row: &QuantitativePerQueryRow, +) -> Result<()> { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs new file mode 100644 index 00000000..dfed81b1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs @@ -0,0 +1,18 @@ +use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_matching_product_row( + path: &Path, + row: &QuantitativePerQueryRow, + row_keys: &BTreeSet<(&str, &str)>, +) -> Result<()> { + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs new file mode 100644 index 00000000..ac009d59 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs @@ -0,0 +1,20 @@ +mod identity; +mod leaderboard; + +use crate::{Path, QuantitativeProductManifest, Result}; + +pub(super) fn validate_quantitative_product_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + for row in &manifest.rows { + identity::validate_product_row_identity(path, row, corpus_id)?; + + if row.leaderboard_eligible { + leaderboard::validate_leaderboard_eligible_product_row(path, row)?; + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs new file mode 100644 index 00000000..5dd82465 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs @@ -0,0 +1,34 @@ +use crate::{Path, QuantitativeBenchmarkRow, Result, eyre}; + +pub(super) fn validate_product_row_identity( + path: &Path, + row: &QuantitativeBenchmarkRow, + corpus_id: &str, +) -> Result<()> { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete quantitative product row.", path.display())); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs new file mode 100644 index 00000000..e5f76ae2 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs @@ -0,0 +1,31 @@ +use crate::{ + Path, QuantitativeBenchmarkRow, Result, eyre, quantitative::MIN_LEADERBOARD_QUERY_COUNT, +}; + +pub(super) fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs new file mode 100644 index 00000000..8206e54b --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs @@ -0,0 +1,33 @@ +use crate::{Path, QuantitativeProductManifest, Result, eyre}; + +pub(super) fn validate_ranked_row_evidence( + manifest: &QuantitativeProductManifest, + path: &Path, +) -> Result<()> { + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs new file mode 100644 index 00000000..08b4b84a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs @@ -0,0 +1,53 @@ +mod controls; +mod imported; +mod input; +mod row; + +pub(crate) use self::input::QuantitativeReportInput; + +use crate::{ + QuantitativeBenchmarkReport, Result, + quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA}, +}; + +pub(crate) fn quantitative_scoreboard_report( + input: QuantitativeReportInput<'_>, +) -> Result { + let current_row = row::current_quantitative_row(&input)?; + let imported_rows = imported::imported_quantitative_rows( + input.product_manifest_path, + current_row.corpus_id.as_str(), + )?; + let mut rows = vec![current_row.row]; + let mut merged_per_query_rows = current_row.per_query_rows; + + rows.extend(imported_rows.rows); + merged_per_query_rows.extend(imported_rows.per_query_rows); + + let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; + let controls = controls::quantitative_benchmark_controls( + &input, + current_row.ranking_query_count, + current_row.explicit_qrel_query_count, + leaderboard_claim_allowed, + ); + + Ok(QuantitativeBenchmarkReport { + schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), + generated_at: input.generated_at.to_string(), + corpus_id: current_row.corpus_id, + k_values: QUANTITATIVE_K_VALUES.to_vec(), + rows, + per_query_rows: merged_per_query_rows, + metrics_not_encoded: quantitative::quantitative_metrics_not_encoded( + imported_rows.row_count, + imported_rows.per_query_count, + ), + controls, + claim_boundary: concat!( + "Do not convert fixture mechanics, missing explicit qrels, ", + "or partial candidate coverage into product leaderboard claims." + ) + .to_string(), + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs new file mode 100644 index 00000000..78d4b723 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs @@ -0,0 +1,26 @@ +use crate::{ + QuantitativeBenchmarkControls, + quantitative::{MIN_LEADERBOARD_QUERY_COUNT, report::QuantitativeReportInput}, +}; + +pub(super) fn quantitative_benchmark_controls( + input: &QuantitativeReportInput<'_>, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + leaderboard_claim_allowed: bool, +) -> QuantitativeBenchmarkControls { + QuantitativeBenchmarkControls { + same_corpus_required: true, + same_task_required: true, + ranked_candidates_required_for_ranking_metrics: true, + explicit_relevance_judgments_required_for_leaderboard: true, + minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, + current_query_count: input.source_jobs.len(), + current_ranking_query_count: ranking_query_count, + current_explicit_qrel_query_count: explicit_qrel_query_count, + leaderboard_claim_allowed, + leakage_control: + "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" + .to_string(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs new file mode 100644 index 00000000..2b2a2515 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs @@ -0,0 +1,27 @@ +use crate::{ + Path, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::product_manifest, +}; + +pub(super) struct ImportedQuantitativeRows { + pub(super) rows: Vec, + pub(super) per_query_rows: Vec, + pub(super) row_count: usize, + pub(super) per_query_count: usize, +} + +pub(super) fn imported_quantitative_rows( + product_manifest_path: Option<&Path>, + corpus_id: &str, +) -> Result { + let product_manifest = + product_manifest::quantitative_product_manifest(product_manifest_path, corpus_id)?; + let row_count = product_manifest.rows.len(); + let per_query_count = product_manifest.per_query_rows.len(); + + Ok(ImportedQuantitativeRows { + rows: product_manifest.rows, + per_query_rows: product_manifest.per_query_rows, + row_count, + per_query_count, + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs new file mode 100644 index 00000000..c4412050 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs @@ -0,0 +1,12 @@ +use crate::{AdapterReport, JobReport, Path, RealWorldJob, ReportSummary}; + +pub(crate) struct QuantitativeReportInput<'a> { + pub(crate) run_id: &'a str, + pub(crate) generated_at: &'a str, + pub(crate) adapter: &'a AdapterReport, + pub(crate) source_jobs: &'a [RealWorldJob], + pub(crate) jobs: &'a [JobReport], + pub(crate) summary: &'a ReportSummary, + pub(crate) product_manifest_path: Option<&'a Path>, + pub(crate) audit_manifest_path: Option<&'a Path>, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs new file mode 100644 index 00000000..ee420902 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -0,0 +1,53 @@ +mod audit_gates; +mod basis; +mod benchmark_row; +mod query_counts; + +use crate::{ + QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, + quantitative::report::{ + QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput, + }, +}; + +pub(super) struct CurrentQuantitativeRow { + pub(super) corpus_id: String, + pub(super) row: QuantitativeBenchmarkRow, + pub(super) per_query_rows: Vec, + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) fn current_quantitative_row( + input: &QuantitativeReportInput<'_>, +) -> Result { + let basis = basis::quantitative_row_basis(input); + let audit_gates = audit_gates::quantitative_audit_gates( + input, + basis.corpus_id.as_str(), + basis.evidence_class, + basis.ranking_query_count, + basis.explicit_qrel_query_count, + basis.metric_comparable, + )?; + let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput { + input, + corpus_id: basis.corpus_id.as_str(), + evidence_class: basis.evidence_class, + per_query_rows: basis.per_query_rows.as_slice(), + ranking_query_count: basis.ranking_query_count, + explicit_qrel_query_count: basis.explicit_qrel_query_count, + metric_comparable: basis.metric_comparable, + result_state: basis.result_state, + audit_evidence: audit_gates.audit_evidence, + leaderboard_eligible: audit_gates.leaderboard_eligible, + }); + + Ok(CurrentQuantitativeRow { + corpus_id: basis.corpus_id, + row, + per_query_rows: basis.per_query_rows, + ranking_query_count: basis.ranking_query_count, + explicit_qrel_query_count: basis.explicit_qrel_query_count, + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs new file mode 100644 index 00000000..31d2ddee --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs @@ -0,0 +1,45 @@ +use crate::{ + Result, + quantitative::{ + self, + audit_manifest::{self, QuantitativeAuditContext, QuantitativeAuditEvidence}, + report::QuantitativeReportInput, + }, +}; + +pub(super) struct QuantitativeAuditGates { + pub(super) audit_evidence: QuantitativeAuditEvidence, + pub(super) leaderboard_eligible: bool, +} + +pub(super) fn quantitative_audit_gates( + input: &QuantitativeReportInput<'_>, + corpus_id: &str, + evidence_class: &str, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + metric_comparable: bool, +) -> Result { + let audit_evidence = audit_manifest::quantitative_audit_evidence( + input.audit_manifest_path, + QuantitativeAuditContext { + run_id: input.run_id, + corpus_id, + product: "ELF", + adapter_id: input.adapter.adapter_id.as_str(), + source_jobs: input.source_jobs, + ranking_query_count, + explicit_qrel_query_count, + }, + )?; + let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( + evidence_class, + input.source_jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + &audit_evidence, + ); + + Ok(QuantitativeAuditGates { audit_evidence, leaderboard_eligible }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs new file mode 100644 index 00000000..0f1a7e47 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs @@ -0,0 +1,41 @@ +use crate::{ + QuantitativePerQueryRow, + quantitative::{ + self, metrics, + report::{QuantitativeReportInput, row::query_counts}, + }, +}; + +pub(super) struct QuantitativeRowBasis { + pub(super) corpus_id: String, + pub(super) evidence_class: &'static str, + pub(super) per_query_rows: Vec, + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, + pub(super) metric_comparable: bool, + pub(super) result_state: &'static str, +} + +pub(super) fn quantitative_row_basis(input: &QuantitativeReportInput<'_>) -> QuantitativeRowBasis { + let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); + let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); + let per_query_rows = metrics::quantitative_per_query_rows( + input.source_jobs, + input.jobs, + corpus_id.as_str(), + evidence_class, + input.adapter.adapter_id.as_str(), + ); + let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice()); + let ranking_query_count = query_counts.ranking_query_count; + + QuantitativeRowBasis { + corpus_id, + evidence_class, + per_query_rows, + ranking_query_count, + explicit_qrel_query_count: query_counts.explicit_qrel_query_count, + metric_comparable: ranking_query_count > 0, + result_state: quantitative::quantitative_result_state(input.summary), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs new file mode 100644 index 00000000..4b8b2e31 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs @@ -0,0 +1,59 @@ +mod input; + +pub(super) use self::input::QuantitativeBenchmarkRowInput; + +use crate::{ + QuantitativeBenchmarkRow, + quantitative::{self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, metrics}, +}; + +pub(super) fn quantitative_benchmark_row( + row_input: QuantitativeBenchmarkRowInput<'_, '_>, +) -> QuantitativeBenchmarkRow { + let QuantitativeBenchmarkRowInput { + input, + corpus_id, + evidence_class, + per_query_rows, + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + result_state, + audit_evidence, + leaderboard_eligible, + } = row_input; + + QuantitativeBenchmarkRow { + product: "ELF".to_string(), + adapter_id: input.adapter.adapter_id.clone(), + adapter_name: input.adapter.name.clone(), + suite: quantitative::quantitative_suite_id(input.jobs), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: result_state.to_string(), + comparable: metric_comparable, + metric_comparable, + leaderboard_eligible, + held_out: audit_evidence.held_out, + leakage_audited: audit_evidence.leakage_audited, + audit_manifest_id: audit_evidence.audit_manifest_id, + fixture_regression_only: evidence_class == "fixture_backed", + sample_size: input.jobs.len(), + ranking_query_count, + ranking_coverage_state: metrics::ranking_coverage_state( + input.summary, + input.source_jobs.len(), + ranking_query_count, + ) + .to_string(), + ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + .to_string(), + explicit_qrel_query_count, + metrics: metrics::aggregate_metrics(per_query_rows), + metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), + denominators: metrics::aggregate_denominators(per_query_rows), + confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs new file mode 100644 index 00000000..a8e3f96a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs @@ -0,0 +1,17 @@ +use crate::{ + QuantitativePerQueryRow, + quantitative::{audit_manifest::QuantitativeAuditEvidence, report::QuantitativeReportInput}, +}; + +pub(in crate::quantitative::report::row) struct QuantitativeBenchmarkRowInput<'a, 'b> { + pub(in crate::quantitative::report::row) input: &'a QuantitativeReportInput<'b>, + pub(in crate::quantitative::report::row) corpus_id: &'a str, + pub(in crate::quantitative::report::row) evidence_class: &'a str, + pub(in crate::quantitative::report::row) per_query_rows: &'a [QuantitativePerQueryRow], + pub(in crate::quantitative::report::row) ranking_query_count: usize, + pub(in crate::quantitative::report::row) explicit_qrel_query_count: usize, + pub(in crate::quantitative::report::row) metric_comparable: bool, + pub(in crate::quantitative::report::row) result_state: &'a str, + pub(in crate::quantitative::report::row) audit_evidence: QuantitativeAuditEvidence, + pub(in crate::quantitative::report::row) leaderboard_eligible: bool, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs new file mode 100644 index 00000000..12632f0a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs @@ -0,0 +1,21 @@ +use crate::QuantitativePerQueryRow; + +pub(super) struct QuantitativeQueryCounts { + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) fn quantitative_query_counts( + per_query_rows: &[QuantitativePerQueryRow], +) -> QuantitativeQueryCounts { + QuantitativeQueryCounts { + ranking_query_count: per_query_rows + .iter() + .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) + .count(), + explicit_qrel_query_count: per_query_rows + .iter() + .filter(|row| row.qrel_source == "explicit_qrels") + .count(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs new file mode 100644 index 00000000..a3bff704 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -0,0 +1,12 @@ +mod audit; +mod benchmark; +mod product; + +pub(crate) use self::{ + audit::{QuantitativeAuditArtifact, QuantitativeAuditManifest}, + benchmark::{ + QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, + QuantitativeConfidenceInterval, QuantitativePerQueryRow, + }, + product::QuantitativeProductManifest, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs new file mode 100644 index 00000000..4b2ce584 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs @@ -0,0 +1,29 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct QuantitativeAuditManifest { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) run_id: String, + pub(crate) corpus_id: String, + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) held_out: bool, + pub(crate) leakage_audited: bool, + pub(crate) sample_size: usize, + pub(crate) ranking_query_count: usize, + pub(crate) explicit_qrel_query_count: usize, + pub(crate) query_ids: Vec, + #[serde(default)] + pub(crate) controls: Vec, + #[serde(default)] + pub(crate) artifacts: Vec, + pub(crate) claim_boundary: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct QuantitativeAuditArtifact { + pub(crate) role: String, + pub(crate) path: String, + pub(crate) sha256: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs new file mode 100644 index 00000000..50d36ff1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs @@ -0,0 +1,11 @@ +mod confidence; +mod controls; +mod per_query; +mod report; +mod row; + +pub(crate) use self::{ + confidence::QuantitativeConfidenceInterval, controls::QuantitativeBenchmarkControls, + per_query::QuantitativePerQueryRow, report::QuantitativeBenchmarkReport, + row::QuantitativeBenchmarkRow, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs new file mode 100644 index 00000000..7a3da458 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs @@ -0,0 +1,11 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeConfidenceInterval { + pub(crate) method: String, + pub(crate) confidence: f64, + pub(crate) lower: f64, + pub(crate) upper: f64, + pub(crate) numerator: usize, + pub(crate) denominator: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs new file mode 100644 index 00000000..1e8ea05f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs @@ -0,0 +1,15 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkControls { + pub(crate) same_corpus_required: bool, + pub(crate) same_task_required: bool, + pub(crate) ranked_candidates_required_for_ranking_metrics: bool, + pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool, + pub(crate) minimum_query_count_for_leaderboard: usize, + pub(crate) current_query_count: usize, + pub(crate) current_ranking_query_count: usize, + pub(crate) current_explicit_qrel_query_count: usize, + pub(crate) leaderboard_claim_allowed: bool, + pub(crate) leakage_control: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs new file mode 100644 index 00000000..35ce6d6f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs @@ -0,0 +1,20 @@ +use crate::{BTreeMap, Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativePerQueryRow { + pub(crate) job_id: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) expected_relevant_count: usize, + pub(crate) candidate_count: usize, + pub(crate) qrel_source: String, + pub(crate) relevance_grade_sum: f64, + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + pub(crate) claim_boundary: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs new file mode 100644 index 00000000..1a57e138 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs @@ -0,0 +1,19 @@ +use crate::{ + Deserialize, QuantitativeBenchmarkControls, QuantitativeBenchmarkRow, QuantitativePerQueryRow, + Serialize, +}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkReport { + pub(crate) schema: String, + pub(crate) generated_at: String, + pub(crate) corpus_id: String, + pub(crate) k_values: Vec, + pub(crate) rows: Vec, + #[serde(default)] + pub(crate) per_query_rows: Vec, + #[serde(default)] + pub(crate) metrics_not_encoded: Vec, + pub(crate) controls: QuantitativeBenchmarkControls, + pub(crate) claim_boundary: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs new file mode 100644 index 00000000..cdef9042 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs @@ -0,0 +1,31 @@ +use crate::{BTreeMap, Deserialize, QuantitativeConfidenceInterval, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkRow { + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) adapter_name: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) comparable: bool, + pub(crate) metric_comparable: bool, + pub(crate) leaderboard_eligible: bool, + pub(crate) held_out: bool, + pub(crate) leakage_audited: bool, + pub(crate) audit_manifest_id: Option, + pub(crate) fixture_regression_only: bool, + pub(crate) sample_size: usize, + pub(crate) ranking_query_count: usize, + pub(crate) ranking_coverage_state: String, + pub(crate) ranked_candidate_source: String, + pub(crate) qrel_source: String, + pub(crate) explicit_qrel_query_count: usize, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + #[serde(default)] + pub(crate) confidence_intervals: BTreeMap, + pub(crate) claim_boundary: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs new file mode 100644 index 00000000..efc5c357 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs @@ -0,0 +1,12 @@ +use crate::{Deserialize, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeProductManifest { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) corpus_id: String, + #[serde(default)] + pub(crate) rows: Vec, + #[serde(default)] + pub(crate) per_query_rows: Vec, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs index 9ee62f1e..797eb2ba 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs @@ -1,7 +1,8 @@ use crate::{ AdapterReport, CaptureIntegrationReport, Deserialize, EvolutionSummary, ExternalAdapterSection, - FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, ReportSummary, - ScoreboardReport, Serialize, SuiteReport, UnsupportedClaimReport, + FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, + QuantitativeBenchmarkReport, ReportSummary, ScoreboardReport, Serialize, SuiteReport, + UnsupportedClaimReport, }; #[derive(Debug, Deserialize, Serialize)] @@ -17,6 +18,8 @@ pub(super) struct RealWorldReport { #[serde(default)] pub(super) operational_evidence: OperationalEvidenceReport, #[serde(default)] + pub(super) quantitative_scoreboard: QuantitativeBenchmarkReport, + #[serde(default)] pub(super) external_adapters: ExternalAdapterSection, pub(super) capture_integration: CaptureIntegrationReport, pub(super) summary: ReportSummary, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs index 088a8842..2f0f34a7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs @@ -27,6 +27,10 @@ pub(super) fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { reports::job_report(job, scoring) } +pub(super) fn produced_evidence_order(job: &RealWorldJob) -> Vec { + self::answers::ordered_produced_evidence_ids(self::answers::produced_answer(job)) +} + pub(super) fn score_job(job: &RealWorldJob) -> JobScoring { let answer = self::answers::produced_answer(job); let produced_evidence = self::answers::produced_evidence_ids(answer); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs index 3e60e5b1..1e2d85ed 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs @@ -61,28 +61,7 @@ pub(super) fn trap_ids_used( .collect() } -fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { - let _ = job; - - static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); - - EMPTY_ANSWER.get_or_init(|| ProducedAnswer { - content: String::new(), - claims: Vec::new(), - evidence_ids: Vec::new(), - pages: Vec::new(), - memory_summaries: Vec::new(), - proactive_briefs: Vec::new(), - scheduled_tasks: Vec::new(), - work_journal_readbacks: Vec::new(), - recovery_drills: Vec::new(), - latency_ms: None, - cost: None, - trace_explainability: None, - }) -} - -fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { +pub(super) fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { let mut seen = BTreeSet::new(); let mut evidence = Vec::new(); @@ -180,6 +159,27 @@ fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { evidence } +fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { + let _ = job; + + static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); + + EMPTY_ANSWER.get_or_init(|| ProducedAnswer { + content: String::new(), + claims: Vec::new(), + evidence_ids: Vec::new(), + pages: Vec::new(), + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), + work_journal_readbacks: Vec::new(), + recovery_drills: Vec::new(), + latency_ms: None, + cost: None, + trace_explainability: None, + }) +} + fn push_ordered_evidence( evidence: &mut Vec, seen: &mut BTreeSet, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 6df392ce..6aa5cecb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -20,6 +20,7 @@ #[path = "real_world_job_benchmark/operator_debug.rs"] mod operator_debug; #[path = "real_world_job_benchmark/proactive_brief.rs"] mod proactive_brief; #[path = "real_world_job_benchmark/production_ops.rs"] mod production_ops; +#[path = "real_world_job_benchmark/quantitative.rs"] mod quantitative; #[path = "real_world_job_benchmark/recall_debug_reports.rs"] mod recall_debug_reports; #[path = "real_world_job_benchmark/retrieval.rs"] mod retrieval; #[path = "real_world_job_benchmark/root_aggregate.rs"] mod root_aggregate; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs index f5a395c8..dc83515a 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs @@ -38,6 +38,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("# Real-World Job Benchmark Report")); assert!(markdown.contains("work_resume")); assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("Quantitative Benchmark Report")); + assert!(markdown.contains("leaderboard claims require explicit qrels")); + assert!(markdown.contains("| ELF | `pass` | `fixture_backed`")); assert!(markdown.contains("External Adapter Coverage")); assert!(markdown.contains("live-baseline-only")); assert!(markdown.contains("live real-world")); diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs new file mode 100644 index 00000000..9bcc07c8 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -0,0 +1,49 @@ +#[path = "quantitative/audit_manifest.rs"] mod audit_manifest; +#[path = "quantitative/contracts.rs"] mod contracts; +#[path = "quantitative/metrics.rs"] mod metrics; +#[path = "quantitative/product_manifest.rs"] mod product_manifest; + +use std::{path::Path, process::Command}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + +fn run_report_with_quantitative_audit(manifest_path: &Path, run_id: &str) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--run-id") + .arg(run_id) + .arg("--quantitative-audit-manifest") + .arg(manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs new file mode 100644 index 00000000..5d8777cd --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -0,0 +1,110 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .arg("--held-out") + .arg("--leakage-audited") + .arg("--control") + .arg("query_ids_locked_before_product_runtime") + .arg("--control") + .arg("product_runtime_did_not_receive_expected_answers_or_qrels") + .arg("--control") + .arg("ranked_candidates_emitted_by_product_runtime") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let manifest = support::load_json(&manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_audit_manifest/v1") + ); + assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + support::array_at(&manifest, "/query_ids")?.len() as u64, + manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default() + ); + + let imported = super::run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?; + let row = support::array_at(&imported, "/quantitative_scoreboard/rows")? + .first() + .ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + row.pointer("/audit_manifest_id").and_then(Value::as_str), + Some("audit-import-test-quantitative-audit-manifest") + ); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + + Ok(()) +} + +#[test] +fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--run-id") + .arg("different-run") + .arg("--quantitative-audit-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs new file mode 100644 index 00000000..fc158b77 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs @@ -0,0 +1,127 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + + assert_eq!( + report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_benchmark/v1") + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str), + report.pointer("/generated_at").and_then(Value::as_str) + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array), + Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),]) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/current_query_count") + .and_then(Value::as_u64), + report.pointer("/summary/job_count").and_then(Value::as_u64) + ); + + assert_quantitative_row_contract(&report)?; + assert_quantitative_per_query_contract(&report)?; + + Ok(()) +} + +fn assert_quantitative_row_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; + + assert_eq!(rows.len(), 1); + + let row = &rows[0]; + + assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF")); + assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke")); + assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality")); + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass")); + assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete")); + assert_eq!( + row.pointer("/ranked_candidate_source").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0)); + + for metric in [ + "recall_at_1", + "precision_at_1", + "success_at_1", + "recall_at_5", + "precision_at_5", + "success_at_5", + "mrr", + "ndcg_at_5", + "average_precision", + ] { + assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some()); + assert_eq!( + row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str), + Some("pass") + ); + assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some()); + } + for metric in ["recall_at_5", "precision_at_5", "success_at_5"] { + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str), + Some("wilson_score") + ); + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/confidence")) + .and_then(Value::as_f64), + Some(0.95) + ); + assert!( + row.pointer(&format!("/confidence_intervals/{metric}/denominator")) + .and_then(Value::as_u64) + .is_some() + ); + } + + Ok(()) +} + +fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?; + let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0); + + assert_eq!(rows.len() as u64, job_count); + + for row in rows { + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/metrics/recall_at_5").is_some()); + assert!(row.pointer("/metrics/precision_at_5").is_some()); + assert!(row.pointer("/metrics/ndcg_at_5").is_some()); + assert!(row.pointer("/metrics/average_precision").is_some()); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs new file mode 100644 index 00000000..3b9262a0 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs @@ -0,0 +1,53 @@ +use std::{env, fs, process}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> { + let source_path = + support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json"); + let mut job = serde_json::from_str::(&fs::read_to_string(source_path)?)?; + + support::set_json_pointer( + &mut job, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["old-provider-note", "current-provider-report"]), + )?; + + job.pointer_mut("/expected_answer") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing expected_answer object"))? + .insert( + "relevance_judgments".to_string(), + serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]), + ); + + let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?; + let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1)); + assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0)); + assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0)); + assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?; + let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?; + + assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs new file mode 100644 index 00000000..054e70f3 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs @@ -0,0 +1,68 @@ +#[path = "product_manifest/export.rs"] mod export; +#[path = "product_manifest/validation.rs"] mod validation; + +use std::{ + env, fs, + path::PathBuf, + process::{self, Command}, +}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +struct ProductManifestPaths { + temp_dir: PathBuf, + report_path: PathBuf, + manifest_path: PathBuf, +} + +fn product_manifest_paths(temp_name: &str, manifest_file: &str) -> ProductManifestPaths { + let temp_dir = env::temp_dir().join(format!("{temp_name}-{}", process::id())); + + ProductManifestPaths { + report_path: temp_dir.join("report.json"), + manifest_path: temp_dir.join(manifest_file), + temp_dir, + } +} + +fn write_adversarial_report(paths: &ProductManifestPaths) -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + + fs::create_dir_all(&paths.temp_dir)?; + fs::write(&paths.report_path, serde_json::to_vec_pretty(&report)?)?; + + Ok(()) +} + +fn export_synthetic_rival_manifest(paths: &ProductManifestPaths) -> Result<()> { + write_adversarial_report(paths)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&paths.report_path) + .arg("--out") + .arg(&paths.manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + Ok(()) +} + +fn run_report_with_manifest(paths: &ProductManifestPaths) -> Result { + super::run_report_with_quantitative_manifest(&paths.manifest_path) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs new file mode 100644 index 00000000..d56f2bd7 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs @@ -0,0 +1,73 @@ +use std::process::Command; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-test", + "synthetic-rival-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let manifest = support::load_json(&paths.manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_product_manifest/v1") + ); + assert_eq!( + manifest.pointer("/rows/0/product").and_then(Value::as_str), + Some("Synthetic Rival") + ); + assert_eq!( + manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), + Some("synthetic_rival") + ); + + let imported = super::run_report_with_manifest(&paths)?; + let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; + let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; + + assert_eq!(rows.len(), 2); + assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); + assert!(!support::array_contains_str( + &imported, + "/quantitative_scoreboard/metrics_not_encoded", + "external_product_manifest_import" + )?); + assert!( + support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( + |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") + ) + ); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-elf-test", + "elf-product-manifest.json", + ); + + super::write_adversarial_report(&paths)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&paths.report_path) + .arg("--out") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs new file mode 100644 index 00000000..e4e302b3 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs @@ -0,0 +1,64 @@ +use std::{fs, process::Command}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-corpus-test", + "wrong-corpus-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let mut manifest = support::load_json(&paths.manifest_path)?; + + support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; + fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-per-query-test", + "missing-per-query-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let mut manifest = support::load_json(&paths.manifest_path)?; + + support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; + fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stderr.contains("ranked queries but only 0")); + + Ok(()) +} diff --git a/docs/spec/agent_memory_knowledge_system_v1.md b/docs/spec/agent_memory_knowledge_system_v1.md index 35d18ca8..070df71f 100644 --- a/docs/spec/agent_memory_knowledge_system_v1.md +++ b/docs/spec/agent_memory_knowledge_system_v1.md @@ -272,8 +272,7 @@ Repository-native validation is authoritative. docs are validation-ready. - Before a PR handoff or any push that refreshes a PR head, run the registered Decodex workflow gate: `cargo make fmt`, `cargo make lint-fix`, then - `cargo make checks`. In this Makefile tree, `checks` aliases the repo-native - aggregate `check` task. + `cargo make check`. - If a phase changes commands, schemas, config, runtime behavior, status semantics, or benchmark claims, update the owning docs and include drift evidence as required by `docs/policy.md`. diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md index 5974e4bf..265a71c1 100644 --- a/docs/spec/agent_memory_quantitative_benchmark_v1.md +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -1,216 +1,608 @@ --- type: Spec title: "Agent Memory Quantitative Benchmark v1" -description: "Define the public quantitative competitor scoreboard row contract and claim boundaries." +description: "Define quantitative same-corpus memory benchmark metrics, formulas, evidence classes, and claim boundaries." resource: docs/spec/agent_memory_quantitative_benchmark_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-27 +last_verified: 2026-06-23 tags: - docs - spec - benchmarking - agent-memory -source_refs: - - XY-1098 - - XY-1120 +source_refs: [] code_refs: + - Makefile.toml + - makefiles/benchmark-memory-a.toml + - makefiles/benchmark-memory-b.toml + - scripts/materialize-explicit-qrels.py + - scripts/real-world-explicit-qrels.sh + - scripts/real-world-docker.sh + - scripts/real-world-live-explicit-qrels.sh + - apps/elf-eval/src/app.rs - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - - apps/elf-eval/tests/real_world_job_benchmark.rs + - apps/elf-eval/fixtures/real_world_memory/p1_closeout/source_candidate_approval_recall.json + - apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json related: + - docs/spec/agent_memory_knowledge_system_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md - - docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md + - docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md + - docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md drift_watch: - docs/spec/agent_memory_quantitative_benchmark_v1.md + - Makefile.toml + - makefiles/benchmark-memory-a.toml + - makefiles/benchmark-memory-b.toml + - scripts/materialize-explicit-qrels.py + - scripts/real-world-explicit-qrels.sh + - scripts/real-world-docker.sh + - scripts/real-world-live-explicit-qrels.sh + - docs/spec/agent_memory_knowledge_system_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json + - apps/elf-eval/src/app.rs + - docs/evidence/benchmarking/index.md --- # Agent Memory Quantitative Benchmark v1 -Purpose: Define the public quantitative competitor scoreboard row contract and claim -boundaries. +Purpose: Define the quantitative scoreboard that must sit beside ELF's existing +typed real-world memory benchmark reports. Status: normative -Read this when: You are implementing, validating, or publishing the public -competitor-quality scoreboard for agent memory systems. -Not this document: Real-world job fixture schema, Work Journal behavior, operational -runbooks, or external adapter setup procedures. -Defines: `elf.quality_scoreboard/v1` quantitative rows, metrics, comparability gates, -typed non-pass behavior, and optimization-direction metadata. - -## Scope - -The quantitative scoreboard turns `real_world_job` reports and external adapter -manifest records into public product rows. It is a row-level evidence contract, not a -universal leaderboard. It is allowed to say which metrics are proven for a row, which -competitor strengths remain visible, and which evidence is missing before a row can be -treated as comparable. - -This contract applies to reports with schema `elf.quality_scoreboard/v1`. - -## Scoreboard Report - -A report MUST include: - -- `schema`: exactly `elf.quality_scoreboard/v1`. -- `result_states`: the public row-state enum. -- `evidence_classes`: the public evidence-class enum. -- `metric_basis`: the ranking basis used for retrieval metrics. -- `retrieval_k`: the `k` used for recall, precision, MRR, and nDCG. -- typed non-pass counts and visible typed non-pass states for encoded jobs, external - adapter rows, and the aggregate report. -- evidence-class counts. -- bounded encoded-job and aggregate summary claims. -- `unqualified_win_claim_allowed`, which MUST be `false` when any typed non-pass row - or non-comparable row exists. -- `claim_boundary`, a human-readable statement that prevents typed blockers or - fixture-only evidence from becoming broad superiority claims. -- `rows`: one row for ELF plus one row for each tracked external product represented - by the loaded adapter manifest. -- `optimization_roadmap`: concrete next optimization directions derived from missing - row evidence, not from hidden assumptions. - -## Public Row States +Read this when: You are adding or reviewing recall, freshness, update, delete, +expiry, latency, cost, or competitor-comparison metrics for agent memory systems. +Not this document: A finished benchmark report, a claim that current results beat +every competitor, or a replacement for typed non-pass outcome reporting. +Defines: `elf.agent_memory_quantitative_benchmark/v1`, required metric families, +formulas, denominators, evidence classes, comparability rules, and minimum report +rows. + +## Core Rule + +Quantitative memory comparison must measure the exact behavior users care about: +finding the right evidence, using current facts, suppressing stale or deleted facts, +showing citations, and staying within latency/cost/resource bounds. + +A report must not use broad product labels such as "best memory" or "beats OpenKB" +unless the specific metric row is same-corpus, same-task, same-evidence-class, +same-candidate-source, same-denominator, and leaderboard eligible. Typed non-pass +states remain first-class results. + +## Evidence Classes + +Every quantitative row must declare one evidence class: + +| Evidence class | Meaning | Comparable for leaderboard | +| --- | --- | --- | +| `fixture_backed` | Checked-in fixture scored by ELF's runner. | Only against other fixture rows with the same corpus and task. | +| `live_baseline` | Docker-contained baseline or smoke run that may not execute real-world answer jobs. | No, unless the report states the exact same scored task. | +| `live_real_world` | Runtime executed the same real-world job prompt and produced scored answer artifacts. | Yes, when same-corpus and same-task. | +| `public_proxy` | Local proxy contract based on public docs or expected artifact shape, not a product runtime. | No product leaderboard claim. | +| `private_corpus` | Operator-owned private corpus with publishable bounded metrics only. | Yes only for private-corpus rows with matching policy. | +| `provider_backed` | Provider credentials/models were used and cost/latency are measured. | Yes only against rows with equivalent provider boundary. | +| `research_gate` | Research-only, blocked, or reference-only evidence. | No. | +| `mixed_evidence` | Aggregate row blends multiple evidence classes. | No; split rows before leaderboard use. | + +## Result States + +Every row must declare one result state: | State | Meaning | | --- | --- | -| `pass` | The row has a scored pass under its evidence class. A pass is comparable only when every comparability gate is also true. | -| `wrong_result` | The adapter or job reached the behavioral check but selected the wrong answer, evidence, lifecycle state, or action. | -| `incomplete` | Setup, build, parse, adapter wiring, or runtime execution did not reach the behavioral check. | -| `blocked` | The row cannot be completed safely without missing credentials, private input, durable runtime integration, Docker evidence, or manual product setup. | -| `not_tested` | No benchmark execution or comparable adapter output exists for the row. | -| `not_encoded` | The suite, scoring dimension, or adapter path is not implemented in the runner. | -| `not_comparable` | The row has useful evidence but lacks one or more required comparability gates, so it must not be used as a product-runtime comparison pass. | -| `unsupported_claim` | The row or source report made a substantive claim not supported by corpus evidence, source refs, or report metadata. | - -`not_comparable` is a public row state only. It is not a `real_world_job` status and -must not be written back into job or suite outcome fields. +| `pass` | The metric is measured and meets the row threshold. | +| `wrong_result` | The task ran but selected the wrong answer, wrong evidence, or wrong lifecycle state. | +| `incomplete` | Some required artifacts exist, but the metric denominator is not fully satisfied. | +| `blocked` | Required setup, credentials, corpus, exported artifact, or product readback is missing. | +| `not_encoded` | The adapter or benchmark does not implement this metric. | +| `not_comparable` | A metric exists but evidence class, corpus, task, or denominator differs. | +| `unsupported_claim` | The output makes a claim that the evidence cannot support. | + +Metric states are separate from row result states. A metric state of `measured` +means the denominator is non-zero and the row has no typed non-pass state; it does +not mean the value passed a leaderboard threshold. If the row result is +`blocked`, `wrong_result`, `incomplete`, `not_encoded`, or `unsupported_claim`, +metric states for measured values must inherit that non-pass state. + +Metric states may also use `partial_coverage` when a formula is computable for +some queries but the row lacks full ranked-candidate coverage or the minimum query +count required for leaderboard use. `partial_coverage` values are useful regression +evidence, not product-ranking proof. + +## Retrieval Metrics + +Retrieval metrics apply when a job has relevance labels and an ordered candidate +list. The report must name `k` for every `@k` metric. A row must also declare whether +ranked candidates came from a product/runtime trace or a fixture trace; fixture traces +are formula smoke tests unless the compared product emitted the same artifact shape. +Explicit qrels live in `expected_answer.relevance_judgments` as +`{ "evidence_id": "...", "grade": 0.0 }` records. If a legacy fixture omits qrels, +the runner may derive binary relevance from required evidence for regression use, +but that row must expose `qrel_source = expected_evidence_fallback` and must not +become leaderboard eligible. + +`cargo make real-world-memory-explicit-qrels` is the deterministic qrel +materialization command for fixture-mechanics evidence. It derives positive qrels +from checked-in `expected_answer.evidence_links` and `required_evidence`, preserves +existing explicit zero-grade judgments, and leaves unmentioned corpus evidence +unjudged instead of converting it into synthetic negative labels. Its optional +oracle ranked candidates are allowed only to prove metric mechanics; they are not +product-runtime retrieval evidence and cannot satisfy leaderboard runtime, held-out, +or leakage-audit gates. + +`cargo make real-world-memory-live-explicit-qrels` is the current product-runtime +bridge from deterministic qrel materialization to ELF/qmd live adapter scoring. It +must materialize explicit qrels with `--ranked-candidates-source none`, then let +the live adapters emit their own runtime ranked candidates. This command can close +the `qrel_source` gap for product-runtime rows, but it does not itself prove +held-out status, leakage audit status, or clean leaderboard eligibility. + +| Metric | Formula | Required fields | +| --- | --- | --- | +| `recall_at_k` | `relevant_returned_in_top_k / expected_relevant_count` | relevance labels, explicit `ranked_candidate_evidence_ids`, `k` | +| `precision_at_k` | `relevant_returned_in_top_k / k` | ordered candidates, relevance labels | +| `mrr` | `1 / rank(first_relevant)` or `0` when no relevant item appears | ordered candidates, relevance labels | +| `ndcg_at_k` | `dcg_at_k / ideal_dcg_at_k` using graded relevance when available, binary otherwise | ordered candidates, relevance grades | +| `map` | Mean of per-query average precision values | ordered candidates, relevance labels | +| `average_precision` | Per-query sum of precision at each relevant hit divided by expected relevant count | ordered candidates, relevance labels | +| `success_at_k` | Query has at least one relevant candidate in the top `k` | ordered candidates, relevance labels, `k` | +| `expected_evidence_recall` | `produced_required_evidence_count / required_evidence_count` | required evidence map, produced evidence ids | +| `citation_coverage` | `claims_with_valid_citation / claims_requiring_citation` | claim list, citation validation result | +| `source_ref_coverage` | `claims_with_valid_source_ref / claims_requiring_source_ref` | source-ref validation result | + +Retrieval metrics must not count redacted, excluded, deleted, expired, unreadable, or +non-captured source spans as relevant current evidence. Such candidates may be +reported separately as historical or diagnostic rows. + +## Memory Lifecycle Metrics + +Memory lifecycle metrics apply to jobs that encode state changes over time. + +| Metric | Formula | What it proves | +| --- | --- | --- | +| `update_correctness_rate` | `jobs_selecting_current_superseding_fact / update_jobs` | New facts replace old facts for current answers. | +| `stale_suppression_rate` | `stale_facts_not_used_as_current / stale_fact_opportunities` | Stale facts do not pollute current answers. | +| `delete_suppression_rate` | `deleted_or_tombstoned_facts_not_used / delete_opportunities` | Deleted or tombstoned facts do not reappear as current context. | +| `expiry_suppression_rate` | `expired_facts_not_used / expiry_opportunities` | TTL or time-bounded facts are suppressed after expiry. | +| `rollback_readback_rate` | `rollback_events_with_readback / rollback_events_expected` | Rollback and prior versions remain auditable. | +| `history_readback_rate` | `history_events_readable / history_events_expected` | Add, update, ignore, reject, delete, restore, and derived transitions are visible. | +| `contradiction_resolution_rate` | `contradictions_resolved_to_current_supported_answer / contradiction_opportunities` | Mutually inconsistent memories are resolved with current source support instead of arbitrary retrieval order. | + +The denominator must be explicit. A benchmark with no delete jobs must report +`delete_suppression_rate = not_encoded`, not `1.000`. + +## Answer Safety Metrics + +| Metric | Formula | +| --- | --- | +| `unsupported_claim_rate` | `unsupported_claim_count / answer_claim_count` | +| `stale_answer_rate` | `answers_using_stale_fact_as_current / answered_jobs` | +| `hallucinated_evidence_rate` | `citations_not_in_candidate_or_source_set / citation_count` | +| `redaction_leak_count` | Count of private, excluded, or redacted spans surfaced in public output. | +| `irrelevant_context_ratio` | `irrelevant_context_items / returned_context_items` | +| `scope_violation_count` | Count of unreadable cross-scope or grant-violating rows returned. | -## Evidence Classes +Zero values are meaningful only when the denominator is non-zero and the checked row +actually exercises the failure mode. + +## Operational Metrics -| Evidence class | Meaning | +| Metric | Required unit | | --- | --- | -| `fixture_backed` | Checked-in fixtures were scored. This is regression evidence, not live product-runtime evidence. | -| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists, but the row is not a real-world product-runtime scoreboard pass. | -| `live_real_world` | A live adapter executed real-world job paths and emitted typed outcomes. | -| `research_gate` | Research, source mapping, setup, credential, or resource gates are recorded before fair scoring can run. | - -## Row Fields - -Each `rows[]` entry MUST include: - -- `product_id` and `product_name`. -- `row_source`: stable source label, such as `elf_report` or - `external_adapter_manifest`. -- `evidence_class`. -- `result_state`. -- `comparable`: true only when all comparability gates are satisfied and the row has a - pass state with quantitative metrics. -- comparability gates: - - `same_corpus` - - `source_id_mapped` - - `held_out` - - `leakage_audited` - - `product_runtime` - - `container_digest_identified` -- `metrics`. -- `strengths`: product strengths supported by the row source. -- `weaknesses`: typed weaknesses, blockers, or non-pass evidence from the row source. -- `next_evidence`: row-level evidence needed before the row can become comparable. -- `source_provenance`: bounded source pointers to the input report, adapter record, or - suite records. - -`same_corpus = true` requires positive row evidence that the product or checked-in -adapter is mapped to the benchmark corpus. A blocker sentence that says same-corpus -evidence is missing is not sufficient. A typed same-corpus setup-blocker adapter may -set this gate to true only when its source provenance identifies the intended shared -benchmark corpus and the remaining blocker is runtime/source-id output, not corpus -selection. - -## Metrics - -The `metrics` object MUST include `retrieval`, `lifecycle`, `answer_safety`, -`operations`, and `coverage` sub-objects. - -`retrieval` MUST include: - -- `k`. -- `metric_basis`. -- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg`, or `null` when the row lacks - ranked produced evidence. -- `expected_evidence_recall`. -- `citation_source_ref_coverage`. -- matched, total, and produced evidence counts. - -For `metric_basis = "produced_evidence_order"`, ranked retrieval metrics use the -ordered `produced_evidence` list in the scored job output as the retrieved list. -Expected evidence ids are the relevance set. Relevance is binary. `recall_at_k` and -`precision_at_k` use the first `k` produced evidence ids. MRR is reciprocal rank of -the first relevant produced evidence id. nDCG uses binary gains with the ideal DCG -bounded by `min(k, expected_evidence_total)`. - -`lifecycle` MUST include: - -- stale suppression rate and counts. -- update correctness rate and counts. -- delete correctness rate and counts. -- rollback/history readback rate and counts. - -`answer_safety` MUST include: - -- unsupported-claim rate and count. -- stale-answer rate and count. -- hallucinated-evidence rate when measurable. -- redaction leak count. -- irrelevant-context ratio. - -`operations` MUST include: - -- mean latency in milliseconds when measured. -- total cost when cost accounting exists. -- resource-envelope status, encoded job count, and pass count. - -`coverage` MUST include: - -- job count. -- encoded suite count. -- pass count. -- typed non-pass count. -- source-ref coverage. -- evidence coverage. -- evidence class. - -## Comparability Rules - -A row is comparable only when all of the following are true: - -- `same_corpus = true`. -- `source_id_mapped = true`. -- `held_out = true`. -- `leakage_audited = true`. -- `product_runtime = true`. -- `container_digest_identified = true`. -- `result_state = "pass"`. -- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg` are present. - -If any required gate is false, the report MUST set `comparable = false`, add a -specific `next_evidence` entry for each missing gate, and avoid any win, parity, or -rank claim for that row. If an otherwise passing row is missing a required gate, the -public row state SHOULD be `not_comparable` so the report is explicit about the -reason no product-runtime comparison claim is allowed. - -## Report Claim Rules - -- A row with `fixture_backed`, `live_baseline`, or `research_gate` evidence MUST NOT - be described as a comparable product-runtime pass. -- A row with `blocked`, `incomplete`, `not_tested`, `not_encoded`, `not_comparable`, - or `unsupported_claim` MUST remain visible as a non-pass row. -- External competitors MUST have either comparable product-runtime evidence or an - explicit typed non-pass/blocker row with source provenance. -- Missing Docker image digest evidence is a blocker for comparability, even if a live - adapter executed. -- Public-proxy, fixture-only, local-mock, diagnostic, blocked, and not-encoded rows - MUST NOT be promoted into universal product superiority claims. -- Optimization direction MUST be tied to row-level `next_evidence`, metrics, or typed - non-pass states. +| `ingestion_success_rate` | successful ingested records / records submitted | +| `indexing_coverage` | indexed records or spans / ingestible records or spans | +| `source_id_mapping_coverage` | returned candidates or generated claims mapped to benchmark source ids / candidates or claims requiring mapping | +| `query_latency_p50_ms`, `query_latency_p95_ms`, `query_latency_p99_ms` | milliseconds | +| `ingest_latency_ms` | milliseconds from submitted source to durable ingest acknowledgement | +| `update_propagation_latency_ms` | milliseconds from write/apply/delete to searchable/readable effect | +| `cold_start_recovery_seconds` | seconds | +| `restore_seconds` | seconds | +| `index_rebuild_seconds` | seconds | +| `cost_usd` | USD with input/output token counts where applicable | +| `available_context_token_count` | tokens available in the source corpus or memory store for the query | +| `answer_context_token_count` | tokens supplied to the answering model or final answer context | +| `context_token_efficiency` | `answer_context_token_count / available_context_token_count` | +| `resource_envelope_status` | pass, blocked, incomplete, not_encoded | + +Provider-backed rows must include model/provider identifiers or must remain +`not_comparable`. Fixture zero-cost rows must not imply hosted provider cost. + +## Quantitative Scoreboard Schema + +Reports that implement this spec must emit: + +```json +{ + "schema": "elf.agent_memory_quantitative_benchmark/v1", + "generated_at": "...", + "corpus_id": "...", + "k_values": [1, 3, 5, 10], + "rows": [ + { + "product": "ELF", + "adapter_id": "elf_live_real_world", + "adapter_name": "ELF live real-world", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "comparable": true, + "metric_comparable": true, + "leaderboard_eligible": false, + "held_out": false, + "leakage_audited": false, + "audit_manifest_id": null, + "fixture_regression_only": false, + "sample_size": 40, + "ranking_query_count": 40, + "ranking_coverage_state": "measured", + "ranked_candidate_source": "runtime_trace", + "qrel_source": "explicit_qrels", + "explicit_qrel_query_count": 40, + "metrics": { + "recall_at_5": 1.0, + "precision_at_5": 0.6, + "mrr": 1.0, + "ndcg_at_5": 1.0, + "map": 1.0, + "average_precision": 1.0, + "success_at_5": 1.0, + "explicit_qrel_query_coverage": 1.0, + "relevance_judgment_count": 80, + "relevance_grade_sum": 160, + "update_correctness_rate": 1.0, + "stale_suppression_rate": 1.0, + "delete_suppression_rate": 1.0, + "expected_evidence_recall": 1.0, + "unsupported_claim_rate": 0.0, + "stale_answer_rate": 0.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "map": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 80, + "precision_at_5": 200, + "map": 40, + "success_at_5": 40, + "update_correctness_rate": 2, + "delete_suppression_rate": 1, + "stale_answer_rate": 40 + }, + "confidence_intervals": { + "recall_at_5": { + "method": "wilson_score", + "confidence": 0.95, + "lower": 0.954, + "upper": 1.0, + "numerator": 80, + "denominator": 80 + } + }, + "claim_boundary": "Comparable only against same-corpus live_real_world rows." + } + ], + "per_query_rows": [ + { + "job_id": "memory-evolution-001", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "expected_relevant_count": 2, + "candidate_count": 8, + "qrel_source": "explicit_qrels", + "relevance_grade_sum": 4.0, + "product": "ELF", + "adapter_id": "elf_live_real_world", + "metrics": { + "recall_at_5": 1.0, + "precision_at_5": 0.4, + "mrr": 1.0, + "ndcg_at_5": 1.0, + "average_precision": 1.0, + "success_at_5": 1.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 2, + "precision_at_5": 5, + "mrr": 1, + "ndcg_at_5": 1, + "average_precision": 1, + "success_at_5": 1 + } + } + ], + "ablation_rows": [ + { + "product": "ELF", + "adapter_id": "elf_live_real_world", + "ablation_id": "raw_vector", + "job_id": "memory-evolution-001", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "candidate_source": "runtime_trace_ablation", + "qrel_source": "explicit_qrels", + "expected_relevant_count": 2, + "candidate_count": 8, + "metrics": { + "recall_at_5": 0.5, + "precision_at_5": 0.2, + "mrr": 0.5, + "ndcg_at_5": 0.62, + "average_precision": 0.5, + "success_at_5": 1.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 2, + "precision_at_5": 5, + "mrr": 1, + "ndcg_at_5": 1, + "average_precision": 1, + "success_at_5": 1 + }, + "claim_boundary": "Ablation rows score explicitly supplied candidate orderings for diagnosis; they are not separate product-runtime rows unless the evidence class and candidate source say so." + } + ], + "significance": { + "method": "exact_two_sided_sign_test_on_same_query_metric_deltas", + "state": "not_encoded_single_product_row", + "eligible": false, + "minimum_paired_query_count": 30, + "comparable_product_row_count": 1, + "paired_query_count": 0, + "comparisons": [], + "ablation_comparisons": [ + { + "comparison_scope": "ablation", + "baseline_id": "raw_vector", + "candidate_id": "governed_memory", + "baseline_product": "raw_vector", + "candidate_product": "governed_memory", + "metric": "ndcg_at_5", + "paired_query_count": 1, + "state": "measured", + "effect_mean": 0.311, + "p_value": 1.0, + "win_count": 1, + "loss_count": 0, + "tie_count": 0 + } + ], + "claim_boundary": "Pairwise wins require at least two leaderboard-eligible rows with same-query per-query metrics; otherwise p-values and win claims stay not encoded." + }, + "leakage_audit": { + "state": "not_leaderboard_eligible", + "held_out": false, + "leakage_audited": false, + "corpus_profile": "synthetic", + "evidence_class": "fixture_backed", + "qrel_source": "explicit_qrels", + "fixture_regression_only": true, + "ranking_coverage_state": "partial_coverage", + "leaderboard_blocking_reasons": [ + "fixture_regression_only", + "insufficient_query_count", + "no_held_out_manifest", + "no_leakage_audit_manifest", + "not_live_real_world", + "ranking_coverage_not_measured" + ], + "claim_boundary": "Held-out and leakage-audit fields are explicit gates; fixture or non-audited rows cannot become public leaderboard evidence by omission." + }, + "non_comparable_rows": [ + { + "product": "VectifyAI PageIndex", + "adapter_id": "pageindex_public_proxy_contract", + "result_state": "not_comparable", + "reason": "public_proxy evidence class; no PageIndex product runtime output" + } + ], + "controls": { + "same_corpus_required": true, + "same_task_required": true, + "same_evidence_class_required": true, + "same_budget_required": true, + "ranked_candidates_required_for_ranking_metrics": true, + "raw_ranked_candidate_artifacts_required": true, + "held_out_or_leakage_audited_required": true, + "explicit_relevance_judgments_required_for_leaderboard": true, + "per_query_rows_required_for_significance": true, + "minimum_query_count_for_leaderboard": 30, + "current_query_count": 40, + "current_ranking_query_count": 40, + "current_explicit_qrel_query_count": 40, + "comparable_product_row_count": 1, + "leaderboard_claim_allowed": false, + "statistical_significance": "not_encoded_until_at_least_two_same-corpus comparable product rows meet minimum query count, full ranking coverage, and explicit qrels", + "uncertainty_reporting": "single-row rates include Wilson 95% confidence intervals; competitor win claims require same-query paired significance over per-query rows.", + "leakage_control": "fixture rows are not public leaderboard proof; current product leaderboard rows require held-out and leakage-audited status plus an audit manifest id." + } +} +``` + +## External Product Row Import + +`real_world_job_benchmark run` may accept an optional +`--quantitative-product-manifest` file when a competitor adapter has already +materialized same-corpus product-runtime rows outside the current ELF fixture run. +The manifest schema is `elf.agent_memory_quantitative_product_manifest/v1`. +Generated reports infer the quantitative row `product` from the external adapter +manifest entry matching `--adapter-id`, with `--product` available only as an +explicit override for old or ad hoc reports. + +Use `real_world_job_benchmark export-quantitative-product-manifest --report +` to derive this manifest from a generated `elf.real_world_job_report/v1` +instead of hand-writing metric rows. The export command copies the report's primary +aggregate row and matching per-query rows, rejects `ELF` self rows, and then runs +the same manifest validation used by import. The live qmd adapter sweep writes +`qmd-quantitative-product-manifest.json` and a combined +`elf-qmd-quantitative-report.json` so the same-corpus qmd row is visible in +`quantitative_scoreboard.rows` when fresh live artifacts exist. + +```json +{ + "schema": "elf.agent_memory_quantitative_product_manifest/v1", + "manifest_id": "qmd-live-real-world-2026-06-23", + "corpus_id": "...same value as quantitative_scoreboard.corpus_id...", + "rows": [ + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "held_out": false, + "leakage_audited": false, + "audit_manifest_id": null, + "metrics": { + "recall_at_5": 0.75, + "ndcg_at_5": 0.601, + "average_precision": 0.608 + }, + "metric_states": { + "recall_at_5": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured" + } + } + ], + "per_query_rows": [ + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "job_id": "...", + "metrics": { + "recall_at_5": 0.75, + "ndcg_at_5": 0.601, + "average_precision": 0.608 + }, + "metric_states": { + "recall_at_5": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured" + } + } + ] +} +``` + +The runner must reject imported rows unless: + +- the manifest `corpus_id` exactly matches the current scoreboard `corpus_id` +- each `(product, adapter_id)` matches an external adapter manifest record +- the product is not `ELF` +- aggregate rows and per-query rows carry the paired-comparison metrics + `recall_at_5`, `ndcg_at_5`, and `average_precision` +- ranked aggregate rows have at least `ranking_query_count` matching per-query rows + +Imported rows replace the matching `non_comparable_rows` entry, but they do not +automatically authorize leaderboard claims. A row marked `leaderboard_eligible` +must also be product-runtime evidence with `result_state = pass`, minimum ranked +query coverage, `ranked_candidate_source = runtime_trace`, `qrel_source = +explicit_qrels`, enough explicit qrels for every ranked query, `held_out = true`, +`leakage_audited = true`, and a non-empty `audit_manifest_id`. The current runner +requires both held-out and leakage-audit fields, plus an audit manifest id, before +an imported product row can remain marked leaderboard eligible. This keeps +hand-written, public-proxy, or non-audited rows from becoming hidden wins. + +## Minimum Rows For P6 + +The first implementation issue after this spec must produce a machine-readable +`quantitative_scoreboard` from `real_world_job_benchmark`. The initial runner row may +calculate ranking metrics only when the fixture or adapter emits explicit +`ranked_candidate_evidence_ids`; otherwise it must mark those metrics +`not_encoded`. If only a subset of queries emits ranked candidates, ranking metrics +must use `partial_coverage` and must not make the row leaderboard eligible. It must +publish metric states, denominators, sample size, ranked query count, per-query rows, +explicit-qrel coverage, qrel source, Wilson 95% intervals for measured or partial +rate metrics, ablation rows for explicitly supplied candidate orderings, diagnostic +ablation pairwise comparisons with exact two-sided sign-test p-values, +paired-significance gating state for product rows, held-out/leakage audit state, and +controls so missing rows cannot become hidden wins. The runner may also import +same-corpus external quantitative product rows through +`elf.agent_memory_quantitative_product_manifest/v1`; this is an adapter artifact +boundary, not a manual scoring exemption. It must also keep unimplemented but +required production-memory measures visible as `not_encoded`, including source-id +mapping coverage, ingestion/indexing coverage, contradiction resolution, +propagation latency, and context-token efficiency. + +The full P6 scoreboard must produce rows for: + +- ELF fixture-backed memory authority and knowledge workspace jobs. +- ELF live-real-world retrieval and memory-evolution jobs where artifacts exist. +- qmd live-real-world retrieval/debug rows where artifacts exist. +- mem0/OpenMemory local SDK history/export rows where artifacts exist. +- Honcho rows as typed same-corpus blockers plus `research_gate`/`not_comparable` + external-adapter rows until peer/session outputs, background reasoning artifacts, + source-id mapped search/chat/context results, and token/context efficiency + measures exist for the same corpus. +- PageIndex/OpenKB rows as `blocked` or `not_comparable` until actual product + artifacts exist. +- Letta, OpenViking, Graphiti/Zep, RAGFlow, GraphRAG, and LightRAG rows as + `blocked`, `not_encoded`, or `not_comparable` unless same-corpus product artifacts + are checked in. + +## Research Alignment + +This benchmark contract is aligned with established retrieval and memory-evaluation +practice, but it is not itself a public leaderboard until the controls permit one: + +- BEIR-style retrieval evaluation requires a shared corpus/query/qrels format and + rank-aware metrics such as nDCG@k, MAP, and success@k for comparable retrieval + claims. +- RAGAS-style RAG evaluation separates retrieval context recall/precision from + answer faithfulness and response quality. +- LoCoMo-style memory evaluation shows that long-term memory requires temporal, + multi-session, summarization, and event-grounded reasoning slices, not only + single-turn retrieval. +- Production memory comparisons must report token/cost/latency budgets; Mem0's + public benchmark framing treats accuracy, token cost, and latency as coupled + production dimensions. +- Honcho's public docs and benchmark materials position it as reasoning-first + memory with peer/session representations, background reasoning/dreaming, LongMem, + LoCoMo, BEAM, and token-efficiency framing. ELF must treat those as required + benchmark surfaces, not as same-corpus product results, until a Honcho adapter + emits source-id mapped artifacts on the benchmark corpus. +- Scientific comparison requires held-out and leakage-audited corpora with audit + manifest ids, explicit qrels, raw per-query rows, repeated or paired comparable + runs, confidence intervals for single-row estimates, and paired product-row + significance tests before a leaderboard claim is allowed. Ablation pairwise tests + are diagnostic optimization evidence, not product leaderboard evidence. + +## Claim Boundaries + +Allowed: + +- "ELF has measured evidence recall, source-ref coverage, stale suppression, and + update/delete correctness for the rows shown." +- "Product X is not comparable on metric Y because evidence class, corpus, or + product artifact coverage differs." +- "Product X beats ELF on metric Y" only when both rows are same-corpus, + same-evidence-class, same-task, and comparable. + +Not allowed: + +- A fixture-backed pass cannot beat a provider-backed or product-runtime row. +- A public-proxy pass cannot prove PageIndex, OpenKB, hosted memory, provider-backed, + or private-corpus product quality. +- A missing denominator cannot be reported as `1.000`. +- A `blocked`, `not_encoded`, or `not_comparable` row cannot become a win by omission. diff --git a/makefiles/benchmark-core.toml b/makefiles/benchmark-core.toml index 02c94349..55243485 100644 --- a/makefiles/benchmark-core.toml +++ b/makefiles/benchmark-core.toml @@ -1,95 +1,8 @@ -# Rust workspace tasks: Benchmark core, baseline, and operator tasks. - -# Rust workspace tasks: Benchmark. - -# Benchmark -# | task | type | cwd | -# | ------------------------------------------ | --------- | --- | -# | baseline-backfill-100k-docker | command | | -# | baseline-backfill-10k-docker | command | | -# | baseline-backfill-docker | command | | -# | baseline-live-docker | command | | -# | baseline-live-report | command | | -# | baseline-production-private | command | | -# | baseline-production-private-addendum | command | | -# | baseline-production-synthetic | command | | -# | baseline-soak-docker | command | | -# | local-agent-loop | command | | -# | openmemory-ui-export-readback | command | | -# | parity-docker | command | | -# | real-world-first-generation-oss | composite | | -# | real-world-first-generation-oss-json | command | | -# | real-world-first-generation-oss-report | command | | -# | real-world-job-operator-ux | composite | | -# | real-world-job-operator-ux-json | command | | -# | real-world-job-operator-ux-live-adapters | command | | -# | real-world-job-operator-ux-report | command | | -# | real-world-memory | composite | | -# | real-world-memory-adversarial-quality | composite | | -# | real-world-memory-adversarial-quality-json | command | | -# | real-world-memory-adversarial-quality-report | command | | -# | real-world-memory-consolidation | composite | | -# | real-world-memory-consolidation-json | command | | -# | real-world-memory-consolidation-report | command | | -# | real-world-memory-p1-closeout | composite | | -# | real-world-memory-p1-closeout-json | command | | -# | real-world-memory-p1-closeout-report | command | | -# | real-world-memory-p4-production-readiness | composite | | -# | real-world-memory-p4-production-readiness-json | command | | -# | real-world-memory-p4-production-readiness-report | command | | -# | real-world-memory-p4-quality-hardening-closeout | composite | | -# | real-world-memory-p2-knowledge-closeout | composite | | -# | real-world-memory-core-archival | composite | | -# | real-world-memory-core-archival-json | command | | -# | real-world-memory-core-archival-report | command | | -# | real-world-memory-context-trajectory | composite | | -# | real-world-memory-context-trajectory-json | command | | -# | real-world-memory-context-trajectory-report | command | | -# | real-world-memory-evolution | composite | | -# | real-world-memory-evolution-json | command | | -# | real-world-memory-evolution-report | command | | -# | real-world-memory-graph-rag | composite | | -# | real-world-memory-graph-rag-json | command | | -# | real-world-memory-graph-rag-report | command | | -# | real-world-memory-json | command | | -# | real-world-memory-knowledge | composite | | -# | real-world-memory-knowledge-json | command | | -# | real-world-memory-knowledge-report | command | | -# | real-world-memory-live-adapters | command | | -# | real-world-memory-live-consolidation | command | | -# | real-world-memory-live-knowledge | command | | -# | real-world-memory-mem0-openmemory-letta | composite | | -# | real-world-memory-mem0-openmemory-letta-json | command | | -# | real-world-memory-mem0-openmemory-letta-report | command | | -# | real-world-memory-pageindex-openkb | composite | | -# | real-world-memory-pageindex-openkb-json | command | | -# | real-world-memory-pageindex-openkb-report | command | | -# | real-world-memory-proactive-brief | composite | | -# | real-world-memory-proactive-brief-json | command | | -# | real-world-memory-proactive-brief-report | command | | -# | real-world-memory-production-ops | composite | | -# | real-world-memory-production-ops-json | command | | -# | real-world-memory-production-ops-report | command | | -# | real-world-memory-project-decisions | composite | | -# | real-world-memory-project-decisions-json | command | | -# | real-world-memory-project-decisions-report | command | | -# | real-world-memory-quantitative-scoreboard | composite | | -# | real-world-memory-quantitative-scoreboard-json | command | | -# | real-world-memory-quantitative-scoreboard-report | command | | -# | real-world-memory-report | command | | -# | real-world-memory-retrieval | composite | | -# | real-world-memory-retrieval-json | command | | -# | real-world-memory-retrieval-report | command | | -# | real-world-memory-scheduled | composite | | -# | real-world-memory-scheduled-json | command | | -# | real-world-memory-scheduled-report | command | | -# | real-world-memory-service-native-dreaming | command | | -# | real-world-memory-summary | composite | | -# | real-world-memory-summary-json | command | | -# | real-world-memory-summary-report | command | | -# | real-world-memory-work-continuity | composite | | -# | real-world-memory-work-continuity-json | command | | -# | real-world-memory-work-continuity-report | command | | +# Rust workspace tasks: benchmark core, baseline, and operator commands. +# +# Keep long task listings out of comments. `cargo make --list-all-steps` is the +# source for the complete task index, while this file owns only non-sharded +# benchmark commands. [tasks.baseline-backfill-100k-docker] workspace = false diff --git a/makefiles/benchmark-memory-a.toml b/makefiles/benchmark-memory-a.toml index a7063ca4..3f09c7d4 100644 --- a/makefiles/benchmark-memory-a.toml +++ b/makefiles/benchmark-memory-a.toml @@ -1,4 +1,4 @@ -# Rust workspace tasks: Benchmark real-world memory tasks, first half. +# Rust workspace tasks: real-world memory benchmark fixtures A-G. [tasks.real-world-memory] workspace = false @@ -364,6 +364,13 @@ args = [ "tmp/real-world-memory/evolution-report.md", ] +[tasks.real-world-memory-explicit-qrels] +workspace = false +command = "bash" +args = [ + "scripts/real-world-explicit-qrels.sh", +] + [tasks.real-world-memory-graph-rag] workspace = false dependencies = [ diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml index 8657bb36..3b47da39 100644 --- a/makefiles/benchmark-memory-b.toml +++ b/makefiles/benchmark-memory-b.toml @@ -1,4 +1,4 @@ -# Rust workspace tasks: Benchmark real-world memory tasks, second half. +# Rust workspace tasks: real-world memory benchmark fixtures K-W and aggregate runners. [tasks.real-world-memory-json] workspace = false @@ -251,6 +251,14 @@ args = [ "memory-live-consolidation", ] +[tasks.real-world-memory-live-explicit-qrels] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-explicit-qrels", +] + [tasks.real-world-memory-live-knowledge] workspace = false command = "bash" @@ -678,12 +686,3 @@ args = [ "--out", "tmp/real-world-memory/memory-summary/report.md", ] - -# Check -# | task | type | cwd | -# | ---------------- | --------- | --- | -# | check | composite | | -# | check-docs | command | | -# | check-rust | command | | -# | check-trace-gate | command | | -# | checks | composite | | diff --git a/makefiles/check.toml b/makefiles/check.toml index 5756ac55..c6ab6569 100644 --- a/makefiles/check.toml +++ b/makefiles/check.toml @@ -1,14 +1,5 @@ # Rust workspace tasks: Check. -# Check -# | task | type | cwd | -# | ---------------- | --------- | --- | -# | check | composite | | -# | check-docs | command | | -# | check-rust | command | | -# | check-trace-gate | command | | -# | checks | composite | | - [tasks.check] clear = true workspace = false @@ -43,15 +34,3 @@ command = "bash" args = [ "scripts/trace-gate.sh", ] - -[tasks.checks] -workspace = false -dependencies = [ - "check", -] - -# Clean -# | task | type | cwd | -# | -------------------------- | ------- | --- | -# | clean-baseline-live-docker | command | | -# | clean-parity-docker | command | | diff --git a/makefiles/clean.toml b/makefiles/clean.toml index 7fc71c62..bf899af0 100644 --- a/makefiles/clean.toml +++ b/makefiles/clean.toml @@ -1,11 +1,5 @@ # Rust workspace tasks: Clean. -# Clean -# | task | type | cwd | -# | -------------------------- | ------- | --- | -# | clean-baseline-live-docker | command | | -# | clean-parity-docker | command | | - [tasks.clean-baseline-live-docker] workspace = false command = "docker" @@ -29,13 +23,3 @@ args = [ "-v", "--remove-orphans", ] - -# Format -# | task | type | cwd | -# | -------------- | --------- | --- | -# | fmt | composite | | -# | fmt-check | composite | | -# | fmt-rust | command | | -# | fmt-rust-check | extend | | -# | fmt-toml | command | | -# | fmt-toml-check | extend | | diff --git a/makefiles/format.toml b/makefiles/format.toml index e214c216..8046cfb9 100644 --- a/makefiles/format.toml +++ b/makefiles/format.toml @@ -1,15 +1,5 @@ # Rust workspace tasks: Format. -# Format -# | task | type | cwd | -# | -------------- | --------- | --- | -# | fmt | composite | | -# | fmt-check | composite | | -# | fmt-rust | command | | -# | fmt-rust-check | extend | | -# | fmt-toml | command | | -# | fmt-toml-check | extend | | - [tasks.fmt] workspace = false dependencies = [ @@ -45,10 +35,3 @@ args = [ "fmt", "--check", ] - -# Lint -# | task | type | cwd | -# | ----------- | --------- | --- | -# | lint | composite | | -# | lint-rust | command | | -# | lint-vstyle | command | | diff --git a/makefiles/lint-fix.toml b/makefiles/lint-fix.toml index 5aada462..aa2f8a4f 100644 --- a/makefiles/lint-fix.toml +++ b/makefiles/lint-fix.toml @@ -1,12 +1,5 @@ # Rust workspace tasks: Lint Fix. -# Lint Fix -# | task | type | cwd | -# | --------------- | --------- | --- | -# | lint-fix | composite | | -# | lint-fix-rust | command | | -# | lint-fix-vstyle | command | | - [tasks.lint-fix] workspace = false dependencies = [ @@ -55,15 +48,3 @@ args = [ "--all-features", "--strict", ] - -# Research -# | task | type | cwd | -# | --------------------------------------- | --------- | --- | -# | external-memory-radar | command | | -# | external-memory-radar-artifact | composite | | -# | external-memory-radar-artifact-json | command | | -# | external-memory-radar-artifact-validate | command | | -# | external-memory-radar-dry-run | composite | | -# | external-memory-radar-dry-run-json | command | | -# | external-memory-radar-dry-run-validate | command | | -# | external-memory-radar-validate | command | | diff --git a/makefiles/lint.toml b/makefiles/lint.toml index 1cedd668..a09517af 100644 --- a/makefiles/lint.toml +++ b/makefiles/lint.toml @@ -1,12 +1,5 @@ # Rust workspace tasks: Lint. -# Lint -# | task | type | cwd | -# | ----------- | --------- | --- | -# | lint | composite | | -# | lint-rust | command | | -# | lint-vstyle | command | | - [tasks.lint] workspace = false dependencies = [ @@ -52,10 +45,3 @@ args = [ "--workspace", "--all-features", ] - -# Lint Fix -# | task | type | cwd | -# | --------------- | --------- | --- | -# | lint-fix | composite | | -# | lint-fix-rust | command | | -# | lint-fix-vstyle | command | | diff --git a/makefiles/research.toml b/makefiles/research.toml index 1c9db279..45b5770c 100644 --- a/makefiles/research.toml +++ b/makefiles/research.toml @@ -1,17 +1,5 @@ # Rust workspace tasks: Research. -# Research -# | task | type | cwd | -# | --------------------------------------- | --------- | --- | -# | external-memory-radar | command | | -# | external-memory-radar-artifact | composite | | -# | external-memory-radar-artifact-json | command | | -# | external-memory-radar-artifact-validate | command | | -# | external-memory-radar-dry-run | composite | | -# | external-memory-radar-dry-run-json | command | | -# | external-memory-radar-dry-run-validate | command | | -# | external-memory-radar-validate | command | | - [tasks.external-memory-radar] workspace = false command = "cargo" @@ -127,16 +115,3 @@ args = [ "--cursor", "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", ] - -# Smoke -# | task | type | cwd | -# | ---------------------------------- | --------- | --- | -# | smoke-graphify-docker-graph-report | command | | -# | smoke-graphiti-zep-docker-temporal | command | | -# | smoke-graphrag-docker | command | | -# | smoke-letta-core-archive-export-readback | command | | -# | smoke-lightrag-docker-context | command | | -# | smoke-ragflow-docker | command | | -# | smoke-real-world-job | composite | | -# | smoke-real-world-job-json | command | | -# | smoke-real-world-job-report | command | | diff --git a/makefiles/smoke.toml b/makefiles/smoke.toml index 88c4e494..43b9874d 100644 --- a/makefiles/smoke.toml +++ b/makefiles/smoke.toml @@ -1,18 +1,5 @@ # Rust workspace tasks: Smoke. -# Smoke -# | task | type | cwd | -# | ---------------------------------- | --------- | --- | -# | smoke-graphify-docker-graph-report | command | | -# | smoke-graphiti-zep-docker-temporal | command | | -# | smoke-graphrag-docker | command | | -# | smoke-letta-core-archive-export-readback | command | | -# | smoke-lightrag-docker-context | command | | -# | smoke-ragflow-docker | command | | -# | smoke-real-world-job | composite | | -# | smoke-real-world-job-json | command | | -# | smoke-real-world-job-report | command | | - [tasks.smoke-graphify-docker-graph-report] workspace = false command = "bash" @@ -102,12 +89,3 @@ args = [ "--out", "tmp/real-world-job/real-world-job-smoke-report.md", ] - -# Test -# | task | type | cwd | -# | --------------------- | --------- | --- | -# | test | composite | | -# | test-e2e | command | | -# | test-rust | command | | -# | test-rust-all | command | | -# | test-rust-integration | command | | diff --git a/makefiles/test.toml b/makefiles/test.toml index 4245ab58..9ee899d8 100644 --- a/makefiles/test.toml +++ b/makefiles/test.toml @@ -1,14 +1,5 @@ # Rust workspace tasks: Test. -# Test -# | task | type | cwd | -# | --------------------- | --------- | --- | -# | test | composite | | -# | test-e2e | command | | -# | test-rust | command | | -# | test-rust-all | command | | -# | test-rust-integration | command | | - [tasks.test] clear = true workspace = false diff --git a/scripts/materialize-explicit-qrels.py b/scripts/materialize-explicit-qrels.py new file mode 100755 index 00000000..779abd2f --- /dev/null +++ b/scripts/materialize-explicit-qrels.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +"""Generate explicit relevance-judgment fixtures from real-world job fixtures.""" + +from __future__ import annotations + +import argparse +import json +import shutil +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +SCHEMA = "elf.real_world_explicit_qrel_materialization/v1" +JOB_SCHEMA = "elf.real_world_job/v1" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Copy real_world_job fixtures and derive expected_answer.relevance_judgments " + "from checked-in evidence_links/required_evidence." + ) + ) + parser.add_argument("--fixtures", required=True, type=Path, help="Input fixture directory.") + parser.add_argument("--out-fixtures", required=True, type=Path, help="Generated fixture directory.") + parser.add_argument( + "--summary-out", + required=True, + type=Path, + help="Write materialization summary JSON.", + ) + parser.add_argument( + "--ranked-candidates-source", + choices=["none", "oracle"], + default="none", + help="Optionally add fixture-trace ranked candidates ordered by qrel grade.", + ) + parser.add_argument( + "--profile", + choices=["preserve", "generated_public"], + default="preserve", + help="Preserve original corpus profile or mark generated jobs as generated_public.", + ) + parser.add_argument( + "--exclude-without-positive-qrels", + action="store_true", + help="Do not copy job JSON files that have no positive derived qrels.", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Replace existing relevance_judgments instead of preserving explicit grades.", + ) + + return parser.parse_args() + + +def read_json(path: Path) -> Any: + with path.open(encoding="utf-8") as fh: + return json.load(fh) + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + json.dump(value, fh, indent=2, sort_keys=False) + fh.write("\n") + + +def stable_unique(values: list[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value and value not in seen: + seen.add(value) + result.append(value) + + return result + + +def evidence_link_ids(value: Any) -> list[str]: + if isinstance(value, str): + return [value] + if isinstance(value, list): + return [item for item in value if isinstance(item, str)] + + return [] + + +def corpus_evidence_ids(job: dict[str, Any]) -> list[str]: + return [ + item["evidence_id"] + for item in job.get("corpus", {}).get("items", []) + if isinstance(item, dict) and isinstance(item.get("evidence_id"), str) + ] + + +def derive_positive_grades(job: dict[str, Any]) -> dict[str, float]: + grades: dict[str, float] = {} + expected = job.get("expected_answer", {}) + + for link in expected.get("evidence_links", {}).values(): + for evidence_id in evidence_link_ids(link): + grades[evidence_id] = max(grades.get(evidence_id, 0.0), 2.0) + + for evidence in job.get("required_evidence", []): + if isinstance(evidence, dict) and isinstance(evidence.get("evidence_id"), str): + grades[evidence["evidence_id"]] = max(grades.get(evidence["evidence_id"], 0.0), 1.0) + + return grades + + +def existing_qrel_grades(job: dict[str, Any]) -> dict[str, float]: + grades: dict[str, float] = {} + expected = job.get("expected_answer", {}) + for judgment in expected.get("relevance_judgments", []): + if not isinstance(judgment, dict) or not isinstance(judgment.get("evidence_id"), str): + continue + grade = judgment.get("grade", 1.0) + if isinstance(grade, (int, float)): + grades[judgment["evidence_id"]] = float(grade) + + return grades + + +def materialized_qrels(job: dict[str, Any], overwrite: bool) -> list[dict[str, Any]]: + evidence_ids = corpus_evidence_ids(job) + grades = derive_positive_grades(job) + + if not overwrite: + grades.update(existing_qrel_grades(job)) + + if not any(grade > 0.0 for grade in grades.values()): + return [] + + return [ + {"evidence_id": evidence_id, "grade": grades.get(evidence_id, 0.0)} + for evidence_id in evidence_ids + if evidence_id in grades + ] + + +def ranked_candidates_from_qrels(qrels: list[dict[str, Any]]) -> list[str]: + return [ + judgment["evidence_id"] + for judgment in sorted( + qrels, + key=lambda judgment: ( + -float(judgment.get("grade", 0.0)), + str(judgment.get("evidence_id", "")), + ), + ) + if judgment.get("evidence_id") + ] + + +def add_oracle_ranked_candidates(job: dict[str, Any], qrels: list[dict[str, Any]]) -> bool: + answer = job.get("corpus", {}).get("adapter_response", {}).get("answer") + if not isinstance(answer, dict): + return False + + trace = answer.setdefault("trace_explainability", {}) + trace["ranked_candidate_evidence_ids"] = ranked_candidates_from_qrels(qrels) + trace.setdefault("trace_id", f"{job.get('job_id', 'unknown')}-explicit-qrel-oracle") + + return True + + +def materialize_job( + source: Path, + target: Path, + args: argparse.Namespace, +) -> dict[str, Any]: + job = read_json(source) + if not isinstance(job, dict) or job.get("schema") != JOB_SCHEMA: + shutil.copy2(source, target) + return {"kind": "copied_non_job_json"} + + qrels = materialized_qrels(job, overwrite=args.overwrite) + if not qrels and args.exclude_without_positive_qrels: + return { + "kind": "excluded_without_positive_qrels", + "job_id": job.get("job_id"), + } + + ranked_candidate_added = False + if qrels: + expected = job.setdefault("expected_answer", {}) + had_existing_qrels = bool(expected.get("relevance_judgments")) + expected["relevance_judgments"] = qrels + tags = stable_unique([*job.get("tags", []), "explicit_qrels_generated"]) + job["tags"] = tags + + if args.profile == "generated_public": + job.setdefault("corpus", {})["profile"] = "generated_public" + + if args.ranked_candidates_source == "oracle": + ranked_candidate_added = add_oracle_ranked_candidates(job, qrels) + + write_json(target, job) + return { + "kind": "materialized_job", + "job_id": job.get("job_id"), + "judgment_count": len(qrels), + "positive_judgment_count": sum(1 for judgment in qrels if judgment["grade"] > 0.0), + "zero_grade_judgment_count": sum(1 for judgment in qrels if judgment["grade"] == 0.0), + "unjudged_corpus_evidence_count": len(corpus_evidence_ids(job)) - len(qrels), + "had_existing_qrels": had_existing_qrels, + "ranked_candidate_added": ranked_candidate_added, + } + + shutil.copy2(source, target) + return { + "kind": "copied_without_positive_qrels", + "job_id": job.get("job_id"), + } + + +def materialize(args: argparse.Namespace) -> dict[str, Any]: + if not args.fixtures.is_dir(): + raise SystemExit(f"{args.fixtures} is not a directory") + + if args.out_fixtures.exists(): + shutil.rmtree(args.out_fixtures) + args.out_fixtures.mkdir(parents=True) + + records: list[dict[str, Any]] = [] + for source in sorted(args.fixtures.rglob("*")): + rel = source.relative_to(args.fixtures) + target = args.out_fixtures / rel + if source.is_dir(): + target.mkdir(parents=True, exist_ok=True) + continue + if source.suffix == ".json": + records.append(materialize_job(source, target, args)) + else: + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, target) + + materialized = [record for record in records if record["kind"] == "materialized_job"] + excluded = [record for record in records if record["kind"] == "excluded_without_positive_qrels"] + + summary = { + "schema": SCHEMA, + "generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"), + "input_fixture_dir": str(args.fixtures), + "output_fixture_dir": str(args.out_fixtures), + "ranked_candidates_source": args.ranked_candidates_source, + "profile": args.profile, + "exclude_without_positive_qrels": args.exclude_without_positive_qrels, + "overwrite": args.overwrite, + "job_count": len(materialized), + "excluded_without_positive_qrels_count": len(excluded), + "judgment_count": sum(record["judgment_count"] for record in materialized), + "positive_judgment_count": sum(record["positive_judgment_count"] for record in materialized), + "zero_grade_judgment_count": sum(record["zero_grade_judgment_count"] for record in materialized), + "unjudged_corpus_evidence_count": sum( + record["unjudged_corpus_evidence_count"] for record in materialized + ), + "existing_qrel_job_count": sum(1 for record in materialized if record["had_existing_qrels"]), + "ranked_candidate_job_count": sum( + 1 for record in materialized if record["ranked_candidate_added"] + ), + "excluded_job_ids": [record.get("job_id") for record in excluded], + "claim_boundary": ( + "Derived qrels are deterministic benchmark labels from checked-in evidence links and " + "required_evidence. Unmentioned corpus evidence remains unjudged instead of being " + "converted into synthetic negative labels. Oracle ranked candidates test metric " + "mechanics only; they are not product-runtime retrieval evidence or leaderboard proof." + ), + } + + write_json(args.summary_out, summary) + return summary + + +def main() -> None: + args = parse_args() + summary = materialize(args) + print( + "materialized explicit qrels: " + f"{summary['job_count']} jobs, " + f"{summary['judgment_count']} judgments, " + f"{summary['ranked_candidate_job_count']} ranked-candidate traces" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh index 163c4d1f..8afc80d5 100755 --- a/scripts/real-world-docker.sh +++ b/scripts/real-world-docker.sh @@ -45,6 +45,11 @@ memory-live-adapters) docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb fi docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_REPORT_DIR \ + -e ELF_REAL_WORLD_LIVE_FIXTURES \ + -e ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_WORK_DIR \ + -e ELF_REAL_WORLD_QMD_DIR \ -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW \ -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG \ -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG \ @@ -123,6 +128,15 @@ memory-live-adapters) fi exit "$status" ;; +memory-live-explicit-qrels) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR \ + -e ELF_REAL_WORLD_QMD_DIR \ + baseline-runner bash scripts/real-world-live-explicit-qrels.sh + ;; *) echo "unknown real-world Docker profile: $profile" >&2 exit 2 diff --git a/scripts/real-world-explicit-qrels.sh b/scripts/real-world-explicit-qrels.sh new file mode 100755 index 00000000..ccd17cf1 --- /dev/null +++ b/scripts/real-world-explicit-qrels.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/explicit-qrels}" +SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +QREL_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_OUT_FIXTURES:-${REPORT_DIR}/fixtures}" + +cd "${ROOT_DIR}" + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/materialization-summary.json" \ + --ranked-candidates-source oracle \ + --profile generated_public \ + --exclude-without-positive-qrels + +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures "${QREL_FIXTURE_DIR}" \ + --out "${REPORT_DIR}/report.json" \ + --run-id real-world-memory-explicit-qrels \ + --adapter-id fixture_explicit_qrels \ + --adapter-name "Explicit qrel oracle fixture pack" \ + --adapter-behavior explicit_qrel_oracle_fixture \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Generated by scripts/materialize-explicit-qrels.py from checked-in evidence_links and required_evidence; unmentioned corpus evidence remains unjudged; oracle ranked candidates test metric mechanics only." + +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + publish \ + --report "${REPORT_DIR}/report.json" \ + --out "${REPORT_DIR}/report.md" + +echo "Explicit qrel benchmark report:" +echo " ${REPORT_DIR}/materialization-summary.json" +echo " ${REPORT_DIR}/report.json" +echo " ${REPORT_DIR}/report.md" diff --git a/scripts/real-world-live-explicit-qrels.sh b/scripts/real-world-live-explicit-qrels.sh new file mode 100755 index 00000000..35212ac1 --- /dev/null +++ b/scripts/real-world-live-explicit-qrels.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-explicit-qrels}" +SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +OPERATOR_SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +QREL_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-fixtures" +QREL_OPERATOR_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-operator-debug-fixtures" +LIVE_REPORT_DIR="${REPORT_DIR}/live-adapters" +LIVE_WORK_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR:-/bench/real-world-live-explicit-qrels}" + +if [[ ! -f "/.dockerenv" && "${ELF_REAL_WORLD_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live explicit-qrel adapters outside Docker. Use cargo make real-world-memory-live-explicit-qrels." >&2 + exit 1 +fi + +for cmd in bash jq python3; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live explicit-qrel runner." >&2 + exit 1 + fi +done + +cd "${ROOT_DIR}" + +rm -rf "${REPORT_DIR}" +mkdir -p "${REPORT_DIR}" + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/memory-materialization-summary.json" \ + --ranked-candidates-source none \ + --profile generated_public \ + --exclude-without-positive-qrels + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${OPERATOR_SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_OPERATOR_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/operator-debug-materialization-summary.json" \ + --ranked-candidates-source none \ + --profile generated_public \ + --exclude-without-positive-qrels + +ELF_REAL_WORLD_LIVE_REPORT_DIR="${LIVE_REPORT_DIR}" \ + ELF_REAL_WORLD_LIVE_FIXTURES="${QREL_FIXTURE_DIR}" \ + ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES="${QREL_OPERATOR_FIXTURE_DIR}" \ + ELF_REAL_WORLD_LIVE_WORK_DIR="${LIVE_WORK_DIR}" \ + ELF_REAL_WORLD_LIVE_ELF_RUN_ID="real-world-memory-live-explicit-qrels-elf" \ + ELF_REAL_WORLD_LIVE_QMD_RUN_ID="real-world-memory-live-explicit-qrels-qmd" \ + ELF_REAL_WORLD_LIVE_COMBINED_RUN_ID="real-world-memory-live-elf-qmd-explicit-qrels-quantitative" \ + bash scripts/real-world-live-adapters.sh + +jq -n \ + --slurpfile memory_summary "${REPORT_DIR}/memory-materialization-summary.json" \ + --slurpfile operator_summary "${REPORT_DIR}/operator-debug-materialization-summary.json" \ + --slurpfile live_summary "${LIVE_REPORT_DIR}/summary.json" \ + '{ + schema: "elf.real_world_live_explicit_qrels_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR // "tmp/real-world-memory/live-explicit-qrels"), + live_report_dir: "tmp/real-world-memory/live-explicit-qrels/live-adapters", + materialization: { + memory: $memory_summary[0], + operator_debugging_ux: $operator_summary[0] + }, + live_summary: $live_summary[0], + boundary: "Input fixtures have deterministic explicit qrels, but ranked candidates are product-runtime traces from the live adapters. This improves qrel-source evidence only; leaderboard claims still require pass rows, full ranked coverage, held-out/leakage audit evidence, and paired significance." + }' >"${REPORT_DIR}/summary.json" + +echo "Live explicit-qrel adapter reports:" +echo " ${REPORT_DIR}/memory-materialization-summary.json" +echo " ${REPORT_DIR}/operator-debug-materialization-summary.json" +echo " ${LIVE_REPORT_DIR}/elf-report.json" +echo " ${LIVE_REPORT_DIR}/qmd-report.json" +echo " ${LIVE_REPORT_DIR}/qmd-quantitative-product-manifest.json" +echo " ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.json" +echo " ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.md" +echo " ${REPORT_DIR}/summary.json"