From c013899e0a90bdb18c2a20b9ee22bdd84d408431 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 10:42:59 -0400 Subject: [PATCH 01/58] {"schema":"decodex/commit/1","summary":"Port explicit qrel benchmark rescue slice","authority":"manual"} --- .../agent_memory_quantitative_benchmark_v1.md | 764 +++++++++++++----- makefiles/benchmark-core.toml | 97 +-- makefiles/benchmark-memory-a.toml | 7 + makefiles/benchmark-memory-b.toml | 8 + scripts/materialize-explicit-qrels.py | 290 +++++++ scripts/real-world-docker.sh | 14 + scripts/real-world-explicit-qrels.sh | 39 + scripts/real-world-live-explicit-qrels.sh | 80 ++ 8 files changed, 1021 insertions(+), 278 deletions(-) create mode 100755 scripts/materialize-explicit-qrels.py create mode 100755 scripts/real-world-explicit-qrels.sh create mode 100755 scripts/real-world-live-explicit-qrels.sh diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md index 5974e4bf..265a71c1 100644 --- a/docs/spec/agent_memory_quantitative_benchmark_v1.md +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -1,216 +1,608 @@ --- type: Spec title: "Agent Memory Quantitative Benchmark v1" -description: "Define the public quantitative competitor scoreboard row contract and claim boundaries." +description: "Define quantitative same-corpus memory benchmark metrics, formulas, evidence classes, and claim boundaries." resource: docs/spec/agent_memory_quantitative_benchmark_v1.md status: active authority: normative owner: spec -last_verified: 2026-06-27 +last_verified: 2026-06-23 tags: - docs - spec - benchmarking - agent-memory -source_refs: - - XY-1098 - - XY-1120 +source_refs: [] code_refs: + - Makefile.toml + - makefiles/benchmark-memory-a.toml + - makefiles/benchmark-memory-b.toml + - scripts/materialize-explicit-qrels.py + - scripts/real-world-explicit-qrels.sh + - scripts/real-world-docker.sh + - scripts/real-world-live-explicit-qrels.sh + - apps/elf-eval/src/app.rs - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - - apps/elf-eval/tests/real_world_job_benchmark.rs + - apps/elf-eval/fixtures/real_world_memory/p1_closeout/source_candidate_approval_recall.json + - apps/elf-eval/fixtures/real_world_external_adapters/memory_projects_manifest.json related: + - docs/spec/agent_memory_knowledge_system_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md - - docs/evidence/benchmarking/2026-06-27-public-quantitative-competitor-scoreboard-report.md + - docs/evidence/benchmarking/2026-06-23-p4-quality-hardening-productization-readiness-report.md + - docs/evidence/benchmarking/2026-06-23-p3-competitor-strength-absorption-report.md drift_watch: - docs/spec/agent_memory_quantitative_benchmark_v1.md + - Makefile.toml + - makefiles/benchmark-memory-a.toml + - makefiles/benchmark-memory-b.toml + - scripts/materialize-explicit-qrels.py + - scripts/real-world-explicit-qrels.sh + - scripts/real-world-docker.sh + - scripts/real-world-live-explicit-qrels.sh + - docs/spec/agent_memory_knowledge_system_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json + - apps/elf-eval/src/app.rs + - docs/evidence/benchmarking/index.md --- # Agent Memory Quantitative Benchmark v1 -Purpose: Define the public quantitative competitor scoreboard row contract and claim -boundaries. +Purpose: Define the quantitative scoreboard that must sit beside ELF's existing +typed real-world memory benchmark reports. Status: normative -Read this when: You are implementing, validating, or publishing the public -competitor-quality scoreboard for agent memory systems. -Not this document: Real-world job fixture schema, Work Journal behavior, operational -runbooks, or external adapter setup procedures. -Defines: `elf.quality_scoreboard/v1` quantitative rows, metrics, comparability gates, -typed non-pass behavior, and optimization-direction metadata. - -## Scope - -The quantitative scoreboard turns `real_world_job` reports and external adapter -manifest records into public product rows. It is a row-level evidence contract, not a -universal leaderboard. It is allowed to say which metrics are proven for a row, which -competitor strengths remain visible, and which evidence is missing before a row can be -treated as comparable. - -This contract applies to reports with schema `elf.quality_scoreboard/v1`. - -## Scoreboard Report - -A report MUST include: - -- `schema`: exactly `elf.quality_scoreboard/v1`. -- `result_states`: the public row-state enum. -- `evidence_classes`: the public evidence-class enum. -- `metric_basis`: the ranking basis used for retrieval metrics. -- `retrieval_k`: the `k` used for recall, precision, MRR, and nDCG. -- typed non-pass counts and visible typed non-pass states for encoded jobs, external - adapter rows, and the aggregate report. -- evidence-class counts. -- bounded encoded-job and aggregate summary claims. -- `unqualified_win_claim_allowed`, which MUST be `false` when any typed non-pass row - or non-comparable row exists. -- `claim_boundary`, a human-readable statement that prevents typed blockers or - fixture-only evidence from becoming broad superiority claims. -- `rows`: one row for ELF plus one row for each tracked external product represented - by the loaded adapter manifest. -- `optimization_roadmap`: concrete next optimization directions derived from missing - row evidence, not from hidden assumptions. - -## Public Row States +Read this when: You are adding or reviewing recall, freshness, update, delete, +expiry, latency, cost, or competitor-comparison metrics for agent memory systems. +Not this document: A finished benchmark report, a claim that current results beat +every competitor, or a replacement for typed non-pass outcome reporting. +Defines: `elf.agent_memory_quantitative_benchmark/v1`, required metric families, +formulas, denominators, evidence classes, comparability rules, and minimum report +rows. + +## Core Rule + +Quantitative memory comparison must measure the exact behavior users care about: +finding the right evidence, using current facts, suppressing stale or deleted facts, +showing citations, and staying within latency/cost/resource bounds. + +A report must not use broad product labels such as "best memory" or "beats OpenKB" +unless the specific metric row is same-corpus, same-task, same-evidence-class, +same-candidate-source, same-denominator, and leaderboard eligible. Typed non-pass +states remain first-class results. + +## Evidence Classes + +Every quantitative row must declare one evidence class: + +| Evidence class | Meaning | Comparable for leaderboard | +| --- | --- | --- | +| `fixture_backed` | Checked-in fixture scored by ELF's runner. | Only against other fixture rows with the same corpus and task. | +| `live_baseline` | Docker-contained baseline or smoke run that may not execute real-world answer jobs. | No, unless the report states the exact same scored task. | +| `live_real_world` | Runtime executed the same real-world job prompt and produced scored answer artifacts. | Yes, when same-corpus and same-task. | +| `public_proxy` | Local proxy contract based on public docs or expected artifact shape, not a product runtime. | No product leaderboard claim. | +| `private_corpus` | Operator-owned private corpus with publishable bounded metrics only. | Yes only for private-corpus rows with matching policy. | +| `provider_backed` | Provider credentials/models were used and cost/latency are measured. | Yes only against rows with equivalent provider boundary. | +| `research_gate` | Research-only, blocked, or reference-only evidence. | No. | +| `mixed_evidence` | Aggregate row blends multiple evidence classes. | No; split rows before leaderboard use. | + +## Result States + +Every row must declare one result state: | State | Meaning | | --- | --- | -| `pass` | The row has a scored pass under its evidence class. A pass is comparable only when every comparability gate is also true. | -| `wrong_result` | The adapter or job reached the behavioral check but selected the wrong answer, evidence, lifecycle state, or action. | -| `incomplete` | Setup, build, parse, adapter wiring, or runtime execution did not reach the behavioral check. | -| `blocked` | The row cannot be completed safely without missing credentials, private input, durable runtime integration, Docker evidence, or manual product setup. | -| `not_tested` | No benchmark execution or comparable adapter output exists for the row. | -| `not_encoded` | The suite, scoring dimension, or adapter path is not implemented in the runner. | -| `not_comparable` | The row has useful evidence but lacks one or more required comparability gates, so it must not be used as a product-runtime comparison pass. | -| `unsupported_claim` | The row or source report made a substantive claim not supported by corpus evidence, source refs, or report metadata. | - -`not_comparable` is a public row state only. It is not a `real_world_job` status and -must not be written back into job or suite outcome fields. +| `pass` | The metric is measured and meets the row threshold. | +| `wrong_result` | The task ran but selected the wrong answer, wrong evidence, or wrong lifecycle state. | +| `incomplete` | Some required artifacts exist, but the metric denominator is not fully satisfied. | +| `blocked` | Required setup, credentials, corpus, exported artifact, or product readback is missing. | +| `not_encoded` | The adapter or benchmark does not implement this metric. | +| `not_comparable` | A metric exists but evidence class, corpus, task, or denominator differs. | +| `unsupported_claim` | The output makes a claim that the evidence cannot support. | + +Metric states are separate from row result states. A metric state of `measured` +means the denominator is non-zero and the row has no typed non-pass state; it does +not mean the value passed a leaderboard threshold. If the row result is +`blocked`, `wrong_result`, `incomplete`, `not_encoded`, or `unsupported_claim`, +metric states for measured values must inherit that non-pass state. + +Metric states may also use `partial_coverage` when a formula is computable for +some queries but the row lacks full ranked-candidate coverage or the minimum query +count required for leaderboard use. `partial_coverage` values are useful regression +evidence, not product-ranking proof. + +## Retrieval Metrics + +Retrieval metrics apply when a job has relevance labels and an ordered candidate +list. The report must name `k` for every `@k` metric. A row must also declare whether +ranked candidates came from a product/runtime trace or a fixture trace; fixture traces +are formula smoke tests unless the compared product emitted the same artifact shape. +Explicit qrels live in `expected_answer.relevance_judgments` as +`{ "evidence_id": "...", "grade": 0.0 }` records. If a legacy fixture omits qrels, +the runner may derive binary relevance from required evidence for regression use, +but that row must expose `qrel_source = expected_evidence_fallback` and must not +become leaderboard eligible. + +`cargo make real-world-memory-explicit-qrels` is the deterministic qrel +materialization command for fixture-mechanics evidence. It derives positive qrels +from checked-in `expected_answer.evidence_links` and `required_evidence`, preserves +existing explicit zero-grade judgments, and leaves unmentioned corpus evidence +unjudged instead of converting it into synthetic negative labels. Its optional +oracle ranked candidates are allowed only to prove metric mechanics; they are not +product-runtime retrieval evidence and cannot satisfy leaderboard runtime, held-out, +or leakage-audit gates. + +`cargo make real-world-memory-live-explicit-qrels` is the current product-runtime +bridge from deterministic qrel materialization to ELF/qmd live adapter scoring. It +must materialize explicit qrels with `--ranked-candidates-source none`, then let +the live adapters emit their own runtime ranked candidates. This command can close +the `qrel_source` gap for product-runtime rows, but it does not itself prove +held-out status, leakage audit status, or clean leaderboard eligibility. + +| Metric | Formula | Required fields | +| --- | --- | --- | +| `recall_at_k` | `relevant_returned_in_top_k / expected_relevant_count` | relevance labels, explicit `ranked_candidate_evidence_ids`, `k` | +| `precision_at_k` | `relevant_returned_in_top_k / k` | ordered candidates, relevance labels | +| `mrr` | `1 / rank(first_relevant)` or `0` when no relevant item appears | ordered candidates, relevance labels | +| `ndcg_at_k` | `dcg_at_k / ideal_dcg_at_k` using graded relevance when available, binary otherwise | ordered candidates, relevance grades | +| `map` | Mean of per-query average precision values | ordered candidates, relevance labels | +| `average_precision` | Per-query sum of precision at each relevant hit divided by expected relevant count | ordered candidates, relevance labels | +| `success_at_k` | Query has at least one relevant candidate in the top `k` | ordered candidates, relevance labels, `k` | +| `expected_evidence_recall` | `produced_required_evidence_count / required_evidence_count` | required evidence map, produced evidence ids | +| `citation_coverage` | `claims_with_valid_citation / claims_requiring_citation` | claim list, citation validation result | +| `source_ref_coverage` | `claims_with_valid_source_ref / claims_requiring_source_ref` | source-ref validation result | + +Retrieval metrics must not count redacted, excluded, deleted, expired, unreadable, or +non-captured source spans as relevant current evidence. Such candidates may be +reported separately as historical or diagnostic rows. + +## Memory Lifecycle Metrics + +Memory lifecycle metrics apply to jobs that encode state changes over time. + +| Metric | Formula | What it proves | +| --- | --- | --- | +| `update_correctness_rate` | `jobs_selecting_current_superseding_fact / update_jobs` | New facts replace old facts for current answers. | +| `stale_suppression_rate` | `stale_facts_not_used_as_current / stale_fact_opportunities` | Stale facts do not pollute current answers. | +| `delete_suppression_rate` | `deleted_or_tombstoned_facts_not_used / delete_opportunities` | Deleted or tombstoned facts do not reappear as current context. | +| `expiry_suppression_rate` | `expired_facts_not_used / expiry_opportunities` | TTL or time-bounded facts are suppressed after expiry. | +| `rollback_readback_rate` | `rollback_events_with_readback / rollback_events_expected` | Rollback and prior versions remain auditable. | +| `history_readback_rate` | `history_events_readable / history_events_expected` | Add, update, ignore, reject, delete, restore, and derived transitions are visible. | +| `contradiction_resolution_rate` | `contradictions_resolved_to_current_supported_answer / contradiction_opportunities` | Mutually inconsistent memories are resolved with current source support instead of arbitrary retrieval order. | + +The denominator must be explicit. A benchmark with no delete jobs must report +`delete_suppression_rate = not_encoded`, not `1.000`. + +## Answer Safety Metrics + +| Metric | Formula | +| --- | --- | +| `unsupported_claim_rate` | `unsupported_claim_count / answer_claim_count` | +| `stale_answer_rate` | `answers_using_stale_fact_as_current / answered_jobs` | +| `hallucinated_evidence_rate` | `citations_not_in_candidate_or_source_set / citation_count` | +| `redaction_leak_count` | Count of private, excluded, or redacted spans surfaced in public output. | +| `irrelevant_context_ratio` | `irrelevant_context_items / returned_context_items` | +| `scope_violation_count` | Count of unreadable cross-scope or grant-violating rows returned. | -## Evidence Classes +Zero values are meaningful only when the denominator is non-zero and the checked row +actually exercises the failure mode. + +## Operational Metrics -| Evidence class | Meaning | +| Metric | Required unit | | --- | --- | -| `fixture_backed` | Checked-in fixtures were scored. This is regression evidence, not live product-runtime evidence. | -| `live_baseline` | Docker live-baseline retrieval or lifecycle evidence exists, but the row is not a real-world product-runtime scoreboard pass. | -| `live_real_world` | A live adapter executed real-world job paths and emitted typed outcomes. | -| `research_gate` | Research, source mapping, setup, credential, or resource gates are recorded before fair scoring can run. | - -## Row Fields - -Each `rows[]` entry MUST include: - -- `product_id` and `product_name`. -- `row_source`: stable source label, such as `elf_report` or - `external_adapter_manifest`. -- `evidence_class`. -- `result_state`. -- `comparable`: true only when all comparability gates are satisfied and the row has a - pass state with quantitative metrics. -- comparability gates: - - `same_corpus` - - `source_id_mapped` - - `held_out` - - `leakage_audited` - - `product_runtime` - - `container_digest_identified` -- `metrics`. -- `strengths`: product strengths supported by the row source. -- `weaknesses`: typed weaknesses, blockers, or non-pass evidence from the row source. -- `next_evidence`: row-level evidence needed before the row can become comparable. -- `source_provenance`: bounded source pointers to the input report, adapter record, or - suite records. - -`same_corpus = true` requires positive row evidence that the product or checked-in -adapter is mapped to the benchmark corpus. A blocker sentence that says same-corpus -evidence is missing is not sufficient. A typed same-corpus setup-blocker adapter may -set this gate to true only when its source provenance identifies the intended shared -benchmark corpus and the remaining blocker is runtime/source-id output, not corpus -selection. - -## Metrics - -The `metrics` object MUST include `retrieval`, `lifecycle`, `answer_safety`, -`operations`, and `coverage` sub-objects. - -`retrieval` MUST include: - -- `k`. -- `metric_basis`. -- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg`, or `null` when the row lacks - ranked produced evidence. -- `expected_evidence_recall`. -- `citation_source_ref_coverage`. -- matched, total, and produced evidence counts. - -For `metric_basis = "produced_evidence_order"`, ranked retrieval metrics use the -ordered `produced_evidence` list in the scored job output as the retrieved list. -Expected evidence ids are the relevance set. Relevance is binary. `recall_at_k` and -`precision_at_k` use the first `k` produced evidence ids. MRR is reciprocal rank of -the first relevant produced evidence id. nDCG uses binary gains with the ideal DCG -bounded by `min(k, expected_evidence_total)`. - -`lifecycle` MUST include: - -- stale suppression rate and counts. -- update correctness rate and counts. -- delete correctness rate and counts. -- rollback/history readback rate and counts. - -`answer_safety` MUST include: - -- unsupported-claim rate and count. -- stale-answer rate and count. -- hallucinated-evidence rate when measurable. -- redaction leak count. -- irrelevant-context ratio. - -`operations` MUST include: - -- mean latency in milliseconds when measured. -- total cost when cost accounting exists. -- resource-envelope status, encoded job count, and pass count. - -`coverage` MUST include: - -- job count. -- encoded suite count. -- pass count. -- typed non-pass count. -- source-ref coverage. -- evidence coverage. -- evidence class. - -## Comparability Rules - -A row is comparable only when all of the following are true: - -- `same_corpus = true`. -- `source_id_mapped = true`. -- `held_out = true`. -- `leakage_audited = true`. -- `product_runtime = true`. -- `container_digest_identified = true`. -- `result_state = "pass"`. -- `recall_at_k`, `precision_at_k`, `mrr`, and `ndcg` are present. - -If any required gate is false, the report MUST set `comparable = false`, add a -specific `next_evidence` entry for each missing gate, and avoid any win, parity, or -rank claim for that row. If an otherwise passing row is missing a required gate, the -public row state SHOULD be `not_comparable` so the report is explicit about the -reason no product-runtime comparison claim is allowed. - -## Report Claim Rules - -- A row with `fixture_backed`, `live_baseline`, or `research_gate` evidence MUST NOT - be described as a comparable product-runtime pass. -- A row with `blocked`, `incomplete`, `not_tested`, `not_encoded`, `not_comparable`, - or `unsupported_claim` MUST remain visible as a non-pass row. -- External competitors MUST have either comparable product-runtime evidence or an - explicit typed non-pass/blocker row with source provenance. -- Missing Docker image digest evidence is a blocker for comparability, even if a live - adapter executed. -- Public-proxy, fixture-only, local-mock, diagnostic, blocked, and not-encoded rows - MUST NOT be promoted into universal product superiority claims. -- Optimization direction MUST be tied to row-level `next_evidence`, metrics, or typed - non-pass states. +| `ingestion_success_rate` | successful ingested records / records submitted | +| `indexing_coverage` | indexed records or spans / ingestible records or spans | +| `source_id_mapping_coverage` | returned candidates or generated claims mapped to benchmark source ids / candidates or claims requiring mapping | +| `query_latency_p50_ms`, `query_latency_p95_ms`, `query_latency_p99_ms` | milliseconds | +| `ingest_latency_ms` | milliseconds from submitted source to durable ingest acknowledgement | +| `update_propagation_latency_ms` | milliseconds from write/apply/delete to searchable/readable effect | +| `cold_start_recovery_seconds` | seconds | +| `restore_seconds` | seconds | +| `index_rebuild_seconds` | seconds | +| `cost_usd` | USD with input/output token counts where applicable | +| `available_context_token_count` | tokens available in the source corpus or memory store for the query | +| `answer_context_token_count` | tokens supplied to the answering model or final answer context | +| `context_token_efficiency` | `answer_context_token_count / available_context_token_count` | +| `resource_envelope_status` | pass, blocked, incomplete, not_encoded | + +Provider-backed rows must include model/provider identifiers or must remain +`not_comparable`. Fixture zero-cost rows must not imply hosted provider cost. + +## Quantitative Scoreboard Schema + +Reports that implement this spec must emit: + +```json +{ + "schema": "elf.agent_memory_quantitative_benchmark/v1", + "generated_at": "...", + "corpus_id": "...", + "k_values": [1, 3, 5, 10], + "rows": [ + { + "product": "ELF", + "adapter_id": "elf_live_real_world", + "adapter_name": "ELF live real-world", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "comparable": true, + "metric_comparable": true, + "leaderboard_eligible": false, + "held_out": false, + "leakage_audited": false, + "audit_manifest_id": null, + "fixture_regression_only": false, + "sample_size": 40, + "ranking_query_count": 40, + "ranking_coverage_state": "measured", + "ranked_candidate_source": "runtime_trace", + "qrel_source": "explicit_qrels", + "explicit_qrel_query_count": 40, + "metrics": { + "recall_at_5": 1.0, + "precision_at_5": 0.6, + "mrr": 1.0, + "ndcg_at_5": 1.0, + "map": 1.0, + "average_precision": 1.0, + "success_at_5": 1.0, + "explicit_qrel_query_coverage": 1.0, + "relevance_judgment_count": 80, + "relevance_grade_sum": 160, + "update_correctness_rate": 1.0, + "stale_suppression_rate": 1.0, + "delete_suppression_rate": 1.0, + "expected_evidence_recall": 1.0, + "unsupported_claim_rate": 0.0, + "stale_answer_rate": 0.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "map": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 80, + "precision_at_5": 200, + "map": 40, + "success_at_5": 40, + "update_correctness_rate": 2, + "delete_suppression_rate": 1, + "stale_answer_rate": 40 + }, + "confidence_intervals": { + "recall_at_5": { + "method": "wilson_score", + "confidence": 0.95, + "lower": 0.954, + "upper": 1.0, + "numerator": 80, + "denominator": 80 + } + }, + "claim_boundary": "Comparable only against same-corpus live_real_world rows." + } + ], + "per_query_rows": [ + { + "job_id": "memory-evolution-001", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "expected_relevant_count": 2, + "candidate_count": 8, + "qrel_source": "explicit_qrels", + "relevance_grade_sum": 4.0, + "product": "ELF", + "adapter_id": "elf_live_real_world", + "metrics": { + "recall_at_5": 1.0, + "precision_at_5": 0.4, + "mrr": 1.0, + "ndcg_at_5": 1.0, + "average_precision": 1.0, + "success_at_5": 1.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 2, + "precision_at_5": 5, + "mrr": 1, + "ndcg_at_5": 1, + "average_precision": 1, + "success_at_5": 1 + } + } + ], + "ablation_rows": [ + { + "product": "ELF", + "adapter_id": "elf_live_real_world", + "ablation_id": "raw_vector", + "job_id": "memory-evolution-001", + "suite": "memory_evolution", + "evidence_class": "live_real_world", + "result_state": "pass", + "candidate_source": "runtime_trace_ablation", + "qrel_source": "explicit_qrels", + "expected_relevant_count": 2, + "candidate_count": 8, + "metrics": { + "recall_at_5": 0.5, + "precision_at_5": 0.2, + "mrr": 0.5, + "ndcg_at_5": 0.62, + "average_precision": 0.5, + "success_at_5": 1.0 + }, + "metric_states": { + "recall_at_5": "measured", + "precision_at_5": "measured", + "mrr": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured", + "success_at_5": "measured" + }, + "denominators": { + "recall_at_5": 2, + "precision_at_5": 5, + "mrr": 1, + "ndcg_at_5": 1, + "average_precision": 1, + "success_at_5": 1 + }, + "claim_boundary": "Ablation rows score explicitly supplied candidate orderings for diagnosis; they are not separate product-runtime rows unless the evidence class and candidate source say so." + } + ], + "significance": { + "method": "exact_two_sided_sign_test_on_same_query_metric_deltas", + "state": "not_encoded_single_product_row", + "eligible": false, + "minimum_paired_query_count": 30, + "comparable_product_row_count": 1, + "paired_query_count": 0, + "comparisons": [], + "ablation_comparisons": [ + { + "comparison_scope": "ablation", + "baseline_id": "raw_vector", + "candidate_id": "governed_memory", + "baseline_product": "raw_vector", + "candidate_product": "governed_memory", + "metric": "ndcg_at_5", + "paired_query_count": 1, + "state": "measured", + "effect_mean": 0.311, + "p_value": 1.0, + "win_count": 1, + "loss_count": 0, + "tie_count": 0 + } + ], + "claim_boundary": "Pairwise wins require at least two leaderboard-eligible rows with same-query per-query metrics; otherwise p-values and win claims stay not encoded." + }, + "leakage_audit": { + "state": "not_leaderboard_eligible", + "held_out": false, + "leakage_audited": false, + "corpus_profile": "synthetic", + "evidence_class": "fixture_backed", + "qrel_source": "explicit_qrels", + "fixture_regression_only": true, + "ranking_coverage_state": "partial_coverage", + "leaderboard_blocking_reasons": [ + "fixture_regression_only", + "insufficient_query_count", + "no_held_out_manifest", + "no_leakage_audit_manifest", + "not_live_real_world", + "ranking_coverage_not_measured" + ], + "claim_boundary": "Held-out and leakage-audit fields are explicit gates; fixture or non-audited rows cannot become public leaderboard evidence by omission." + }, + "non_comparable_rows": [ + { + "product": "VectifyAI PageIndex", + "adapter_id": "pageindex_public_proxy_contract", + "result_state": "not_comparable", + "reason": "public_proxy evidence class; no PageIndex product runtime output" + } + ], + "controls": { + "same_corpus_required": true, + "same_task_required": true, + "same_evidence_class_required": true, + "same_budget_required": true, + "ranked_candidates_required_for_ranking_metrics": true, + "raw_ranked_candidate_artifacts_required": true, + "held_out_or_leakage_audited_required": true, + "explicit_relevance_judgments_required_for_leaderboard": true, + "per_query_rows_required_for_significance": true, + "minimum_query_count_for_leaderboard": 30, + "current_query_count": 40, + "current_ranking_query_count": 40, + "current_explicit_qrel_query_count": 40, + "comparable_product_row_count": 1, + "leaderboard_claim_allowed": false, + "statistical_significance": "not_encoded_until_at_least_two_same-corpus comparable product rows meet minimum query count, full ranking coverage, and explicit qrels", + "uncertainty_reporting": "single-row rates include Wilson 95% confidence intervals; competitor win claims require same-query paired significance over per-query rows.", + "leakage_control": "fixture rows are not public leaderboard proof; current product leaderboard rows require held-out and leakage-audited status plus an audit manifest id." + } +} +``` + +## External Product Row Import + +`real_world_job_benchmark run` may accept an optional +`--quantitative-product-manifest` file when a competitor adapter has already +materialized same-corpus product-runtime rows outside the current ELF fixture run. +The manifest schema is `elf.agent_memory_quantitative_product_manifest/v1`. +Generated reports infer the quantitative row `product` from the external adapter +manifest entry matching `--adapter-id`, with `--product` available only as an +explicit override for old or ad hoc reports. + +Use `real_world_job_benchmark export-quantitative-product-manifest --report +` to derive this manifest from a generated `elf.real_world_job_report/v1` +instead of hand-writing metric rows. The export command copies the report's primary +aggregate row and matching per-query rows, rejects `ELF` self rows, and then runs +the same manifest validation used by import. The live qmd adapter sweep writes +`qmd-quantitative-product-manifest.json` and a combined +`elf-qmd-quantitative-report.json` so the same-corpus qmd row is visible in +`quantitative_scoreboard.rows` when fresh live artifacts exist. + +```json +{ + "schema": "elf.agent_memory_quantitative_product_manifest/v1", + "manifest_id": "qmd-live-real-world-2026-06-23", + "corpus_id": "...same value as quantitative_scoreboard.corpus_id...", + "rows": [ + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "held_out": false, + "leakage_audited": false, + "audit_manifest_id": null, + "metrics": { + "recall_at_5": 0.75, + "ndcg_at_5": 0.601, + "average_precision": 0.608 + }, + "metric_states": { + "recall_at_5": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured" + } + } + ], + "per_query_rows": [ + { + "product": "qmd", + "adapter_id": "qmd_live_real_world", + "job_id": "...", + "metrics": { + "recall_at_5": 0.75, + "ndcg_at_5": 0.601, + "average_precision": 0.608 + }, + "metric_states": { + "recall_at_5": "measured", + "ndcg_at_5": "measured", + "average_precision": "measured" + } + } + ] +} +``` + +The runner must reject imported rows unless: + +- the manifest `corpus_id` exactly matches the current scoreboard `corpus_id` +- each `(product, adapter_id)` matches an external adapter manifest record +- the product is not `ELF` +- aggregate rows and per-query rows carry the paired-comparison metrics + `recall_at_5`, `ndcg_at_5`, and `average_precision` +- ranked aggregate rows have at least `ranking_query_count` matching per-query rows + +Imported rows replace the matching `non_comparable_rows` entry, but they do not +automatically authorize leaderboard claims. A row marked `leaderboard_eligible` +must also be product-runtime evidence with `result_state = pass`, minimum ranked +query coverage, `ranked_candidate_source = runtime_trace`, `qrel_source = +explicit_qrels`, enough explicit qrels for every ranked query, `held_out = true`, +`leakage_audited = true`, and a non-empty `audit_manifest_id`. The current runner +requires both held-out and leakage-audit fields, plus an audit manifest id, before +an imported product row can remain marked leaderboard eligible. This keeps +hand-written, public-proxy, or non-audited rows from becoming hidden wins. + +## Minimum Rows For P6 + +The first implementation issue after this spec must produce a machine-readable +`quantitative_scoreboard` from `real_world_job_benchmark`. The initial runner row may +calculate ranking metrics only when the fixture or adapter emits explicit +`ranked_candidate_evidence_ids`; otherwise it must mark those metrics +`not_encoded`. If only a subset of queries emits ranked candidates, ranking metrics +must use `partial_coverage` and must not make the row leaderboard eligible. It must +publish metric states, denominators, sample size, ranked query count, per-query rows, +explicit-qrel coverage, qrel source, Wilson 95% intervals for measured or partial +rate metrics, ablation rows for explicitly supplied candidate orderings, diagnostic +ablation pairwise comparisons with exact two-sided sign-test p-values, +paired-significance gating state for product rows, held-out/leakage audit state, and +controls so missing rows cannot become hidden wins. The runner may also import +same-corpus external quantitative product rows through +`elf.agent_memory_quantitative_product_manifest/v1`; this is an adapter artifact +boundary, not a manual scoring exemption. It must also keep unimplemented but +required production-memory measures visible as `not_encoded`, including source-id +mapping coverage, ingestion/indexing coverage, contradiction resolution, +propagation latency, and context-token efficiency. + +The full P6 scoreboard must produce rows for: + +- ELF fixture-backed memory authority and knowledge workspace jobs. +- ELF live-real-world retrieval and memory-evolution jobs where artifacts exist. +- qmd live-real-world retrieval/debug rows where artifacts exist. +- mem0/OpenMemory local SDK history/export rows where artifacts exist. +- Honcho rows as typed same-corpus blockers plus `research_gate`/`not_comparable` + external-adapter rows until peer/session outputs, background reasoning artifacts, + source-id mapped search/chat/context results, and token/context efficiency + measures exist for the same corpus. +- PageIndex/OpenKB rows as `blocked` or `not_comparable` until actual product + artifacts exist. +- Letta, OpenViking, Graphiti/Zep, RAGFlow, GraphRAG, and LightRAG rows as + `blocked`, `not_encoded`, or `not_comparable` unless same-corpus product artifacts + are checked in. + +## Research Alignment + +This benchmark contract is aligned with established retrieval and memory-evaluation +practice, but it is not itself a public leaderboard until the controls permit one: + +- BEIR-style retrieval evaluation requires a shared corpus/query/qrels format and + rank-aware metrics such as nDCG@k, MAP, and success@k for comparable retrieval + claims. +- RAGAS-style RAG evaluation separates retrieval context recall/precision from + answer faithfulness and response quality. +- LoCoMo-style memory evaluation shows that long-term memory requires temporal, + multi-session, summarization, and event-grounded reasoning slices, not only + single-turn retrieval. +- Production memory comparisons must report token/cost/latency budgets; Mem0's + public benchmark framing treats accuracy, token cost, and latency as coupled + production dimensions. +- Honcho's public docs and benchmark materials position it as reasoning-first + memory with peer/session representations, background reasoning/dreaming, LongMem, + LoCoMo, BEAM, and token-efficiency framing. ELF must treat those as required + benchmark surfaces, not as same-corpus product results, until a Honcho adapter + emits source-id mapped artifacts on the benchmark corpus. +- Scientific comparison requires held-out and leakage-audited corpora with audit + manifest ids, explicit qrels, raw per-query rows, repeated or paired comparable + runs, confidence intervals for single-row estimates, and paired product-row + significance tests before a leaderboard claim is allowed. Ablation pairwise tests + are diagnostic optimization evidence, not product leaderboard evidence. + +## Claim Boundaries + +Allowed: + +- "ELF has measured evidence recall, source-ref coverage, stale suppression, and + update/delete correctness for the rows shown." +- "Product X is not comparable on metric Y because evidence class, corpus, or + product artifact coverage differs." +- "Product X beats ELF on metric Y" only when both rows are same-corpus, + same-evidence-class, same-task, and comparable. + +Not allowed: + +- A fixture-backed pass cannot beat a provider-backed or product-runtime row. +- A public-proxy pass cannot prove PageIndex, OpenKB, hosted memory, provider-backed, + or private-corpus product quality. +- A missing denominator cannot be reported as `1.000`. +- A `blocked`, `not_encoded`, or `not_comparable` row cannot become a win by omission. diff --git a/makefiles/benchmark-core.toml b/makefiles/benchmark-core.toml index 02c94349..55243485 100644 --- a/makefiles/benchmark-core.toml +++ b/makefiles/benchmark-core.toml @@ -1,95 +1,8 @@ -# Rust workspace tasks: Benchmark core, baseline, and operator tasks. - -# Rust workspace tasks: Benchmark. - -# Benchmark -# | task | type | cwd | -# | ------------------------------------------ | --------- | --- | -# | baseline-backfill-100k-docker | command | | -# | baseline-backfill-10k-docker | command | | -# | baseline-backfill-docker | command | | -# | baseline-live-docker | command | | -# | baseline-live-report | command | | -# | baseline-production-private | command | | -# | baseline-production-private-addendum | command | | -# | baseline-production-synthetic | command | | -# | baseline-soak-docker | command | | -# | local-agent-loop | command | | -# | openmemory-ui-export-readback | command | | -# | parity-docker | command | | -# | real-world-first-generation-oss | composite | | -# | real-world-first-generation-oss-json | command | | -# | real-world-first-generation-oss-report | command | | -# | real-world-job-operator-ux | composite | | -# | real-world-job-operator-ux-json | command | | -# | real-world-job-operator-ux-live-adapters | command | | -# | real-world-job-operator-ux-report | command | | -# | real-world-memory | composite | | -# | real-world-memory-adversarial-quality | composite | | -# | real-world-memory-adversarial-quality-json | command | | -# | real-world-memory-adversarial-quality-report | command | | -# | real-world-memory-consolidation | composite | | -# | real-world-memory-consolidation-json | command | | -# | real-world-memory-consolidation-report | command | | -# | real-world-memory-p1-closeout | composite | | -# | real-world-memory-p1-closeout-json | command | | -# | real-world-memory-p1-closeout-report | command | | -# | real-world-memory-p4-production-readiness | composite | | -# | real-world-memory-p4-production-readiness-json | command | | -# | real-world-memory-p4-production-readiness-report | command | | -# | real-world-memory-p4-quality-hardening-closeout | composite | | -# | real-world-memory-p2-knowledge-closeout | composite | | -# | real-world-memory-core-archival | composite | | -# | real-world-memory-core-archival-json | command | | -# | real-world-memory-core-archival-report | command | | -# | real-world-memory-context-trajectory | composite | | -# | real-world-memory-context-trajectory-json | command | | -# | real-world-memory-context-trajectory-report | command | | -# | real-world-memory-evolution | composite | | -# | real-world-memory-evolution-json | command | | -# | real-world-memory-evolution-report | command | | -# | real-world-memory-graph-rag | composite | | -# | real-world-memory-graph-rag-json | command | | -# | real-world-memory-graph-rag-report | command | | -# | real-world-memory-json | command | | -# | real-world-memory-knowledge | composite | | -# | real-world-memory-knowledge-json | command | | -# | real-world-memory-knowledge-report | command | | -# | real-world-memory-live-adapters | command | | -# | real-world-memory-live-consolidation | command | | -# | real-world-memory-live-knowledge | command | | -# | real-world-memory-mem0-openmemory-letta | composite | | -# | real-world-memory-mem0-openmemory-letta-json | command | | -# | real-world-memory-mem0-openmemory-letta-report | command | | -# | real-world-memory-pageindex-openkb | composite | | -# | real-world-memory-pageindex-openkb-json | command | | -# | real-world-memory-pageindex-openkb-report | command | | -# | real-world-memory-proactive-brief | composite | | -# | real-world-memory-proactive-brief-json | command | | -# | real-world-memory-proactive-brief-report | command | | -# | real-world-memory-production-ops | composite | | -# | real-world-memory-production-ops-json | command | | -# | real-world-memory-production-ops-report | command | | -# | real-world-memory-project-decisions | composite | | -# | real-world-memory-project-decisions-json | command | | -# | real-world-memory-project-decisions-report | command | | -# | real-world-memory-quantitative-scoreboard | composite | | -# | real-world-memory-quantitative-scoreboard-json | command | | -# | real-world-memory-quantitative-scoreboard-report | command | | -# | real-world-memory-report | command | | -# | real-world-memory-retrieval | composite | | -# | real-world-memory-retrieval-json | command | | -# | real-world-memory-retrieval-report | command | | -# | real-world-memory-scheduled | composite | | -# | real-world-memory-scheduled-json | command | | -# | real-world-memory-scheduled-report | command | | -# | real-world-memory-service-native-dreaming | command | | -# | real-world-memory-summary | composite | | -# | real-world-memory-summary-json | command | | -# | real-world-memory-summary-report | command | | -# | real-world-memory-work-continuity | composite | | -# | real-world-memory-work-continuity-json | command | | -# | real-world-memory-work-continuity-report | command | | +# Rust workspace tasks: benchmark core, baseline, and operator commands. +# +# Keep long task listings out of comments. `cargo make --list-all-steps` is the +# source for the complete task index, while this file owns only non-sharded +# benchmark commands. [tasks.baseline-backfill-100k-docker] workspace = false diff --git a/makefiles/benchmark-memory-a.toml b/makefiles/benchmark-memory-a.toml index a7063ca4..a7b5e6c6 100644 --- a/makefiles/benchmark-memory-a.toml +++ b/makefiles/benchmark-memory-a.toml @@ -364,6 +364,13 @@ args = [ "tmp/real-world-memory/evolution-report.md", ] +[tasks.real-world-memory-explicit-qrels] +workspace = false +command = "bash" +args = [ + "scripts/real-world-explicit-qrels.sh", +] + [tasks.real-world-memory-graph-rag] workspace = false dependencies = [ diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml index 8657bb36..95003f90 100644 --- a/makefiles/benchmark-memory-b.toml +++ b/makefiles/benchmark-memory-b.toml @@ -251,6 +251,14 @@ args = [ "memory-live-consolidation", ] +[tasks.real-world-memory-live-explicit-qrels] +workspace = false +command = "bash" +args = [ + "scripts/real-world-docker.sh", + "memory-live-explicit-qrels", +] + [tasks.real-world-memory-live-knowledge] workspace = false command = "bash" diff --git a/scripts/materialize-explicit-qrels.py b/scripts/materialize-explicit-qrels.py new file mode 100755 index 00000000..779abd2f --- /dev/null +++ b/scripts/materialize-explicit-qrels.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +"""Generate explicit relevance-judgment fixtures from real-world job fixtures.""" + +from __future__ import annotations + +import argparse +import json +import shutil +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +SCHEMA = "elf.real_world_explicit_qrel_materialization/v1" +JOB_SCHEMA = "elf.real_world_job/v1" + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description=( + "Copy real_world_job fixtures and derive expected_answer.relevance_judgments " + "from checked-in evidence_links/required_evidence." + ) + ) + parser.add_argument("--fixtures", required=True, type=Path, help="Input fixture directory.") + parser.add_argument("--out-fixtures", required=True, type=Path, help="Generated fixture directory.") + parser.add_argument( + "--summary-out", + required=True, + type=Path, + help="Write materialization summary JSON.", + ) + parser.add_argument( + "--ranked-candidates-source", + choices=["none", "oracle"], + default="none", + help="Optionally add fixture-trace ranked candidates ordered by qrel grade.", + ) + parser.add_argument( + "--profile", + choices=["preserve", "generated_public"], + default="preserve", + help="Preserve original corpus profile or mark generated jobs as generated_public.", + ) + parser.add_argument( + "--exclude-without-positive-qrels", + action="store_true", + help="Do not copy job JSON files that have no positive derived qrels.", + ) + parser.add_argument( + "--overwrite", + action="store_true", + help="Replace existing relevance_judgments instead of preserving explicit grades.", + ) + + return parser.parse_args() + + +def read_json(path: Path) -> Any: + with path.open(encoding="utf-8") as fh: + return json.load(fh) + + +def write_json(path: Path, value: Any) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with path.open("w", encoding="utf-8") as fh: + json.dump(value, fh, indent=2, sort_keys=False) + fh.write("\n") + + +def stable_unique(values: list[str]) -> list[str]: + seen: set[str] = set() + result: list[str] = [] + for value in values: + if value and value not in seen: + seen.add(value) + result.append(value) + + return result + + +def evidence_link_ids(value: Any) -> list[str]: + if isinstance(value, str): + return [value] + if isinstance(value, list): + return [item for item in value if isinstance(item, str)] + + return [] + + +def corpus_evidence_ids(job: dict[str, Any]) -> list[str]: + return [ + item["evidence_id"] + for item in job.get("corpus", {}).get("items", []) + if isinstance(item, dict) and isinstance(item.get("evidence_id"), str) + ] + + +def derive_positive_grades(job: dict[str, Any]) -> dict[str, float]: + grades: dict[str, float] = {} + expected = job.get("expected_answer", {}) + + for link in expected.get("evidence_links", {}).values(): + for evidence_id in evidence_link_ids(link): + grades[evidence_id] = max(grades.get(evidence_id, 0.0), 2.0) + + for evidence in job.get("required_evidence", []): + if isinstance(evidence, dict) and isinstance(evidence.get("evidence_id"), str): + grades[evidence["evidence_id"]] = max(grades.get(evidence["evidence_id"], 0.0), 1.0) + + return grades + + +def existing_qrel_grades(job: dict[str, Any]) -> dict[str, float]: + grades: dict[str, float] = {} + expected = job.get("expected_answer", {}) + for judgment in expected.get("relevance_judgments", []): + if not isinstance(judgment, dict) or not isinstance(judgment.get("evidence_id"), str): + continue + grade = judgment.get("grade", 1.0) + if isinstance(grade, (int, float)): + grades[judgment["evidence_id"]] = float(grade) + + return grades + + +def materialized_qrels(job: dict[str, Any], overwrite: bool) -> list[dict[str, Any]]: + evidence_ids = corpus_evidence_ids(job) + grades = derive_positive_grades(job) + + if not overwrite: + grades.update(existing_qrel_grades(job)) + + if not any(grade > 0.0 for grade in grades.values()): + return [] + + return [ + {"evidence_id": evidence_id, "grade": grades.get(evidence_id, 0.0)} + for evidence_id in evidence_ids + if evidence_id in grades + ] + + +def ranked_candidates_from_qrels(qrels: list[dict[str, Any]]) -> list[str]: + return [ + judgment["evidence_id"] + for judgment in sorted( + qrels, + key=lambda judgment: ( + -float(judgment.get("grade", 0.0)), + str(judgment.get("evidence_id", "")), + ), + ) + if judgment.get("evidence_id") + ] + + +def add_oracle_ranked_candidates(job: dict[str, Any], qrels: list[dict[str, Any]]) -> bool: + answer = job.get("corpus", {}).get("adapter_response", {}).get("answer") + if not isinstance(answer, dict): + return False + + trace = answer.setdefault("trace_explainability", {}) + trace["ranked_candidate_evidence_ids"] = ranked_candidates_from_qrels(qrels) + trace.setdefault("trace_id", f"{job.get('job_id', 'unknown')}-explicit-qrel-oracle") + + return True + + +def materialize_job( + source: Path, + target: Path, + args: argparse.Namespace, +) -> dict[str, Any]: + job = read_json(source) + if not isinstance(job, dict) or job.get("schema") != JOB_SCHEMA: + shutil.copy2(source, target) + return {"kind": "copied_non_job_json"} + + qrels = materialized_qrels(job, overwrite=args.overwrite) + if not qrels and args.exclude_without_positive_qrels: + return { + "kind": "excluded_without_positive_qrels", + "job_id": job.get("job_id"), + } + + ranked_candidate_added = False + if qrels: + expected = job.setdefault("expected_answer", {}) + had_existing_qrels = bool(expected.get("relevance_judgments")) + expected["relevance_judgments"] = qrels + tags = stable_unique([*job.get("tags", []), "explicit_qrels_generated"]) + job["tags"] = tags + + if args.profile == "generated_public": + job.setdefault("corpus", {})["profile"] = "generated_public" + + if args.ranked_candidates_source == "oracle": + ranked_candidate_added = add_oracle_ranked_candidates(job, qrels) + + write_json(target, job) + return { + "kind": "materialized_job", + "job_id": job.get("job_id"), + "judgment_count": len(qrels), + "positive_judgment_count": sum(1 for judgment in qrels if judgment["grade"] > 0.0), + "zero_grade_judgment_count": sum(1 for judgment in qrels if judgment["grade"] == 0.0), + "unjudged_corpus_evidence_count": len(corpus_evidence_ids(job)) - len(qrels), + "had_existing_qrels": had_existing_qrels, + "ranked_candidate_added": ranked_candidate_added, + } + + shutil.copy2(source, target) + return { + "kind": "copied_without_positive_qrels", + "job_id": job.get("job_id"), + } + + +def materialize(args: argparse.Namespace) -> dict[str, Any]: + if not args.fixtures.is_dir(): + raise SystemExit(f"{args.fixtures} is not a directory") + + if args.out_fixtures.exists(): + shutil.rmtree(args.out_fixtures) + args.out_fixtures.mkdir(parents=True) + + records: list[dict[str, Any]] = [] + for source in sorted(args.fixtures.rglob("*")): + rel = source.relative_to(args.fixtures) + target = args.out_fixtures / rel + if source.is_dir(): + target.mkdir(parents=True, exist_ok=True) + continue + if source.suffix == ".json": + records.append(materialize_job(source, target, args)) + else: + target.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(source, target) + + materialized = [record for record in records if record["kind"] == "materialized_job"] + excluded = [record for record in records if record["kind"] == "excluded_without_positive_qrels"] + + summary = { + "schema": SCHEMA, + "generated_at": datetime.now(UTC).isoformat().replace("+00:00", "Z"), + "input_fixture_dir": str(args.fixtures), + "output_fixture_dir": str(args.out_fixtures), + "ranked_candidates_source": args.ranked_candidates_source, + "profile": args.profile, + "exclude_without_positive_qrels": args.exclude_without_positive_qrels, + "overwrite": args.overwrite, + "job_count": len(materialized), + "excluded_without_positive_qrels_count": len(excluded), + "judgment_count": sum(record["judgment_count"] for record in materialized), + "positive_judgment_count": sum(record["positive_judgment_count"] for record in materialized), + "zero_grade_judgment_count": sum(record["zero_grade_judgment_count"] for record in materialized), + "unjudged_corpus_evidence_count": sum( + record["unjudged_corpus_evidence_count"] for record in materialized + ), + "existing_qrel_job_count": sum(1 for record in materialized if record["had_existing_qrels"]), + "ranked_candidate_job_count": sum( + 1 for record in materialized if record["ranked_candidate_added"] + ), + "excluded_job_ids": [record.get("job_id") for record in excluded], + "claim_boundary": ( + "Derived qrels are deterministic benchmark labels from checked-in evidence links and " + "required_evidence. Unmentioned corpus evidence remains unjudged instead of being " + "converted into synthetic negative labels. Oracle ranked candidates test metric " + "mechanics only; they are not product-runtime retrieval evidence or leaderboard proof." + ), + } + + write_json(args.summary_out, summary) + return summary + + +def main() -> None: + args = parse_args() + summary = materialize(args) + print( + "materialized explicit qrels: " + f"{summary['job_count']} jobs, " + f"{summary['judgment_count']} judgments, " + f"{summary['ranked_candidate_job_count']} ranked-candidate traces" + ) + + +if __name__ == "__main__": + main() diff --git a/scripts/real-world-docker.sh b/scripts/real-world-docker.sh index 163c4d1f..8afc80d5 100755 --- a/scripts/real-world-docker.sh +++ b/scripts/real-world-docker.sh @@ -45,6 +45,11 @@ memory-live-adapters) docker compose -f docker-compose.baseline.yml --profile graphiti-zep up -d graphiti-falkordb fi docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_REPORT_DIR \ + -e ELF_REAL_WORLD_LIVE_FIXTURES \ + -e ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_WORK_DIR \ + -e ELF_REAL_WORLD_QMD_DIR \ -e ELF_REAL_WORLD_LIVE_ENABLE_RAGFLOW \ -e ELF_REAL_WORLD_LIVE_ENABLE_LIGHTRAG \ -e ELF_REAL_WORLD_LIVE_ENABLE_GRAPHRAG \ @@ -123,6 +128,15 @@ memory-live-adapters) fi exit "$status" ;; +memory-live-explicit-qrels) + docker compose -f docker-compose.baseline.yml run --build --rm \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES \ + -e ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR \ + -e ELF_REAL_WORLD_QMD_DIR \ + baseline-runner bash scripts/real-world-live-explicit-qrels.sh + ;; *) echo "unknown real-world Docker profile: $profile" >&2 exit 2 diff --git a/scripts/real-world-explicit-qrels.sh b/scripts/real-world-explicit-qrels.sh new file mode 100755 index 00000000..ccd17cf1 --- /dev/null +++ b/scripts/real-world-explicit-qrels.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/explicit-qrels}" +SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +QREL_FIXTURE_DIR="${ELF_REAL_WORLD_EXPLICIT_QRELS_OUT_FIXTURES:-${REPORT_DIR}/fixtures}" + +cd "${ROOT_DIR}" + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/materialization-summary.json" \ + --ranked-candidates-source oracle \ + --profile generated_public \ + --exclude-without-positive-qrels + +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + run \ + --fixtures "${QREL_FIXTURE_DIR}" \ + --out "${REPORT_DIR}/report.json" \ + --run-id real-world-memory-explicit-qrels \ + --adapter-id fixture_explicit_qrels \ + --adapter-name "Explicit qrel oracle fixture pack" \ + --adapter-behavior explicit_qrel_oracle_fixture \ + --adapter-storage-status pass \ + --adapter-runtime-status pass \ + --adapter-notes "Generated by scripts/materialize-explicit-qrels.py from checked-in evidence_links and required_evidence; unmentioned corpus evidence remains unjudged; oracle ranked candidates test metric mechanics only." + +cargo run -p elf-eval --bin real_world_job_benchmark -- \ + publish \ + --report "${REPORT_DIR}/report.json" \ + --out "${REPORT_DIR}/report.md" + +echo "Explicit qrel benchmark report:" +echo " ${REPORT_DIR}/materialization-summary.json" +echo " ${REPORT_DIR}/report.json" +echo " ${REPORT_DIR}/report.md" diff --git a/scripts/real-world-live-explicit-qrels.sh b/scripts/real-world-live-explicit-qrels.sh new file mode 100755 index 00000000..35212ac1 --- /dev/null +++ b/scripts/real-world-live-explicit-qrels.sh @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +REPORT_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR:-${ROOT_DIR}/tmp/real-world-memory/live-explicit-qrels}" +SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_memory}" +OPERATOR_SOURCE_FIXTURE_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_OPERATOR_DEBUG_FIXTURES:-${ROOT_DIR}/apps/elf-eval/fixtures/real_world_job/operator_debugging_ux}" +QREL_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-fixtures" +QREL_OPERATOR_FIXTURE_DIR="${REPORT_DIR}/explicit-qrel-operator-debug-fixtures" +LIVE_REPORT_DIR="${REPORT_DIR}/live-adapters" +LIVE_WORK_DIR="${ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_WORK_DIR:-/bench/real-world-live-explicit-qrels}" + +if [[ ! -f "/.dockerenv" && "${ELF_REAL_WORLD_LIVE_ALLOW_HOST:-0}" != "1" ]]; then + echo "Refusing to run live explicit-qrel adapters outside Docker. Use cargo make real-world-memory-live-explicit-qrels." >&2 + exit 1 +fi + +for cmd in bash jq python3; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + echo "Missing ${cmd} in live explicit-qrel runner." >&2 + exit 1 + fi +done + +cd "${ROOT_DIR}" + +rm -rf "${REPORT_DIR}" +mkdir -p "${REPORT_DIR}" + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/memory-materialization-summary.json" \ + --ranked-candidates-source none \ + --profile generated_public \ + --exclude-without-positive-qrels + +python3 scripts/materialize-explicit-qrels.py \ + --fixtures "${OPERATOR_SOURCE_FIXTURE_DIR}" \ + --out-fixtures "${QREL_OPERATOR_FIXTURE_DIR}" \ + --summary-out "${REPORT_DIR}/operator-debug-materialization-summary.json" \ + --ranked-candidates-source none \ + --profile generated_public \ + --exclude-without-positive-qrels + +ELF_REAL_WORLD_LIVE_REPORT_DIR="${LIVE_REPORT_DIR}" \ + ELF_REAL_WORLD_LIVE_FIXTURES="${QREL_FIXTURE_DIR}" \ + ELF_REAL_WORLD_OPERATOR_DEBUG_FIXTURES="${QREL_OPERATOR_FIXTURE_DIR}" \ + ELF_REAL_WORLD_LIVE_WORK_DIR="${LIVE_WORK_DIR}" \ + ELF_REAL_WORLD_LIVE_ELF_RUN_ID="real-world-memory-live-explicit-qrels-elf" \ + ELF_REAL_WORLD_LIVE_QMD_RUN_ID="real-world-memory-live-explicit-qrels-qmd" \ + ELF_REAL_WORLD_LIVE_COMBINED_RUN_ID="real-world-memory-live-elf-qmd-explicit-qrels-quantitative" \ + bash scripts/real-world-live-adapters.sh + +jq -n \ + --slurpfile memory_summary "${REPORT_DIR}/memory-materialization-summary.json" \ + --slurpfile operator_summary "${REPORT_DIR}/operator-debug-materialization-summary.json" \ + --slurpfile live_summary "${LIVE_REPORT_DIR}/summary.json" \ + '{ + schema: "elf.real_world_live_explicit_qrels_sweep/v1", + generated_at: (now | todateiso8601), + artifact_dir: (env.ELF_REAL_WORLD_LIVE_EXPLICIT_QRELS_REPORT_DIR // "tmp/real-world-memory/live-explicit-qrels"), + live_report_dir: "tmp/real-world-memory/live-explicit-qrels/live-adapters", + materialization: { + memory: $memory_summary[0], + operator_debugging_ux: $operator_summary[0] + }, + live_summary: $live_summary[0], + boundary: "Input fixtures have deterministic explicit qrels, but ranked candidates are product-runtime traces from the live adapters. This improves qrel-source evidence only; leaderboard claims still require pass rows, full ranked coverage, held-out/leakage audit evidence, and paired significance." + }' >"${REPORT_DIR}/summary.json" + +echo "Live explicit-qrel adapter reports:" +echo " ${REPORT_DIR}/memory-materialization-summary.json" +echo " ${REPORT_DIR}/operator-debug-materialization-summary.json" +echo " ${LIVE_REPORT_DIR}/elf-report.json" +echo " ${LIVE_REPORT_DIR}/qmd-report.json" +echo " ${LIVE_REPORT_DIR}/qmd-quantitative-product-manifest.json" +echo " ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.json" +echo " ${LIVE_REPORT_DIR}/elf-qmd-quantitative-report.md" +echo " ${REPORT_DIR}/summary.json" From 33d66158079c83419efe48897738800e66ca27d7 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 11:29:00 -0400 Subject: [PATCH 02/58] {"schema":"decodex/commit/1","summary":"Port quantitative benchmark report surface","authority":"manual"} --- .../bin/real_world_job_benchmark/commands.rs | 18 +- .../bin/real_world_job_benchmark/fixtures.rs | 13 + .../src/bin/real_world_job_benchmark/main.rs | 7 + .../bin/real_world_job_benchmark/markdown.rs | 8 +- .../markdown/quantitative.rs | 84 +++ .../real_world_job_benchmark/quantitative.rs | 489 ++++++++++++++++++ .../quantitative_reports.rs | 76 +++ .../real_world_job_benchmark/report_root.rs | 7 +- .../bin/real_world_job_benchmark/scoring.rs | 4 + .../scoring/answers.rs | 44 +- .../tests/real_world_job_benchmark.rs | 1 + .../markdown_rendering_generated.rs | 3 + .../real_world_job_benchmark/quantitative.rs | 160 ++++++ 13 files changed, 883 insertions(+), 31 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs index 91dc476f..3e7d4ce1 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs @@ -1,7 +1,7 @@ use crate::{ AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path, - PathBuf, PrivateCorpusRedaction, PublishArgs, REPORT_SCHEMA, RealWorldJob, RealWorldReport, - Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, + PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, + RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, }; pub(super) fn run_command(args: RunArgs) -> Result<()> { @@ -103,16 +103,26 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result, #[serde(default)] pub(super) evidence_links: BTreeMap, + #[serde(default)] + pub(super) relevance_judgments: Vec, pub(super) answer_type: String, #[serde(default)] pub(super) accepted_alternates: Vec, @@ -96,6 +98,13 @@ pub(super) struct ExpectedAnswer { pub(super) requires_refusal: bool, } +#[derive(Debug, Deserialize)] +pub(super) struct RelevanceJudgment { + pub(super) evidence_id: String, + #[serde(default = "default_relevance_grade")] + pub(super) grade: f64, +} + #[derive(Debug, Deserialize)] pub(super) struct RequiredEvidence { pub(super) evidence_id: String, @@ -250,3 +259,7 @@ pub(super) struct AdapterResponse { pub(super) answer: ProducedAnswer, pub(super) consolidation: Option, } + +fn default_relevance_grade() -> f64 { + 1.0 +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs index 9815886f..61715b35 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -16,6 +16,8 @@ mod job_reports; mod markdown; mod operational; mod operational_reports; +mod quantitative; +mod quantitative_reports; mod recovery; mod report_root; mod scoreboard; @@ -84,6 +86,11 @@ use operational_reports::{ OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport, OperationalResourceSummary, }; +use quantitative::{QuantitativeReportInput, quantitative_scoreboard_report}; +use quantitative_reports::{ + QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, + QuantitativePerQueryRow, +}; use report_root::RealWorldReport; use scoreboard::scoreboard_report; use scoreboard_reports::{ diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs index 36f9dba6..68bcb12a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs @@ -6,6 +6,7 @@ mod followups; mod header; mod jobs; mod operational; +mod quantitative; mod scoreboard; mod trace; @@ -16,9 +17,9 @@ use crate::{ AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, CostReport, DEFAULT_ADAPTER_BEHAVIOR, EvolutionJobReport, ExternalAdapterReport, KnowledgeSummary, MemorySummaryReport, OperatorDebugEvidence, OperatorUxGap, ProactiveBriefSummaryReport, - RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, ScenarioOutcomeCounts, - ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, ScoreboardRow, - TraceExplainability, WorkContinuitySummaryReport, + QuantitativeBenchmarkRow, RealWorldReport, ReportSummary, SCOREBOARD_EVIDENCE_CLASSES, + ScenarioOutcomeCounts, ScenarioPositionCounts, ScheduledMemorySummaryReport, ScoreboardReport, + ScoreboardRow, TraceExplainability, WorkContinuitySummaryReport, formatting::{ adapter_status_str, round3, scenario_comparison_outcome_str, status_str, trace_failure_stage, @@ -32,6 +33,7 @@ pub(super) fn render_markdown(report: &RealWorldReport, report_path: &Path) -> S self::header::render_markdown_header(&mut out, report, report_path.as_str()); self::scoreboard::render_markdown_scoreboard(&mut out, report); + self::quantitative::render_markdown_quantitative_scoreboard(&mut out, report); self::operational::render_markdown_operational_evidence(&mut out, report); self::adapters::render_markdown_external_adapters(&mut out, report); self::adapters::render_markdown_capture_integration(&mut out, report); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs new file mode 100644 index 00000000..1c3ec195 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/quantitative.rs @@ -0,0 +1,84 @@ +use crate::markdown::{self, QuantitativeBenchmarkRow, RealWorldReport}; + +pub(super) fn render_markdown_quantitative_scoreboard(out: &mut String, report: &RealWorldReport) { + let scoreboard = &report.quantitative_scoreboard; + + if scoreboard.schema.is_empty() { + return; + } + + out.push_str("## Quantitative Benchmark Report\n\n"); + out.push_str(concat!( + "Quantitative rows expose ranking metrics and their claim controls. ", + "Fixture-backed rows verify benchmark mechanics; leaderboard claims require explicit qrels, ", + "enough queries, and leakage controls.\n\n" + )); + out.push_str(&format!("- Schema: `{}`\n", markdown::md_inline(scoreboard.schema.as_str()))); + out.push_str(&format!("- Corpus: `{}`\n", markdown::md_inline(scoreboard.corpus_id.as_str()))); + out.push_str(&format!( + "- k values: `{}`\n", + markdown::md_inline( + scoreboard + .k_values + .iter() + .map(usize::to_string) + .collect::>() + .join(", ") + .as_str() + ) + )); + out.push_str(&format!( + "- Ranking queries: `{}` of `{}`; explicit-qrel queries: `{}`\n", + scoreboard.controls.current_ranking_query_count, + scoreboard.controls.current_query_count, + scoreboard.controls.current_explicit_qrel_query_count + )); + out.push_str(&format!( + "- Leaderboard claim allowed: `{}`\n", + scoreboard.controls.leaderboard_claim_allowed + )); + out.push_str(&format!( + "- Claim boundary: {}\n\n", + markdown::md_cell(scoreboard.claim_boundary.as_str()) + )); + out.push_str("| Product | State | Evidence | Qrels | Sample | Ranking Queries | Recall@5 | "); + out.push_str("Precision@5 | MRR | nDCG@5 | AP | Leaderboard |\n"); + out.push_str( + "| --- | --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | --- |\n", + ); + + for row in &scoreboard.rows { + out.push_str(&format!( + "| {} | `{}` | `{}` | `{}` | `{}` | `{}` | {} | {} | {} | {} | {} | `{}` |\n", + markdown::md_cell(row.product.as_str()), + markdown::md_inline(row.result_state.as_str()), + markdown::md_inline(row.evidence_class.as_str()), + markdown::md_inline(row.qrel_source.as_str()), + row.sample_size, + row.ranking_query_count, + quantitative_metric(row, "recall_at_5"), + quantitative_metric(row, "precision_at_5"), + quantitative_metric(row, "mrr"), + quantitative_metric(row, "ndcg_at_5"), + quantitative_metric(row, "average_precision"), + row.leaderboard_eligible + )); + } + + if !scoreboard.metrics_not_encoded.is_empty() { + out.push_str("\nMetrics not encoded:\n"); + + for metric in &scoreboard.metrics_not_encoded { + out.push_str(&format!("- `{}`\n", markdown::md_inline(metric.as_str()))); + } + + out.push('\n'); + } +} + +fn quantitative_metric(row: &QuantitativeBenchmarkRow, metric: &str) -> String { + row.metrics + .get(metric) + .and_then(|value| *value) + .map_or_else(|| "`n/a`".to_string(), |value| format!("`{}`", markdown::round3(value))) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs new file mode 100644 index 00000000..fa96df20 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -0,0 +1,489 @@ +use crate::{ + AdapterReport, BTreeMap, BTreeSet, JobReport, QuantitativeBenchmarkControls, + QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldJob, + ReportSummary, formatting, scoring, +}; + +const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; +const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10]; +const MIN_LEADERBOARD_QUERY_COUNT: usize = 30; +const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( + "Quantitative metrics are bounded to this generated report. ", + "Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims." +); + +pub(super) struct QuantitativeReportInput<'a> { + pub(super) generated_at: &'a str, + pub(super) adapter: &'a AdapterReport, + pub(super) source_jobs: &'a [RealWorldJob], + pub(super) jobs: &'a [JobReport], + pub(super) summary: &'a ReportSummary, +} + +pub(super) fn quantitative_scoreboard_report( + input: QuantitativeReportInput<'_>, +) -> QuantitativeBenchmarkReport { + let corpus_id = quantitative_corpus_id(input.source_jobs); + let evidence_class = quantitative_evidence_class(input.adapter, input.jobs); + let per_query_rows = quantitative_per_query_rows( + input.source_jobs, + input.jobs, + corpus_id.as_str(), + evidence_class, + input.adapter.adapter_id.as_str(), + ); + let ranking_query_count = per_query_rows + .iter() + .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) + .count(); + let explicit_qrel_query_count = + per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); + let metric_comparable = ranking_query_count > 0; + let leaderboard_eligible = false; + let result_state = quantitative_result_state(input.summary); + let row = QuantitativeBenchmarkRow { + product: "ELF".to_string(), + adapter_id: input.adapter.adapter_id.clone(), + adapter_name: input.adapter.name.clone(), + suite: quantitative_suite_id(input.jobs), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.clone()), + result_state: result_state.to_string(), + comparable: metric_comparable, + metric_comparable, + leaderboard_eligible, + held_out: false, + leakage_audited: false, + fixture_regression_only: evidence_class == "fixture_backed", + sample_size: input.jobs.len(), + ranking_query_count, + ranking_coverage_state: ranking_coverage_state( + input.summary, + input.source_jobs.len(), + ranking_query_count, + ) + .to_string(), + ranked_candidate_source: ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + .to_string(), + explicit_qrel_query_count, + metrics: aggregate_metrics(per_query_rows.as_slice()), + metric_states: aggregate_metric_states(result_state, metric_comparable), + denominators: aggregate_denominators(per_query_rows.as_slice()), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + }; + let controls = QuantitativeBenchmarkControls { + same_corpus_required: true, + same_task_required: true, + ranked_candidates_required_for_ranking_metrics: true, + explicit_relevance_judgments_required_for_leaderboard: true, + minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, + current_query_count: input.source_jobs.len(), + current_ranking_query_count: ranking_query_count, + current_explicit_qrel_query_count: explicit_qrel_query_count, + leaderboard_claim_allowed: leaderboard_eligible, + leakage_control: + "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" + .to_string(), + }; + + QuantitativeBenchmarkReport { + schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), + generated_at: input.generated_at.to_string(), + corpus_id, + k_values: QUANTITATIVE_K_VALUES.to_vec(), + rows: vec![row], + per_query_rows, + metrics_not_encoded: vec![ + "paired_significance".to_string(), + "external_product_manifest_import".to_string(), + "audit_manifest_validation".to_string(), + ], + controls, + claim_boundary: concat!( + "Do not convert fixture mechanics, missing explicit qrels, ", + "or partial candidate coverage into product leaderboard claims." + ) + .to_string(), + } +} + +fn quantitative_per_query_rows( + source_jobs: &[RealWorldJob], + jobs: &[JobReport], + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> Vec { + source_jobs + .iter() + .zip(jobs.iter()) + .map(|(source_job, job)| { + quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) + }) + .collect() +} + +fn quantitative_per_query_row( + source_job: &RealWorldJob, + job: &JobReport, + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let relevance = relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = positive_qrel_count(&relevance); + let metrics = per_query_metrics(candidates.as_slice(), &relevance); + let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { + "not_encoded" + } else { + formatting::status_str(job.status) + }; + let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); + let denominators = per_query_denominators(candidates.len(), positive_relevance_count); + + QuantitativePerQueryRow { + job_id: job.job_id.clone(), + suite: job.suite_id.clone(), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: formatting::status_str(job.status).to_string(), + expected_relevant_count: positive_relevance_count, + candidate_count: candidates.len(), + qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + product: "ELF".to_string(), + adapter_id: adapter_id.to_string(), + metrics, + metric_states, + denominators, + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} + +fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { + let explicit = source_job + .expected_answer + .relevance_judgments + .iter() + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect::>(); + + if !explicit.is_empty() { + return explicit; + } + + job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() +} + +fn per_query_metrics( + candidates: &[String], + relevance: &BTreeMap, +) -> BTreeMap> { + let mut metrics = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + let relevant_at_k = relevant_at_k(candidates, relevance, *k); + + metrics + .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); + metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); + metrics.insert( + format!("success_at_{k}"), + Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), + ); + } + + metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); + metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); + + metrics +} + +fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} + +fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} + +fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { + let positive_count = positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} + +fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + let mut sums = BTreeMap::::new(); + let mut metrics = quantitative_metric_names() + .into_iter() + .map(|metric| (metric, None)) + .collect::>(); + + for row in rows { + for (metric, value) in &row.metrics { + if let Some(value) = value { + let (sum, count) = sums.entry(metric.clone()).or_default(); + + *sum += *value; + *count += 1; + } + } + } + for (metric, (sum, count)) in sums { + metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); + } + + metrics +} + +fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + let state = if metric_comparable { result_state } else { "not_encoded" }; + let mut states = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + states.insert(format!("recall_at_{k}"), state.to_string()); + states.insert(format!("precision_at_{k}"), state.to_string()); + states.insert(format!("success_at_{k}"), state.to_string()); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + states.insert(metric.to_string(), state.to_string()); + } + + states +} + +fn quantitative_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + metrics.push(metric.to_string()); + } + + metrics +} + +fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} + +fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert( + format!("recall_at_{k}"), + sum_per_query_denominator(rows, &format!("recall_at_{k}")), + ); + denominators.insert( + format!("precision_at_{k}"), + sum_per_query_denominator(rows, &format!("precision_at_{k}")), + ); + denominators.insert( + format!("success_at_{k}"), + sum_per_query_denominator(rows, &format!("success_at_{k}")), + ); + } + + denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); + denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); + denominators.insert( + "average_precision".to_string(), + sum_per_query_denominator(rows, "average_precision"), + ); + + denominators +} + +fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { + rows.iter().filter_map(|row| row.denominators.get(metric)).sum() +} + +fn quantitative_corpus_id(source_jobs: &[RealWorldJob]) -> String { + let ids = source_jobs.iter().map(|job| job.corpus.corpus_id.as_str()).collect::>(); + + if ids.len() == 1 { + ids.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn quantitative_suite_id(jobs: &[JobReport]) -> String { + let suites = jobs.iter().map(|job| job.suite_id.as_str()).collect::>(); + + if suites.len() == 1 { + suites.into_iter().next().unwrap_or("unknown").to_string() + } else { + "mixed".to_string() + } +} + +fn quantitative_result_state(summary: &ReportSummary) -> &'static str { + if summary.unsupported_claim > 0 { + "unsupported_claim" + } else if summary.wrong_result > 0 { + "wrong_result" + } else if summary.incomplete > 0 { + "incomplete" + } else if summary.blocked > 0 { + "blocked" + } else if summary.not_encoded > 0 { + "not_encoded" + } else { + "pass" + } +} + +fn quantitative_evidence_class(adapter: &AdapterReport, jobs: &[JobReport]) -> &'static str { + if adapter.behavior == "live_real_world_adapter" { + "live_real_world" + } else if jobs.iter().any(|job| job.operational_evidence_tier == "private_corpus") { + "private_corpus" + } else if jobs.iter().any(|job| job.operational_evidence_tier == "provider_backed") { + "provider_backed" + } else if adapter.behavior.contains("public_proxy") { + "public_proxy" + } else { + "fixture_backed" + } +} + +fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { + if !source_job.expected_answer.relevance_judgments.is_empty() { + "explicit_qrels" + } else if empty { + "not_encoded" + } else { + "expected_evidence_fallback" + } +} + +fn aggregate_qrel_source( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if explicit_qrel_query_count == ranking_query_count { + "explicit_qrels" + } else if explicit_qrel_query_count == 0 { + "expected_evidence_fallback" + } else { + "mixed" + } +} + +fn ranking_coverage_state( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { + "complete" + } else { + "partial_coverage" + } +} + +fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { + if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } +} + +fn positive_qrel_count(relevance: &BTreeMap) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs new file mode 100644 index 00000000..73f2b1eb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -0,0 +1,76 @@ +use crate::{BTreeMap, Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativeBenchmarkReport { + pub(super) schema: String, + pub(super) generated_at: String, + pub(super) corpus_id: String, + pub(super) k_values: Vec, + pub(super) rows: Vec, + #[serde(default)] + pub(super) per_query_rows: Vec, + #[serde(default)] + pub(super) metrics_not_encoded: Vec, + pub(super) controls: QuantitativeBenchmarkControls, + pub(super) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativeBenchmarkRow { + pub(super) product: String, + pub(super) adapter_id: String, + pub(super) adapter_name: String, + pub(super) suite: String, + pub(super) evidence_class: String, + pub(super) source_manifest_corpus_id: Option, + pub(super) result_state: String, + pub(super) comparable: bool, + pub(super) metric_comparable: bool, + pub(super) leaderboard_eligible: bool, + pub(super) held_out: bool, + pub(super) leakage_audited: bool, + pub(super) fixture_regression_only: bool, + pub(super) sample_size: usize, + pub(super) ranking_query_count: usize, + pub(super) ranking_coverage_state: String, + pub(super) ranked_candidate_source: String, + pub(super) qrel_source: String, + pub(super) explicit_qrel_query_count: usize, + pub(super) metrics: BTreeMap>, + pub(super) metric_states: BTreeMap, + pub(super) denominators: BTreeMap, + pub(super) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativePerQueryRow { + pub(super) job_id: String, + pub(super) suite: String, + pub(super) evidence_class: String, + pub(super) source_manifest_corpus_id: Option, + pub(super) result_state: String, + pub(super) expected_relevant_count: usize, + pub(super) candidate_count: usize, + pub(super) qrel_source: String, + pub(super) relevance_grade_sum: f64, + pub(super) product: String, + pub(super) adapter_id: String, + pub(super) metrics: BTreeMap>, + pub(super) metric_states: BTreeMap, + pub(super) denominators: BTreeMap, + pub(super) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativeBenchmarkControls { + pub(super) same_corpus_required: bool, + pub(super) same_task_required: bool, + pub(super) ranked_candidates_required_for_ranking_metrics: bool, + pub(super) explicit_relevance_judgments_required_for_leaderboard: bool, + pub(super) minimum_query_count_for_leaderboard: usize, + pub(super) current_query_count: usize, + pub(super) current_ranking_query_count: usize, + pub(super) current_explicit_qrel_query_count: usize, + pub(super) leaderboard_claim_allowed: bool, + pub(super) leakage_control: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs index 9ee62f1e..797eb2ba 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/report_root.rs @@ -1,7 +1,8 @@ use crate::{ AdapterReport, CaptureIntegrationReport, Deserialize, EvolutionSummary, ExternalAdapterSection, - FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, ReportSummary, - ScoreboardReport, Serialize, SuiteReport, UnsupportedClaimReport, + FollowUpReport, JobReport, OperationalEvidenceReport, PrivateCorpusRedaction, + QuantitativeBenchmarkReport, ReportSummary, ScoreboardReport, Serialize, SuiteReport, + UnsupportedClaimReport, }; #[derive(Debug, Deserialize, Serialize)] @@ -17,6 +18,8 @@ pub(super) struct RealWorldReport { #[serde(default)] pub(super) operational_evidence: OperationalEvidenceReport, #[serde(default)] + pub(super) quantitative_scoreboard: QuantitativeBenchmarkReport, + #[serde(default)] pub(super) external_adapters: ExternalAdapterSection, pub(super) capture_integration: CaptureIntegrationReport, pub(super) summary: ReportSummary, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs index 088a8842..2f0f34a7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs @@ -27,6 +27,10 @@ pub(super) fn job_report(job: &RealWorldJob, scoring: JobScoring) -> JobReport { reports::job_report(job, scoring) } +pub(super) fn produced_evidence_order(job: &RealWorldJob) -> Vec { + self::answers::ordered_produced_evidence_ids(self::answers::produced_answer(job)) +} + pub(super) fn score_job(job: &RealWorldJob) -> JobScoring { let answer = self::answers::produced_answer(job); let produced_evidence = self::answers::produced_evidence_ids(answer); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs index 3e60e5b1..1e2d85ed 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring/answers.rs @@ -61,28 +61,7 @@ pub(super) fn trap_ids_used( .collect() } -fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { - let _ = job; - - static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); - - EMPTY_ANSWER.get_or_init(|| ProducedAnswer { - content: String::new(), - claims: Vec::new(), - evidence_ids: Vec::new(), - pages: Vec::new(), - memory_summaries: Vec::new(), - proactive_briefs: Vec::new(), - scheduled_tasks: Vec::new(), - work_journal_readbacks: Vec::new(), - recovery_drills: Vec::new(), - latency_ms: None, - cost: None, - trace_explainability: None, - }) -} - -fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { +pub(super) fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { let mut seen = BTreeSet::new(); let mut evidence = Vec::new(); @@ -180,6 +159,27 @@ fn ordered_produced_evidence_ids(answer: &ProducedAnswer) -> Vec { evidence } +fn synthetic_answer(job: &RealWorldJob) -> &ProducedAnswer { + let _ = job; + + static EMPTY_ANSWER: std::sync::OnceLock = std::sync::OnceLock::new(); + + EMPTY_ANSWER.get_or_init(|| ProducedAnswer { + content: String::new(), + claims: Vec::new(), + evidence_ids: Vec::new(), + pages: Vec::new(), + memory_summaries: Vec::new(), + proactive_briefs: Vec::new(), + scheduled_tasks: Vec::new(), + work_journal_readbacks: Vec::new(), + recovery_drills: Vec::new(), + latency_ms: None, + cost: None, + trace_explainability: None, + }) +} + fn push_ordered_evidence( evidence: &mut Vec, seen: &mut BTreeSet, diff --git a/apps/elf-eval/tests/real_world_job_benchmark.rs b/apps/elf-eval/tests/real_world_job_benchmark.rs index 6df392ce..6aa5cecb 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark.rs @@ -20,6 +20,7 @@ #[path = "real_world_job_benchmark/operator_debug.rs"] mod operator_debug; #[path = "real_world_job_benchmark/proactive_brief.rs"] mod proactive_brief; #[path = "real_world_job_benchmark/production_ops.rs"] mod production_ops; +#[path = "real_world_job_benchmark/quantitative.rs"] mod quantitative; #[path = "real_world_job_benchmark/recall_debug_reports.rs"] mod recall_debug_reports; #[path = "real_world_job_benchmark/retrieval.rs"] mod retrieval; #[path = "real_world_job_benchmark/root_aggregate.rs"] mod root_aggregate; diff --git a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs index f5a395c8..dc83515a 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/markdown_rendering_generated.rs @@ -38,6 +38,9 @@ fn generated_json_report_renders_markdown() -> Result<()> { assert!(markdown.contains("# Real-World Job Benchmark Report")); assert!(markdown.contains("work_resume")); assert!(markdown.contains("Capture And Integration Coverage")); + assert!(markdown.contains("Quantitative Benchmark Report")); + assert!(markdown.contains("leaderboard claims require explicit qrels")); + assert!(markdown.contains("| ELF | `pass` | `fixture_backed`")); assert!(markdown.contains("External Adapter Coverage")); assert!(markdown.contains("live-baseline-only")); assert!(markdown.contains("live real-world")); diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs new file mode 100644 index 00000000..675dbeb3 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -0,0 +1,160 @@ +use std::{env, fs, process}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + + assert_eq!( + report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_benchmark/v1") + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str), + report.pointer("/generated_at").and_then(Value::as_str) + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array), + Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),]) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/current_query_count") + .and_then(Value::as_u64), + report.pointer("/summary/job_count").and_then(Value::as_u64) + ); + + assert_quantitative_row_contract(&report)?; + assert_quantitative_per_query_contract(&report)?; + + Ok(()) +} + +#[test] +fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> { + let source_path = + support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json"); + let mut job = serde_json::from_str::(&fs::read_to_string(source_path)?)?; + + support::set_json_pointer( + &mut job, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["old-provider-note", "current-provider-report"]), + )?; + + job.pointer_mut("/expected_answer") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing expected_answer object"))? + .insert( + "relevance_judgments".to_string(), + serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]), + ); + + let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?; + let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1)); + assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0)); + assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0)); + assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?; + let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?; + + assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + Ok(()) +} + +fn assert_quantitative_row_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; + + assert_eq!(rows.len(), 1); + + let row = &rows[0]; + + assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF")); + assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke")); + assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality")); + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass")); + assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete")); + assert_eq!( + row.pointer("/ranked_candidate_source").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0)); + + for metric in [ + "recall_at_1", + "precision_at_1", + "success_at_1", + "recall_at_5", + "precision_at_5", + "success_at_5", + "mrr", + "ndcg_at_5", + "average_precision", + ] { + assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some()); + assert_eq!( + row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str), + Some("pass") + ); + assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some()); + } + + Ok(()) +} + +fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?; + let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0); + + assert_eq!(rows.len() as u64, job_count); + + for row in rows { + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/metrics/recall_at_5").is_some()); + assert!(row.pointer("/metrics/precision_at_5").is_some()); + assert!(row.pointer("/metrics/ndcg_at_5").is_some()); + assert!(row.pointer("/metrics/average_precision").is_some()); + } + + Ok(()) +} From a92363be1975089599bdccfa9c879229e1d19097 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 11:46:21 -0400 Subject: [PATCH 03/58] {"schema":"decodex/commit/1","summary":"Port quantitative product manifest import export","authority":"manual"} --- .../src/bin/real_world_job_benchmark/cli.rs | 27 ++ .../bin/real_world_job_benchmark/commands.rs | 21 +- .../src/bin/real_world_job_benchmark/main.rs | 11 +- .../real_world_job_benchmark/quantitative.rs | 284 +++++++++++++++++- .../quantitative_reports.rs | 11 + .../real_world_job_benchmark/quantitative.rs | 218 +++++++++++++- 6 files changed, 552 insertions(+), 20 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs index e1bc6f32..ddcf4a7e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs @@ -51,6 +51,9 @@ pub(super) struct RunArgs { /// Skip loading the real-world external adapter coverage manifest. #[arg(long)] pub(super) skip_external_adapter_manifest: bool, + /// Optional same-corpus quantitative product manifest to merge into the report. + #[arg(long, value_name = "FILE")] + pub(super) quantitative_product_manifest: Option, } #[derive(Debug, Parser)] @@ -63,9 +66,33 @@ pub(super) struct PublishArgs { pub(super) out: Option, } +#[derive(Debug, Parser)] +pub(super) struct ExportQuantitativeProductManifestArgs { + /// Generated real_world_job JSON report to export. + #[arg(long, value_name = "FILE", default_value = DEFAULT_REPORT_PATH)] + pub(super) report: PathBuf, + /// Write product manifest JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + pub(super) out: Option, + /// Stable manifest id. Defaults to -quantitative-product-manifest. + #[arg(long)] + pub(super) manifest_id: Option, + /// Override the exported product name. + #[arg(long)] + pub(super) product: Option, + /// Override the exported adapter id. + #[arg(long)] + pub(super) adapter_id: Option, + /// Override the exported adapter name. + #[arg(long)] + pub(super) adapter_name: Option, +} + #[derive(Debug, Subcommand)] #[command(rename_all = "kebab")] pub(super) enum Command { + /// Export the primary quantitative row as a reusable product manifest. + ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs), /// Parse and score real_world_job fixtures, then emit a JSON report. Run(RunArgs), /// Render Markdown from a generated real_world_job JSON report. diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs index 3e7d4ce1..c36fedd4 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs @@ -1,7 +1,8 @@ use crate::{ - AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, OffsetDateTime, Path, - PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, - RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, + AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, + ExportQuantitativeProductManifestArgs, OffsetDateTime, Path, PathBuf, PrivateCorpusRedaction, + PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, RealWorldJob, RealWorldReport, Result, + Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, }; pub(super) fn run_command(args: RunArgs) -> Result<()> { @@ -20,6 +21,17 @@ pub(super) fn publish_command(args: PublishArgs) -> Result<()> { write_or_print(args.out.as_deref(), markdown.as_str()) } +pub(super) fn export_quantitative_product_manifest_command( + args: ExportQuantitativeProductManifestArgs, +) -> Result<()> { + let raw = fs::read_to_string(&args.report)?; + let report = serde_json::from_str::(&raw)?; + let manifest = crate::quantitative_product_manifest_from_report(&report, &args)?; + let json = serde_json::to_string_pretty(&manifest)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + fn load_jobs(path: &Path) -> Result> { let paths = fixture_paths(path)?; let mut jobs = Vec::with_capacity(paths.len()); @@ -111,7 +123,8 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result<()> { color_eyre::install()?; match Args::parse().command { + Command::ExportQuantitativeProductManifest(args) => + commands::export_quantitative_product_manifest_command(args), Command::Run(args) => commands::run_command(args), Command::Publish(args) => commands::publish_command(args), } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index fa96df20..51d1c07e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,10 +1,13 @@ use crate::{ - AdapterReport, BTreeMap, BTreeSet, JobReport, QuantitativeBenchmarkControls, - QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldJob, - ReportSummary, formatting, scoring, + AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeProductManifestArgs, JobReport, Path, + QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, + QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, + RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring, }; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; +const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = + "elf.agent_memory_quantitative_product_manifest/v1"; const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10]; const MIN_LEADERBOARD_QUERY_COUNT: usize = 30; const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( @@ -18,11 +21,12 @@ pub(super) struct QuantitativeReportInput<'a> { pub(super) source_jobs: &'a [RealWorldJob], pub(super) jobs: &'a [JobReport], pub(super) summary: &'a ReportSummary, + pub(super) product_manifest_path: Option<&'a Path>, } pub(super) fn quantitative_scoreboard_report( input: QuantitativeReportInput<'_>, -) -> QuantitativeBenchmarkReport { +) -> Result { let corpus_id = quantitative_corpus_id(input.source_jobs); let evidence_class = quantitative_evidence_class(input.adapter, input.jobs); let per_query_rows = quantitative_per_query_rows( @@ -72,6 +76,16 @@ pub(super) fn quantitative_scoreboard_report( denominators: aggregate_denominators(per_query_rows.as_slice()), claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), }; + let product_manifest = + quantitative_product_manifest(input.product_manifest_path, corpus_id.as_str())?; + let imported_row_count = product_manifest.rows.len(); + let imported_per_query_count = product_manifest.per_query_rows.len(); + let mut rows = vec![row]; + let mut merged_per_query_rows = per_query_rows; + + rows.extend(product_manifest.rows); + merged_per_query_rows.extend(product_manifest.per_query_rows); + let controls = QuantitativeBenchmarkControls { same_corpus_required: true, same_task_required: true, @@ -87,25 +101,271 @@ pub(super) fn quantitative_scoreboard_report( .to_string(), }; - QuantitativeBenchmarkReport { + Ok(QuantitativeBenchmarkReport { schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), generated_at: input.generated_at.to_string(), corpus_id, k_values: QUANTITATIVE_K_VALUES.to_vec(), - rows: vec![row], - per_query_rows, - metrics_not_encoded: vec![ - "paired_significance".to_string(), - "external_product_manifest_import".to_string(), - "audit_manifest_validation".to_string(), - ], + rows, + per_query_rows: merged_per_query_rows, + metrics_not_encoded: quantitative_metrics_not_encoded( + imported_row_count, + imported_per_query_count, + ), controls, claim_boundary: concat!( "Do not convert fixture mechanics, missing explicit qrels, ", "or partial candidate coverage into product leaderboard claims." ) .to_string(), + }) +} + +pub(super) fn quantitative_product_manifest_from_report( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + if report.schema != REPORT_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {REPORT_SCHEMA}.", + args.report.display(), + report.schema + )); + } + + let source_row = + report.quantitative_scoreboard.rows.first().ok_or_else(|| { + eyre::eyre!("{} has no quantitative product row.", args.report.display()) + })?; + let source_product = source_row.product.as_str(); + let source_adapter_id = source_row.adapter_id.as_str(); + let product = args.product.as_deref().unwrap_or(source_product).trim(); + let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); + let adapter_name = + args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); + + if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { + return Err(eyre::eyre!( + "{} cannot export an incomplete quantitative product identity.", + args.report.display() + )); + } + if product == "ELF" { + return Err(eyre::eyre!( + "{} exports product ELF; use --product for external product manifest exports.", + args.report.display() + )); + } + + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.adapter_name = adapter_name.to_string(); + row.claim_boundary = concat!( + "Exported from a generated real_world_job_report quantitative row; ", + "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." + ) + .to_string(); + + let mut per_query_rows = Vec::new(); + + for row in &report.quantitative_scoreboard.per_query_rows { + if row.product != source_product || row.adapter_id != source_adapter_id { + continue; + } + + let mut row = row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.claim_boundary = concat!( + "Exported from generated report per-query quantitative evidence; ", + "import does not relax paired-significance or leaderboard gates." + ) + .to_string(); + + per_query_rows.push(row); } + + let manifest = QuantitativeProductManifest { + schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), + corpus_id: report.quantitative_scoreboard.corpus_id.clone(), + rows: vec![row], + per_query_rows, + }; + + validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?; + + Ok(manifest) +} + +fn quantitative_product_manifest( + path: Option<&Path>, + corpus_id: &str, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeProductManifest::default()); + }; + let raw = fs::read_to_string(path)?; + let mut manifest = + serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) + })?; + + for row in &mut manifest.rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + for row in &mut manifest.per_query_rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + + validate_quantitative_product_manifest(&manifest, path, corpus_id)?; + + Ok(manifest) +} + +fn validate_quantitative_product_manifest( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.corpus_id != corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected same-corpus {}.", + path.display(), + manifest.corpus_id, + corpus_id + )); + } + if manifest.rows.is_empty() { + return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); + } + + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.rows { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative product row.", + path.display() + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + for row in &manifest.per_query_rows { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} + +fn quantitative_metrics_not_encoded( + imported_row_count: usize, + imported_per_query_count: usize, +) -> Vec { + let mut metrics = + vec!["paired_significance".to_string(), "audit_manifest_validation".to_string()]; + + if imported_row_count == 0 { + metrics.push("external_product_manifest_import".to_string()); + } + if imported_row_count > 0 && imported_per_query_count == 0 { + metrics.push("imported_product_per_query_rows".to_string()); + } + + metrics } fn quantitative_per_query_rows( diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs index 73f2b1eb..a4552032 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -74,3 +74,14 @@ pub(super) struct QuantitativeBenchmarkControls { pub(super) leaderboard_claim_allowed: bool, pub(super) leakage_control: String, } + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativeProductManifest { + pub(super) schema: String, + pub(super) manifest_id: String, + pub(super) corpus_id: String, + #[serde(default)] + pub(super) rows: Vec, + #[serde(default)] + pub(super) per_query_rows: Vec, +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index 675dbeb3..b350eb3f 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -1,4 +1,8 @@ -use std::{env, fs, process}; +use std::{ + env, fs, + path::Path, + process::{self, Command}, +}; use color_eyre::{Result, eyre}; use serde_json::Value; @@ -87,6 +91,218 @@ fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> { Ok(()) } +#[test] +fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let manifest = support::load_json(&manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_product_manifest/v1") + ); + assert_eq!( + manifest.pointer("/rows/0/product").and_then(Value::as_str), + Some("Synthetic Rival") + ); + assert_eq!( + manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), + Some("synthetic_rival") + ); + + let imported = run_report_with_quantitative_manifest(&manifest_path)?; + let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; + let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; + + assert_eq!(rows.len(), 2); + assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); + assert!(!support::array_contains_str( + &imported, + "/quantitative_scoreboard/metrics_not_encoded", + "external_product_manifest_import" + )?); + assert!( + support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( + |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") + ) + ); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("elf-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let mut manifest = support::load_json(&manifest_path)?; + + support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("missing-per-query-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let mut manifest = support::load_json(&manifest_path)?; + + support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stderr.contains("ranked queries but only 0")); + + Ok(()) +} + +fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + fn assert_quantitative_row_contract(report: &Value) -> Result<()> { let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; From 4ee6bae78890b98d3627a6bcfc4f197b0d9f717c Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:01:09 -0400 Subject: [PATCH 04/58] {"schema":"decodex/commit/1","summary":"Port quantitative audit manifest gates","authority":"manual"} --- .../src/bin/real_world_job_benchmark/cli.rs | 39 ++ .../bin/real_world_job_benchmark/commands.rs | 18 +- .../src/bin/real_world_job_benchmark/main.rs | 16 +- .../real_world_job_benchmark/quantitative.rs | 541 +++++++++++++++++- .../quantitative_reports.rs | 29 + .../real_world_job_benchmark/quantitative.rs | 121 ++++ 6 files changed, 748 insertions(+), 16 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs index ddcf4a7e..bae29a2e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/cli.rs @@ -54,6 +54,9 @@ pub(super) struct RunArgs { /// Optional same-corpus quantitative product manifest to merge into the report. #[arg(long, value_name = "FILE")] pub(super) quantitative_product_manifest: Option, + /// Optional audit manifest proving the current quantitative row's held-out/leakage gates. + #[arg(long, value_name = "FILE")] + pub(super) quantitative_audit_manifest: Option, } #[derive(Debug, Parser)] @@ -88,9 +91,45 @@ pub(super) struct ExportQuantitativeProductManifestArgs { pub(super) adapter_name: Option, } +#[derive(Debug, Parser)] +pub(super) struct ExportQuantitativeAuditManifestArgs { + /// Fixture file or directory containing current product-runtime real_world_job outputs. + #[arg(long, value_name = "PATH", default_value = DEFAULT_FIXTURE_PATH)] + pub(super) fixtures: PathBuf, + /// Write audit manifest JSON to this file. Omit to print to stdout. + #[arg(long, value_name = "FILE")] + pub(super) out: Option, + /// Stable run id that the audit manifest is allowed to attest. + #[arg(long, default_value = DEFAULT_RUN_ID)] + pub(super) run_id: String, + /// Stable manifest id. Defaults to -quantitative-audit-manifest. + #[arg(long)] + pub(super) manifest_id: Option, + /// Product name for the current row. + #[arg(long, default_value = "ELF")] + pub(super) product: String, + /// Adapter id for the current row. + #[arg(long, default_value = DEFAULT_ADAPTER_ID)] + pub(super) adapter_id: String, + /// Mark the current row as held-out only when query ids were locked before runtime. + #[arg(long)] + pub(super) held_out: bool, + /// Mark the current row as leakage audited only when runtime inputs excluded answers/qrels. + #[arg(long)] + pub(super) leakage_audited: bool, + /// Audit control string. Repeat for multiple controls. + #[arg(long = "control")] + pub(super) controls: Vec, + /// Claim boundary recorded in the audit manifest. + #[arg(long)] + pub(super) claim_boundary: Option, +} + #[derive(Debug, Subcommand)] #[command(rename_all = "kebab")] pub(super) enum Command { + /// Export a quantitative audit manifest for the current fixture set. + ExportQuantitativeAuditManifest(ExportQuantitativeAuditManifestArgs), /// Export the primary quantitative row as a reusable product manifest. ExportQuantitativeProductManifest(ExportQuantitativeProductManifestArgs), /// Parse and score real_world_job fixtures, then emit a JSON report. diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs index c36fedd4..a151e6da 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/commands.rs @@ -1,8 +1,8 @@ use crate::{ AdapterReport, BTreeSet, CaptureIntegrationReport, CorpusProfile, - ExportQuantitativeProductManifestArgs, OffsetDateTime, Path, PathBuf, PrivateCorpusRedaction, - PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, RealWorldJob, RealWorldReport, Result, - Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, + ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, OffsetDateTime, + Path, PathBuf, PrivateCorpusRedaction, PublishArgs, QuantitativeReportInput, REPORT_SCHEMA, + RealWorldJob, RealWorldReport, Result, Rfc3339, RunArgs, TypedStatus, VERSION, eyre, fs, }; pub(super) fn run_command(args: RunArgs) -> Result<()> { @@ -32,6 +32,16 @@ pub(super) fn export_quantitative_product_manifest_command( write_or_print(args.out.as_deref(), json.as_str()) } +pub(super) fn export_quantitative_audit_manifest_command( + args: ExportQuantitativeAuditManifestArgs, +) -> Result<()> { + let jobs = load_jobs(&args.fixtures)?; + let manifest = crate::quantitative_audit_manifest_from_jobs(jobs.as_slice(), &args)?; + let json = serde_json::to_string_pretty(&manifest)?; + + write_or_print(args.out.as_deref(), json.as_str()) +} + fn load_jobs(path: &Path) -> Result> { let paths = fixture_paths(path)?; let mut jobs = Vec::with_capacity(paths.len()); @@ -118,12 +128,14 @@ fn build_report(jobs: &[RealWorldJob], args: &RunArgs) -> Result Result<()> { color_eyre::install()?; match Args::parse().command { + Command::ExportQuantitativeAuditManifest(args) => + commands::export_quantitative_audit_manifest_command(args), Command::ExportQuantitativeProductManifest(args) => commands::export_quantitative_product_manifest_command(args), Command::Run(args) => commands::run_command(args), diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index 51d1c07e..f799e9fc 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,13 +1,22 @@ +use std::env; + use crate::{ - AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeProductManifestArgs, JobReport, Path, - QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, - QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, - RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring, + AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, + ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact, + QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, + QuantitativeBenchmarkRow, QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, + RealWorldJob, RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring, }; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = "elf.agent_memory_quantitative_product_manifest/v1"; +const QUANTITATIVE_AUDIT_MANIFEST_SCHEMA: &str = "elf.agent_memory_quantitative_audit_manifest/v1"; +const REQUIRED_HELD_OUT_AUDIT_CONTROL: &str = "query_ids_locked_before_product_runtime"; +const REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL: &str = + "product_runtime_did_not_receive_expected_answers_or_qrels"; +const REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL: &str = + "ranked_candidates_emitted_by_product_runtime"; const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10]; const MIN_LEADERBOARD_QUERY_COUNT: usize = 30; const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( @@ -16,12 +25,30 @@ const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( ); pub(super) struct QuantitativeReportInput<'a> { + pub(super) run_id: &'a str, pub(super) generated_at: &'a str, pub(super) adapter: &'a AdapterReport, pub(super) source_jobs: &'a [RealWorldJob], pub(super) jobs: &'a [JobReport], pub(super) summary: &'a ReportSummary, pub(super) product_manifest_path: Option<&'a Path>, + pub(super) audit_manifest_path: Option<&'a Path>, +} + +struct QuantitativeAuditContext<'a> { + run_id: &'a str, + corpus_id: &'a str, + product: &'a str, + adapter_id: &'a str, + source_jobs: &'a [RealWorldJob], + ranking_query_count: usize, + explicit_qrel_query_count: usize, +} + +struct QuantitativeAuditEvidence { + held_out: bool, + leakage_audited: bool, + audit_manifest_id: Option, } pub(super) fn quantitative_scoreboard_report( @@ -43,8 +70,27 @@ pub(super) fn quantitative_scoreboard_report( let explicit_qrel_query_count = per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); let metric_comparable = ranking_query_count > 0; - let leaderboard_eligible = false; let result_state = quantitative_result_state(input.summary); + let audit_evidence = quantitative_audit_evidence( + input.audit_manifest_path, + QuantitativeAuditContext { + run_id: input.run_id, + corpus_id: corpus_id.as_str(), + product: "ELF", + adapter_id: input.adapter.adapter_id.as_str(), + source_jobs: input.source_jobs, + ranking_query_count, + explicit_qrel_query_count, + }, + )?; + let leaderboard_eligible = quantitative_row_leaderboard_eligible( + evidence_class, + input.source_jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + &audit_evidence, + ); let row = QuantitativeBenchmarkRow { product: "ELF".to_string(), adapter_id: input.adapter.adapter_id.clone(), @@ -56,8 +102,9 @@ pub(super) fn quantitative_scoreboard_report( comparable: metric_comparable, metric_comparable, leaderboard_eligible, - held_out: false, - leakage_audited: false, + held_out: audit_evidence.held_out, + leakage_audited: audit_evidence.leakage_audited, + audit_manifest_id: audit_evidence.audit_manifest_id, fixture_regression_only: evidence_class == "fixture_backed", sample_size: input.jobs.len(), ranking_query_count, @@ -86,6 +133,7 @@ pub(super) fn quantitative_scoreboard_report( rows.extend(product_manifest.rows); merged_per_query_rows.extend(product_manifest.per_query_rows); + let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; let controls = QuantitativeBenchmarkControls { same_corpus_required: true, same_task_required: true, @@ -95,7 +143,7 @@ pub(super) fn quantitative_scoreboard_report( current_query_count: input.source_jobs.len(), current_ranking_query_count: ranking_query_count, current_explicit_qrel_query_count: explicit_qrel_query_count, - leaderboard_claim_allowed: leaderboard_eligible, + leaderboard_claim_allowed, leakage_control: "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" .to_string(), @@ -204,6 +252,303 @@ pub(super) fn quantitative_product_manifest_from_report( Ok(manifest) } +pub(super) fn quantitative_audit_manifest_from_jobs( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, +) -> Result { + let product = args.product.trim(); + let adapter_id = args.adapter_id.trim(); + + if product.is_empty() || adapter_id.is_empty() { + return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); + } + + let corpus_id = quantitative_corpus_id(jobs); + let ranking_query_count = ranking_query_count(jobs); + let explicit_qrel_query_count = explicit_qrel_query_count(jobs); + let manifest = QuantitativeAuditManifest { + schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), + run_id: args.run_id.clone(), + corpus_id, + product: product.to_string(), + adapter_id: adapter_id.to_string(), + held_out: args.held_out, + leakage_audited: args.leakage_audited, + sample_size: jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + controls: args.controls.clone(), + artifacts: vec![QuantitativeAuditArtifact { + role: "product_runtime_fixtures".to_string(), + path: audit_artifact_display_path(args.fixtures.as_path()), + sha256: fixture_path_digest(args.fixtures.as_path())?, + }], + claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| { + if args.held_out || args.leakage_audited { + concat!( + "Audit manifest supplied by operator; runner validates run/corpus/product/", + "adapter/count/query-id/artifact bindings before opening row gates." + ) + .to_string() + } else { + concat!( + "Diagnostic audit manifest binds the current product-runtime fixture set to ", + "query ids and counts, but it does not prove held-out or leakage-audited status." + ) + .to_string() + } + }), + }; + + validate_quantitative_audit_manifest( + &manifest, + args.fixtures.as_path(), + QuantitativeAuditContext { + run_id: args.run_id.as_str(), + corpus_id: manifest.corpus_id.as_str(), + product, + adapter_id, + source_jobs: jobs, + ranking_query_count: manifest.ranking_query_count, + explicit_qrel_query_count: manifest.explicit_qrel_query_count, + }, + )?; + + Ok(manifest) +} + +fn quantitative_audit_evidence( + path: Option<&Path>, + context: QuantitativeAuditContext<'_>, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeAuditEvidence { + held_out: false, + leakage_audited: false, + audit_manifest_id: None, + }); + }; + let raw = fs::read_to_string(path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) + })?; + + validate_quantitative_audit_manifest(&manifest, path, context)?; + + Ok(QuantitativeAuditEvidence { + held_out: manifest.held_out, + leakage_audited: manifest.leakage_audited, + audit_manifest_id: Some(manifest.manifest_id), + }) +} + +fn validate_quantitative_audit_manifest( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; + validate_quantitative_audit_controls(manifest, path)?; + + validate_quantitative_audit_artifacts(manifest, path) +} + +fn validate_quantitative_audit_query_ids( + manifest: &QuantitativeAuditManifest, + path: &Path, + source_jobs: &[RealWorldJob], +) -> Result<()> { + let expected = ranking_query_ids(source_jobs); + let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); + + if actual.len() != manifest.query_ids.len() { + return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); + } + if actual != expected { + let missing = expected.difference(&actual).copied().collect::>(); + let extra = actual.difference(&expected).copied().collect::>(); + + return Err(eyre::eyre!( + "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", + path.display(), + missing, + extra + )); + } + + Ok(()) +} + +fn validate_quantitative_audit_controls( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + let controls = manifest.controls.iter().map(String::as_str).collect::>(); + + if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { + return Err(eyre::eyre!( + "{} marks held_out=true without required control {}.", + path.display(), + REQUIRED_HELD_OUT_AUDIT_CONTROL + )); + } + if manifest.leakage_audited + && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) + || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) + { + return Err(eyre::eyre!( + "{} marks leakage_audited=true without required controls {} and {}.", + path.display(), + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL + )); + } + if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() + { + return Err(eyre::eyre!( + "{} marks audit controls true but has an empty claim_boundary.", + path.display() + )); + } + + Ok(()) +} + +fn validate_quantitative_audit_artifacts( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.artifacts.is_empty() { + return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); + } + + for artifact in &manifest.artifacts { + if artifact.role.trim().is_empty() + || artifact.path.trim().is_empty() + || artifact.sha256.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative audit artifact.", + path.display() + )); + } + if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) + { + return Err(eyre::eyre!( + "{} artifact {} has invalid sha256 digest {}.", + path.display(), + artifact.role, + artifact.sha256 + )); + } + + let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { + eyre::eyre!( + "{} artifact {} could not be digested at {}: {err}", + path.display(), + artifact.role, + artifact_path.display() + ) + })?; + + if actual != artifact.sha256 { + return Err(eyre::eyre!( + "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", + path.display(), + artifact.role, + artifact_path.display(), + artifact.sha256, + actual + )); + } + } + + Ok(()) +} + +fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { + let raw = PathBuf::from(artifact_path); + + if raw.is_absolute() { + return raw; + } + + let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); + + if cwd_path.exists() { + return cwd_path; + } + + manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) +} + fn quantitative_product_manifest( path: Option<&Path>, corpus_id: &str, @@ -290,6 +635,9 @@ fn validate_quantitative_product_manifest( corpus_id )); } + if row.leaderboard_eligible { + validate_leaderboard_eligible_product_row(path, row)?; + } } for row in &manifest.per_query_rows { if row.job_id.trim().is_empty() @@ -351,6 +699,34 @@ fn validate_quantitative_product_manifest( Ok(()) } +fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} + fn quantitative_metrics_not_encoded( imported_row_count: usize, imported_per_query_count: usize, @@ -697,6 +1073,155 @@ fn quantitative_evidence_class(adapter: &AdapterReport, jobs: &[JobReport]) -> & } } +fn quantitative_row_leaderboard_eligible( + evidence_class: &str, + sample_size: usize, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + metric_comparable: bool, + audit_evidence: &QuantitativeAuditEvidence, +) -> bool { + metric_comparable + && evidence_class == "live_real_world" + && sample_size >= MIN_LEADERBOARD_QUERY_COUNT + && ranking_query_count == sample_size + && explicit_qrel_query_count == ranking_query_count + && audit_evidence.held_out + && audit_evidence.leakage_audited + && audit_evidence + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()) +} + +fn fixture_path_digest(path: &Path) -> Result { + let mut hasher = blake3::Hasher::new(); + + if path.is_file() { + hash_fixture_file( + path, + path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), + &mut hasher, + )?; + + return Ok(hasher.finalize().to_hex().to_string()); + } + + let paths = audit_fixture_paths(path)?; + + for fixture in paths { + let relative = fixture + .strip_prefix(path) + .map(|relative| relative.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); + + hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; + } + + Ok(hasher.finalize().to_hex().to_string()) +} + +fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { + hasher.update(logical_path.as_bytes()); + hasher.update(b"\0"); + hasher.update(&fs::read(path)?); + hasher.update(b"\0"); + + Ok(()) +} + +fn audit_artifact_display_path(path: &Path) -> String { + let display_path = if path.is_absolute() { + env::current_dir() + .ok() + .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) + .unwrap_or_else(|| path.to_path_buf()) + } else { + path.to_path_buf() + }; + + display_path.to_string_lossy().replace('\\', "/") +} + +fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + source_jobs + .iter() + .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) + .map(|job| job.job_id.as_str()) + .collect() +} + +fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { + ranking_query_ids(source_jobs).len() +} + +fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { + source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() +} + +fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { + if !source_job.expected_answer.relevance_judgments.is_empty() { + return source_job + .expected_answer + .relevance_judgments + .iter() + .filter(|judgment| judgment.grade > 0.0) + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect(); + } + + source_job + .required_evidence + .iter() + .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) + .map(|evidence| (evidence.evidence_id.clone(), 1.0)) + .collect() +} + +fn ranking_query_attempted(job: &RealWorldJob) -> bool { + if !scoring::produced_evidence_order(job).is_empty() { + return true; + } + + let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { + return false; + }; + + answer.trace_explainability.as_ref().is_some_and(|trace| { + trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") + }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) +} + fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { if !source_job.expected_answer.relevance_judgments.is_empty() { "explicit_qrels" diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs index a4552032..6c953802 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -29,6 +29,7 @@ pub(super) struct QuantitativeBenchmarkRow { pub(super) leaderboard_eligible: bool, pub(super) held_out: bool, pub(super) leakage_audited: bool, + pub(super) audit_manifest_id: Option, pub(super) fixture_regression_only: bool, pub(super) sample_size: usize, pub(super) ranking_query_count: usize, @@ -85,3 +86,31 @@ pub(super) struct QuantitativeProductManifest { #[serde(default)] pub(super) per_query_rows: Vec, } + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(super) struct QuantitativeAuditManifest { + pub(super) schema: String, + pub(super) manifest_id: String, + pub(super) run_id: String, + pub(super) corpus_id: String, + pub(super) product: String, + pub(super) adapter_id: String, + pub(super) held_out: bool, + pub(super) leakage_audited: bool, + pub(super) sample_size: usize, + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, + pub(super) query_ids: Vec, + #[serde(default)] + pub(super) controls: Vec, + #[serde(default)] + pub(super) artifacts: Vec, + pub(super) claim_boundary: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(super) struct QuantitativeAuditArtifact { + pub(super) role: String, + pub(super) path: String, + pub(super) sha256: String, +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index b350eb3f..f2b03d5c 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -285,6 +285,107 @@ fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence( Ok(()) } +#[test] +fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .arg("--held-out") + .arg("--leakage-audited") + .arg("--control") + .arg("query_ids_locked_before_product_runtime") + .arg("--control") + .arg("product_runtime_did_not_receive_expected_answers_or_qrels") + .arg("--control") + .arg("ranked_candidates_emitted_by_product_runtime") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let manifest = support::load_json(&manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_audit_manifest/v1") + ); + assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + support::array_at(&manifest, "/query_ids")?.len() as u64, + manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default() + ); + + let imported = run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?; + let row = support::array_at(&imported, "/quantitative_scoreboard/rows")? + .first() + .ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + row.pointer("/audit_manifest_id").and_then(Value::as_str), + Some("audit-import-test-quantitative-audit-manifest") + ); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + + Ok(()) +} + +#[test] +fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--run-id") + .arg("different-run") + .arg("--quantitative-audit-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run")); + + Ok(()) +} + fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -303,6 +404,26 @@ fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result Ok(serde_json::from_slice(&output.stdout)?) } +fn run_report_with_quantitative_audit(manifest_path: &Path, run_id: &str) -> Result { + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--run-id") + .arg(run_id) + .arg("--quantitative-audit-manifest") + .arg(manifest_path) + .output()?; + + assert!( + output.status.success(), + "real_world_job runner failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + + Ok(serde_json::from_slice(&output.stdout)?) +} + fn assert_quantitative_row_contract(report: &Value) -> Result<()> { let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; From 8c95885575752d0c44ed6fd512c9c872ba347a97 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:06:38 -0400 Subject: [PATCH 05/58] {"schema":"decodex/commit/1","summary":"Add quantitative rate confidence intervals","authority":"manual"} --- .../src/bin/real_world_job_benchmark/main.rs | 4 +- .../real_world_job_benchmark/quantitative.rs | 85 ++++++++++++++++++- .../quantitative_reports.rs | 12 +++ .../real_world_job_benchmark/quantitative.rs | 16 ++++ 4 files changed, 113 insertions(+), 4 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs index 50fadd82..dc77d8f0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -95,8 +95,8 @@ use quantitative::{ }; use quantitative_reports::{ QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls, - QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativePerQueryRow, - QuantitativeProductManifest, + QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, + QuantitativePerQueryRow, QuantitativeProductManifest, }; use report_root::RealWorldReport; use scoreboard::scoreboard_report; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index f799e9fc..ac782c30 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -4,8 +4,9 @@ use crate::{ AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, - QuantitativeBenchmarkRow, QuantitativePerQueryRow, QuantitativeProductManifest, REPORT_SCHEMA, - RealWorldJob, RealWorldReport, ReportSummary, Result, eyre, formatting, fs, scoring, + QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, QuantitativePerQueryRow, + QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, RealWorldReport, ReportSummary, + Result, eyre, formatting, fs, scoring, }; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; @@ -19,6 +20,7 @@ const REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL: &str = "ranked_candidates_emitted_by_product_runtime"; const QUANTITATIVE_K_VALUES: &[usize] = &[1, 3, 5, 10]; const MIN_LEADERBOARD_QUERY_COUNT: usize = 30; +const WILSON_95_Z: f64 = 1.959963984540054; const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( "Quantitative metrics are bounded to this generated report. ", "Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims." @@ -121,6 +123,7 @@ pub(super) fn quantitative_scoreboard_report( metrics: aggregate_metrics(per_query_rows.as_slice()), metric_states: aggregate_metric_states(result_state, metric_comparable), denominators: aggregate_denominators(per_query_rows.as_slice()), + confidence_intervals: aggregate_confidence_intervals(per_query_rows.as_slice()), claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), }; let product_manifest = @@ -1019,6 +1022,84 @@ fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap BTreeMap { + let mut confidence_intervals = BTreeMap::new(); + + for metric in rate_metric_names() { + let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); + + if denominator > 0 { + confidence_intervals.insert( + metric, + wilson_confidence_interval(numerator.min(denominator), denominator), + ); + } + } + + confidence_intervals +} + +fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} + +fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} + fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { rows.iter().filter_map(|row| row.denominators.get(metric)).sum() } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs index 6c953802..ded35360 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -40,6 +40,8 @@ pub(super) struct QuantitativeBenchmarkRow { pub(super) metrics: BTreeMap>, pub(super) metric_states: BTreeMap, pub(super) denominators: BTreeMap, + #[serde(default)] + pub(super) confidence_intervals: BTreeMap, pub(super) claim_boundary: String, } @@ -76,6 +78,16 @@ pub(super) struct QuantitativeBenchmarkControls { pub(super) leakage_control: String, } +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(super) struct QuantitativeConfidenceInterval { + pub(super) method: String, + pub(super) confidence: f64, + pub(super) lower: f64, + pub(super) upper: f64, + pub(super) numerator: usize, + pub(super) denominator: usize, +} + #[derive(Clone, Debug, Default, Deserialize, Serialize)] pub(super) struct QuantitativeProductManifest { pub(super) schema: String, diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index f2b03d5c..249c48e2 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -469,6 +469,22 @@ fn assert_quantitative_row_contract(report: &Value) -> Result<()> { ); assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some()); } + for metric in ["recall_at_5", "precision_at_5", "success_at_5"] { + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str), + Some("wilson_score") + ); + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/confidence")) + .and_then(Value::as_f64), + Some(0.95) + ); + assert!( + row.pointer(&format!("/confidence_intervals/{metric}/denominator")) + .and_then(Value::as_u64) + .is_some() + ); + } Ok(()) } From 486c476331fcd6fff30e2c8ba2b4b2a3fb482adf Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:12:59 -0400 Subject: [PATCH 06/58] {"schema":"decodex/commit/1","summary":"Clean up split makefile comments","authority":"manual"} --- docs/spec/agent_memory_knowledge_system_v1.md | 3 +-- makefiles/benchmark-memory-a.toml | 2 +- makefiles/benchmark-memory-b.toml | 11 +------- makefiles/check.toml | 21 ---------------- makefiles/clean.toml | 16 ------------ makefiles/format.toml | 17 ------------- makefiles/lint-fix.toml | 19 -------------- makefiles/lint.toml | 14 ----------- makefiles/research.toml | 25 ------------------- makefiles/smoke.toml | 22 ---------------- makefiles/test.toml | 9 ------- 11 files changed, 3 insertions(+), 156 deletions(-) diff --git a/docs/spec/agent_memory_knowledge_system_v1.md b/docs/spec/agent_memory_knowledge_system_v1.md index 35d18ca8..070df71f 100644 --- a/docs/spec/agent_memory_knowledge_system_v1.md +++ b/docs/spec/agent_memory_knowledge_system_v1.md @@ -272,8 +272,7 @@ Repository-native validation is authoritative. docs are validation-ready. - Before a PR handoff or any push that refreshes a PR head, run the registered Decodex workflow gate: `cargo make fmt`, `cargo make lint-fix`, then - `cargo make checks`. In this Makefile tree, `checks` aliases the repo-native - aggregate `check` task. + `cargo make check`. - If a phase changes commands, schemas, config, runtime behavior, status semantics, or benchmark claims, update the owning docs and include drift evidence as required by `docs/policy.md`. diff --git a/makefiles/benchmark-memory-a.toml b/makefiles/benchmark-memory-a.toml index a7b5e6c6..3f09c7d4 100644 --- a/makefiles/benchmark-memory-a.toml +++ b/makefiles/benchmark-memory-a.toml @@ -1,4 +1,4 @@ -# Rust workspace tasks: Benchmark real-world memory tasks, first half. +# Rust workspace tasks: real-world memory benchmark fixtures A-G. [tasks.real-world-memory] workspace = false diff --git a/makefiles/benchmark-memory-b.toml b/makefiles/benchmark-memory-b.toml index 95003f90..3b47da39 100644 --- a/makefiles/benchmark-memory-b.toml +++ b/makefiles/benchmark-memory-b.toml @@ -1,4 +1,4 @@ -# Rust workspace tasks: Benchmark real-world memory tasks, second half. +# Rust workspace tasks: real-world memory benchmark fixtures K-W and aggregate runners. [tasks.real-world-memory-json] workspace = false @@ -686,12 +686,3 @@ args = [ "--out", "tmp/real-world-memory/memory-summary/report.md", ] - -# Check -# | task | type | cwd | -# | ---------------- | --------- | --- | -# | check | composite | | -# | check-docs | command | | -# | check-rust | command | | -# | check-trace-gate | command | | -# | checks | composite | | diff --git a/makefiles/check.toml b/makefiles/check.toml index 5756ac55..c6ab6569 100644 --- a/makefiles/check.toml +++ b/makefiles/check.toml @@ -1,14 +1,5 @@ # Rust workspace tasks: Check. -# Check -# | task | type | cwd | -# | ---------------- | --------- | --- | -# | check | composite | | -# | check-docs | command | | -# | check-rust | command | | -# | check-trace-gate | command | | -# | checks | composite | | - [tasks.check] clear = true workspace = false @@ -43,15 +34,3 @@ command = "bash" args = [ "scripts/trace-gate.sh", ] - -[tasks.checks] -workspace = false -dependencies = [ - "check", -] - -# Clean -# | task | type | cwd | -# | -------------------------- | ------- | --- | -# | clean-baseline-live-docker | command | | -# | clean-parity-docker | command | | diff --git a/makefiles/clean.toml b/makefiles/clean.toml index 7fc71c62..bf899af0 100644 --- a/makefiles/clean.toml +++ b/makefiles/clean.toml @@ -1,11 +1,5 @@ # Rust workspace tasks: Clean. -# Clean -# | task | type | cwd | -# | -------------------------- | ------- | --- | -# | clean-baseline-live-docker | command | | -# | clean-parity-docker | command | | - [tasks.clean-baseline-live-docker] workspace = false command = "docker" @@ -29,13 +23,3 @@ args = [ "-v", "--remove-orphans", ] - -# Format -# | task | type | cwd | -# | -------------- | --------- | --- | -# | fmt | composite | | -# | fmt-check | composite | | -# | fmt-rust | command | | -# | fmt-rust-check | extend | | -# | fmt-toml | command | | -# | fmt-toml-check | extend | | diff --git a/makefiles/format.toml b/makefiles/format.toml index e214c216..8046cfb9 100644 --- a/makefiles/format.toml +++ b/makefiles/format.toml @@ -1,15 +1,5 @@ # Rust workspace tasks: Format. -# Format -# | task | type | cwd | -# | -------------- | --------- | --- | -# | fmt | composite | | -# | fmt-check | composite | | -# | fmt-rust | command | | -# | fmt-rust-check | extend | | -# | fmt-toml | command | | -# | fmt-toml-check | extend | | - [tasks.fmt] workspace = false dependencies = [ @@ -45,10 +35,3 @@ args = [ "fmt", "--check", ] - -# Lint -# | task | type | cwd | -# | ----------- | --------- | --- | -# | lint | composite | | -# | lint-rust | command | | -# | lint-vstyle | command | | diff --git a/makefiles/lint-fix.toml b/makefiles/lint-fix.toml index 5aada462..aa2f8a4f 100644 --- a/makefiles/lint-fix.toml +++ b/makefiles/lint-fix.toml @@ -1,12 +1,5 @@ # Rust workspace tasks: Lint Fix. -# Lint Fix -# | task | type | cwd | -# | --------------- | --------- | --- | -# | lint-fix | composite | | -# | lint-fix-rust | command | | -# | lint-fix-vstyle | command | | - [tasks.lint-fix] workspace = false dependencies = [ @@ -55,15 +48,3 @@ args = [ "--all-features", "--strict", ] - -# Research -# | task | type | cwd | -# | --------------------------------------- | --------- | --- | -# | external-memory-radar | command | | -# | external-memory-radar-artifact | composite | | -# | external-memory-radar-artifact-json | command | | -# | external-memory-radar-artifact-validate | command | | -# | external-memory-radar-dry-run | composite | | -# | external-memory-radar-dry-run-json | command | | -# | external-memory-radar-dry-run-validate | command | | -# | external-memory-radar-validate | command | | diff --git a/makefiles/lint.toml b/makefiles/lint.toml index 1cedd668..a09517af 100644 --- a/makefiles/lint.toml +++ b/makefiles/lint.toml @@ -1,12 +1,5 @@ # Rust workspace tasks: Lint. -# Lint -# | task | type | cwd | -# | ----------- | --------- | --- | -# | lint | composite | | -# | lint-rust | command | | -# | lint-vstyle | command | | - [tasks.lint] workspace = false dependencies = [ @@ -52,10 +45,3 @@ args = [ "--workspace", "--all-features", ] - -# Lint Fix -# | task | type | cwd | -# | --------------- | --------- | --- | -# | lint-fix | composite | | -# | lint-fix-rust | command | | -# | lint-fix-vstyle | command | | diff --git a/makefiles/research.toml b/makefiles/research.toml index 1c9db279..45b5770c 100644 --- a/makefiles/research.toml +++ b/makefiles/research.toml @@ -1,17 +1,5 @@ # Rust workspace tasks: Research. -# Research -# | task | type | cwd | -# | --------------------------------------- | --------- | --- | -# | external-memory-radar | command | | -# | external-memory-radar-artifact | composite | | -# | external-memory-radar-artifact-json | command | | -# | external-memory-radar-artifact-validate | command | | -# | external-memory-radar-dry-run | composite | | -# | external-memory-radar-dry-run-json | command | | -# | external-memory-radar-dry-run-validate | command | | -# | external-memory-radar-validate | command | | - [tasks.external-memory-radar] workspace = false command = "cargo" @@ -127,16 +115,3 @@ args = [ "--cursor", "apps/elf-eval/fixtures/external_memory_pattern_radar/cursor.json", ] - -# Smoke -# | task | type | cwd | -# | ---------------------------------- | --------- | --- | -# | smoke-graphify-docker-graph-report | command | | -# | smoke-graphiti-zep-docker-temporal | command | | -# | smoke-graphrag-docker | command | | -# | smoke-letta-core-archive-export-readback | command | | -# | smoke-lightrag-docker-context | command | | -# | smoke-ragflow-docker | command | | -# | smoke-real-world-job | composite | | -# | smoke-real-world-job-json | command | | -# | smoke-real-world-job-report | command | | diff --git a/makefiles/smoke.toml b/makefiles/smoke.toml index 88c4e494..43b9874d 100644 --- a/makefiles/smoke.toml +++ b/makefiles/smoke.toml @@ -1,18 +1,5 @@ # Rust workspace tasks: Smoke. -# Smoke -# | task | type | cwd | -# | ---------------------------------- | --------- | --- | -# | smoke-graphify-docker-graph-report | command | | -# | smoke-graphiti-zep-docker-temporal | command | | -# | smoke-graphrag-docker | command | | -# | smoke-letta-core-archive-export-readback | command | | -# | smoke-lightrag-docker-context | command | | -# | smoke-ragflow-docker | command | | -# | smoke-real-world-job | composite | | -# | smoke-real-world-job-json | command | | -# | smoke-real-world-job-report | command | | - [tasks.smoke-graphify-docker-graph-report] workspace = false command = "bash" @@ -102,12 +89,3 @@ args = [ "--out", "tmp/real-world-job/real-world-job-smoke-report.md", ] - -# Test -# | task | type | cwd | -# | --------------------- | --------- | --- | -# | test | composite | | -# | test-e2e | command | | -# | test-rust | command | | -# | test-rust-all | command | | -# | test-rust-integration | command | | diff --git a/makefiles/test.toml b/makefiles/test.toml index 4245ab58..9ee899d8 100644 --- a/makefiles/test.toml +++ b/makefiles/test.toml @@ -1,14 +1,5 @@ # Rust workspace tasks: Test. -# Test -# | task | type | cwd | -# | --------------------- | --------- | --- | -# | test | composite | | -# | test-e2e | command | | -# | test-rust | command | | -# | test-rust-all | command | | -# | test-rust-integration | command | | - [tasks.test] clear = true workspace = false From d766be86f1e9ec33cde7fec9518420e262707a6f Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:17:00 -0400 Subject: [PATCH 07/58] {"schema":"decodex/commit/1","summary":"Split quantitative product manifests","authority":"manual"} --- .../real_world_job_benchmark/quantitative.rs | 277 +----------------- .../quantitative/product_manifest.rs | 267 +++++++++++++++++ 2 files changed, 277 insertions(+), 267 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index ac782c30..80fd746d 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,14 +1,18 @@ use std::env; +mod product_manifest; + +pub(super) use product_manifest::quantitative_product_manifest_from_report; + use crate::{ - AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, - ExportQuantitativeProductManifestArgs, JobReport, Path, PathBuf, QuantitativeAuditArtifact, - QuantitativeAuditManifest, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, - QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, QuantitativePerQueryRow, - QuantitativeProductManifest, REPORT_SCHEMA, RealWorldJob, RealWorldReport, ReportSummary, - Result, eyre, formatting, fs, scoring, + AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, JobReport, Path, + PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls, + QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, + QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, eyre, formatting, fs, scoring, }; +use product_manifest::quantitative_product_manifest; + const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = "elf.agent_memory_quantitative_product_manifest/v1"; @@ -172,89 +176,6 @@ pub(super) fn quantitative_scoreboard_report( }) } -pub(super) fn quantitative_product_manifest_from_report( - report: &RealWorldReport, - args: &ExportQuantitativeProductManifestArgs, -) -> Result { - if report.schema != REPORT_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {REPORT_SCHEMA}.", - args.report.display(), - report.schema - )); - } - - let source_row = - report.quantitative_scoreboard.rows.first().ok_or_else(|| { - eyre::eyre!("{} has no quantitative product row.", args.report.display()) - })?; - let source_product = source_row.product.as_str(); - let source_adapter_id = source_row.adapter_id.as_str(); - let product = args.product.as_deref().unwrap_or(source_product).trim(); - let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); - let adapter_name = - args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); - - if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { - return Err(eyre::eyre!( - "{} cannot export an incomplete quantitative product identity.", - args.report.display() - )); - } - if product == "ELF" { - return Err(eyre::eyre!( - "{} exports product ELF; use --product for external product manifest exports.", - args.report.display() - )); - } - - let mut row = source_row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.adapter_name = adapter_name.to_string(); - row.claim_boundary = concat!( - "Exported from a generated real_world_job_report quantitative row; ", - "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." - ) - .to_string(); - - let mut per_query_rows = Vec::new(); - - for row in &report.quantitative_scoreboard.per_query_rows { - if row.product != source_product || row.adapter_id != source_adapter_id { - continue; - } - - let mut row = row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.claim_boundary = concat!( - "Exported from generated report per-query quantitative evidence; ", - "import does not relax paired-significance or leaderboard gates." - ) - .to_string(); - - per_query_rows.push(row); - } - - let manifest = QuantitativeProductManifest { - schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), - corpus_id: report.quantitative_scoreboard.corpus_id.clone(), - rows: vec![row], - per_query_rows, - }; - - validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?; - - Ok(manifest) -} - pub(super) fn quantitative_audit_manifest_from_jobs( jobs: &[RealWorldJob], args: &ExportQuantitativeAuditManifestArgs, @@ -552,184 +473,6 @@ fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) } -fn quantitative_product_manifest( - path: Option<&Path>, - corpus_id: &str, -) -> Result { - let Some(path) = path else { - return Ok(QuantitativeProductManifest::default()); - }; - let raw = fs::read_to_string(path)?; - let mut manifest = - serde_json::from_str::(&raw).map_err(|err| { - eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) - })?; - - for row in &mut manifest.rows { - row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); - } - for row in &mut manifest.per_query_rows { - row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); - } - - validate_quantitative_product_manifest(&manifest, path, corpus_id)?; - - Ok(manifest) -} - -fn validate_quantitative_product_manifest( - manifest: &QuantitativeProductManifest, - path: &Path, - corpus_id: &str, -) -> Result<()> { - if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.corpus_id != corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected same-corpus {}.", - path.display(), - manifest.corpus_id, - corpus_id - )); - } - if manifest.rows.is_empty() { - return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); - } - - let row_keys = manifest - .rows - .iter() - .map(|row| (row.product.as_str(), row.adapter_id.as_str())) - .collect::>(); - - for row in &manifest.rows { - if row.product == "ELF" { - return Err(eyre::eyre!( - "{} quantitative product manifest must not inject ELF self rows.", - path.display() - )); - } - if row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.adapter_name.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative product row.", - path.display() - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - if row.leaderboard_eligible { - validate_leaderboard_eligible_product_row(path, row)?; - } - } - for row in &manifest.per_query_rows { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - } - for row in &manifest.rows { - if row.ranking_query_count == 0 { - continue; - } - - let per_query_count = manifest - .per_query_rows - .iter() - .filter(|per_query| { - per_query.product == row.product && per_query.adapter_id == row.adapter_id - }) - .count(); - - if per_query_count < row.ranking_query_count { - return Err(eyre::eyre!( - "{} row {}:{} declares {} ranked queries but only {} per-query rows.", - path.display(), - row.product, - row.adapter_id, - row.ranking_query_count, - per_query_count - )); - } - } - - Ok(()) -} - -fn validate_leaderboard_eligible_product_row( - path: &Path, - row: &QuantitativeBenchmarkRow, -) -> Result<()> { - let has_audit_manifest_id = row - .audit_manifest_id - .as_deref() - .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); - - if row.evidence_class != "live_real_world" - || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT - || row.ranking_query_count != row.sample_size - || row.explicit_qrel_query_count != row.ranking_query_count - || !row.held_out - || !row.leakage_audited - || !has_audit_manifest_id - { - return Err(eyre::eyre!( - "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", - path.display(), - row.product, - row.adapter_id - )); - } - - Ok(()) -} - fn quantitative_metrics_not_encoded( imported_row_count: usize, imported_per_query_count: usize, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs new file mode 100644 index 00000000..ed3844d4 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs @@ -0,0 +1,267 @@ +use crate::{ + BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow, + QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs, +}; + +use super::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}; + +pub(crate) fn quantitative_product_manifest_from_report( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + if report.schema != REPORT_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {REPORT_SCHEMA}.", + args.report.display(), + report.schema + )); + } + + let source_row = + report.quantitative_scoreboard.rows.first().ok_or_else(|| { + eyre::eyre!("{} has no quantitative product row.", args.report.display()) + })?; + let source_product = source_row.product.as_str(); + let source_adapter_id = source_row.adapter_id.as_str(); + let product = args.product.as_deref().unwrap_or(source_product).trim(); + let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); + let adapter_name = + args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); + + if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { + return Err(eyre::eyre!( + "{} cannot export an incomplete quantitative product identity.", + args.report.display() + )); + } + if product == "ELF" { + return Err(eyre::eyre!( + "{} exports product ELF; use --product for external product manifest exports.", + args.report.display() + )); + } + + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.adapter_name = adapter_name.to_string(); + row.claim_boundary = concat!( + "Exported from a generated real_world_job_report quantitative row; ", + "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." + ) + .to_string(); + + let mut per_query_rows = Vec::new(); + + for row in &report.quantitative_scoreboard.per_query_rows { + if row.product != source_product || row.adapter_id != source_adapter_id { + continue; + } + + let mut row = row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.claim_boundary = concat!( + "Exported from generated report per-query quantitative evidence; ", + "import does not relax paired-significance or leaderboard gates." + ) + .to_string(); + + per_query_rows.push(row); + } + + let manifest = QuantitativeProductManifest { + schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), + corpus_id: report.quantitative_scoreboard.corpus_id.clone(), + rows: vec![row], + per_query_rows, + }; + + validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?; + + Ok(manifest) +} + +pub(super) fn quantitative_product_manifest( + path: Option<&Path>, + corpus_id: &str, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeProductManifest::default()); + }; + let raw = fs::read_to_string(path)?; + let mut manifest = + serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) + })?; + + for row in &mut manifest.rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + for row in &mut manifest.per_query_rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + + validate_quantitative_product_manifest(&manifest, path, corpus_id)?; + + Ok(manifest) +} + +fn validate_quantitative_product_manifest( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.corpus_id != corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected same-corpus {}.", + path.display(), + manifest.corpus_id, + corpus_id + )); + } + if manifest.rows.is_empty() { + return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); + } + + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.rows { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative product row.", + path.display() + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + if row.leaderboard_eligible { + validate_leaderboard_eligible_product_row(path, row)?; + } + } + for row in &manifest.per_query_rows { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} + +fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} From 5b60c392dd6b318ad8fd7b2b7fe3420bb0c25387 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:20:36 -0400 Subject: [PATCH 08/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit manifests","authority":"manual"} --- .../real_world_job_benchmark/quantitative.rs | 404 +----------------- .../quantitative/audit_manifest.rs | 404 ++++++++++++++++++ 2 files changed, 411 insertions(+), 397 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index 80fd746d..ec62228f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,16 +1,18 @@ -use std::env; - +mod audit_manifest; mod product_manifest; +pub(super) use audit_manifest::quantitative_audit_manifest_from_jobs; pub(super) use product_manifest::quantitative_product_manifest_from_report; use crate::{ - AdapterReport, BTreeMap, BTreeSet, ExportQuantitativeAuditManifestArgs, JobReport, Path, - PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, QuantitativeBenchmarkControls, + AdapterReport, BTreeMap, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, - QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, eyre, formatting, fs, scoring, + QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, formatting, scoring, }; +use audit_manifest::{ + QuantitativeAuditContext, QuantitativeAuditEvidence, quantitative_audit_evidence, +}; use product_manifest::quantitative_product_manifest; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; @@ -41,22 +43,6 @@ pub(super) struct QuantitativeReportInput<'a> { pub(super) audit_manifest_path: Option<&'a Path>, } -struct QuantitativeAuditContext<'a> { - run_id: &'a str, - corpus_id: &'a str, - product: &'a str, - adapter_id: &'a str, - source_jobs: &'a [RealWorldJob], - ranking_query_count: usize, - explicit_qrel_query_count: usize, -} - -struct QuantitativeAuditEvidence { - held_out: bool, - leakage_audited: bool, - audit_manifest_id: Option, -} - pub(super) fn quantitative_scoreboard_report( input: QuantitativeReportInput<'_>, ) -> Result { @@ -176,303 +162,6 @@ pub(super) fn quantitative_scoreboard_report( }) } -pub(super) fn quantitative_audit_manifest_from_jobs( - jobs: &[RealWorldJob], - args: &ExportQuantitativeAuditManifestArgs, -) -> Result { - let product = args.product.trim(); - let adapter_id = args.adapter_id.trim(); - - if product.is_empty() || adapter_id.is_empty() { - return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); - } - - let corpus_id = quantitative_corpus_id(jobs); - let ranking_query_count = ranking_query_count(jobs); - let explicit_qrel_query_count = explicit_qrel_query_count(jobs); - let manifest = QuantitativeAuditManifest { - schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), - run_id: args.run_id.clone(), - corpus_id, - product: product.to_string(), - adapter_id: adapter_id.to_string(), - held_out: args.held_out, - leakage_audited: args.leakage_audited, - sample_size: jobs.len(), - ranking_query_count, - explicit_qrel_query_count, - query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), - controls: args.controls.clone(), - artifacts: vec![QuantitativeAuditArtifact { - role: "product_runtime_fixtures".to_string(), - path: audit_artifact_display_path(args.fixtures.as_path()), - sha256: fixture_path_digest(args.fixtures.as_path())?, - }], - claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| { - if args.held_out || args.leakage_audited { - concat!( - "Audit manifest supplied by operator; runner validates run/corpus/product/", - "adapter/count/query-id/artifact bindings before opening row gates." - ) - .to_string() - } else { - concat!( - "Diagnostic audit manifest binds the current product-runtime fixture set to ", - "query ids and counts, but it does not prove held-out or leakage-audited status." - ) - .to_string() - } - }), - }; - - validate_quantitative_audit_manifest( - &manifest, - args.fixtures.as_path(), - QuantitativeAuditContext { - run_id: args.run_id.as_str(), - corpus_id: manifest.corpus_id.as_str(), - product, - adapter_id, - source_jobs: jobs, - ranking_query_count: manifest.ranking_query_count, - explicit_qrel_query_count: manifest.explicit_qrel_query_count, - }, - )?; - - Ok(manifest) -} - -fn quantitative_audit_evidence( - path: Option<&Path>, - context: QuantitativeAuditContext<'_>, -) -> Result { - let Some(path) = path else { - return Ok(QuantitativeAuditEvidence { - held_out: false, - leakage_audited: false, - audit_manifest_id: None, - }); - }; - let raw = fs::read_to_string(path)?; - let manifest = serde_json::from_str::(&raw).map_err(|err| { - eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) - })?; - - validate_quantitative_audit_manifest(&manifest, path, context)?; - - Ok(QuantitativeAuditEvidence { - held_out: manifest.held_out, - leakage_audited: manifest.leakage_audited, - audit_manifest_id: Some(manifest.manifest_id), - }) -} - -fn validate_quantitative_audit_manifest( - manifest: &QuantitativeAuditManifest, - path: &Path, - context: QuantitativeAuditContext<'_>, -) -> Result<()> { - if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.run_id != context.run_id { - return Err(eyre::eyre!( - "{} has run_id {}, expected {}.", - path.display(), - manifest.run_id, - context.run_id - )); - } - if manifest.corpus_id != context.corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected {}.", - path.display(), - manifest.corpus_id, - context.corpus_id - )); - } - if manifest.product != context.product || manifest.adapter_id != context.adapter_id { - return Err(eyre::eyre!( - "{} has product {}:{} but current row is {}:{}.", - path.display(), - manifest.product, - manifest.adapter_id, - context.product, - context.adapter_id - )); - } - if manifest.sample_size != context.source_jobs.len() { - return Err(eyre::eyre!( - "{} has sample_size {}, expected {}.", - path.display(), - manifest.sample_size, - context.source_jobs.len() - )); - } - if manifest.ranking_query_count != context.ranking_query_count { - return Err(eyre::eyre!( - "{} has ranking_query_count {}, expected {}.", - path.display(), - manifest.ranking_query_count, - context.ranking_query_count - )); - } - if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { - return Err(eyre::eyre!( - "{} has explicit_qrel_query_count {}, expected {}.", - path.display(), - manifest.explicit_qrel_query_count, - context.explicit_qrel_query_count - )); - } - - validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; - validate_quantitative_audit_controls(manifest, path)?; - - validate_quantitative_audit_artifacts(manifest, path) -} - -fn validate_quantitative_audit_query_ids( - manifest: &QuantitativeAuditManifest, - path: &Path, - source_jobs: &[RealWorldJob], -) -> Result<()> { - let expected = ranking_query_ids(source_jobs); - let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); - - if actual.len() != manifest.query_ids.len() { - return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); - } - if actual != expected { - let missing = expected.difference(&actual).copied().collect::>(); - let extra = actual.difference(&expected).copied().collect::>(); - - return Err(eyre::eyre!( - "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", - path.display(), - missing, - extra - )); - } - - Ok(()) -} - -fn validate_quantitative_audit_controls( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - let controls = manifest.controls.iter().map(String::as_str).collect::>(); - - if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { - return Err(eyre::eyre!( - "{} marks held_out=true without required control {}.", - path.display(), - REQUIRED_HELD_OUT_AUDIT_CONTROL - )); - } - if manifest.leakage_audited - && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) - || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) - { - return Err(eyre::eyre!( - "{} marks leakage_audited=true without required controls {} and {}.", - path.display(), - REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, - REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL - )); - } - if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() - { - return Err(eyre::eyre!( - "{} marks audit controls true but has an empty claim_boundary.", - path.display() - )); - } - - Ok(()) -} - -fn validate_quantitative_audit_artifacts( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - if manifest.artifacts.is_empty() { - return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); - } - - for artifact in &manifest.artifacts { - if artifact.role.trim().is_empty() - || artifact.path.trim().is_empty() - || artifact.sha256.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative audit artifact.", - path.display() - )); - } - if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) - { - return Err(eyre::eyre!( - "{} artifact {} has invalid sha256 digest {}.", - path.display(), - artifact.role, - artifact.sha256 - )); - } - - let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); - let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { - eyre::eyre!( - "{} artifact {} could not be digested at {}: {err}", - path.display(), - artifact.role, - artifact_path.display() - ) - })?; - - if actual != artifact.sha256 { - return Err(eyre::eyre!( - "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", - path.display(), - artifact.role, - artifact_path.display(), - artifact.sha256, - actual - )); - } - } - - Ok(()) -} - -fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { - let raw = PathBuf::from(artifact_path); - - if raw.is_absolute() { - return raw; - } - - let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); - - if cwd_path.exists() { - return cwd_path; - } - - manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) -} - fn quantitative_metrics_not_encoded( imported_row_count: usize, imported_per_query_count: usize, @@ -918,85 +607,6 @@ fn quantitative_row_leaderboard_eligible( .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()) } -fn fixture_path_digest(path: &Path) -> Result { - let mut hasher = blake3::Hasher::new(); - - if path.is_file() { - hash_fixture_file( - path, - path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), - &mut hasher, - )?; - - return Ok(hasher.finalize().to_hex().to_string()); - } - - let paths = audit_fixture_paths(path)?; - - for fixture in paths { - let relative = fixture - .strip_prefix(path) - .map(|relative| relative.to_string_lossy().replace('\\', "/")) - .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); - - hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; - } - - Ok(hasher.finalize().to_hex().to_string()) -} - -fn audit_fixture_paths(path: &Path) -> Result> { - let mut paths = Vec::new(); - - collect_audit_fixture_paths(path, &mut paths)?; - - paths.sort(); - - Ok(paths) -} - -fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { - if path.is_file() { - paths.push(path.to_path_buf()); - - return Ok(()); - } - - for entry in fs::read_dir(path)? { - let entry_path = entry?.path(); - - if entry_path.is_dir() { - collect_audit_fixture_paths(entry_path.as_path(), paths)?; - } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { - paths.push(entry_path); - } - } - - Ok(()) -} - -fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { - hasher.update(logical_path.as_bytes()); - hasher.update(b"\0"); - hasher.update(&fs::read(path)?); - hasher.update(b"\0"); - - Ok(()) -} - -fn audit_artifact_display_path(path: &Path) -> String { - let display_path = if path.is_absolute() { - env::current_dir() - .ok() - .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) - .unwrap_or_else(|| path.to_path_buf()) - } else { - path.to_path_buf() - }; - - display_path.to_string_lossy().replace('\\', "/") -} - fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { source_jobs .iter() diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs new file mode 100644 index 00000000..dbdb861d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -0,0 +1,404 @@ +use std::env; + +use crate::{ + BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact, + QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs, +}; + +use super::{ + QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, + REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + explicit_qrel_query_count, quantitative_corpus_id, ranking_query_count, ranking_query_ids, +}; + +pub(super) struct QuantitativeAuditContext<'a> { + pub(super) run_id: &'a str, + pub(super) corpus_id: &'a str, + pub(super) product: &'a str, + pub(super) adapter_id: &'a str, + pub(super) source_jobs: &'a [RealWorldJob], + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) struct QuantitativeAuditEvidence { + pub(super) held_out: bool, + pub(super) leakage_audited: bool, + pub(super) audit_manifest_id: Option, +} + +pub(crate) fn quantitative_audit_manifest_from_jobs( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, +) -> Result { + let product = args.product.trim(); + let adapter_id = args.adapter_id.trim(); + + if product.is_empty() || adapter_id.is_empty() { + return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); + } + + let corpus_id = quantitative_corpus_id(jobs); + let ranking_query_count = ranking_query_count(jobs); + let explicit_qrel_query_count = explicit_qrel_query_count(jobs); + let manifest = QuantitativeAuditManifest { + schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), + run_id: args.run_id.clone(), + corpus_id, + product: product.to_string(), + adapter_id: adapter_id.to_string(), + held_out: args.held_out, + leakage_audited: args.leakage_audited, + sample_size: jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + controls: args.controls.clone(), + artifacts: vec![QuantitativeAuditArtifact { + role: "product_runtime_fixtures".to_string(), + path: audit_artifact_display_path(args.fixtures.as_path()), + sha256: fixture_path_digest(args.fixtures.as_path())?, + }], + claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| { + if args.held_out || args.leakage_audited { + concat!( + "Audit manifest supplied by operator; runner validates run/corpus/product/", + "adapter/count/query-id/artifact bindings before opening row gates." + ) + .to_string() + } else { + concat!( + "Diagnostic audit manifest binds the current product-runtime fixture set to ", + "query ids and counts, but it does not prove held-out or leakage-audited status." + ) + .to_string() + } + }), + }; + + validate_quantitative_audit_manifest( + &manifest, + args.fixtures.as_path(), + QuantitativeAuditContext { + run_id: args.run_id.as_str(), + corpus_id: manifest.corpus_id.as_str(), + product, + adapter_id, + source_jobs: jobs, + ranking_query_count: manifest.ranking_query_count, + explicit_qrel_query_count: manifest.explicit_qrel_query_count, + }, + )?; + + Ok(manifest) +} + +pub(super) fn quantitative_audit_evidence( + path: Option<&Path>, + context: QuantitativeAuditContext<'_>, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeAuditEvidence { + held_out: false, + leakage_audited: false, + audit_manifest_id: None, + }); + }; + let raw = fs::read_to_string(path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) + })?; + + validate_quantitative_audit_manifest(&manifest, path, context)?; + + Ok(QuantitativeAuditEvidence { + held_out: manifest.held_out, + leakage_audited: manifest.leakage_audited, + audit_manifest_id: Some(manifest.manifest_id), + }) +} + +fn validate_quantitative_audit_manifest( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; + validate_quantitative_audit_controls(manifest, path)?; + + validate_quantitative_audit_artifacts(manifest, path) +} + +fn validate_quantitative_audit_query_ids( + manifest: &QuantitativeAuditManifest, + path: &Path, + source_jobs: &[RealWorldJob], +) -> Result<()> { + let expected = ranking_query_ids(source_jobs); + let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); + + if actual.len() != manifest.query_ids.len() { + return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); + } + if actual != expected { + let missing = expected.difference(&actual).copied().collect::>(); + let extra = actual.difference(&expected).copied().collect::>(); + + return Err(eyre::eyre!( + "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", + path.display(), + missing, + extra + )); + } + + Ok(()) +} + +fn validate_quantitative_audit_controls( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + let controls = manifest.controls.iter().map(String::as_str).collect::>(); + + if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { + return Err(eyre::eyre!( + "{} marks held_out=true without required control {}.", + path.display(), + REQUIRED_HELD_OUT_AUDIT_CONTROL + )); + } + if manifest.leakage_audited + && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) + || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) + { + return Err(eyre::eyre!( + "{} marks leakage_audited=true without required controls {} and {}.", + path.display(), + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL + )); + } + if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() + { + return Err(eyre::eyre!( + "{} marks audit controls true but has an empty claim_boundary.", + path.display() + )); + } + + Ok(()) +} + +fn validate_quantitative_audit_artifacts( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.artifacts.is_empty() { + return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); + } + + for artifact in &manifest.artifacts { + if artifact.role.trim().is_empty() + || artifact.path.trim().is_empty() + || artifact.sha256.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative audit artifact.", + path.display() + )); + } + if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) + { + return Err(eyre::eyre!( + "{} artifact {} has invalid sha256 digest {}.", + path.display(), + artifact.role, + artifact.sha256 + )); + } + + let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { + eyre::eyre!( + "{} artifact {} could not be digested at {}: {err}", + path.display(), + artifact.role, + artifact_path.display() + ) + })?; + + if actual != artifact.sha256 { + return Err(eyre::eyre!( + "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", + path.display(), + artifact.role, + artifact_path.display(), + artifact.sha256, + actual + )); + } + } + + Ok(()) +} + +fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { + let raw = PathBuf::from(artifact_path); + + if raw.is_absolute() { + return raw; + } + + let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); + + if cwd_path.exists() { + return cwd_path; + } + + manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) +} + +fn fixture_path_digest(path: &Path) -> Result { + let mut hasher = blake3::Hasher::new(); + + if path.is_file() { + hash_fixture_file( + path, + path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), + &mut hasher, + )?; + + return Ok(hasher.finalize().to_hex().to_string()); + } + + let paths = audit_fixture_paths(path)?; + + for fixture in paths { + let relative = fixture + .strip_prefix(path) + .map(|relative| relative.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); + + hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; + } + + Ok(hasher.finalize().to_hex().to_string()) +} + +fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { + hasher.update(logical_path.as_bytes()); + hasher.update(b"\0"); + hasher.update(&fs::read(path)?); + hasher.update(b"\0"); + + Ok(()) +} + +fn audit_artifact_display_path(path: &Path) -> String { + let display_path = if path.is_absolute() { + env::current_dir() + .ok() + .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) + .unwrap_or_else(|| path.to_path_buf()) + } else { + path.to_path_buf() + }; + + display_path.to_string_lossy().replace('\\', "/") +} From a083844af8fbbe1fef489cd9127d0ad09f2d8ac5 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:24:55 -0400 Subject: [PATCH 09/58] {"schema":"decodex/commit/1","summary":"Split quantitative metric helpers","authority":"manual"} --- .../real_world_job_benchmark/quantitative.rs | 499 +---------------- .../quantitative/audit_manifest.rs | 20 +- .../quantitative/metrics.rs | 503 ++++++++++++++++++ .../quantitative/product_manifest.rs | 3 +- 4 files changed, 534 insertions(+), 491 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index ec62228f..16365e66 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,19 +1,17 @@ mod audit_manifest; +mod metrics; mod product_manifest; -pub(super) use audit_manifest::quantitative_audit_manifest_from_jobs; -pub(super) use product_manifest::quantitative_product_manifest_from_report; - -use crate::{ - AdapterReport, BTreeMap, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls, - QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, QuantitativeConfidenceInterval, - QuantitativePerQueryRow, RealWorldJob, ReportSummary, Result, formatting, scoring, +pub(super) use self::{ + audit_manifest::quantitative_audit_manifest_from_jobs, + product_manifest::quantitative_product_manifest_from_report, }; -use audit_manifest::{ - QuantitativeAuditContext, QuantitativeAuditEvidence, quantitative_audit_evidence, +use self::audit_manifest::{QuantitativeAuditContext, QuantitativeAuditEvidence}; +use crate::{ + AdapterReport, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls, + QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result, }; -use product_manifest::quantitative_product_manifest; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = @@ -48,7 +46,7 @@ pub(super) fn quantitative_scoreboard_report( ) -> Result { let corpus_id = quantitative_corpus_id(input.source_jobs); let evidence_class = quantitative_evidence_class(input.adapter, input.jobs); - let per_query_rows = quantitative_per_query_rows( + let per_query_rows = metrics::quantitative_per_query_rows( input.source_jobs, input.jobs, corpus_id.as_str(), @@ -63,7 +61,7 @@ pub(super) fn quantitative_scoreboard_report( per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); let metric_comparable = ranking_query_count > 0; let result_state = quantitative_result_state(input.summary); - let audit_evidence = quantitative_audit_evidence( + let audit_evidence = audit_manifest::quantitative_audit_evidence( input.audit_manifest_path, QuantitativeAuditContext { run_id: input.run_id, @@ -100,24 +98,26 @@ pub(super) fn quantitative_scoreboard_report( fixture_regression_only: evidence_class == "fixture_backed", sample_size: input.jobs.len(), ranking_query_count, - ranking_coverage_state: ranking_coverage_state( + ranking_coverage_state: metrics::ranking_coverage_state( input.summary, input.source_jobs.len(), ranking_query_count, ) .to_string(), - ranked_candidate_source: ranked_candidate_source(ranking_query_count).to_string(), - qrel_source: aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) .to_string(), explicit_qrel_query_count, - metrics: aggregate_metrics(per_query_rows.as_slice()), - metric_states: aggregate_metric_states(result_state, metric_comparable), - denominators: aggregate_denominators(per_query_rows.as_slice()), - confidence_intervals: aggregate_confidence_intervals(per_query_rows.as_slice()), + metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), + metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), + denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), + confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), }; - let product_manifest = - quantitative_product_manifest(input.product_manifest_path, corpus_id.as_str())?; + let product_manifest = product_manifest::quantitative_product_manifest( + input.product_manifest_path, + corpus_id.as_str(), + )?; let imported_row_count = product_manifest.rows.len(); let imported_per_query_count = product_manifest.per_query_rows.len(); let mut rows = vec![row]; @@ -179,363 +179,6 @@ fn quantitative_metrics_not_encoded( metrics } -fn quantitative_per_query_rows( - source_jobs: &[RealWorldJob], - jobs: &[JobReport], - corpus_id: &str, - evidence_class: &str, - adapter_id: &str, -) -> Vec { - source_jobs - .iter() - .zip(jobs.iter()) - .map(|(source_job, job)| { - quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) - }) - .collect() -} - -fn quantitative_per_query_row( - source_job: &RealWorldJob, - job: &JobReport, - corpus_id: &str, - evidence_class: &str, - adapter_id: &str, -) -> QuantitativePerQueryRow { - let relevance = relevance_grades(source_job, job); - let candidates = scoring::produced_evidence_order(source_job); - let positive_relevance_count = positive_qrel_count(&relevance); - let metrics = per_query_metrics(candidates.as_slice(), &relevance); - let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { - "not_encoded" - } else { - formatting::status_str(job.status) - }; - let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); - let denominators = per_query_denominators(candidates.len(), positive_relevance_count); - - QuantitativePerQueryRow { - job_id: job.job_id.clone(), - suite: job.suite_id.clone(), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.to_string()), - result_state: formatting::status_str(job.status).to_string(), - expected_relevant_count: positive_relevance_count, - candidate_count: candidates.len(), - qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), - relevance_grade_sum: formatting::round3(relevance.values().sum::()), - product: "ELF".to_string(), - adapter_id: adapter_id.to_string(), - metrics, - metric_states, - denominators, - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - } -} - -fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { - let explicit = source_job - .expected_answer - .relevance_judgments - .iter() - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect::>(); - - if !explicit.is_empty() { - return explicit; - } - - job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() -} - -fn per_query_metrics( - candidates: &[String], - relevance: &BTreeMap, -) -> BTreeMap> { - let mut metrics = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - let relevant_at_k = relevant_at_k(candidates, relevance, *k); - - metrics - .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); - metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); - metrics.insert( - format!("success_at_{k}"), - Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), - ); - } - - metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); - metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); - metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); - - metrics -} - -fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { - candidates - .iter() - .take(k) - .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) - .count() -} - -fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - Some( - candidates - .iter() - .position(|candidate| { - relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) - }) - .map_or(0.0, |index| 1.0 / (index + 1) as f64), - ) -} - -fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - let dcg = candidates - .iter() - .take(k) - .enumerate() - .map(|(index, candidate)| { - relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) - / ((index + 2) as f64).log2() - }) - .sum::(); - let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); - - ideal.sort_by(|left, right| right.total_cmp(left)); - - let idcg = ideal - .iter() - .take(k) - .enumerate() - .map(|(index, grade)| grade / ((index + 2) as f64).log2()) - .sum::(); - - Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) -} - -fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { - let positive_count = positive_qrel_count(relevance); - - if positive_count == 0 { - return None; - } - - let mut hit_count = 0; - let mut precision_sum = 0.0; - let mut seen = BTreeSet::new(); - - for (index, candidate) in candidates.iter().enumerate() { - if !seen.insert(candidate.as_str()) { - continue; - } - if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { - hit_count += 1; - precision_sum += hit_count as f64 / (index + 1) as f64; - } - } - - Some(precision_sum / positive_count as f64) -} - -fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { - let mut sums = BTreeMap::::new(); - let mut metrics = quantitative_metric_names() - .into_iter() - .map(|metric| (metric, None)) - .collect::>(); - - for row in rows { - for (metric, value) in &row.metrics { - if let Some(value) = value { - let (sum, count) = sums.entry(metric.clone()).or_default(); - - *sum += *value; - *count += 1; - } - } - } - for (metric, (sum, count)) in sums { - metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); - } - - metrics -} - -fn aggregate_metric_states( - result_state: &str, - metric_comparable: bool, -) -> BTreeMap { - let state = if metric_comparable { result_state } else { "not_encoded" }; - let mut states = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - states.insert(format!("recall_at_{k}"), state.to_string()); - states.insert(format!("precision_at_{k}"), state.to_string()); - states.insert(format!("success_at_{k}"), state.to_string()); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - states.insert(metric.to_string(), state.to_string()); - } - - states -} - -fn quantitative_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - metrics.push(metric.to_string()); - } - - metrics -} - -fn per_query_denominators( - candidate_count: usize, - expected_relevant_count: usize, -) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert(format!("recall_at_{k}"), expected_relevant_count); - denominators.insert(format!("precision_at_{k}"), *k); - denominators.insert(format!("success_at_{k}"), 1); - } - - denominators.insert("mrr".to_string(), expected_relevant_count); - denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); - denominators.insert("average_precision".to_string(), expected_relevant_count); - denominators.insert("candidate_count".to_string(), candidate_count); - - denominators -} - -fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert( - format!("recall_at_{k}"), - sum_per_query_denominator(rows, &format!("recall_at_{k}")), - ); - denominators.insert( - format!("precision_at_{k}"), - sum_per_query_denominator(rows, &format!("precision_at_{k}")), - ); - denominators.insert( - format!("success_at_{k}"), - sum_per_query_denominator(rows, &format!("success_at_{k}")), - ); - } - - denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); - denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); - denominators.insert( - "average_precision".to_string(), - sum_per_query_denominator(rows, "average_precision"), - ); - - denominators -} - -fn aggregate_confidence_intervals( - rows: &[QuantitativePerQueryRow], -) -> BTreeMap { - let mut confidence_intervals = BTreeMap::new(); - - for metric in rate_metric_names() { - let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); - - if denominator > 0 { - confidence_intervals.insert( - metric, - wilson_confidence_interval(numerator.min(denominator), denominator), - ); - } - } - - confidence_intervals -} - -fn rate_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - - metrics -} - -fn aggregate_rate_numerator_denominator( - rows: &[QuantitativePerQueryRow], - metric: &str, -) -> (usize, usize) { - let mut numerator = 0; - let mut denominator = 0; - - for row in rows { - let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { - continue; - }; - let Some(row_denominator) = row.denominators.get(metric).copied() else { - continue; - }; - - if row_denominator == 0 { - continue; - } - - denominator += row_denominator; - numerator += (value * row_denominator as f64).round() as usize; - } - - (numerator, denominator) -} - -fn wilson_confidence_interval( - numerator: usize, - denominator: usize, -) -> QuantitativeConfidenceInterval { - let n = denominator as f64; - let p = numerator as f64 / n; - let z2 = WILSON_95_Z * WILSON_95_Z; - let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); - let half_width = - WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); - - QuantitativeConfidenceInterval { - method: "wilson_score".to_string(), - confidence: 0.95, - lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), - upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), - numerator, - denominator, - } -} - -fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { - rows.iter().filter_map(|row| row.denominators.get(metric)).sum() -} - fn quantitative_corpus_id(source_jobs: &[RealWorldJob]) -> String { let ids = source_jobs.iter().map(|job| job.corpus.corpus_id.as_str()).collect::>(); @@ -606,103 +249,3 @@ fn quantitative_row_leaderboard_eligible( .as_deref() .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()) } - -fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { - source_jobs - .iter() - .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) - .map(|job| job.job_id.as_str()) - .collect() -} - -fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { - ranking_query_ids(source_jobs).len() -} - -fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { - source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() -} - -fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { - if !source_job.expected_answer.relevance_judgments.is_empty() { - return source_job - .expected_answer - .relevance_judgments - .iter() - .filter(|judgment| judgment.grade > 0.0) - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect(); - } - - source_job - .required_evidence - .iter() - .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) - .map(|evidence| (evidence.evidence_id.clone(), 1.0)) - .collect() -} - -fn ranking_query_attempted(job: &RealWorldJob) -> bool { - if !scoring::produced_evidence_order(job).is_empty() { - return true; - } - - let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { - return false; - }; - - answer.trace_explainability.as_ref().is_some_and(|trace| { - trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") - }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) -} - -fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { - if !source_job.expected_answer.relevance_judgments.is_empty() { - "explicit_qrels" - } else if empty { - "not_encoded" - } else { - "expected_evidence_fallback" - } -} - -fn aggregate_qrel_source( - ranking_query_count: usize, - explicit_qrel_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if explicit_qrel_query_count == ranking_query_count { - "explicit_qrels" - } else if explicit_qrel_query_count == 0 { - "expected_evidence_fallback" - } else { - "mixed" - } -} - -fn ranking_coverage_state( - summary: &ReportSummary, - source_job_count: usize, - ranking_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { - "complete" - } else { - "partial_coverage" - } -} - -fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { - if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } -} - -fn positive_qrel_count(relevance: &BTreeMap) -> usize { - relevance.values().filter(|grade| **grade > 0.0).count() -} - -fn rate(numerator: usize, denominator: usize) -> Option { - (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs index dbdb861d..be8b9e50 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -3,12 +3,10 @@ use std::env; use crate::{ BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact, QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs, -}; - -use super::{ - QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, - REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, - explicit_qrel_query_count, quantitative_corpus_id, ranking_query_count, ranking_query_ids, + quantitative::{ + QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, + REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, metrics, + }, }; pub(super) struct QuantitativeAuditContext<'a> { @@ -38,9 +36,9 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); } - let corpus_id = quantitative_corpus_id(jobs); - let ranking_query_count = ranking_query_count(jobs); - let explicit_qrel_query_count = explicit_qrel_query_count(jobs); + let corpus_id = super::quantitative_corpus_id(jobs); + let ranking_query_count = metrics::ranking_query_count(jobs); + let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); let manifest = QuantitativeAuditManifest { schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), manifest_id: args @@ -56,7 +54,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( sample_size: jobs.len(), ranking_query_count, explicit_qrel_query_count, - query_ids: ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), controls: args.controls.clone(), artifacts: vec![QuantitativeAuditArtifact { role: "product_runtime_fixtures".to_string(), @@ -199,7 +197,7 @@ fn validate_quantitative_audit_query_ids( path: &Path, source_jobs: &[RealWorldJob], ) -> Result<()> { - let expected = ranking_query_ids(source_jobs); + let expected = metrics::ranking_query_ids(source_jobs); let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); if actual.len() != manifest.query_ids.len() { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs new file mode 100644 index 00000000..e5377d7b --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs @@ -0,0 +1,503 @@ +use crate::{ + BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow, + RealWorldJob, ReportSummary, formatting, + quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, WILSON_95_Z}, + scoring, +}; + +pub(super) fn quantitative_per_query_rows( + source_jobs: &[RealWorldJob], + jobs: &[JobReport], + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> Vec { + source_jobs + .iter() + .zip(jobs.iter()) + .map(|(source_job, job)| { + quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) + }) + .collect() +} + +pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + aggregate_metrics_impl(rows) +} + +pub(super) fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + aggregate_metric_states_impl(result_state, metric_comparable) +} + +pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + aggregate_denominators_impl(rows) +} + +pub(super) fn aggregate_confidence_intervals( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + aggregate_confidence_intervals_impl(rows) +} + +pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + ranking_query_ids_impl(source_jobs) +} + +pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { + ranking_query_ids(source_jobs).len() +} + +pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { + source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() +} + +pub(super) fn aggregate_qrel_source( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + aggregate_qrel_source_impl(ranking_query_count, explicit_qrel_query_count) +} + +pub(super) fn ranking_coverage_state( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + ranking_coverage_state_impl(summary, source_job_count, ranking_query_count) +} + +pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { + if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } +} + +fn quantitative_per_query_row( + source_job: &RealWorldJob, + job: &JobReport, + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let relevance = relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = positive_qrel_count(&relevance); + let metrics = per_query_metrics(candidates.as_slice(), &relevance); + let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { + "not_encoded" + } else { + formatting::status_str(job.status) + }; + let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); + let denominators = per_query_denominators(candidates.len(), positive_relevance_count); + + QuantitativePerQueryRow { + job_id: job.job_id.clone(), + suite: job.suite_id.clone(), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: formatting::status_str(job.status).to_string(), + expected_relevant_count: positive_relevance_count, + candidate_count: candidates.len(), + qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + product: "ELF".to_string(), + adapter_id: adapter_id.to_string(), + metrics, + metric_states, + denominators, + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} + +fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { + let explicit = source_job + .expected_answer + .relevance_judgments + .iter() + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect::>(); + + if !explicit.is_empty() { + return explicit; + } + + job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() +} + +fn per_query_metrics( + candidates: &[String], + relevance: &BTreeMap, +) -> BTreeMap> { + let mut metrics = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + let relevant_at_k = relevant_at_k(candidates, relevance, *k); + + metrics + .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); + metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); + metrics.insert( + format!("success_at_{k}"), + Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), + ); + } + + metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); + metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); + + metrics +} + +fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} + +fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} + +fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { + let positive_count = positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} + +fn aggregate_metrics_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + let mut sums = BTreeMap::::new(); + let mut metrics = quantitative_metric_names() + .into_iter() + .map(|metric| (metric, None)) + .collect::>(); + + for row in rows { + for (metric, value) in &row.metrics { + if let Some(value) = value { + let (sum, count) = sums.entry(metric.clone()).or_default(); + + *sum += *value; + *count += 1; + } + } + } + for (metric, (sum, count)) in sums { + metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); + } + + metrics +} + +fn aggregate_metric_states_impl( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + let state = if metric_comparable { result_state } else { "not_encoded" }; + let mut states = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + states.insert(format!("recall_at_{k}"), state.to_string()); + states.insert(format!("precision_at_{k}"), state.to_string()); + states.insert(format!("success_at_{k}"), state.to_string()); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + states.insert(metric.to_string(), state.to_string()); + } + + states +} + +fn quantitative_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + metrics.push(metric.to_string()); + } + + metrics +} + +fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} + +fn aggregate_denominators_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert( + format!("recall_at_{k}"), + sum_per_query_denominator(rows, &format!("recall_at_{k}")), + ); + denominators.insert( + format!("precision_at_{k}"), + sum_per_query_denominator(rows, &format!("precision_at_{k}")), + ); + denominators.insert( + format!("success_at_{k}"), + sum_per_query_denominator(rows, &format!("success_at_{k}")), + ); + } + + denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); + denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); + denominators.insert( + "average_precision".to_string(), + sum_per_query_denominator(rows, "average_precision"), + ); + + denominators +} + +fn aggregate_confidence_intervals_impl( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + let mut confidence_intervals = BTreeMap::new(); + + for metric in rate_metric_names() { + let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); + + if denominator > 0 { + confidence_intervals.insert( + metric, + wilson_confidence_interval(numerator.min(denominator), denominator), + ); + } + } + + confidence_intervals +} + +fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} + +fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} + +fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { + rows.iter().filter_map(|row| row.denominators.get(metric)).sum() +} + +fn ranking_query_ids_impl(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + source_jobs + .iter() + .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) + .map(|job| job.job_id.as_str()) + .collect() +} + +fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { + if !source_job.expected_answer.relevance_judgments.is_empty() { + return source_job + .expected_answer + .relevance_judgments + .iter() + .filter(|judgment| judgment.grade > 0.0) + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect(); + } + + source_job + .required_evidence + .iter() + .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) + .map(|evidence| (evidence.evidence_id.clone(), 1.0)) + .collect() +} + +fn ranking_query_attempted(job: &RealWorldJob) -> bool { + if !scoring::produced_evidence_order(job).is_empty() { + return true; + } + + let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { + return false; + }; + + answer.trace_explainability.as_ref().is_some_and(|trace| { + trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") + }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) +} + +fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { + if !source_job.expected_answer.relevance_judgments.is_empty() { + "explicit_qrels" + } else if empty { + "not_encoded" + } else { + "expected_evidence_fallback" + } +} + +fn aggregate_qrel_source_impl( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if explicit_qrel_query_count == ranking_query_count { + "explicit_qrels" + } else if explicit_qrel_query_count == 0 { + "expected_evidence_fallback" + } else { + "mixed" + } +} + +fn ranking_coverage_state_impl( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { + "complete" + } else { + "partial_coverage" + } +} + +fn positive_qrel_count(relevance: &BTreeMap) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs index ed3844d4..111459e9 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs @@ -1,10 +1,9 @@ use crate::{ BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs, + quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}, }; -use super::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}; - pub(crate) fn quantitative_product_manifest_from_report( report: &RealWorldReport, args: &ExportQuantitativeProductManifestArgs, From dee9e0cbc342d8f3e899fa2a7c151da1a05d2094 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:39:57 -0400 Subject: [PATCH 10/58] {"schema":"decodex/commit/1","summary":"Split quantitative benchmark tests","authority":"manual"} --- .../real_world_job_benchmark/quantitative.rs | 478 +----------------- .../quantitative/audit_manifest.rs | 110 ++++ .../quantitative/contracts.rs | 127 +++++ .../quantitative/metrics.rs | 53 ++ .../quantitative/product_manifest.rs | 203 ++++++++ 5 files changed, 500 insertions(+), 471 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs index 249c48e2..9bcc07c8 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative.rs @@ -1,391 +1,15 @@ -use std::{ - env, fs, - path::Path, - process::{self, Command}, -}; +#[path = "quantitative/audit_manifest.rs"] mod audit_manifest; +#[path = "quantitative/contracts.rs"] mod contracts; +#[path = "quantitative/metrics.rs"] mod metrics; +#[path = "quantitative/product_manifest.rs"] mod product_manifest; -use color_eyre::{Result, eyre}; +use std::{path::Path, process::Command}; + +use color_eyre::Result; use serde_json::Value; use crate::support; -#[test] -fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - - assert_eq!( - report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str), - Some("elf.agent_memory_quantitative_benchmark/v1") - ); - assert_eq!( - report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str), - report.pointer("/generated_at").and_then(Value::as_str) - ); - assert_eq!( - report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array), - Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),]) - ); - assert_eq!( - report - .pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed") - .and_then(Value::as_bool), - Some(false) - ); - assert_eq!( - report - .pointer("/quantitative_scoreboard/controls/current_query_count") - .and_then(Value::as_u64), - report.pointer("/summary/job_count").and_then(Value::as_u64) - ); - - assert_quantitative_row_contract(&report)?; - assert_quantitative_per_query_contract(&report)?; - - Ok(()) -} - -#[test] -fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> { - let source_path = - support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json"); - let mut job = serde_json::from_str::(&fs::read_to_string(source_path)?)?; - - support::set_json_pointer( - &mut job, - "/corpus/adapter_response/answer/evidence_ids", - serde_json::json!(["old-provider-note", "current-provider-report"]), - )?; - - job.pointer_mut("/expected_answer") - .and_then(Value::as_object_mut) - .ok_or_else(|| eyre::eyre!("missing expected_answer object"))? - .insert( - "relevance_judgments".to_string(), - serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]), - ); - - let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id())); - - fs::create_dir_all(&temp_dir)?; - fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?; - - let report = support::run_json_report_from(temp_dir)?; - let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?; - let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?; - - assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); - assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1)); - assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0)); - assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0)); - assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); - assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5)); - assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); - - let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?; - let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?; - - assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); - assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); - assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); - - Ok(()) -} - -#[test] -fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .arg("--product") - .arg("Synthetic Rival") - .arg("--adapter-id") - .arg("synthetic_rival") - .arg("--adapter-name") - .arg("Synthetic Rival adapter") - .output()?; - - assert!( - export.status.success(), - "product manifest export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let manifest = support::load_json(&manifest_path)?; - - assert_eq!( - manifest.pointer("/schema").and_then(Value::as_str), - Some("elf.agent_memory_quantitative_product_manifest/v1") - ); - assert_eq!( - manifest.pointer("/rows/0/product").and_then(Value::as_str), - Some("Synthetic Rival") - ); - assert_eq!( - manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), - Some("synthetic_rival") - ); - - let imported = run_report_with_quantitative_manifest(&manifest_path)?; - let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; - let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; - - assert_eq!(rows.len(), 2); - assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); - assert!(!support::array_contains_str( - &imported, - "/quantitative_scoreboard/metrics_not_encoded", - "external_product_manifest_import" - )?); - assert!( - support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( - |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") - ) - ); - - Ok(()) -} - -#[test] -fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("elf-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); - - Ok(()) -} - -#[test] -fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .arg("--product") - .arg("Synthetic Rival") - .arg("--adapter-id") - .arg("synthetic_rival") - .arg("--adapter-name") - .arg("Synthetic Rival adapter") - .output()?; - - assert!( - export.status.success(), - "product manifest export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let mut manifest = support::load_json(&manifest_path)?; - - support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--quantitative-product-manifest") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); - - Ok(()) -} - -#[test] -fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("missing-per-query-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .arg("--product") - .arg("Synthetic Rival") - .arg("--adapter-id") - .arg("synthetic_rival") - .arg("--adapter-name") - .arg("Synthetic Rival adapter") - .output()?; - - assert!( - export.status.success(), - "product manifest export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let mut manifest = support::load_json(&manifest_path)?; - - support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--quantitative-product-manifest") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - - let stderr = String::from_utf8_lossy(&output.stderr); - - assert!(stderr.contains("ranked queries but only 0")); - - Ok(()) -} - -#[test] -fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> { - let temp_dir = - env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id())); - let manifest_path = temp_dir.join("audit-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-audit-manifest") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--out") - .arg(&manifest_path) - .arg("--run-id") - .arg("audit-import-test") - .arg("--held-out") - .arg("--leakage-audited") - .arg("--control") - .arg("query_ids_locked_before_product_runtime") - .arg("--control") - .arg("product_runtime_did_not_receive_expected_answers_or_qrels") - .arg("--control") - .arg("ranked_candidates_emitted_by_product_runtime") - .output()?; - - assert!( - export.status.success(), - "quantitative audit export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let manifest = support::load_json(&manifest_path)?; - - assert_eq!( - manifest.pointer("/schema").and_then(Value::as_str), - Some("elf.agent_memory_quantitative_audit_manifest/v1") - ); - assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true)); - assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); - assert_eq!( - support::array_at(&manifest, "/query_ids")?.len() as u64, - manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default() - ); - - let imported = run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?; - let row = support::array_at(&imported, "/quantitative_scoreboard/rows")? - .first() - .ok_or_else(|| eyre::eyre!("missing quantitative row"))?; - - assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true)); - assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); - assert_eq!( - row.pointer("/audit_manifest_id").and_then(Value::as_str), - Some("audit-import-test-quantitative-audit-manifest") - ); - assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); - - Ok(()) -} - -#[test] -fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> { - let temp_dir = - env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id())); - let manifest_path = temp_dir.join("audit-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-audit-manifest") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--out") - .arg(&manifest_path) - .arg("--run-id") - .arg("audit-import-test") - .output()?; - - assert!( - export.status.success(), - "quantitative audit export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--run-id") - .arg("different-run") - .arg("--quantitative-audit-manifest") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run")); - - Ok(()) -} - fn run_report_with_quantitative_manifest(manifest_path: &Path) -> Result { let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("run") @@ -423,91 +47,3 @@ fn run_report_with_quantitative_audit(manifest_path: &Path, run_id: &str) -> Res Ok(serde_json::from_slice(&output.stdout)?) } - -fn assert_quantitative_row_contract(report: &Value) -> Result<()> { - let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; - - assert_eq!(rows.len(), 1); - - let row = &rows[0]; - - assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF")); - assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke")); - assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality")); - assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass")); - assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true)); - assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true)); - assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); - assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true)); - assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete")); - assert_eq!( - row.pointer("/ranked_candidate_source").and_then(Value::as_str), - Some("produced_evidence_order") - ); - assert_eq!( - row.pointer("/qrel_source").and_then(Value::as_str), - Some("expected_evidence_fallback") - ); - assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0)); - - for metric in [ - "recall_at_1", - "precision_at_1", - "success_at_1", - "recall_at_5", - "precision_at_5", - "success_at_5", - "mrr", - "ndcg_at_5", - "average_precision", - ] { - assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some()); - assert_eq!( - row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str), - Some("pass") - ); - assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some()); - } - for metric in ["recall_at_5", "precision_at_5", "success_at_5"] { - assert_eq!( - row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str), - Some("wilson_score") - ); - assert_eq!( - row.pointer(&format!("/confidence_intervals/{metric}/confidence")) - .and_then(Value::as_f64), - Some(0.95) - ); - assert!( - row.pointer(&format!("/confidence_intervals/{metric}/denominator")) - .and_then(Value::as_u64) - .is_some() - ); - } - - Ok(()) -} - -fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> { - let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?; - let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0); - - assert_eq!(rows.len() as u64, job_count); - - for row in rows { - assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); - assert_eq!( - row.pointer("/qrel_source").and_then(Value::as_str), - Some("expected_evidence_fallback") - ); - assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some()); - assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some()); - assert!(row.pointer("/metrics/recall_at_5").is_some()); - assert!(row.pointer("/metrics/precision_at_5").is_some()); - assert!(row.pointer("/metrics/ndcg_at_5").is_some()); - assert!(row.pointer("/metrics/average_precision").is_some()); - } - - Ok(()) -} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs new file mode 100644 index 00000000..5d8777cd --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -0,0 +1,110 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn quantitative_audit_manifest_exports_and_opens_current_row_gates() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .arg("--held-out") + .arg("--leakage-audited") + .arg("--control") + .arg("query_ids_locked_before_product_runtime") + .arg("--control") + .arg("product_runtime_did_not_receive_expected_answers_or_qrels") + .arg("--control") + .arg("ranked_candidates_emitted_by_product_runtime") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let manifest = support::load_json(&manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_audit_manifest/v1") + ); + assert_eq!(manifest.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(manifest.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + support::array_at(&manifest, "/query_ids")?.len() as u64, + manifest.pointer("/ranking_query_count").and_then(Value::as_u64).unwrap_or_default() + ); + + let imported = super::run_report_with_quantitative_audit(&manifest_path, "audit-import-test")?; + let row = support::array_at(&imported, "/quantitative_scoreboard/rows")? + .first() + .ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/held_out").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leakage_audited").and_then(Value::as_bool), Some(true)); + assert_eq!( + row.pointer("/audit_manifest_id").and_then(Value::as_str), + Some("audit-import-test-quantitative-audit-manifest") + ); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + + Ok(()) +} + +#[test] +fn quantitative_audit_manifest_rejects_wrong_run_id_imports() -> Result<()> { + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-audit-manifest-run-test-{}", process::id())); + let manifest_path = temp_dir.join("audit-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-audit-manifest") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--out") + .arg(&manifest_path) + .arg("--run-id") + .arg("audit-import-test") + .output()?; + + assert!( + export.status.success(), + "quantitative audit export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--run-id") + .arg("different-run") + .arg("--quantitative-audit-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected different-run")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs new file mode 100644 index 00000000..fc158b77 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/contracts.rs @@ -0,0 +1,127 @@ +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn adversarial_quality_report_exposes_quantitative_scoreboard() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + + assert_eq!( + report.pointer("/quantitative_scoreboard/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_benchmark/v1") + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/generated_at").and_then(Value::as_str), + report.pointer("/generated_at").and_then(Value::as_str) + ); + assert_eq!( + report.pointer("/quantitative_scoreboard/k_values").and_then(Value::as_array), + Some(&vec![Value::from(1), Value::from(3), Value::from(5), Value::from(10),]) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/leaderboard_claim_allowed") + .and_then(Value::as_bool), + Some(false) + ); + assert_eq!( + report + .pointer("/quantitative_scoreboard/controls/current_query_count") + .and_then(Value::as_u64), + report.pointer("/summary/job_count").and_then(Value::as_u64) + ); + + assert_quantitative_row_contract(&report)?; + assert_quantitative_per_query_contract(&report)?; + + Ok(()) +} + +fn assert_quantitative_row_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/rows")?; + + assert_eq!(rows.len(), 1); + + let row = &rows[0]; + + assert_eq!(row.pointer("/product").and_then(Value::as_str), Some("ELF")); + assert_eq!(row.pointer("/adapter_id").and_then(Value::as_str), Some("fixture_smoke")); + assert_eq!(row.pointer("/suite").and_then(Value::as_str), Some("adversarial_quality")); + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!(row.pointer("/result_state").and_then(Value::as_str), Some("pass")); + assert_eq!(row.pointer("/comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/metric_comparable").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/leaderboard_eligible").and_then(Value::as_bool), Some(false)); + assert_eq!(row.pointer("/fixture_regression_only").and_then(Value::as_bool), Some(true)); + assert_eq!(row.pointer("/ranking_coverage_state").and_then(Value::as_str), Some("complete")); + assert_eq!( + row.pointer("/ranked_candidate_source").and_then(Value::as_str), + Some("produced_evidence_order") + ); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(0)); + + for metric in [ + "recall_at_1", + "precision_at_1", + "success_at_1", + "recall_at_5", + "precision_at_5", + "success_at_5", + "mrr", + "ndcg_at_5", + "average_precision", + ] { + assert!(row.pointer(&format!("/metrics/{metric}")).and_then(Value::as_f64).is_some()); + assert_eq!( + row.pointer(&format!("/metric_states/{metric}")).and_then(Value::as_str), + Some("pass") + ); + assert!(row.pointer(&format!("/denominators/{metric}")).and_then(Value::as_u64).is_some()); + } + for metric in ["recall_at_5", "precision_at_5", "success_at_5"] { + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/method")).and_then(Value::as_str), + Some("wilson_score") + ); + assert_eq!( + row.pointer(&format!("/confidence_intervals/{metric}/confidence")) + .and_then(Value::as_f64), + Some(0.95) + ); + assert!( + row.pointer(&format!("/confidence_intervals/{metric}/denominator")) + .and_then(Value::as_u64) + .is_some() + ); + } + + Ok(()) +} + +fn assert_quantitative_per_query_contract(report: &Value) -> Result<()> { + let rows = support::array_at(report, "/quantitative_scoreboard/per_query_rows")?; + let job_count = report.pointer("/summary/job_count").and_then(Value::as_u64).unwrap_or(0); + + assert_eq!(rows.len() as u64, job_count); + + for row in rows { + assert_eq!(row.pointer("/evidence_class").and_then(Value::as_str), Some("fixture_backed")); + assert_eq!( + row.pointer("/qrel_source").and_then(Value::as_str), + Some("expected_evidence_fallback") + ); + assert!(row.pointer("/candidate_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/expected_relevant_count").and_then(Value::as_u64).is_some()); + assert!(row.pointer("/metrics/recall_at_5").is_some()); + assert!(row.pointer("/metrics/precision_at_5").is_some()); + assert!(row.pointer("/metrics/ndcg_at_5").is_some()); + assert!(row.pointer("/metrics/average_precision").is_some()); + } + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs new file mode 100644 index 00000000..3b9262a0 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/metrics.rs @@ -0,0 +1,53 @@ +use std::{env, fs, process}; + +use color_eyre::{Result, eyre}; +use serde_json::Value; + +use crate::support; + +#[test] +fn explicit_qrels_preserve_candidate_order_for_ranking_metrics() -> Result<()> { + let source_path = + support::adversarial_quality_fixture_dir().join("conflicting_source_authority.json"); + let mut job = serde_json::from_str::(&fs::read_to_string(source_path)?)?; + + support::set_json_pointer( + &mut job, + "/corpus/adapter_response/answer/evidence_ids", + serde_json::json!(["old-provider-note", "current-provider-report"]), + )?; + + job.pointer_mut("/expected_answer") + .and_then(Value::as_object_mut) + .ok_or_else(|| eyre::eyre!("missing expected_answer object"))? + .insert( + "relevance_judgments".to_string(), + serde_json::json!([{ "evidence_id": "current-provider-report", "grade": 1.0 }]), + ); + + let temp_dir = env::temp_dir().join(format!("elf-explicit-qrel-order-test-{}", process::id())); + + fs::create_dir_all(&temp_dir)?; + fs::write(temp_dir.join("explicit_qrel_order.json"), serde_json::to_vec_pretty(&job)?)?; + + let report = support::run_json_report_from(temp_dir)?; + let rows = support::array_at(&report, "/quantitative_scoreboard/rows")?; + let row = rows.first().ok_or_else(|| eyre::eyre!("missing quantitative row"))?; + + assert_eq!(row.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(row.pointer("/explicit_qrel_query_count").and_then(Value::as_u64), Some(1)); + assert_eq!(row.pointer("/metrics/recall_at_1").and_then(Value::as_f64), Some(0.0)); + assert_eq!(row.pointer("/metrics/recall_at_3").and_then(Value::as_f64), Some(1.0)); + assert_eq!(row.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/metrics/average_precision").and_then(Value::as_f64), Some(0.5)); + assert_eq!(row.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + let per_query_rows = support::array_at(&report, "/quantitative_scoreboard/per_query_rows")?; + let per_query = per_query_rows.first().ok_or_else(|| eyre::eyre!("missing per-query row"))?; + + assert_eq!(per_query.pointer("/qrel_source").and_then(Value::as_str), Some("explicit_qrels")); + assert_eq!(per_query.pointer("/metrics/mrr").and_then(Value::as_f64), Some(0.5)); + assert_eq!(per_query.pointer("/denominators/recall_at_5").and_then(Value::as_u64), Some(1)); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs new file mode 100644 index 00000000..c7b543c5 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs @@ -0,0 +1,203 @@ +use std::{ + env, fs, + process::{self, Command}, +}; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = + env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let manifest = support::load_json(&manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_product_manifest/v1") + ); + assert_eq!( + manifest.pointer("/rows/0/product").and_then(Value::as_str), + Some("Synthetic Rival") + ); + assert_eq!( + manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), + Some("synthetic_rival") + ); + + let imported = super::run_report_with_quantitative_manifest(&manifest_path)?; + let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; + let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; + + assert_eq!(rows.len(), 2); + assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); + assert!(!support::array_contains_str( + &imported, + "/quantitative_scoreboard/metrics_not_encoded", + "external_product_manifest_import" + )?); + assert!( + support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( + |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") + ) + ); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("elf-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let mut manifest = support::load_json(&manifest_path)?; + + support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { + let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; + let temp_dir = env::temp_dir() + .join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id())); + let report_path = temp_dir.join("report.json"); + let manifest_path = temp_dir.join("missing-per-query-product-manifest.json"); + + fs::create_dir_all(&temp_dir)?; + fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; + + let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&report_path) + .arg("--out") + .arg(&manifest_path) + .arg("--product") + .arg("Synthetic Rival") + .arg("--adapter-id") + .arg("synthetic_rival") + .arg("--adapter-name") + .arg("Synthetic Rival adapter") + .output()?; + + assert!( + export.status.success(), + "product manifest export failed: {}", + String::from_utf8_lossy(&export.stderr) + ); + + let mut manifest = support::load_json(&manifest_path)?; + + support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; + fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&manifest_path) + .output()?; + + assert!(!output.status.success()); + + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stderr.contains("ranked queries but only 0")); + + Ok(()) +} From f65b0e28c357e58f676e6b4b21b40b4bd48440a8 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:44:49 -0400 Subject: [PATCH 11/58] {"schema":"decodex/commit/1","summary":"Split quantitative metric submodules","authority":"manual"} --- .../quantitative/metrics.rs | 465 +----------------- .../quantitative/metrics/aggregate.rs | 172 +++++++ .../quantitative/metrics/per_query.rs | 212 ++++++++ .../quantitative/metrics/ranking.rs | 83 ++++ 4 files changed, 483 insertions(+), 449 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs index e5377d7b..779329f6 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs @@ -1,8 +1,10 @@ +mod aggregate; +mod per_query; +mod ranking; + use crate::{ BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow, - RealWorldJob, ReportSummary, formatting, - quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, WILSON_95_Z}, - scoring, + RealWorldJob, ReportSummary, }; pub(super) fn quantitative_per_query_rows( @@ -12,53 +14,47 @@ pub(super) fn quantitative_per_query_rows( evidence_class: &str, adapter_id: &str, ) -> Vec { - source_jobs - .iter() - .zip(jobs.iter()) - .map(|(source_job, job)| { - quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) - }) - .collect() + per_query::quantitative_per_query_rows(source_jobs, jobs, corpus_id, evidence_class, adapter_id) } pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { - aggregate_metrics_impl(rows) + aggregate::aggregate_metrics(rows) } pub(super) fn aggregate_metric_states( result_state: &str, metric_comparable: bool, ) -> BTreeMap { - aggregate_metric_states_impl(result_state, metric_comparable) + aggregate::aggregate_metric_states(result_state, metric_comparable) } pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { - aggregate_denominators_impl(rows) + aggregate::aggregate_denominators(rows) } pub(super) fn aggregate_confidence_intervals( rows: &[QuantitativePerQueryRow], ) -> BTreeMap { - aggregate_confidence_intervals_impl(rows) + aggregate::aggregate_confidence_intervals(rows) } pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { - ranking_query_ids_impl(source_jobs) + ranking::ranking_query_ids(source_jobs) } pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { - ranking_query_ids(source_jobs).len() + ranking::ranking_query_count(source_jobs) } pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { - source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() + ranking::explicit_qrel_query_count(source_jobs) } pub(super) fn aggregate_qrel_source( ranking_query_count: usize, explicit_qrel_query_count: usize, ) -> &'static str { - aggregate_qrel_source_impl(ranking_query_count, explicit_qrel_query_count) + ranking::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) } pub(super) fn ranking_coverage_state( @@ -66,438 +62,9 @@ pub(super) fn ranking_coverage_state( source_job_count: usize, ranking_query_count: usize, ) -> &'static str { - ranking_coverage_state_impl(summary, source_job_count, ranking_query_count) + ranking::ranking_coverage_state(summary, source_job_count, ranking_query_count) } pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { - if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } -} - -fn quantitative_per_query_row( - source_job: &RealWorldJob, - job: &JobReport, - corpus_id: &str, - evidence_class: &str, - adapter_id: &str, -) -> QuantitativePerQueryRow { - let relevance = relevance_grades(source_job, job); - let candidates = scoring::produced_evidence_order(source_job); - let positive_relevance_count = positive_qrel_count(&relevance); - let metrics = per_query_metrics(candidates.as_slice(), &relevance); - let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { - "not_encoded" - } else { - formatting::status_str(job.status) - }; - let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); - let denominators = per_query_denominators(candidates.len(), positive_relevance_count); - - QuantitativePerQueryRow { - job_id: job.job_id.clone(), - suite: job.suite_id.clone(), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.to_string()), - result_state: formatting::status_str(job.status).to_string(), - expected_relevant_count: positive_relevance_count, - candidate_count: candidates.len(), - qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), - relevance_grade_sum: formatting::round3(relevance.values().sum::()), - product: "ELF".to_string(), - adapter_id: adapter_id.to_string(), - metrics, - metric_states, - denominators, - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - } -} - -fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { - let explicit = source_job - .expected_answer - .relevance_judgments - .iter() - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect::>(); - - if !explicit.is_empty() { - return explicit; - } - - job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() -} - -fn per_query_metrics( - candidates: &[String], - relevance: &BTreeMap, -) -> BTreeMap> { - let mut metrics = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - let relevant_at_k = relevant_at_k(candidates, relevance, *k); - - metrics - .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); - metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); - metrics.insert( - format!("success_at_{k}"), - Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), - ); - } - - metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); - metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); - metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); - - metrics -} - -fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { - candidates - .iter() - .take(k) - .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) - .count() -} - -fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - Some( - candidates - .iter() - .position(|candidate| { - relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) - }) - .map_or(0.0, |index| 1.0 / (index + 1) as f64), - ) -} - -fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - let dcg = candidates - .iter() - .take(k) - .enumerate() - .map(|(index, candidate)| { - relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) - / ((index + 2) as f64).log2() - }) - .sum::(); - let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); - - ideal.sort_by(|left, right| right.total_cmp(left)); - - let idcg = ideal - .iter() - .take(k) - .enumerate() - .map(|(index, grade)| grade / ((index + 2) as f64).log2()) - .sum::(); - - Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) -} - -fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { - let positive_count = positive_qrel_count(relevance); - - if positive_count == 0 { - return None; - } - - let mut hit_count = 0; - let mut precision_sum = 0.0; - let mut seen = BTreeSet::new(); - - for (index, candidate) in candidates.iter().enumerate() { - if !seen.insert(candidate.as_str()) { - continue; - } - if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { - hit_count += 1; - precision_sum += hit_count as f64 / (index + 1) as f64; - } - } - - Some(precision_sum / positive_count as f64) -} - -fn aggregate_metrics_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { - let mut sums = BTreeMap::::new(); - let mut metrics = quantitative_metric_names() - .into_iter() - .map(|metric| (metric, None)) - .collect::>(); - - for row in rows { - for (metric, value) in &row.metrics { - if let Some(value) = value { - let (sum, count) = sums.entry(metric.clone()).or_default(); - - *sum += *value; - *count += 1; - } - } - } - for (metric, (sum, count)) in sums { - metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); - } - - metrics -} - -fn aggregate_metric_states_impl( - result_state: &str, - metric_comparable: bool, -) -> BTreeMap { - let state = if metric_comparable { result_state } else { "not_encoded" }; - let mut states = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - states.insert(format!("recall_at_{k}"), state.to_string()); - states.insert(format!("precision_at_{k}"), state.to_string()); - states.insert(format!("success_at_{k}"), state.to_string()); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - states.insert(metric.to_string(), state.to_string()); - } - - states -} - -fn quantitative_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - metrics.push(metric.to_string()); - } - - metrics -} - -fn per_query_denominators( - candidate_count: usize, - expected_relevant_count: usize, -) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert(format!("recall_at_{k}"), expected_relevant_count); - denominators.insert(format!("precision_at_{k}"), *k); - denominators.insert(format!("success_at_{k}"), 1); - } - - denominators.insert("mrr".to_string(), expected_relevant_count); - denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); - denominators.insert("average_precision".to_string(), expected_relevant_count); - denominators.insert("candidate_count".to_string(), candidate_count); - - denominators -} - -fn aggregate_denominators_impl(rows: &[QuantitativePerQueryRow]) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert( - format!("recall_at_{k}"), - sum_per_query_denominator(rows, &format!("recall_at_{k}")), - ); - denominators.insert( - format!("precision_at_{k}"), - sum_per_query_denominator(rows, &format!("precision_at_{k}")), - ); - denominators.insert( - format!("success_at_{k}"), - sum_per_query_denominator(rows, &format!("success_at_{k}")), - ); - } - - denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); - denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); - denominators.insert( - "average_precision".to_string(), - sum_per_query_denominator(rows, "average_precision"), - ); - - denominators -} - -fn aggregate_confidence_intervals_impl( - rows: &[QuantitativePerQueryRow], -) -> BTreeMap { - let mut confidence_intervals = BTreeMap::new(); - - for metric in rate_metric_names() { - let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); - - if denominator > 0 { - confidence_intervals.insert( - metric, - wilson_confidence_interval(numerator.min(denominator), denominator), - ); - } - } - - confidence_intervals -} - -fn rate_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - - metrics -} - -fn aggregate_rate_numerator_denominator( - rows: &[QuantitativePerQueryRow], - metric: &str, -) -> (usize, usize) { - let mut numerator = 0; - let mut denominator = 0; - - for row in rows { - let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { - continue; - }; - let Some(row_denominator) = row.denominators.get(metric).copied() else { - continue; - }; - - if row_denominator == 0 { - continue; - } - - denominator += row_denominator; - numerator += (value * row_denominator as f64).round() as usize; - } - - (numerator, denominator) -} - -fn wilson_confidence_interval( - numerator: usize, - denominator: usize, -) -> QuantitativeConfidenceInterval { - let n = denominator as f64; - let p = numerator as f64 / n; - let z2 = WILSON_95_Z * WILSON_95_Z; - let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); - let half_width = - WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); - - QuantitativeConfidenceInterval { - method: "wilson_score".to_string(), - confidence: 0.95, - lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), - upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), - numerator, - denominator, - } -} - -fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { - rows.iter().filter_map(|row| row.denominators.get(metric)).sum() -} - -fn ranking_query_ids_impl(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { - source_jobs - .iter() - .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) - .map(|job| job.job_id.as_str()) - .collect() -} - -fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { - if !source_job.expected_answer.relevance_judgments.is_empty() { - return source_job - .expected_answer - .relevance_judgments - .iter() - .filter(|judgment| judgment.grade > 0.0) - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect(); - } - - source_job - .required_evidence - .iter() - .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) - .map(|evidence| (evidence.evidence_id.clone(), 1.0)) - .collect() -} - -fn ranking_query_attempted(job: &RealWorldJob) -> bool { - if !scoring::produced_evidence_order(job).is_empty() { - return true; - } - - let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { - return false; - }; - - answer.trace_explainability.as_ref().is_some_and(|trace| { - trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") - }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) -} - -fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { - if !source_job.expected_answer.relevance_judgments.is_empty() { - "explicit_qrels" - } else if empty { - "not_encoded" - } else { - "expected_evidence_fallback" - } -} - -fn aggregate_qrel_source_impl( - ranking_query_count: usize, - explicit_qrel_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if explicit_qrel_query_count == ranking_query_count { - "explicit_qrels" - } else if explicit_qrel_query_count == 0 { - "expected_evidence_fallback" - } else { - "mixed" - } -} - -fn ranking_coverage_state_impl( - summary: &ReportSummary, - source_job_count: usize, - ranking_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { - "complete" - } else { - "partial_coverage" - } -} - -fn positive_qrel_count(relevance: &BTreeMap) -> usize { - relevance.values().filter(|grade| **grade > 0.0).count() -} - -fn rate(numerator: usize, denominator: usize) -> Option { - (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) + ranking::ranked_candidate_source(ranking_query_count) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs new file mode 100644 index 00000000..cb2dd63d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -0,0 +1,172 @@ +use crate::{ + BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, + quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z}, +}; + +pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + let mut sums = BTreeMap::::new(); + let mut metrics = quantitative_metric_names() + .into_iter() + .map(|metric| (metric, None)) + .collect::>(); + + for row in rows { + for (metric, value) in &row.metrics { + if let Some(value) = value { + let (sum, count) = sums.entry(metric.clone()).or_default(); + + *sum += *value; + *count += 1; + } + } + } + for (metric, (sum, count)) in sums { + metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); + } + + metrics +} + +pub(super) fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + let state = if metric_comparable { result_state } else { "not_encoded" }; + let mut states = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + states.insert(format!("recall_at_{k}"), state.to_string()); + states.insert(format!("precision_at_{k}"), state.to_string()); + states.insert(format!("success_at_{k}"), state.to_string()); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + states.insert(metric.to_string(), state.to_string()); + } + + states +} + +pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert( + format!("recall_at_{k}"), + sum_per_query_denominator(rows, &format!("recall_at_{k}")), + ); + denominators.insert( + format!("precision_at_{k}"), + sum_per_query_denominator(rows, &format!("precision_at_{k}")), + ); + denominators.insert( + format!("success_at_{k}"), + sum_per_query_denominator(rows, &format!("success_at_{k}")), + ); + } + + denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); + denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); + denominators.insert( + "average_precision".to_string(), + sum_per_query_denominator(rows, "average_precision"), + ); + + denominators +} + +pub(super) fn aggregate_confidence_intervals( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + let mut confidence_intervals = BTreeMap::new(); + + for metric in rate_metric_names() { + let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); + + if denominator > 0 { + confidence_intervals.insert( + metric, + wilson_confidence_interval(numerator.min(denominator), denominator), + ); + } + } + + confidence_intervals +} + +fn quantitative_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + metrics.push(metric.to_string()); + } + + metrics +} + +fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} + +fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} + +fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { + rows.iter().filter_map(|row| row.denominators.get(metric)).sum() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs new file mode 100644 index 00000000..db9e932c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -0,0 +1,212 @@ +use crate::{ + BTreeMap, BTreeSet, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, + quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY}, + scoring, +}; + +pub(super) fn quantitative_per_query_rows( + source_jobs: &[RealWorldJob], + jobs: &[JobReport], + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> Vec { + source_jobs + .iter() + .zip(jobs.iter()) + .map(|(source_job, job)| { + quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) + }) + .collect() +} + +fn quantitative_per_query_row( + source_job: &RealWorldJob, + job: &JobReport, + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let relevance = relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = positive_qrel_count(&relevance); + let metrics = per_query_metrics(candidates.as_slice(), &relevance); + let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { + "not_encoded" + } else { + formatting::status_str(job.status) + }; + let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); + let denominators = per_query_denominators(candidates.len(), positive_relevance_count); + + QuantitativePerQueryRow { + job_id: job.job_id.clone(), + suite: job.suite_id.clone(), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: formatting::status_str(job.status).to_string(), + expected_relevant_count: positive_relevance_count, + candidate_count: candidates.len(), + qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + product: "ELF".to_string(), + adapter_id: adapter_id.to_string(), + metrics, + metric_states, + denominators, + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} + +fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { + let explicit = source_job + .expected_answer + .relevance_judgments + .iter() + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect::>(); + + if !explicit.is_empty() { + return explicit; + } + + job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() +} + +fn per_query_metrics( + candidates: &[String], + relevance: &BTreeMap, +) -> BTreeMap> { + let mut metrics = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + let relevant_at_k = relevant_at_k(candidates, relevance, *k); + + metrics + .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); + metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); + metrics.insert( + format!("success_at_{k}"), + Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), + ); + } + + metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); + metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); + + metrics +} + +fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} + +fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} + +fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { + let positive_count = positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} + +fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} + +fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { + if !source_job.expected_answer.relevance_judgments.is_empty() { + "explicit_qrels" + } else if empty { + "not_encoded" + } else { + "expected_evidence_fallback" + } +} + +fn positive_qrel_count(relevance: &BTreeMap) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs new file mode 100644 index 00000000..918a8613 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs @@ -0,0 +1,83 @@ +use crate::{BTreeMap, BTreeSet, RealWorldJob, ReportSummary, scoring}; + +pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + source_jobs + .iter() + .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) + .map(|job| job.job_id.as_str()) + .collect() +} + +pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { + ranking_query_ids(source_jobs).len() +} + +pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { + source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() +} + +pub(super) fn aggregate_qrel_source( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if explicit_qrel_query_count == ranking_query_count { + "explicit_qrels" + } else if explicit_qrel_query_count == 0 { + "expected_evidence_fallback" + } else { + "mixed" + } +} + +pub(super) fn ranking_coverage_state( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { + "complete" + } else { + "partial_coverage" + } +} + +pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { + if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } +} + +fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { + if !source_job.expected_answer.relevance_judgments.is_empty() { + return source_job + .expected_answer + .relevance_judgments + .iter() + .filter(|judgment| judgment.grade > 0.0) + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect(); + } + + source_job + .required_evidence + .iter() + .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) + .map(|evidence| (evidence.evidence_id.clone(), 1.0)) + .collect() +} + +fn ranking_query_attempted(job: &RealWorldJob) -> bool { + if !scoring::produced_evidence_order(job).is_empty() { + return true; + } + + let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { + return false; + }; + + answer.trace_explainability.as_ref().is_some_and(|trace| { + trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") + }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) +} From ce6f82c58b8561f2d1b6bbb8ee24ffec2ec9df83 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:51:28 -0400 Subject: [PATCH 12/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit artifacts","authority":"manual"} --- .../quantitative/audit_manifest.rs | 158 +----------------- .../quantitative/audit_manifest/artifacts.rs | 151 +++++++++++++++++ 2 files changed, 156 insertions(+), 153 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs index be8b9e50..e927bbac 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -1,7 +1,7 @@ -use std::env; +mod artifacts; use crate::{ - BTreeSet, ExportQuantitativeAuditManifestArgs, Path, PathBuf, QuantitativeAuditArtifact, + BTreeSet, ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact, QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs, quantitative::{ QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, @@ -58,8 +58,8 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( controls: args.controls.clone(), artifacts: vec![QuantitativeAuditArtifact { role: "product_runtime_fixtures".to_string(), - path: audit_artifact_display_path(args.fixtures.as_path()), - sha256: fixture_path_digest(args.fixtures.as_path())?, + path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), + sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, }], claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| { if args.held_out || args.leakage_audited { @@ -189,7 +189,7 @@ fn validate_quantitative_audit_manifest( validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; validate_quantitative_audit_controls(manifest, path)?; - validate_quantitative_audit_artifacts(manifest, path) + artifacts::validate_quantitative_audit_artifacts(manifest, path) } fn validate_quantitative_audit_query_ids( @@ -252,151 +252,3 @@ fn validate_quantitative_audit_controls( Ok(()) } - -fn validate_quantitative_audit_artifacts( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - if manifest.artifacts.is_empty() { - return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); - } - - for artifact in &manifest.artifacts { - if artifact.role.trim().is_empty() - || artifact.path.trim().is_empty() - || artifact.sha256.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative audit artifact.", - path.display() - )); - } - if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) - { - return Err(eyre::eyre!( - "{} artifact {} has invalid sha256 digest {}.", - path.display(), - artifact.role, - artifact.sha256 - )); - } - - let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); - let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { - eyre::eyre!( - "{} artifact {} could not be digested at {}: {err}", - path.display(), - artifact.role, - artifact_path.display() - ) - })?; - - if actual != artifact.sha256 { - return Err(eyre::eyre!( - "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", - path.display(), - artifact.role, - artifact_path.display(), - artifact.sha256, - actual - )); - } - } - - Ok(()) -} - -fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { - let raw = PathBuf::from(artifact_path); - - if raw.is_absolute() { - return raw; - } - - let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); - - if cwd_path.exists() { - return cwd_path; - } - - manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) -} - -fn fixture_path_digest(path: &Path) -> Result { - let mut hasher = blake3::Hasher::new(); - - if path.is_file() { - hash_fixture_file( - path, - path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), - &mut hasher, - )?; - - return Ok(hasher.finalize().to_hex().to_string()); - } - - let paths = audit_fixture_paths(path)?; - - for fixture in paths { - let relative = fixture - .strip_prefix(path) - .map(|relative| relative.to_string_lossy().replace('\\', "/")) - .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); - - hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; - } - - Ok(hasher.finalize().to_hex().to_string()) -} - -fn audit_fixture_paths(path: &Path) -> Result> { - let mut paths = Vec::new(); - - collect_audit_fixture_paths(path, &mut paths)?; - - paths.sort(); - - Ok(paths) -} - -fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { - if path.is_file() { - paths.push(path.to_path_buf()); - - return Ok(()); - } - - for entry in fs::read_dir(path)? { - let entry_path = entry?.path(); - - if entry_path.is_dir() { - collect_audit_fixture_paths(entry_path.as_path(), paths)?; - } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { - paths.push(entry_path); - } - } - - Ok(()) -} - -fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { - hasher.update(logical_path.as_bytes()); - hasher.update(b"\0"); - hasher.update(&fs::read(path)?); - hasher.update(b"\0"); - - Ok(()) -} - -fn audit_artifact_display_path(path: &Path) -> String { - let display_path = if path.is_absolute() { - env::current_dir() - .ok() - .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) - .unwrap_or_else(|| path.to_path_buf()) - } else { - path.to_path_buf() - }; - - display_path.to_string_lossy().replace('\\', "/") -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs new file mode 100644 index 00000000..9e033400 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs @@ -0,0 +1,151 @@ +use std::env; + +use crate::{Path, PathBuf, QuantitativeAuditManifest, Result, eyre, fs}; + +pub(super) fn validate_quantitative_audit_artifacts( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.artifacts.is_empty() { + return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); + } + + for artifact in &manifest.artifacts { + if artifact.role.trim().is_empty() + || artifact.path.trim().is_empty() + || artifact.sha256.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative audit artifact.", + path.display() + )); + } + if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) + { + return Err(eyre::eyre!( + "{} artifact {} has invalid sha256 digest {}.", + path.display(), + artifact.role, + artifact.sha256 + )); + } + + let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { + eyre::eyre!( + "{} artifact {} could not be digested at {}: {err}", + path.display(), + artifact.role, + artifact_path.display() + ) + })?; + + if actual != artifact.sha256 { + return Err(eyre::eyre!( + "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", + path.display(), + artifact.role, + artifact_path.display(), + artifact.sha256, + actual + )); + } + } + + Ok(()) +} + +pub(super) fn fixture_path_digest(path: &Path) -> Result { + let mut hasher = blake3::Hasher::new(); + + if path.is_file() { + hash_fixture_file( + path, + path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), + &mut hasher, + )?; + + return Ok(hasher.finalize().to_hex().to_string()); + } + + let paths = audit_fixture_paths(path)?; + + for fixture in paths { + let relative = fixture + .strip_prefix(path) + .map(|relative| relative.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); + + hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; + } + + Ok(hasher.finalize().to_hex().to_string()) +} + +pub(super) fn audit_artifact_display_path(path: &Path) -> String { + let display_path = if path.is_absolute() { + env::current_dir() + .ok() + .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) + .unwrap_or_else(|| path.to_path_buf()) + } else { + path.to_path_buf() + }; + + display_path.to_string_lossy().replace('\\', "/") +} + +fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { + let raw = PathBuf::from(artifact_path); + + if raw.is_absolute() { + return raw; + } + + let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); + + if cwd_path.exists() { + return cwd_path; + } + + manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) +} + +fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { + hasher.update(logical_path.as_bytes()); + hasher.update(b"\0"); + hasher.update(&fs::read(path)?); + hasher.update(b"\0"); + + Ok(()) +} From 06ec4c1ba920482488a6f65e1b730dfa0059c221 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 12:55:12 -0400 Subject: [PATCH 13/58] {"schema":"decodex/commit/1","summary":"Split quantitative product validation","authority":"manual"} --- .../quantitative/product_manifest.rs | 168 ++---------------- .../product_manifest/validation.rs | 157 ++++++++++++++++ 2 files changed, 167 insertions(+), 158 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs index 111459e9..ad9a2dee 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs @@ -1,7 +1,8 @@ +mod validation; + use crate::{ - BTreeSet, ExportQuantitativeProductManifestArgs, Path, QuantitativeBenchmarkRow, - QuantitativeProductManifest, REPORT_SCHEMA, RealWorldReport, Result, eyre, fs, - quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}, + ExportQuantitativeProductManifestArgs, Path, QuantitativeProductManifest, REPORT_SCHEMA, + RealWorldReport, Result, eyre, fs, quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, }; pub(crate) fn quantitative_product_manifest_from_report( @@ -82,7 +83,11 @@ pub(crate) fn quantitative_product_manifest_from_report( per_query_rows, }; - validate_quantitative_product_manifest(&manifest, &args.report, manifest.corpus_id.as_str())?; + validation::validate_quantitative_product_manifest( + &manifest, + &args.report, + manifest.corpus_id.as_str(), + )?; Ok(manifest) } @@ -107,160 +112,7 @@ pub(super) fn quantitative_product_manifest( row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); } - validate_quantitative_product_manifest(&manifest, path, corpus_id)?; + validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?; Ok(manifest) } - -fn validate_quantitative_product_manifest( - manifest: &QuantitativeProductManifest, - path: &Path, - corpus_id: &str, -) -> Result<()> { - if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.corpus_id != corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected same-corpus {}.", - path.display(), - manifest.corpus_id, - corpus_id - )); - } - if manifest.rows.is_empty() { - return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); - } - - let row_keys = manifest - .rows - .iter() - .map(|row| (row.product.as_str(), row.adapter_id.as_str())) - .collect::>(); - - for row in &manifest.rows { - if row.product == "ELF" { - return Err(eyre::eyre!( - "{} quantitative product manifest must not inject ELF self rows.", - path.display() - )); - } - if row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.adapter_name.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative product row.", - path.display() - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - if row.leaderboard_eligible { - validate_leaderboard_eligible_product_row(path, row)?; - } - } - for row in &manifest.per_query_rows { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - } - for row in &manifest.rows { - if row.ranking_query_count == 0 { - continue; - } - - let per_query_count = manifest - .per_query_rows - .iter() - .filter(|per_query| { - per_query.product == row.product && per_query.adapter_id == row.adapter_id - }) - .count(); - - if per_query_count < row.ranking_query_count { - return Err(eyre::eyre!( - "{} row {}:{} declares {} ranked queries but only {} per-query rows.", - path.display(), - row.product, - row.adapter_id, - row.ranking_query_count, - per_query_count - )); - } - } - - Ok(()) -} - -fn validate_leaderboard_eligible_product_row( - path: &Path, - row: &QuantitativeBenchmarkRow, -) -> Result<()> { - let has_audit_manifest_id = row - .audit_manifest_id - .as_deref() - .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); - - if row.evidence_class != "live_real_world" - || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT - || row.ranking_query_count != row.sample_size - || row.explicit_qrel_query_count != row.ranking_query_count - || !row.held_out - || !row.leakage_audited - || !has_audit_manifest_id - { - return Err(eyre::eyre!( - "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", - path.display(), - row.product, - row.adapter_id - )); - } - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs new file mode 100644 index 00000000..0ae5bf33 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs @@ -0,0 +1,157 @@ +use crate::{ + BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, + quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}, +}; + +pub(super) fn validate_quantitative_product_manifest( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.corpus_id != corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected same-corpus {}.", + path.display(), + manifest.corpus_id, + corpus_id + )); + } + if manifest.rows.is_empty() { + return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); + } + + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.rows { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative product row.", + path.display() + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + if row.leaderboard_eligible { + validate_leaderboard_eligible_product_row(path, row)?; + } + } + for row in &manifest.per_query_rows { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} + +fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} From c869f8bc33a58c5f99738d24b39588b73d33be03 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:00:33 -0400 Subject: [PATCH 14/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit validation","authority":"manual"} --- .../quantitative/audit_manifest.rs | 145 +----------------- .../quantitative/audit_manifest/validation.rs | 142 +++++++++++++++++ 2 files changed, 147 insertions(+), 140 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs index e927bbac..d3e696a9 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -1,12 +1,10 @@ mod artifacts; +mod validation; use crate::{ - BTreeSet, ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact, + ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact, QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs, - quantitative::{ - QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, - REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, metrics, - }, + quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, metrics}, }; pub(super) struct QuantitativeAuditContext<'a> { @@ -78,7 +76,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( }), }; - validate_quantitative_audit_manifest( + validation::validate_quantitative_audit_manifest( &manifest, args.fixtures.as_path(), QuantitativeAuditContext { @@ -111,7 +109,7 @@ pub(super) fn quantitative_audit_evidence( eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) })?; - validate_quantitative_audit_manifest(&manifest, path, context)?; + validation::validate_quantitative_audit_manifest(&manifest, path, context)?; Ok(QuantitativeAuditEvidence { held_out: manifest.held_out, @@ -119,136 +117,3 @@ pub(super) fn quantitative_audit_evidence( audit_manifest_id: Some(manifest.manifest_id), }) } - -fn validate_quantitative_audit_manifest( - manifest: &QuantitativeAuditManifest, - path: &Path, - context: QuantitativeAuditContext<'_>, -) -> Result<()> { - if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.run_id != context.run_id { - return Err(eyre::eyre!( - "{} has run_id {}, expected {}.", - path.display(), - manifest.run_id, - context.run_id - )); - } - if manifest.corpus_id != context.corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected {}.", - path.display(), - manifest.corpus_id, - context.corpus_id - )); - } - if manifest.product != context.product || manifest.adapter_id != context.adapter_id { - return Err(eyre::eyre!( - "{} has product {}:{} but current row is {}:{}.", - path.display(), - manifest.product, - manifest.adapter_id, - context.product, - context.adapter_id - )); - } - if manifest.sample_size != context.source_jobs.len() { - return Err(eyre::eyre!( - "{} has sample_size {}, expected {}.", - path.display(), - manifest.sample_size, - context.source_jobs.len() - )); - } - if manifest.ranking_query_count != context.ranking_query_count { - return Err(eyre::eyre!( - "{} has ranking_query_count {}, expected {}.", - path.display(), - manifest.ranking_query_count, - context.ranking_query_count - )); - } - if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { - return Err(eyre::eyre!( - "{} has explicit_qrel_query_count {}, expected {}.", - path.display(), - manifest.explicit_qrel_query_count, - context.explicit_qrel_query_count - )); - } - - validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; - validate_quantitative_audit_controls(manifest, path)?; - - artifacts::validate_quantitative_audit_artifacts(manifest, path) -} - -fn validate_quantitative_audit_query_ids( - manifest: &QuantitativeAuditManifest, - path: &Path, - source_jobs: &[RealWorldJob], -) -> Result<()> { - let expected = metrics::ranking_query_ids(source_jobs); - let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); - - if actual.len() != manifest.query_ids.len() { - return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); - } - if actual != expected { - let missing = expected.difference(&actual).copied().collect::>(); - let extra = actual.difference(&expected).copied().collect::>(); - - return Err(eyre::eyre!( - "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", - path.display(), - missing, - extra - )); - } - - Ok(()) -} - -fn validate_quantitative_audit_controls( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - let controls = manifest.controls.iter().map(String::as_str).collect::>(); - - if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { - return Err(eyre::eyre!( - "{} marks held_out=true without required control {}.", - path.display(), - REQUIRED_HELD_OUT_AUDIT_CONTROL - )); - } - if manifest.leakage_audited - && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) - || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) - { - return Err(eyre::eyre!( - "{} marks leakage_audited=true without required controls {} and {}.", - path.display(), - REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, - REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL - )); - } - if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() - { - return Err(eyre::eyre!( - "{} marks audit controls true but has an empty claim_boundary.", - path.display() - )); - } - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs new file mode 100644 index 00000000..5aab2c4f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs @@ -0,0 +1,142 @@ +use crate::{ + BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre, + quantitative::{ + QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, + REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + audit_manifest::{QuantitativeAuditContext, artifacts}, + metrics, + }, +}; + +pub(super) fn validate_quantitative_audit_manifest( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; + validate_quantitative_audit_controls(manifest, path)?; + + artifacts::validate_quantitative_audit_artifacts(manifest, path) +} + +fn validate_quantitative_audit_query_ids( + manifest: &QuantitativeAuditManifest, + path: &Path, + source_jobs: &[RealWorldJob], +) -> Result<()> { + let expected = metrics::ranking_query_ids(source_jobs); + let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); + + if actual.len() != manifest.query_ids.len() { + return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); + } + if actual != expected { + let missing = expected.difference(&actual).copied().collect::>(); + let extra = actual.difference(&expected).copied().collect::>(); + + return Err(eyre::eyre!( + "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", + path.display(), + missing, + extra + )); + } + + Ok(()) +} + +fn validate_quantitative_audit_controls( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + let controls = manifest.controls.iter().map(String::as_str).collect::>(); + + if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { + return Err(eyre::eyre!( + "{} marks held_out=true without required control {}.", + path.display(), + REQUIRED_HELD_OUT_AUDIT_CONTROL + )); + } + if manifest.leakage_audited + && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) + || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) + { + return Err(eyre::eyre!( + "{} marks leakage_audited=true without required controls {} and {}.", + path.display(), + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL + )); + } + if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() + { + return Err(eyre::eyre!( + "{} marks audit controls true but has an empty claim_boundary.", + path.display() + )); + } + + Ok(()) +} From 660172501cca286a38c30b750502563bc23de2a3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:09:56 -0400 Subject: [PATCH 15/58] {"schema":"decodex/commit/1","summary":"Split quantitative report assembly","authority":"manual"} --- .../real_world_job_benchmark/quantitative.rs | 141 +---------------- .../quantitative/report.rs | 142 ++++++++++++++++++ 2 files changed, 146 insertions(+), 137 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs index 16365e66..4032c770 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative.rs @@ -1,17 +1,16 @@ mod audit_manifest; mod metrics; mod product_manifest; +mod report; pub(super) use self::{ audit_manifest::quantitative_audit_manifest_from_jobs, product_manifest::quantitative_product_manifest_from_report, + report::{QuantitativeReportInput, quantitative_scoreboard_report}, }; -use self::audit_manifest::{QuantitativeAuditContext, QuantitativeAuditEvidence}; -use crate::{ - AdapterReport, BTreeSet, JobReport, Path, QuantitativeBenchmarkControls, - QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result, -}; +use self::audit_manifest::QuantitativeAuditEvidence; +use crate::{AdapterReport, BTreeSet, JobReport, RealWorldJob, ReportSummary}; const QUANTITATIVE_SCOREBOARD_SCHEMA: &str = "elf.agent_memory_quantitative_benchmark/v1"; const QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA: &str = @@ -30,138 +29,6 @@ const QUANTITATIVE_ROW_CLAIM_BOUNDARY: &str = concat!( "Fixture-backed rows prove benchmark mechanics, not product-runtime or leaderboard claims." ); -pub(super) struct QuantitativeReportInput<'a> { - pub(super) run_id: &'a str, - pub(super) generated_at: &'a str, - pub(super) adapter: &'a AdapterReport, - pub(super) source_jobs: &'a [RealWorldJob], - pub(super) jobs: &'a [JobReport], - pub(super) summary: &'a ReportSummary, - pub(super) product_manifest_path: Option<&'a Path>, - pub(super) audit_manifest_path: Option<&'a Path>, -} - -pub(super) fn quantitative_scoreboard_report( - input: QuantitativeReportInput<'_>, -) -> Result { - let corpus_id = quantitative_corpus_id(input.source_jobs); - let evidence_class = quantitative_evidence_class(input.adapter, input.jobs); - let per_query_rows = metrics::quantitative_per_query_rows( - input.source_jobs, - input.jobs, - corpus_id.as_str(), - evidence_class, - input.adapter.adapter_id.as_str(), - ); - let ranking_query_count = per_query_rows - .iter() - .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) - .count(); - let explicit_qrel_query_count = - per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); - let metric_comparable = ranking_query_count > 0; - let result_state = quantitative_result_state(input.summary); - let audit_evidence = audit_manifest::quantitative_audit_evidence( - input.audit_manifest_path, - QuantitativeAuditContext { - run_id: input.run_id, - corpus_id: corpus_id.as_str(), - product: "ELF", - adapter_id: input.adapter.adapter_id.as_str(), - source_jobs: input.source_jobs, - ranking_query_count, - explicit_qrel_query_count, - }, - )?; - let leaderboard_eligible = quantitative_row_leaderboard_eligible( - evidence_class, - input.source_jobs.len(), - ranking_query_count, - explicit_qrel_query_count, - metric_comparable, - &audit_evidence, - ); - let row = QuantitativeBenchmarkRow { - product: "ELF".to_string(), - adapter_id: input.adapter.adapter_id.clone(), - adapter_name: input.adapter.name.clone(), - suite: quantitative_suite_id(input.jobs), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.clone()), - result_state: result_state.to_string(), - comparable: metric_comparable, - metric_comparable, - leaderboard_eligible, - held_out: audit_evidence.held_out, - leakage_audited: audit_evidence.leakage_audited, - audit_manifest_id: audit_evidence.audit_manifest_id, - fixture_regression_only: evidence_class == "fixture_backed", - sample_size: input.jobs.len(), - ranking_query_count, - ranking_coverage_state: metrics::ranking_coverage_state( - input.summary, - input.source_jobs.len(), - ranking_query_count, - ) - .to_string(), - ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), - qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) - .to_string(), - explicit_qrel_query_count, - metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), - metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), - denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), - confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - }; - let product_manifest = product_manifest::quantitative_product_manifest( - input.product_manifest_path, - corpus_id.as_str(), - )?; - let imported_row_count = product_manifest.rows.len(); - let imported_per_query_count = product_manifest.per_query_rows.len(); - let mut rows = vec![row]; - let mut merged_per_query_rows = per_query_rows; - - rows.extend(product_manifest.rows); - merged_per_query_rows.extend(product_manifest.per_query_rows); - - let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; - let controls = QuantitativeBenchmarkControls { - same_corpus_required: true, - same_task_required: true, - ranked_candidates_required_for_ranking_metrics: true, - explicit_relevance_judgments_required_for_leaderboard: true, - minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, - current_query_count: input.source_jobs.len(), - current_ranking_query_count: ranking_query_count, - current_explicit_qrel_query_count: explicit_qrel_query_count, - leaderboard_claim_allowed, - leakage_control: - "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" - .to_string(), - }; - - Ok(QuantitativeBenchmarkReport { - schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), - generated_at: input.generated_at.to_string(), - corpus_id, - k_values: QUANTITATIVE_K_VALUES.to_vec(), - rows, - per_query_rows: merged_per_query_rows, - metrics_not_encoded: quantitative_metrics_not_encoded( - imported_row_count, - imported_per_query_count, - ), - controls, - claim_boundary: concat!( - "Do not convert fixture mechanics, missing explicit qrels, ", - "or partial candidate coverage into product leaderboard claims." - ) - .to_string(), - }) -} - fn quantitative_metrics_not_encoded( imported_row_count: usize, imported_per_query_count: usize, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs new file mode 100644 index 00000000..bb3ab895 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs @@ -0,0 +1,142 @@ +use crate::{ + AdapterReport, JobReport, Path, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, + QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result, + quantitative::{ + self, MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, + QUANTITATIVE_SCOREBOARD_SCHEMA, + audit_manifest::{self, QuantitativeAuditContext}, + metrics, product_manifest, + }, +}; + +pub(crate) struct QuantitativeReportInput<'a> { + pub(crate) run_id: &'a str, + pub(crate) generated_at: &'a str, + pub(crate) adapter: &'a AdapterReport, + pub(crate) source_jobs: &'a [RealWorldJob], + pub(crate) jobs: &'a [JobReport], + pub(crate) summary: &'a ReportSummary, + pub(crate) product_manifest_path: Option<&'a Path>, + pub(crate) audit_manifest_path: Option<&'a Path>, +} + +pub(crate) fn quantitative_scoreboard_report( + input: QuantitativeReportInput<'_>, +) -> Result { + let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); + let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); + let per_query_rows = metrics::quantitative_per_query_rows( + input.source_jobs, + input.jobs, + corpus_id.as_str(), + evidence_class, + input.adapter.adapter_id.as_str(), + ); + let ranking_query_count = per_query_rows + .iter() + .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) + .count(); + let explicit_qrel_query_count = + per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); + let metric_comparable = ranking_query_count > 0; + let result_state = quantitative::quantitative_result_state(input.summary); + let audit_evidence = audit_manifest::quantitative_audit_evidence( + input.audit_manifest_path, + QuantitativeAuditContext { + run_id: input.run_id, + corpus_id: corpus_id.as_str(), + product: "ELF", + adapter_id: input.adapter.adapter_id.as_str(), + source_jobs: input.source_jobs, + ranking_query_count, + explicit_qrel_query_count, + }, + )?; + let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( + evidence_class, + input.source_jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + &audit_evidence, + ); + let row = QuantitativeBenchmarkRow { + product: "ELF".to_string(), + adapter_id: input.adapter.adapter_id.clone(), + adapter_name: input.adapter.name.clone(), + suite: quantitative::quantitative_suite_id(input.jobs), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.clone()), + result_state: result_state.to_string(), + comparable: metric_comparable, + metric_comparable, + leaderboard_eligible, + held_out: audit_evidence.held_out, + leakage_audited: audit_evidence.leakage_audited, + audit_manifest_id: audit_evidence.audit_manifest_id, + fixture_regression_only: evidence_class == "fixture_backed", + sample_size: input.jobs.len(), + ranking_query_count, + ranking_coverage_state: metrics::ranking_coverage_state( + input.summary, + input.source_jobs.len(), + ranking_query_count, + ) + .to_string(), + ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + .to_string(), + explicit_qrel_query_count, + metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), + metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), + denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), + confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + }; + let product_manifest = product_manifest::quantitative_product_manifest( + input.product_manifest_path, + corpus_id.as_str(), + )?; + let imported_row_count = product_manifest.rows.len(); + let imported_per_query_count = product_manifest.per_query_rows.len(); + let mut rows = vec![row]; + let mut merged_per_query_rows = per_query_rows; + + rows.extend(product_manifest.rows); + merged_per_query_rows.extend(product_manifest.per_query_rows); + + let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; + let controls = QuantitativeBenchmarkControls { + same_corpus_required: true, + same_task_required: true, + ranked_candidates_required_for_ranking_metrics: true, + explicit_relevance_judgments_required_for_leaderboard: true, + minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, + current_query_count: input.source_jobs.len(), + current_ranking_query_count: ranking_query_count, + current_explicit_qrel_query_count: explicit_qrel_query_count, + leaderboard_claim_allowed, + leakage_control: + "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" + .to_string(), + }; + + Ok(QuantitativeBenchmarkReport { + schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), + generated_at: input.generated_at.to_string(), + corpus_id, + k_values: QUANTITATIVE_K_VALUES.to_vec(), + rows, + per_query_rows: merged_per_query_rows, + metrics_not_encoded: quantitative::quantitative_metrics_not_encoded( + imported_row_count, + imported_per_query_count, + ), + controls, + claim_boundary: concat!( + "Do not convert fixture mechanics, missing explicit qrels, ", + "or partial candidate coverage into product leaderboard claims." + ) + .to_string(), + }) +} From 6261914c54be6a7d8ba07a72e4455faa43f1b9af Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:14:46 -0400 Subject: [PATCH 16/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query metrics","authority":"manual"} --- .../quantitative/metrics/per_query.rs | 145 ++---------------- .../metrics/per_query/query_metrics.rs | 129 ++++++++++++++++ 2 files changed, 139 insertions(+), 135 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs index db9e932c..fbbce5db 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -1,7 +1,8 @@ +mod query_metrics; + use crate::{ - BTreeMap, BTreeSet, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, - quantitative::{QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY}, - scoring, + BTreeMap, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, + quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring, }; pub(super) fn quantitative_per_query_rows( @@ -29,15 +30,14 @@ fn quantitative_per_query_row( ) -> QuantitativePerQueryRow { let relevance = relevance_grades(source_job, job); let candidates = scoring::produced_evidence_order(source_job); - let positive_relevance_count = positive_qrel_count(&relevance); - let metrics = per_query_metrics(candidates.as_slice(), &relevance); + let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); + let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { "not_encoded" } else { formatting::status_str(job.status) }; let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); - let denominators = per_query_denominators(candidates.len(), positive_relevance_count); QuantitativePerQueryRow { job_id: job.job_id.clone(), @@ -53,7 +53,10 @@ fn quantitative_per_query_row( adapter_id: adapter_id.to_string(), metrics, metric_states, - denominators, + denominators: query_metrics::per_query_denominators( + candidates.len(), + positive_relevance_count, + ), claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), } } @@ -73,126 +76,6 @@ fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap, -) -> BTreeMap> { - let mut metrics = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - let relevant_at_k = relevant_at_k(candidates, relevance, *k); - - metrics - .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); - metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); - metrics.insert( - format!("success_at_{k}"), - Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), - ); - } - - metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); - metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); - metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); - - metrics -} - -fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { - candidates - .iter() - .take(k) - .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) - .count() -} - -fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - Some( - candidates - .iter() - .position(|candidate| { - relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) - }) - .map_or(0.0, |index| 1.0 / (index + 1) as f64), - ) -} - -fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - let dcg = candidates - .iter() - .take(k) - .enumerate() - .map(|(index, candidate)| { - relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) - / ((index + 2) as f64).log2() - }) - .sum::(); - let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); - - ideal.sort_by(|left, right| right.total_cmp(left)); - - let idcg = ideal - .iter() - .take(k) - .enumerate() - .map(|(index, grade)| grade / ((index + 2) as f64).log2()) - .sum::(); - - Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) -} - -fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { - let positive_count = positive_qrel_count(relevance); - - if positive_count == 0 { - return None; - } - - let mut hit_count = 0; - let mut precision_sum = 0.0; - let mut seen = BTreeSet::new(); - - for (index, candidate) in candidates.iter().enumerate() { - if !seen.insert(candidate.as_str()) { - continue; - } - if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { - hit_count += 1; - precision_sum += hit_count as f64 / (index + 1) as f64; - } - } - - Some(precision_sum / positive_count as f64) -} - -fn per_query_denominators( - candidate_count: usize, - expected_relevant_count: usize, -) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert(format!("recall_at_{k}"), expected_relevant_count); - denominators.insert(format!("precision_at_{k}"), *k); - denominators.insert(format!("success_at_{k}"), 1); - } - - denominators.insert("mrr".to_string(), expected_relevant_count); - denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); - denominators.insert("average_precision".to_string(), expected_relevant_count); - denominators.insert("candidate_count".to_string(), candidate_count); - - denominators -} - fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { if !source_job.expected_answer.relevance_judgments.is_empty() { "explicit_qrels" @@ -202,11 +85,3 @@ fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { "expected_evidence_fallback" } } - -fn positive_qrel_count(relevance: &BTreeMap) -> usize { - relevance.values().filter(|grade| **grade > 0.0).count() -} - -fn rate(numerator: usize, denominator: usize) -> Option { - (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs new file mode 100644 index 00000000..01babc1d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs @@ -0,0 +1,129 @@ +use crate::{BTreeMap, BTreeSet, formatting, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn per_query_metrics( + candidates: &[String], + relevance: &BTreeMap, +) -> BTreeMap> { + let mut metrics = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + let relevant_at_k = relevant_at_k(candidates, relevance, *k); + + metrics + .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); + metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); + metrics.insert( + format!("success_at_{k}"), + Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), + ); + } + + metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); + metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); + + metrics +} + +pub(super) fn positive_qrel_count(relevance: &BTreeMap) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +pub(super) fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} + +fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} + +fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { + if positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} + +fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { + let positive_count = positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} + +fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} From 69af28a58114d96d5ae0adb6a147a30dcd0aaed1 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:18:20 -0400 Subject: [PATCH 17/58] {"schema":"decodex/commit/1","summary":"Split quantitative aggregate confidence","authority":"manual"} --- .../quantitative/metrics/aggregate.rs | 78 +----------------- .../metrics/aggregate/confidence.rs | 82 +++++++++++++++++++ 2 files changed, 86 insertions(+), 74 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs index cb2dd63d..4d737d85 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -1,6 +1,8 @@ +mod confidence; + use crate::{ BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, - quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z}, + quantitative::QUANTITATIVE_K_VALUES, }; pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { @@ -77,20 +79,7 @@ pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeM pub(super) fn aggregate_confidence_intervals( rows: &[QuantitativePerQueryRow], ) -> BTreeMap { - let mut confidence_intervals = BTreeMap::new(); - - for metric in rate_metric_names() { - let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); - - if denominator > 0 { - confidence_intervals.insert( - metric, - wilson_confidence_interval(numerator.min(denominator), denominator), - ); - } - } - - confidence_intervals + confidence::aggregate_confidence_intervals(rows) } fn quantitative_metric_names() -> Vec { @@ -108,65 +97,6 @@ fn quantitative_metric_names() -> Vec { metrics } -fn rate_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - - metrics -} - -fn aggregate_rate_numerator_denominator( - rows: &[QuantitativePerQueryRow], - metric: &str, -) -> (usize, usize) { - let mut numerator = 0; - let mut denominator = 0; - - for row in rows { - let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { - continue; - }; - let Some(row_denominator) = row.denominators.get(metric).copied() else { - continue; - }; - - if row_denominator == 0 { - continue; - } - - denominator += row_denominator; - numerator += (value * row_denominator as f64).round() as usize; - } - - (numerator, denominator) -} - -fn wilson_confidence_interval( - numerator: usize, - denominator: usize, -) -> QuantitativeConfidenceInterval { - let n = denominator as f64; - let p = numerator as f64 / n; - let z2 = WILSON_95_Z * WILSON_95_Z; - let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); - let half_width = - WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); - - QuantitativeConfidenceInterval { - method: "wilson_score".to_string(), - confidence: 0.95, - lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), - upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), - numerator, - denominator, - } -} - fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { rows.iter().filter_map(|row| row.denominators.get(metric)).sum() } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs new file mode 100644 index 00000000..e1db5fb8 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs @@ -0,0 +1,82 @@ +use crate::{ + BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, + quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z}, +}; + +pub(super) fn aggregate_confidence_intervals( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { + let mut confidence_intervals = BTreeMap::new(); + + for metric in rate_metric_names() { + let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); + + if denominator > 0 { + confidence_intervals.insert( + metric, + wilson_confidence_interval(numerator.min(denominator), denominator), + ); + } + } + + confidence_intervals +} + +fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} + +fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} From 0d546f8eafed53f70eee70bfd7380de3d15a0e18 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:24:10 -0400 Subject: [PATCH 18/58] {"schema":"decodex/commit/1","summary":"Split quantitative product manifest tests","authority":"manual"} --- .../quantitative/product_manifest.rs | 185 +++--------------- .../quantitative/product_manifest/export.rs | 73 +++++++ .../product_manifest/validation.rs | 64 ++++++ 3 files changed, 162 insertions(+), 160 deletions(-) create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs create mode 100644 apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs index c7b543c5..054e70f3 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest.rs @@ -1,5 +1,9 @@ +#[path = "product_manifest/export.rs"] mod export; +#[path = "product_manifest/validation.rs"] mod validation; + use std::{ env, fs, + path::PathBuf, process::{self, Command}, }; @@ -8,114 +12,40 @@ use serde_json::Value; use crate::support; -#[test] -fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = - env::temp_dir().join(format!("elf-quantitative-product-manifest-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("synthetic-rival-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .arg("--product") - .arg("Synthetic Rival") - .arg("--adapter-id") - .arg("synthetic_rival") - .arg("--adapter-name") - .arg("Synthetic Rival adapter") - .output()?; - - assert!( - export.status.success(), - "product manifest export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let manifest = support::load_json(&manifest_path)?; - - assert_eq!( - manifest.pointer("/schema").and_then(Value::as_str), - Some("elf.agent_memory_quantitative_product_manifest/v1") - ); - assert_eq!( - manifest.pointer("/rows/0/product").and_then(Value::as_str), - Some("Synthetic Rival") - ); - assert_eq!( - manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), - Some("synthetic_rival") - ); - - let imported = super::run_report_with_quantitative_manifest(&manifest_path)?; - let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; - let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; +struct ProductManifestPaths { + temp_dir: PathBuf, + report_path: PathBuf, + manifest_path: PathBuf, +} - assert_eq!(rows.len(), 2); - assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); - assert!(!support::array_contains_str( - &imported, - "/quantitative_scoreboard/metrics_not_encoded", - "external_product_manifest_import" - )?); - assert!( - support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( - |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") - ) - ); +fn product_manifest_paths(temp_name: &str, manifest_file: &str) -> ProductManifestPaths { + let temp_dir = env::temp_dir().join(format!("{temp_name}-{}", process::id())); - Ok(()) + ProductManifestPaths { + report_path: temp_dir.join("report.json"), + manifest_path: temp_dir.join(manifest_file), + temp_dir, + } } -#[test] -fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { +fn write_adversarial_report(paths: &ProductManifestPaths) -> Result<()> { let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-elf-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("elf-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .output()?; - assert!(!output.status.success()); - assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); + fs::create_dir_all(&paths.temp_dir)?; + fs::write(&paths.report_path, serde_json::to_vec_pretty(&report)?)?; Ok(()) } -#[test] -fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-corpus-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("wrong-corpus-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; +fn export_synthetic_rival_manifest(paths: &ProductManifestPaths) -> Result<()> { + write_adversarial_report(paths)?; let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) .arg("export-quantitative-product-manifest") .arg("--report") - .arg(&report_path) + .arg(&paths.report_path) .arg("--out") - .arg(&manifest_path) + .arg(&paths.manifest_path) .arg("--product") .arg("Synthetic Rival") .arg("--adapter-id") @@ -130,74 +60,9 @@ fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { String::from_utf8_lossy(&export.stderr) ); - let mut manifest = support::load_json(&manifest_path)?; - - support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--quantitative-product-manifest") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); - Ok(()) } -#[test] -fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { - let report = support::run_json_report_from(support::adversarial_quality_fixture_dir())?; - let temp_dir = env::temp_dir() - .join(format!("elf-quantitative-product-manifest-per-query-test-{}", process::id())); - let report_path = temp_dir.join("report.json"); - let manifest_path = temp_dir.join("missing-per-query-product-manifest.json"); - - fs::create_dir_all(&temp_dir)?; - fs::write(&report_path, serde_json::to_vec_pretty(&report)?)?; - - let export = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("export-quantitative-product-manifest") - .arg("--report") - .arg(&report_path) - .arg("--out") - .arg(&manifest_path) - .arg("--product") - .arg("Synthetic Rival") - .arg("--adapter-id") - .arg("synthetic_rival") - .arg("--adapter-name") - .arg("Synthetic Rival adapter") - .output()?; - - assert!( - export.status.success(), - "product manifest export failed: {}", - String::from_utf8_lossy(&export.stderr) - ); - - let mut manifest = support::load_json(&manifest_path)?; - - support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; - fs::write(&manifest_path, serde_json::to_vec_pretty(&manifest)?)?; - - let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) - .arg("run") - .arg("--fixtures") - .arg(support::adversarial_quality_fixture_dir()) - .arg("--quantitative-product-manifest") - .arg(&manifest_path) - .output()?; - - assert!(!output.status.success()); - - let stderr = String::from_utf8_lossy(&output.stderr); - - assert!(stderr.contains("ranked queries but only 0")); - - Ok(()) +fn run_report_with_manifest(paths: &ProductManifestPaths) -> Result { + super::run_report_with_quantitative_manifest(&paths.manifest_path) } diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs new file mode 100644 index 00000000..d56f2bd7 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/export.rs @@ -0,0 +1,73 @@ +use std::process::Command; + +use color_eyre::Result; +use serde_json::Value; + +use crate::support; + +#[test] +fn quantitative_product_manifest_exports_and_reimports_same_corpus_rows() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-test", + "synthetic-rival-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let manifest = support::load_json(&paths.manifest_path)?; + + assert_eq!( + manifest.pointer("/schema").and_then(Value::as_str), + Some("elf.agent_memory_quantitative_product_manifest/v1") + ); + assert_eq!( + manifest.pointer("/rows/0/product").and_then(Value::as_str), + Some("Synthetic Rival") + ); + assert_eq!( + manifest.pointer("/per_query_rows/0/adapter_id").and_then(Value::as_str), + Some("synthetic_rival") + ); + + let imported = super::run_report_with_manifest(&paths)?; + let rows = support::array_at(&imported, "/quantitative_scoreboard/rows")?; + let rival = support::find_by_field(rows, "/adapter_id", "synthetic_rival")?; + + assert_eq!(rows.len(), 2); + assert_eq!(rival.pointer("/product").and_then(Value::as_str), Some("Synthetic Rival")); + assert!(!support::array_contains_str( + &imported, + "/quantitative_scoreboard/metrics_not_encoded", + "external_product_manifest_import" + )?); + assert!( + support::array_at(&imported, "/quantitative_scoreboard/per_query_rows")?.iter().any( + |row| row.pointer("/adapter_id").and_then(Value::as_str) == Some("synthetic_rival") + ) + ); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_export_rejects_elf_self_rows() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-elf-test", + "elf-product-manifest.json", + ); + + super::write_adversarial_report(&paths)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("export-quantitative-product-manifest") + .arg("--report") + .arg(&paths.report_path) + .arg("--out") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("exports product ELF")); + + Ok(()) +} diff --git a/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs new file mode 100644 index 00000000..e4e302b3 --- /dev/null +++ b/apps/elf-eval/tests/real_world_job_benchmark/quantitative/product_manifest/validation.rs @@ -0,0 +1,64 @@ +use std::{fs, process::Command}; + +use color_eyre::Result; + +use crate::support; + +#[test] +fn quantitative_product_manifest_rejects_cross_corpus_imports() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-corpus-test", + "wrong-corpus-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let mut manifest = support::load_json(&paths.manifest_path)?; + + support::set_json_pointer(&mut manifest, "/corpus_id", serde_json::json!("wrong-corpus"))?; + fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + assert!(String::from_utf8_lossy(&output.stderr).contains("expected same-corpus")); + + Ok(()) +} + +#[test] +fn quantitative_product_manifest_rejects_ranked_rows_without_per_query_evidence() -> Result<()> { + let paths = super::product_manifest_paths( + "elf-quantitative-product-manifest-per-query-test", + "missing-per-query-product-manifest.json", + ); + + super::export_synthetic_rival_manifest(&paths)?; + + let mut manifest = support::load_json(&paths.manifest_path)?; + + support::set_json_pointer(&mut manifest, "/per_query_rows", serde_json::json!([]))?; + fs::write(&paths.manifest_path, serde_json::to_vec_pretty(&manifest)?)?; + + let output = Command::new(env!("CARGO_BIN_EXE_real_world_job_benchmark")) + .arg("run") + .arg("--fixtures") + .arg(support::adversarial_quality_fixture_dir()) + .arg("--quantitative-product-manifest") + .arg(&paths.manifest_path) + .output()?; + + assert!(!output.status.success()); + + let stderr = String::from_utf8_lossy(&output.stderr); + + assert!(stderr.contains("ranked queries but only 0")); + + Ok(()) +} From e19440ab2f38e4cbc319496a112db7bd25031440 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:28:02 -0400 Subject: [PATCH 19/58] {"schema":"decodex/commit/1","summary":"Split quantitative product row validation","authority":"manual"} --- .../product_manifest/validation.rs | 131 +-------------- .../product_manifest/validation/rows.rs | 152 ++++++++++++++++++ 2 files changed, 159 insertions(+), 124 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs index 0ae5bf33..fe86d636 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation.rs @@ -1,6 +1,8 @@ +mod rows; + use crate::{ - BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, - quantitative::{MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA}, + Path, QuantitativeProductManifest, Result, eyre, + quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, }; pub(super) fn validate_quantitative_product_manifest( @@ -30,128 +32,9 @@ pub(super) fn validate_quantitative_product_manifest( return Err(eyre::eyre!("{} declares no quantitative product rows.", path.display())); } - let row_keys = manifest - .rows - .iter() - .map(|row| (row.product.as_str(), row.adapter_id.as_str())) - .collect::>(); - - for row in &manifest.rows { - if row.product == "ELF" { - return Err(eyre::eyre!( - "{} quantitative product manifest must not inject ELF self rows.", - path.display() - )); - } - if row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.adapter_name.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative product row.", - path.display() - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - if row.leaderboard_eligible { - validate_leaderboard_eligible_product_row(path, row)?; - } - } - for row in &manifest.per_query_rows { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - } - for row in &manifest.rows { - if row.ranking_query_count == 0 { - continue; - } - - let per_query_count = manifest - .per_query_rows - .iter() - .filter(|per_query| { - per_query.product == row.product && per_query.adapter_id == row.adapter_id - }) - .count(); - - if per_query_count < row.ranking_query_count { - return Err(eyre::eyre!( - "{} row {}:{} declares {} ranked queries but only {} per-query rows.", - path.display(), - row.product, - row.adapter_id, - row.ranking_query_count, - per_query_count - )); - } - } - - Ok(()) -} - -fn validate_leaderboard_eligible_product_row( - path: &Path, - row: &QuantitativeBenchmarkRow, -) -> Result<()> { - let has_audit_manifest_id = row - .audit_manifest_id - .as_deref() - .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); - - if row.evidence_class != "live_real_world" - || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT - || row.ranking_query_count != row.sample_size - || row.explicit_qrel_query_count != row.ranking_query_count - || !row.held_out - || !row.leakage_audited - || !has_audit_manifest_id - { - return Err(eyre::eyre!( - "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", - path.display(), - row.product, - row.adapter_id - )); - } + rows::validate_quantitative_product_rows(manifest, path, corpus_id)?; + rows::validate_quantitative_per_query_rows(manifest, path, corpus_id)?; + rows::validate_ranked_row_evidence(manifest, path)?; Ok(()) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs new file mode 100644 index 00000000..055234ed --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs @@ -0,0 +1,152 @@ +use crate::{ + BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, + quantitative::MIN_LEADERBOARD_QUERY_COUNT, +}; + +pub(super) fn validate_quantitative_product_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + for row in &manifest.rows { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative product row.", + path.display() + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + if row.leaderboard_eligible { + validate_leaderboard_eligible_product_row(path, row)?; + } + } + + Ok(()) +} + +pub(super) fn validate_quantitative_per_query_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.per_query_rows { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + + Ok(()) +} + +pub(super) fn validate_ranked_row_evidence( + manifest: &QuantitativeProductManifest, + path: &Path, +) -> Result<()> { + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} + +fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} From 974489b3f7eaa3457a5dafaa9f380abacc4ac5db Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:34:30 -0400 Subject: [PATCH 20/58] {"schema":"decodex/commit/1","summary":"Split quantitative product row checks","authority":"manual"} --- .../product_manifest/validation/rows.rs | 140 +----------------- .../validation/rows/per_query.rs | 48 ++++++ .../validation/rows/product.rs | 73 +++++++++ .../validation/rows/ranking.rs | 33 +++++ 4 files changed, 162 insertions(+), 132 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs index 055234ed..36009dfa 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows.rs @@ -1,47 +1,15 @@ -use crate::{ - BTreeSet, Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, - quantitative::MIN_LEADERBOARD_QUERY_COUNT, -}; +mod per_query; +mod product; +mod ranking; + +use crate::{Path, QuantitativeProductManifest, Result}; pub(super) fn validate_quantitative_product_rows( manifest: &QuantitativeProductManifest, path: &Path, corpus_id: &str, ) -> Result<()> { - for row in &manifest.rows { - if row.product == "ELF" { - return Err(eyre::eyre!( - "{} quantitative product manifest must not inject ELF self rows.", - path.display() - )); - } - if row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.adapter_name.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative product row.", - path.display() - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - if row.leaderboard_eligible { - validate_leaderboard_eligible_product_row(path, row)?; - } - } - - Ok(()) + product::validate_quantitative_product_rows(manifest, path, corpus_id) } pub(super) fn validate_quantitative_per_query_rows( @@ -49,104 +17,12 @@ pub(super) fn validate_quantitative_per_query_rows( path: &Path, corpus_id: &str, ) -> Result<()> { - let row_keys = manifest - .rows - .iter() - .map(|row| (row.product.as_str(), row.adapter_id.as_str())) - .collect::>(); - - for row in &manifest.per_query_rows { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } - } - - Ok(()) + per_query::validate_quantitative_per_query_rows(manifest, path, corpus_id) } pub(super) fn validate_ranked_row_evidence( manifest: &QuantitativeProductManifest, path: &Path, ) -> Result<()> { - for row in &manifest.rows { - if row.ranking_query_count == 0 { - continue; - } - - let per_query_count = manifest - .per_query_rows - .iter() - .filter(|per_query| { - per_query.product == row.product && per_query.adapter_id == row.adapter_id - }) - .count(); - - if per_query_count < row.ranking_query_count { - return Err(eyre::eyre!( - "{} row {}:{} declares {} ranked queries but only {} per-query rows.", - path.display(), - row.product, - row.adapter_id, - row.ranking_query_count, - per_query_count - )); - } - } - - Ok(()) -} - -fn validate_leaderboard_eligible_product_row( - path: &Path, - row: &QuantitativeBenchmarkRow, -) -> Result<()> { - let has_audit_manifest_id = row - .audit_manifest_id - .as_deref() - .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); - - if row.evidence_class != "live_real_world" - || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT - || row.ranking_query_count != row.sample_size - || row.explicit_qrel_query_count != row.ranking_query_count - || !row.held_out - || !row.leakage_audited - || !has_audit_manifest_id - { - return Err(eyre::eyre!( - "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", - path.display(), - row.product, - row.adapter_id - )); - } - - Ok(()) + ranking::validate_ranked_row_evidence(manifest, path) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs new file mode 100644 index 00000000..4e720a68 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs @@ -0,0 +1,48 @@ +use crate::{BTreeSet, Path, QuantitativeProductManifest, Result, eyre}; + +pub(super) fn validate_quantitative_per_query_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + let row_keys = manifest + .rows + .iter() + .map(|row| (row.product.as_str(), row.adapter_id.as_str())) + .collect::>(); + + for row in &manifest.per_query_rows { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs new file mode 100644 index 00000000..913b0628 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs @@ -0,0 +1,73 @@ +use crate::{ + Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, + quantitative::MIN_LEADERBOARD_QUERY_COUNT, +}; + +pub(super) fn validate_quantitative_product_rows( + manifest: &QuantitativeProductManifest, + path: &Path, + corpus_id: &str, +) -> Result<()> { + for row in &manifest.rows { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative product row.", + path.display() + )); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + if row.leaderboard_eligible { + validate_leaderboard_eligible_product_row(path, row)?; + } + } + + Ok(()) +} + +fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs new file mode 100644 index 00000000..8206e54b --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/ranking.rs @@ -0,0 +1,33 @@ +use crate::{Path, QuantitativeProductManifest, Result, eyre}; + +pub(super) fn validate_ranked_row_evidence( + manifest: &QuantitativeProductManifest, + path: &Path, +) -> Result<()> { + for row in &manifest.rows { + if row.ranking_query_count == 0 { + continue; + } + + let per_query_count = manifest + .per_query_rows + .iter() + .filter(|per_query| { + per_query.product == row.product && per_query.adapter_id == row.adapter_id + }) + .count(); + + if per_query_count < row.ranking_query_count { + return Err(eyre::eyre!( + "{} row {}:{} declares {} ranked queries but only {} per-query rows.", + path.display(), + row.product, + row.adapter_id, + row.ranking_query_count, + per_query_count + )); + } + } + + Ok(()) +} From e1fc0e4033222afe5f6bc8bde9ca9b6c59d6f0d3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:41:35 -0400 Subject: [PATCH 21/58] {"schema":"decodex/commit/1","summary":"Split quantitative report assembly","authority":"manual"} --- .../quantitative/report.rs | 111 +++--------------- .../quantitative/report/controls.rs | 26 ++++ .../quantitative/report/row.rs | 100 ++++++++++++++++ 3 files changed, 142 insertions(+), 95 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs index bb3ab895..331acc70 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs @@ -1,12 +1,10 @@ +mod controls; +mod row; + use crate::{ - AdapterReport, JobReport, Path, QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, - QuantitativeBenchmarkRow, RealWorldJob, ReportSummary, Result, - quantitative::{ - self, MIN_LEADERBOARD_QUERY_COUNT, QUANTITATIVE_K_VALUES, QUANTITATIVE_ROW_CLAIM_BOUNDARY, - QUANTITATIVE_SCOREBOARD_SCHEMA, - audit_manifest::{self, QuantitativeAuditContext}, - metrics, product_manifest, - }, + AdapterReport, JobReport, Path, QuantitativeBenchmarkReport, RealWorldJob, ReportSummary, + Result, + quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest}, }; pub(crate) struct QuantitativeReportInput<'a> { @@ -23,108 +21,31 @@ pub(crate) struct QuantitativeReportInput<'a> { pub(crate) fn quantitative_scoreboard_report( input: QuantitativeReportInput<'_>, ) -> Result { - let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); - let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); - let per_query_rows = metrics::quantitative_per_query_rows( - input.source_jobs, - input.jobs, - corpus_id.as_str(), - evidence_class, - input.adapter.adapter_id.as_str(), - ); - let ranking_query_count = per_query_rows - .iter() - .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) - .count(); - let explicit_qrel_query_count = - per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); - let metric_comparable = ranking_query_count > 0; - let result_state = quantitative::quantitative_result_state(input.summary); - let audit_evidence = audit_manifest::quantitative_audit_evidence( - input.audit_manifest_path, - QuantitativeAuditContext { - run_id: input.run_id, - corpus_id: corpus_id.as_str(), - product: "ELF", - adapter_id: input.adapter.adapter_id.as_str(), - source_jobs: input.source_jobs, - ranking_query_count, - explicit_qrel_query_count, - }, - )?; - let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( - evidence_class, - input.source_jobs.len(), - ranking_query_count, - explicit_qrel_query_count, - metric_comparable, - &audit_evidence, - ); - let row = QuantitativeBenchmarkRow { - product: "ELF".to_string(), - adapter_id: input.adapter.adapter_id.clone(), - adapter_name: input.adapter.name.clone(), - suite: quantitative::quantitative_suite_id(input.jobs), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.clone()), - result_state: result_state.to_string(), - comparable: metric_comparable, - metric_comparable, - leaderboard_eligible, - held_out: audit_evidence.held_out, - leakage_audited: audit_evidence.leakage_audited, - audit_manifest_id: audit_evidence.audit_manifest_id, - fixture_regression_only: evidence_class == "fixture_backed", - sample_size: input.jobs.len(), - ranking_query_count, - ranking_coverage_state: metrics::ranking_coverage_state( - input.summary, - input.source_jobs.len(), - ranking_query_count, - ) - .to_string(), - ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), - qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) - .to_string(), - explicit_qrel_query_count, - metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), - metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), - denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), - confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - }; + let current_row = row::current_quantitative_row(&input)?; let product_manifest = product_manifest::quantitative_product_manifest( input.product_manifest_path, - corpus_id.as_str(), + current_row.corpus_id.as_str(), )?; let imported_row_count = product_manifest.rows.len(); let imported_per_query_count = product_manifest.per_query_rows.len(); - let mut rows = vec![row]; - let mut merged_per_query_rows = per_query_rows; + let mut rows = vec![current_row.row]; + let mut merged_per_query_rows = current_row.per_query_rows; rows.extend(product_manifest.rows); merged_per_query_rows.extend(product_manifest.per_query_rows); let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; - let controls = QuantitativeBenchmarkControls { - same_corpus_required: true, - same_task_required: true, - ranked_candidates_required_for_ranking_metrics: true, - explicit_relevance_judgments_required_for_leaderboard: true, - minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, - current_query_count: input.source_jobs.len(), - current_ranking_query_count: ranking_query_count, - current_explicit_qrel_query_count: explicit_qrel_query_count, + let controls = controls::quantitative_benchmark_controls( + &input, + current_row.ranking_query_count, + current_row.explicit_qrel_query_count, leaderboard_claim_allowed, - leakage_control: - "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" - .to_string(), - }; + ); Ok(QuantitativeBenchmarkReport { schema: QUANTITATIVE_SCOREBOARD_SCHEMA.to_string(), generated_at: input.generated_at.to_string(), - corpus_id, + corpus_id: current_row.corpus_id, k_values: QUANTITATIVE_K_VALUES.to_vec(), rows, per_query_rows: merged_per_query_rows, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs new file mode 100644 index 00000000..78d4b723 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/controls.rs @@ -0,0 +1,26 @@ +use crate::{ + QuantitativeBenchmarkControls, + quantitative::{MIN_LEADERBOARD_QUERY_COUNT, report::QuantitativeReportInput}, +}; + +pub(super) fn quantitative_benchmark_controls( + input: &QuantitativeReportInput<'_>, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + leaderboard_claim_allowed: bool, +) -> QuantitativeBenchmarkControls { + QuantitativeBenchmarkControls { + same_corpus_required: true, + same_task_required: true, + ranked_candidates_required_for_ranking_metrics: true, + explicit_relevance_judgments_required_for_leaderboard: true, + minimum_query_count_for_leaderboard: MIN_LEADERBOARD_QUERY_COUNT, + current_query_count: input.source_jobs.len(), + current_ranking_query_count: ranking_query_count, + current_explicit_qrel_query_count: explicit_qrel_query_count, + leaderboard_claim_allowed, + leakage_control: + "held_out_or_leakage_audited_runtime_rows_required_before_leaderboard_claims" + .to_string(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs new file mode 100644 index 00000000..d3f8b232 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -0,0 +1,100 @@ +use crate::{ + QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, + quantitative::{ + self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, + audit_manifest::{self, QuantitativeAuditContext}, + metrics, + report::QuantitativeReportInput, + }, +}; + +pub(super) struct CurrentQuantitativeRow { + pub(super) corpus_id: String, + pub(super) row: QuantitativeBenchmarkRow, + pub(super) per_query_rows: Vec, + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) fn current_quantitative_row( + input: &QuantitativeReportInput<'_>, +) -> Result { + let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); + let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); + let per_query_rows = metrics::quantitative_per_query_rows( + input.source_jobs, + input.jobs, + corpus_id.as_str(), + evidence_class, + input.adapter.adapter_id.as_str(), + ); + let ranking_query_count = per_query_rows + .iter() + .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) + .count(); + let explicit_qrel_query_count = + per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); + let metric_comparable = ranking_query_count > 0; + let result_state = quantitative::quantitative_result_state(input.summary); + let audit_evidence = audit_manifest::quantitative_audit_evidence( + input.audit_manifest_path, + QuantitativeAuditContext { + run_id: input.run_id, + corpus_id: corpus_id.as_str(), + product: "ELF", + adapter_id: input.adapter.adapter_id.as_str(), + source_jobs: input.source_jobs, + ranking_query_count, + explicit_qrel_query_count, + }, + )?; + let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( + evidence_class, + input.source_jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + &audit_evidence, + ); + let row = QuantitativeBenchmarkRow { + product: "ELF".to_string(), + adapter_id: input.adapter.adapter_id.clone(), + adapter_name: input.adapter.name.clone(), + suite: quantitative::quantitative_suite_id(input.jobs), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.clone()), + result_state: result_state.to_string(), + comparable: metric_comparable, + metric_comparable, + leaderboard_eligible, + held_out: audit_evidence.held_out, + leakage_audited: audit_evidence.leakage_audited, + audit_manifest_id: audit_evidence.audit_manifest_id, + fixture_regression_only: evidence_class == "fixture_backed", + sample_size: input.jobs.len(), + ranking_query_count, + ranking_coverage_state: metrics::ranking_coverage_state( + input.summary, + input.source_jobs.len(), + ranking_query_count, + ) + .to_string(), + ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + .to_string(), + explicit_qrel_query_count, + metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), + metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), + denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), + confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + }; + + Ok(CurrentQuantitativeRow { + corpus_id, + row, + per_query_rows, + ranking_query_count, + explicit_qrel_query_count, + }) +} From 00148a88ebb21832069b26258de32e86fe618c75 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:47:03 -0400 Subject: [PATCH 22/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit artifact helpers","authority":"manual"} --- .../quantitative/audit_manifest/artifacts.rs | 107 ++---------------- .../audit_manifest/artifacts/digest.rs | 67 +++++++++++ .../audit_manifest/artifacts/paths.rs | 35 ++++++ 3 files changed, 110 insertions(+), 99 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs index 9e033400..25a0bbb0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs @@ -1,6 +1,9 @@ -use std::env; +mod digest; +mod paths; -use crate::{Path, PathBuf, QuantitativeAuditManifest, Result, eyre, fs}; +pub(super) use self::{digest::fixture_path_digest, paths::audit_artifact_display_path}; + +use crate::{Path, QuantitativeAuditManifest, Result, eyre}; pub(super) fn validate_quantitative_audit_artifacts( manifest: &QuantitativeAuditManifest, @@ -30,8 +33,9 @@ pub(super) fn validate_quantitative_audit_artifacts( )); } - let artifact_path = resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); - let actual = fixture_path_digest(artifact_path.as_path()).map_err(|err| { + let artifact_path = + paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| { eyre::eyre!( "{} artifact {} could not be digested at {}: {err}", path.display(), @@ -54,98 +58,3 @@ pub(super) fn validate_quantitative_audit_artifacts( Ok(()) } - -pub(super) fn fixture_path_digest(path: &Path) -> Result { - let mut hasher = blake3::Hasher::new(); - - if path.is_file() { - hash_fixture_file( - path, - path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), - &mut hasher, - )?; - - return Ok(hasher.finalize().to_hex().to_string()); - } - - let paths = audit_fixture_paths(path)?; - - for fixture in paths { - let relative = fixture - .strip_prefix(path) - .map(|relative| relative.to_string_lossy().replace('\\', "/")) - .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); - - hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; - } - - Ok(hasher.finalize().to_hex().to_string()) -} - -pub(super) fn audit_artifact_display_path(path: &Path) -> String { - let display_path = if path.is_absolute() { - env::current_dir() - .ok() - .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) - .unwrap_or_else(|| path.to_path_buf()) - } else { - path.to_path_buf() - }; - - display_path.to_string_lossy().replace('\\', "/") -} - -fn resolve_quantitative_audit_artifact_path(manifest_path: &Path, artifact_path: &str) -> PathBuf { - let raw = PathBuf::from(artifact_path); - - if raw.is_absolute() { - return raw; - } - - let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); - - if cwd_path.exists() { - return cwd_path; - } - - manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) -} - -fn audit_fixture_paths(path: &Path) -> Result> { - let mut paths = Vec::new(); - - collect_audit_fixture_paths(path, &mut paths)?; - - paths.sort(); - - Ok(paths) -} - -fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { - if path.is_file() { - paths.push(path.to_path_buf()); - - return Ok(()); - } - - for entry in fs::read_dir(path)? { - let entry_path = entry?.path(); - - if entry_path.is_dir() { - collect_audit_fixture_paths(entry_path.as_path(), paths)?; - } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { - paths.push(entry_path); - } - } - - Ok(()) -} - -fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { - hasher.update(logical_path.as_bytes()); - hasher.update(b"\0"); - hasher.update(&fs::read(path)?); - hasher.update(b"\0"); - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs new file mode 100644 index 00000000..bb75c802 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs @@ -0,0 +1,67 @@ +use crate::{Path, PathBuf, Result, fs}; + +pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) -> Result { + let mut hasher = blake3::Hasher::new(); + + if path.is_file() { + hash_fixture_file( + path, + path.file_name().and_then(|name| name.to_str()).unwrap_or("fixture"), + &mut hasher, + )?; + + return Ok(hasher.finalize().to_hex().to_string()); + } + + let paths = audit_fixture_paths(path)?; + + for fixture in paths { + let relative = fixture + .strip_prefix(path) + .map(|relative| relative.to_string_lossy().replace('\\', "/")) + .unwrap_or_else(|_| fixture.to_string_lossy().replace('\\', "/")); + + hash_fixture_file(fixture.as_path(), relative.as_str(), &mut hasher)?; + } + + Ok(hasher.finalize().to_hex().to_string()) +} + +fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} + +fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { + hasher.update(logical_path.as_bytes()); + hasher.update(b"\0"); + hasher.update(&fs::read(path)?); + hasher.update(b"\0"); + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs new file mode 100644 index 00000000..3dd15d54 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/paths.rs @@ -0,0 +1,35 @@ +use std::env; + +use crate::{Path, PathBuf}; + +pub(in crate::quantitative::audit_manifest) fn audit_artifact_display_path(path: &Path) -> String { + let display_path = if path.is_absolute() { + env::current_dir() + .ok() + .and_then(|cwd| path.strip_prefix(cwd).ok().map(Path::to_path_buf)) + .unwrap_or_else(|| path.to_path_buf()) + } else { + path.to_path_buf() + }; + + display_path.to_string_lossy().replace('\\', "/") +} + +pub(super) fn resolve_quantitative_audit_artifact_path( + manifest_path: &Path, + artifact_path: &str, +) -> PathBuf { + let raw = PathBuf::from(artifact_path); + + if raw.is_absolute() { + return raw; + } + + let cwd_path = env::current_dir().map(|cwd| cwd.join(&raw)).unwrap_or_else(|_| raw.clone()); + + if cwd_path.exists() { + return cwd_path; + } + + manifest_path.parent().map(|parent| parent.join(&raw)).unwrap_or(cwd_path) +} From 48781e60d558e93d28b5249fd697fba91aff8181 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:53:08 -0400 Subject: [PATCH 23/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit validation checks","authority":"manual"} --- .../quantitative/audit_manifest/validation.rs | 140 ++---------------- .../audit_manifest/validation/controls.rs | 42 ++++++ .../audit_manifest/validation/identity.rs | 73 +++++++++ .../audit_manifest/validation/queries.rs | 29 ++++ 4 files changed, 153 insertions(+), 131 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs index 5aab2c4f..5a37d191 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation.rs @@ -1,11 +1,10 @@ +mod controls; +mod identity; +mod queries; + use crate::{ - BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre, - quantitative::{ - QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, - REQUIRED_HELD_OUT_AUDIT_CONTROL, REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, - audit_manifest::{QuantitativeAuditContext, artifacts}, - metrics, - }, + Path, QuantitativeAuditManifest, Result, + quantitative::audit_manifest::{QuantitativeAuditContext, artifacts}, }; pub(super) fn validate_quantitative_audit_manifest( @@ -13,130 +12,9 @@ pub(super) fn validate_quantitative_audit_manifest( path: &Path, context: QuantitativeAuditContext<'_>, ) -> Result<()> { - if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.run_id != context.run_id { - return Err(eyre::eyre!( - "{} has run_id {}, expected {}.", - path.display(), - manifest.run_id, - context.run_id - )); - } - if manifest.corpus_id != context.corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected {}.", - path.display(), - manifest.corpus_id, - context.corpus_id - )); - } - if manifest.product != context.product || manifest.adapter_id != context.adapter_id { - return Err(eyre::eyre!( - "{} has product {}:{} but current row is {}:{}.", - path.display(), - manifest.product, - manifest.adapter_id, - context.product, - context.adapter_id - )); - } - if manifest.sample_size != context.source_jobs.len() { - return Err(eyre::eyre!( - "{} has sample_size {}, expected {}.", - path.display(), - manifest.sample_size, - context.source_jobs.len() - )); - } - if manifest.ranking_query_count != context.ranking_query_count { - return Err(eyre::eyre!( - "{} has ranking_query_count {}, expected {}.", - path.display(), - manifest.ranking_query_count, - context.ranking_query_count - )); - } - if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { - return Err(eyre::eyre!( - "{} has explicit_qrel_query_count {}, expected {}.", - path.display(), - manifest.explicit_qrel_query_count, - context.explicit_qrel_query_count - )); - } - - validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; - validate_quantitative_audit_controls(manifest, path)?; + identity::validate_quantitative_audit_identity(manifest, path, &context)?; + queries::validate_quantitative_audit_query_ids(manifest, path, context.source_jobs)?; + controls::validate_quantitative_audit_controls(manifest, path)?; artifacts::validate_quantitative_audit_artifacts(manifest, path) } - -fn validate_quantitative_audit_query_ids( - manifest: &QuantitativeAuditManifest, - path: &Path, - source_jobs: &[RealWorldJob], -) -> Result<()> { - let expected = metrics::ranking_query_ids(source_jobs); - let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); - - if actual.len() != manifest.query_ids.len() { - return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); - } - if actual != expected { - let missing = expected.difference(&actual).copied().collect::>(); - let extra = actual.difference(&expected).copied().collect::>(); - - return Err(eyre::eyre!( - "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", - path.display(), - missing, - extra - )); - } - - Ok(()) -} - -fn validate_quantitative_audit_controls( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - let controls = manifest.controls.iter().map(String::as_str).collect::>(); - - if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { - return Err(eyre::eyre!( - "{} marks held_out=true without required control {}.", - path.display(), - REQUIRED_HELD_OUT_AUDIT_CONTROL - )); - } - if manifest.leakage_audited - && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) - || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) - { - return Err(eyre::eyre!( - "{} marks leakage_audited=true without required controls {} and {}.", - path.display(), - REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, - REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL - )); - } - if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() - { - return Err(eyre::eyre!( - "{} marks audit controls true but has an empty claim_boundary.", - path.display() - )); - } - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs new file mode 100644 index 00000000..9b15c1ae --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/controls.rs @@ -0,0 +1,42 @@ +use crate::{ + BTreeSet, Path, QuantitativeAuditManifest, Result, eyre, + quantitative::{ + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL, REQUIRED_HELD_OUT_AUDIT_CONTROL, + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + }, +}; + +pub(super) fn validate_quantitative_audit_controls( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + let controls = manifest.controls.iter().map(String::as_str).collect::>(); + + if manifest.held_out && !controls.contains(REQUIRED_HELD_OUT_AUDIT_CONTROL) { + return Err(eyre::eyre!( + "{} marks held_out=true without required control {}.", + path.display(), + REQUIRED_HELD_OUT_AUDIT_CONTROL + )); + } + if manifest.leakage_audited + && (!controls.contains(REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL) + || !controls.contains(REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL)) + { + return Err(eyre::eyre!( + "{} marks leakage_audited=true without required controls {} and {}.", + path.display(), + REQUIRED_QREL_LEAKAGE_AUDIT_CONTROL, + REQUIRED_CANDIDATE_LEAKAGE_AUDIT_CONTROL + )); + } + if (manifest.held_out || manifest.leakage_audited) && manifest.claim_boundary.trim().is_empty() + { + return Err(eyre::eyre!( + "{} marks audit controls true but has an empty claim_boundary.", + path.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs new file mode 100644 index 00000000..461e9eb6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs @@ -0,0 +1,73 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, audit_manifest::QuantitativeAuditContext}, +}; + +pub(super) fn validate_quantitative_audit_identity( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs new file mode 100644 index 00000000..9910b436 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/queries.rs @@ -0,0 +1,29 @@ +use crate::{ + BTreeSet, Path, QuantitativeAuditManifest, RealWorldJob, Result, eyre, quantitative::metrics, +}; + +pub(super) fn validate_quantitative_audit_query_ids( + manifest: &QuantitativeAuditManifest, + path: &Path, + source_jobs: &[RealWorldJob], +) -> Result<()> { + let expected = metrics::ranking_query_ids(source_jobs); + let actual = manifest.query_ids.iter().map(String::as_str).collect::>(); + + if actual.len() != manifest.query_ids.len() { + return Err(eyre::eyre!("{} has duplicate quantitative audit query_ids.", path.display())); + } + if actual != expected { + let missing = expected.difference(&actual).copied().collect::>(); + let extra = actual.difference(&expected).copied().collect::>(); + + return Err(eyre::eyre!( + "{} audit query_ids do not match current ranked-query set; missing: {:?}, extra: {:?}.", + path.display(), + missing, + extra + )); + } + + Ok(()) +} From c478442703fcab68edb096b8952f64eda8afeb30 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 13:59:56 -0400 Subject: [PATCH 24/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query metric formulas","authority":"manual"} --- .../metrics/per_query/query_metrics.rs | 128 +++--------------- .../per_query/query_metrics/denominators.rs | 21 +++ .../per_query/query_metrics/ranking.rs | 78 +++++++++++ .../per_query/query_metrics/relevance.rs | 23 ++++ 4 files changed, 139 insertions(+), 111 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs index 01babc1d..6685aa6e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics.rs @@ -1,4 +1,10 @@ -use crate::{BTreeMap, BTreeSet, formatting, quantitative::QUANTITATIVE_K_VALUES}; +mod denominators; +mod ranking; +mod relevance; + +pub(super) use self::{denominators::per_query_denominators, relevance::positive_qrel_count}; + +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; pub(super) fn per_query_metrics( candidates: &[String], @@ -7,123 +13,23 @@ pub(super) fn per_query_metrics( let mut metrics = BTreeMap::new(); for k in QUANTITATIVE_K_VALUES { - let relevant_at_k = relevant_at_k(candidates, relevance, *k); + let relevant_at_k = relevance::relevant_at_k(candidates, relevance, *k); - metrics - .insert(format!("recall_at_{k}"), rate(relevant_at_k, positive_qrel_count(relevance))); - metrics.insert(format!("precision_at_{k}"), rate(relevant_at_k, *k)); + metrics.insert( + format!("recall_at_{k}"), + relevance::rate(relevant_at_k, positive_qrel_count(relevance)), + ); + metrics.insert(format!("precision_at_{k}"), relevance::rate(relevant_at_k, *k)); metrics.insert( format!("success_at_{k}"), Some(f64::from(relevant_at_k > 0 && positive_qrel_count(relevance) > 0)), ); } - metrics.insert("mrr".to_string(), reciprocal_rank(candidates, relevance)); - metrics.insert("ndcg_at_5".to_string(), ndcg_at_k(candidates, relevance, 5)); - metrics.insert("average_precision".to_string(), average_precision(candidates, relevance)); - + metrics.insert("mrr".to_string(), ranking::reciprocal_rank(candidates, relevance)); + metrics.insert("ndcg_at_5".to_string(), ranking::ndcg_at_k(candidates, relevance, 5)); metrics -} - -pub(super) fn positive_qrel_count(relevance: &BTreeMap) -> usize { - relevance.values().filter(|grade| **grade > 0.0).count() -} - -pub(super) fn per_query_denominators( - candidate_count: usize, - expected_relevant_count: usize, -) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert(format!("recall_at_{k}"), expected_relevant_count); - denominators.insert(format!("precision_at_{k}"), *k); - denominators.insert(format!("success_at_{k}"), 1); - } - - denominators.insert("mrr".to_string(), expected_relevant_count); - denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); - denominators.insert("average_precision".to_string(), expected_relevant_count); - denominators.insert("candidate_count".to_string(), candidate_count); - - denominators -} - -fn relevant_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> usize { - candidates - .iter() - .take(k) - .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) - .count() -} - -fn reciprocal_rank(candidates: &[String], relevance: &BTreeMap) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } + .insert("average_precision".to_string(), ranking::average_precision(candidates, relevance)); - Some( - candidates - .iter() - .position(|candidate| { - relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) - }) - .map_or(0.0, |index| 1.0 / (index + 1) as f64), - ) -} - -fn ndcg_at_k(candidates: &[String], relevance: &BTreeMap, k: usize) -> Option { - if positive_qrel_count(relevance) == 0 { - return None; - } - - let dcg = candidates - .iter() - .take(k) - .enumerate() - .map(|(index, candidate)| { - relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) - / ((index + 2) as f64).log2() - }) - .sum::(); - let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); - - ideal.sort_by(|left, right| right.total_cmp(left)); - - let idcg = ideal - .iter() - .take(k) - .enumerate() - .map(|(index, grade)| grade / ((index + 2) as f64).log2()) - .sum::(); - - Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) -} - -fn average_precision(candidates: &[String], relevance: &BTreeMap) -> Option { - let positive_count = positive_qrel_count(relevance); - - if positive_count == 0 { - return None; - } - - let mut hit_count = 0; - let mut precision_sum = 0.0; - let mut seen = BTreeSet::new(); - - for (index, candidate) in candidates.iter().enumerate() { - if !seen.insert(candidate.as_str()) { - continue; - } - if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { - hit_count += 1; - precision_sum += hit_count as f64 / (index + 1) as f64; - } - } - - Some(precision_sum / positive_count as f64) -} - -fn rate(numerator: usize, denominator: usize) -> Option { - (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) + metrics } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs new file mode 100644 index 00000000..7ef22bc8 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/denominators.rs @@ -0,0 +1,21 @@ +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; + +pub(in crate::quantitative::metrics::per_query) fn per_query_denominators( + candidate_count: usize, + expected_relevant_count: usize, +) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert(format!("recall_at_{k}"), expected_relevant_count); + denominators.insert(format!("precision_at_{k}"), *k); + denominators.insert(format!("success_at_{k}"), 1); + } + + denominators.insert("mrr".to_string(), expected_relevant_count); + denominators.insert("ndcg_at_5".to_string(), expected_relevant_count.min(5)); + denominators.insert("average_precision".to_string(), expected_relevant_count); + denominators.insert("candidate_count".to_string(), candidate_count); + + denominators +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs new file mode 100644 index 00000000..515bfaed --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs @@ -0,0 +1,78 @@ +use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn reciprocal_rank( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} + +pub(super) fn ndcg_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} + +pub(super) fn average_precision( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + let positive_count = query_metrics::positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs new file mode 100644 index 00000000..a3644eb1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/relevance.rs @@ -0,0 +1,23 @@ +use crate::{BTreeMap, formatting}; + +pub(in crate::quantitative::metrics::per_query) fn positive_qrel_count( + relevance: &BTreeMap, +) -> usize { + relevance.values().filter(|grade| **grade > 0.0).count() +} + +pub(super) fn relevant_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> usize { + candidates + .iter() + .take(k) + .filter(|candidate| relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0)) + .count() +} + +pub(super) fn rate(numerator: usize, denominator: usize) -> Option { + (denominator > 0).then(|| formatting::round3(numerator as f64 / denominator as f64)) +} From 5dd6220d289ae7db5a2ca55e87c68403bb0a5b75 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:05:30 -0400 Subject: [PATCH 25/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit manifest flow","authority":"manual"} --- .../quantitative/audit_manifest.rs | 100 ++---------------- .../quantitative/audit_manifest/evidence.rs | 31 ++++++ .../quantitative/audit_manifest/export.rs | 83 +++++++++++++++ 3 files changed, 120 insertions(+), 94 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs index d3e696a9..01f7e463 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest.rs @@ -1,11 +1,11 @@ mod artifacts; +mod evidence; +mod export; mod validation; -use crate::{ - ExportQuantitativeAuditManifestArgs, Path, QuantitativeAuditArtifact, - QuantitativeAuditManifest, RealWorldJob, Result, eyre, fs, - quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, metrics}, -}; +pub(crate) use self::export::quantitative_audit_manifest_from_jobs; + +use crate::{Path, RealWorldJob, Result}; pub(super) struct QuantitativeAuditContext<'a> { pub(super) run_id: &'a str, @@ -23,97 +23,9 @@ pub(super) struct QuantitativeAuditEvidence { pub(super) audit_manifest_id: Option, } -pub(crate) fn quantitative_audit_manifest_from_jobs( - jobs: &[RealWorldJob], - args: &ExportQuantitativeAuditManifestArgs, -) -> Result { - let product = args.product.trim(); - let adapter_id = args.adapter_id.trim(); - - if product.is_empty() || adapter_id.is_empty() { - return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); - } - - let corpus_id = super::quantitative_corpus_id(jobs); - let ranking_query_count = metrics::ranking_query_count(jobs); - let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); - let manifest = QuantitativeAuditManifest { - schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), - run_id: args.run_id.clone(), - corpus_id, - product: product.to_string(), - adapter_id: adapter_id.to_string(), - held_out: args.held_out, - leakage_audited: args.leakage_audited, - sample_size: jobs.len(), - ranking_query_count, - explicit_qrel_query_count, - query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), - controls: args.controls.clone(), - artifacts: vec![QuantitativeAuditArtifact { - role: "product_runtime_fixtures".to_string(), - path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), - sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, - }], - claim_boundary: args.claim_boundary.clone().unwrap_or_else(|| { - if args.held_out || args.leakage_audited { - concat!( - "Audit manifest supplied by operator; runner validates run/corpus/product/", - "adapter/count/query-id/artifact bindings before opening row gates." - ) - .to_string() - } else { - concat!( - "Diagnostic audit manifest binds the current product-runtime fixture set to ", - "query ids and counts, but it does not prove held-out or leakage-audited status." - ) - .to_string() - } - }), - }; - - validation::validate_quantitative_audit_manifest( - &manifest, - args.fixtures.as_path(), - QuantitativeAuditContext { - run_id: args.run_id.as_str(), - corpus_id: manifest.corpus_id.as_str(), - product, - adapter_id, - source_jobs: jobs, - ranking_query_count: manifest.ranking_query_count, - explicit_qrel_query_count: manifest.explicit_qrel_query_count, - }, - )?; - - Ok(manifest) -} - pub(super) fn quantitative_audit_evidence( path: Option<&Path>, context: QuantitativeAuditContext<'_>, ) -> Result { - let Some(path) = path else { - return Ok(QuantitativeAuditEvidence { - held_out: false, - leakage_audited: false, - audit_manifest_id: None, - }); - }; - let raw = fs::read_to_string(path)?; - let manifest = serde_json::from_str::(&raw).map_err(|err| { - eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) - })?; - - validation::validate_quantitative_audit_manifest(&manifest, path, context)?; - - Ok(QuantitativeAuditEvidence { - held_out: manifest.held_out, - leakage_audited: manifest.leakage_audited, - audit_manifest_id: Some(manifest.manifest_id), - }) + evidence::quantitative_audit_evidence(path, context) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs new file mode 100644 index 00000000..f9b2e0d4 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/evidence.rs @@ -0,0 +1,31 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, fs, + quantitative::audit_manifest::{ + QuantitativeAuditContext, QuantitativeAuditEvidence, validation, + }, +}; + +pub(super) fn quantitative_audit_evidence( + path: Option<&Path>, + context: QuantitativeAuditContext<'_>, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeAuditEvidence { + held_out: false, + leakage_audited: false, + audit_manifest_id: None, + }); + }; + let raw = fs::read_to_string(path)?; + let manifest = serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative audit manifest {}: {err}", path.display()) + })?; + + validation::validate_quantitative_audit_manifest(&manifest, path, context)?; + + Ok(QuantitativeAuditEvidence { + held_out: manifest.held_out, + leakage_audited: manifest.leakage_audited, + audit_manifest_id: Some(manifest.manifest_id), + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs new file mode 100644 index 00000000..e99d5a9c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs @@ -0,0 +1,83 @@ +use crate::{ + ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest, + RealWorldJob, Result, eyre, + quantitative::{ + self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, + audit_manifest::{QuantitativeAuditContext, artifacts, validation}, + metrics, + }, +}; + +pub(crate) fn quantitative_audit_manifest_from_jobs( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, +) -> Result { + let product = args.product.trim(); + let adapter_id = args.adapter_id.trim(); + + if product.is_empty() || adapter_id.is_empty() { + return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); + } + + let corpus_id = quantitative::quantitative_corpus_id(jobs); + let ranking_query_count = metrics::ranking_query_count(jobs); + let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); + let manifest = QuantitativeAuditManifest { + schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), + run_id: args.run_id.clone(), + corpus_id, + product: product.to_string(), + adapter_id: adapter_id.to_string(), + held_out: args.held_out, + leakage_audited: args.leakage_audited, + sample_size: jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + controls: args.controls.clone(), + artifacts: vec![QuantitativeAuditArtifact { + role: "product_runtime_fixtures".to_string(), + path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), + sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, + }], + claim_boundary: quantitative_audit_claim_boundary(args), + }; + + validation::validate_quantitative_audit_manifest( + &manifest, + args.fixtures.as_path(), + QuantitativeAuditContext { + run_id: args.run_id.as_str(), + corpus_id: manifest.corpus_id.as_str(), + product, + adapter_id, + source_jobs: jobs, + ranking_query_count: manifest.ranking_query_count, + explicit_qrel_query_count: manifest.explicit_qrel_query_count, + }, + )?; + + Ok(manifest) +} + +fn quantitative_audit_claim_boundary(args: &ExportQuantitativeAuditManifestArgs) -> String { + args.claim_boundary.clone().unwrap_or_else(|| { + if args.held_out || args.leakage_audited { + concat!( + "Audit manifest supplied by operator; runner validates run/corpus/product/", + "adapter/count/query-id/artifact bindings before opening row gates." + ) + .to_string() + } else { + concat!( + "Diagnostic audit manifest binds the current product-runtime fixture set to ", + "query ids and counts, but it does not prove held-out or leakage-audited status." + ) + .to_string() + } + }) +} From 7d8c5efd18218953bf0318a26dcafc9b227c04fa Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:13:50 -0400 Subject: [PATCH 26/58] {"schema":"decodex/commit/1","summary":"Split quantitative product manifest flow","authority":"manual"} --- .../quantitative/product_manifest.rs | 114 +----------------- .../quantitative/product_manifest/export.rs | 61 ++++++++++ .../product_manifest/export/identity.rs | 23 ++++ .../product_manifest/export/rows.rs | 55 +++++++++ .../quantitative/product_manifest/import.rs | 32 +++++ 5 files changed, 176 insertions(+), 109 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs index ad9a2dee..4cd8b6c0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest.rs @@ -1,118 +1,14 @@ +mod export; +mod import; mod validation; -use crate::{ - ExportQuantitativeProductManifestArgs, Path, QuantitativeProductManifest, REPORT_SCHEMA, - RealWorldReport, Result, eyre, fs, quantitative::QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, -}; +pub(crate) use self::export::quantitative_product_manifest_from_report; -pub(crate) fn quantitative_product_manifest_from_report( - report: &RealWorldReport, - args: &ExportQuantitativeProductManifestArgs, -) -> Result { - if report.schema != REPORT_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {REPORT_SCHEMA}.", - args.report.display(), - report.schema - )); - } - - let source_row = - report.quantitative_scoreboard.rows.first().ok_or_else(|| { - eyre::eyre!("{} has no quantitative product row.", args.report.display()) - })?; - let source_product = source_row.product.as_str(); - let source_adapter_id = source_row.adapter_id.as_str(); - let product = args.product.as_deref().unwrap_or(source_product).trim(); - let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); - let adapter_name = - args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); - - if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { - return Err(eyre::eyre!( - "{} cannot export an incomplete quantitative product identity.", - args.report.display() - )); - } - if product == "ELF" { - return Err(eyre::eyre!( - "{} exports product ELF; use --product for external product manifest exports.", - args.report.display() - )); - } - - let mut row = source_row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.adapter_name = adapter_name.to_string(); - row.claim_boundary = concat!( - "Exported from a generated real_world_job_report quantitative row; ", - "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." - ) - .to_string(); - - let mut per_query_rows = Vec::new(); - - for row in &report.quantitative_scoreboard.per_query_rows { - if row.product != source_product || row.adapter_id != source_adapter_id { - continue; - } - - let mut row = row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.claim_boundary = concat!( - "Exported from generated report per-query quantitative evidence; ", - "import does not relax paired-significance or leaderboard gates." - ) - .to_string(); - - per_query_rows.push(row); - } - - let manifest = QuantitativeProductManifest { - schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), - corpus_id: report.quantitative_scoreboard.corpus_id.clone(), - rows: vec![row], - per_query_rows, - }; - - validation::validate_quantitative_product_manifest( - &manifest, - &args.report, - manifest.corpus_id.as_str(), - )?; - - Ok(manifest) -} +use crate::{Path, QuantitativeProductManifest, Result}; pub(super) fn quantitative_product_manifest( path: Option<&Path>, corpus_id: &str, ) -> Result { - let Some(path) = path else { - return Ok(QuantitativeProductManifest::default()); - }; - let raw = fs::read_to_string(path)?; - let mut manifest = - serde_json::from_str::(&raw).map_err(|err| { - eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) - })?; - - for row in &mut manifest.rows { - row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); - } - for row in &mut manifest.per_query_rows { - row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); - } - - validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?; - - Ok(manifest) + import::quantitative_product_manifest(path, corpus_id) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs new file mode 100644 index 00000000..ac105d5a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs @@ -0,0 +1,61 @@ +mod identity; +mod rows; + +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, REPORT_SCHEMA, + RealWorldReport, Result, eyre, + quantitative::{QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, product_manifest::validation}, +}; + +pub(crate) fn quantitative_product_manifest_from_report( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + if report.schema != REPORT_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {REPORT_SCHEMA}.", + args.report.display(), + report.schema + )); + } + + let source_row = + report.quantitative_scoreboard.rows.first().ok_or_else(|| { + eyre::eyre!("{} has no quantitative product row.", args.report.display()) + })?; + let source_product = source_row.product.as_str(); + let source_adapter_id = source_row.adapter_id.as_str(); + let product = args.product.as_deref().unwrap_or(source_product).trim(); + let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); + let adapter_name = + args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); + + identity::validate_export_identity(args, product, adapter_id, adapter_name)?; + + let row = rows::exported_product_row(source_row, product, adapter_id, adapter_name); + let per_query_rows = rows::exported_per_query_rows( + report, + source_product, + source_adapter_id, + product, + adapter_id, + ); + let manifest = QuantitativeProductManifest { + schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), + corpus_id: report.quantitative_scoreboard.corpus_id.clone(), + rows: vec![row], + per_query_rows, + }; + + validation::validate_quantitative_product_manifest( + &manifest, + &args.report, + manifest.corpus_id.as_str(), + )?; + + Ok(manifest) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs new file mode 100644 index 00000000..4f1f6453 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/identity.rs @@ -0,0 +1,23 @@ +use crate::{ExportQuantitativeProductManifestArgs, Result, eyre}; + +pub(super) fn validate_export_identity( + args: &ExportQuantitativeProductManifestArgs, + product: &str, + adapter_id: &str, + adapter_name: &str, +) -> Result<()> { + if product.is_empty() || adapter_id.is_empty() || adapter_name.is_empty() { + return Err(eyre::eyre!( + "{} cannot export an incomplete quantitative product identity.", + args.report.display() + )); + } + if product == "ELF" { + return Err(eyre::eyre!( + "{} exports product ELF; use --product for external product manifest exports.", + args.report.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs new file mode 100644 index 00000000..2e1923db --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs @@ -0,0 +1,55 @@ +use crate::{QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldReport}; + +pub(super) fn exported_product_row( + source_row: &QuantitativeBenchmarkRow, + product: &str, + adapter_id: &str, + adapter_name: &str, +) -> QuantitativeBenchmarkRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.adapter_name = adapter_name.to_string(); + row.claim_boundary = concat!( + "Exported from a generated real_world_job_report quantitative row; ", + "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." + ) + .to_string(); + + row +} + +pub(super) fn exported_per_query_rows( + report: &RealWorldReport, + source_product: &str, + source_adapter_id: &str, + product: &str, + adapter_id: &str, +) -> Vec { + report + .quantitative_scoreboard + .per_query_rows + .iter() + .filter(|row| row.product == source_product && row.adapter_id == source_adapter_id) + .map(|row| exported_per_query_row(row, product, adapter_id)) + .collect() +} + +fn exported_per_query_row( + source_row: &QuantitativePerQueryRow, + product: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.claim_boundary = concat!( + "Exported from generated report per-query quantitative evidence; ", + "import does not relax paired-significance or leaderboard gates." + ) + .to_string(); + + row +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs new file mode 100644 index 00000000..12df9a92 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/import.rs @@ -0,0 +1,32 @@ +use crate::{ + Path, QuantitativeProductManifest, Result, eyre, fs, quantitative::product_manifest::validation, +}; + +pub(super) fn quantitative_product_manifest( + path: Option<&Path>, + corpus_id: &str, +) -> Result { + let Some(path) = path else { + return Ok(QuantitativeProductManifest::default()); + }; + let raw = fs::read_to_string(path)?; + let mut manifest = + serde_json::from_str::(&raw).map_err(|err| { + eyre::eyre!("Failed to parse quantitative product manifest {}: {err}", path.display()) + })?; + + populate_source_manifest_corpus_ids(&mut manifest); + + validation::validate_quantitative_product_manifest(&manifest, path, corpus_id)?; + + Ok(manifest) +} + +fn populate_source_manifest_corpus_ids(manifest: &mut QuantitativeProductManifest) { + for row in &mut manifest.rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } + for row in &mut manifest.per_query_rows { + row.source_manifest_corpus_id.get_or_insert_with(|| manifest.corpus_id.clone()); + } +} From 44cea2fe5c93b23d99d5975a31370e7541ea862b Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:21:10 -0400 Subject: [PATCH 27/58] {"schema":"decodex/commit/1","summary":"Split quantitative aggregate denominators","authority":"manual"} --- .../quantitative/metrics/aggregate.rs | 49 ++----------------- .../metrics/aggregate/denominators.rs | 33 +++++++++++++ .../quantitative/metrics/aggregate/names.rs | 16 ++++++ 3 files changed, 53 insertions(+), 45 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs index 4d737d85..b61ee782 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -1,4 +1,6 @@ mod confidence; +mod denominators; +mod names; use crate::{ BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, @@ -7,7 +9,7 @@ use crate::{ pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { let mut sums = BTreeMap::::new(); - let mut metrics = quantitative_metric_names() + let mut metrics = names::quantitative_metric_names() .into_iter() .map(|metric| (metric, None)) .collect::>(); @@ -49,31 +51,7 @@ pub(super) fn aggregate_metric_states( } pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { - let mut denominators = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - denominators.insert( - format!("recall_at_{k}"), - sum_per_query_denominator(rows, &format!("recall_at_{k}")), - ); - denominators.insert( - format!("precision_at_{k}"), - sum_per_query_denominator(rows, &format!("precision_at_{k}")), - ); - denominators.insert( - format!("success_at_{k}"), - sum_per_query_denominator(rows, &format!("success_at_{k}")), - ); - } - - denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); - denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); - denominators.insert( - "average_precision".to_string(), - sum_per_query_denominator(rows, "average_precision"), - ); - - denominators + denominators::aggregate_denominators(rows) } pub(super) fn aggregate_confidence_intervals( @@ -81,22 +59,3 @@ pub(super) fn aggregate_confidence_intervals( ) -> BTreeMap { confidence::aggregate_confidence_intervals(rows) } - -fn quantitative_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - metrics.push(metric.to_string()); - } - - metrics -} - -fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { - rows.iter().filter_map(|row| row.denominators.get(metric)).sum() -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs new file mode 100644 index 00000000..3ddd044f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/denominators.rs @@ -0,0 +1,33 @@ +use crate::{BTreeMap, QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { + let mut denominators = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + denominators.insert( + format!("recall_at_{k}"), + sum_per_query_denominator(rows, &format!("recall_at_{k}")), + ); + denominators.insert( + format!("precision_at_{k}"), + sum_per_query_denominator(rows, &format!("precision_at_{k}")), + ); + denominators.insert( + format!("success_at_{k}"), + sum_per_query_denominator(rows, &format!("success_at_{k}")), + ); + } + + denominators.insert("mrr".to_string(), sum_per_query_denominator(rows, "mrr")); + denominators.insert("ndcg_at_5".to_string(), sum_per_query_denominator(rows, "ndcg_at_5")); + denominators.insert( + "average_precision".to_string(), + sum_per_query_denominator(rows, "average_precision"), + ); + + denominators +} + +fn sum_per_query_denominator(rows: &[QuantitativePerQueryRow], metric: &str) -> usize { + rows.iter().filter_map(|row| row.denominators.get(metric)).sum() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs new file mode 100644 index 00000000..90055feb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/names.rs @@ -0,0 +1,16 @@ +use crate::quantitative::QUANTITATIVE_K_VALUES; + +pub(super) fn quantitative_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + metrics.push(metric.to_string()); + } + + metrics +} From b61fbd5eb6f72b74fe65b4b02c79cec458b40219 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:24:44 -0400 Subject: [PATCH 28/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query evidence mapping","authority":"manual"} --- .../quantitative/metrics/per_query.rs | 32 +++---------------- .../metrics/per_query/evidence.rs | 29 +++++++++++++++++ 2 files changed, 33 insertions(+), 28 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs index fbbce5db..cb184dc9 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -1,7 +1,8 @@ +mod evidence; mod query_metrics; use crate::{ - BTreeMap, JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, + JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring, }; @@ -28,7 +29,7 @@ fn quantitative_per_query_row( evidence_class: &str, adapter_id: &str, ) -> QuantitativePerQueryRow { - let relevance = relevance_grades(source_job, job); + let relevance = evidence::relevance_grades(source_job, job); let candidates = scoring::produced_evidence_order(source_job); let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); @@ -47,7 +48,7 @@ fn quantitative_per_query_row( result_state: formatting::status_str(job.status).to_string(), expected_relevant_count: positive_relevance_count, candidate_count: candidates.len(), - qrel_source: qrel_source(source_job, relevance.is_empty()).to_string(), + qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), relevance_grade_sum: formatting::round3(relevance.values().sum::()), product: "ELF".to_string(), adapter_id: adapter_id.to_string(), @@ -60,28 +61,3 @@ fn quantitative_per_query_row( claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), } } - -fn relevance_grades(source_job: &RealWorldJob, job: &JobReport) -> BTreeMap { - let explicit = source_job - .expected_answer - .relevance_judgments - .iter() - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect::>(); - - if !explicit.is_empty() { - return explicit; - } - - job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() -} - -fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { - if !source_job.expected_answer.relevance_judgments.is_empty() { - "explicit_qrels" - } else if empty { - "not_encoded" - } else { - "expected_evidence_fallback" - } -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs new file mode 100644 index 00000000..1a13fac2 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/evidence.rs @@ -0,0 +1,29 @@ +use crate::{BTreeMap, JobReport, RealWorldJob}; + +pub(super) fn relevance_grades( + source_job: &RealWorldJob, + job: &JobReport, +) -> BTreeMap { + let explicit = source_job + .expected_answer + .relevance_judgments + .iter() + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect::>(); + + if !explicit.is_empty() { + return explicit; + } + + job.expected_evidence.iter().map(|evidence| (evidence.evidence_id.clone(), 1.0)).collect() +} + +pub(super) fn qrel_source(source_job: &RealWorldJob, empty: bool) -> &'static str { + if !source_job.expected_answer.relevance_judgments.is_empty() { + "explicit_qrels" + } else if empty { + "not_encoded" + } else { + "expected_evidence_fallback" + } +} From 91c4600a0c8ab6ae86f3aea3a8d3ef2952fbce5f Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:28:28 -0400 Subject: [PATCH 29/58] {"schema":"decodex/commit/1","summary":"Split quantitative report row counts","authority":"manual"} --- .../quantitative/report/row.rs | 11 +++++----- .../quantitative/report/row/query_counts.rs | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 6 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs index d3f8b232..71c66266 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -1,3 +1,5 @@ +mod query_counts; + use crate::{ QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::{ @@ -28,12 +30,9 @@ pub(super) fn current_quantitative_row( evidence_class, input.adapter.adapter_id.as_str(), ); - let ranking_query_count = per_query_rows - .iter() - .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) - .count(); - let explicit_qrel_query_count = - per_query_rows.iter().filter(|row| row.qrel_source == "explicit_qrels").count(); + let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice()); + let ranking_query_count = query_counts.ranking_query_count; + let explicit_qrel_query_count = query_counts.explicit_qrel_query_count; let metric_comparable = ranking_query_count > 0; let result_state = quantitative::quantitative_result_state(input.summary); let audit_evidence = audit_manifest::quantitative_audit_evidence( diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs new file mode 100644 index 00000000..12632f0a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/query_counts.rs @@ -0,0 +1,21 @@ +use crate::QuantitativePerQueryRow; + +pub(super) struct QuantitativeQueryCounts { + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, +} + +pub(super) fn quantitative_query_counts( + per_query_rows: &[QuantitativePerQueryRow], +) -> QuantitativeQueryCounts { + QuantitativeQueryCounts { + ranking_query_count: per_query_rows + .iter() + .filter(|row| row.candidate_count > 0 && row.expected_relevant_count > 0) + .count(), + explicit_qrel_query_count: per_query_rows + .iter() + .filter(|row| row.qrel_source == "explicit_qrels") + .count(), + } +} From 353c953f78fea4d0eac7f46b16e9f7178bdc24ab Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:32:14 -0400 Subject: [PATCH 30/58] {"schema":"decodex/commit/1","summary":"Split quantitative ranking query detection","authority":"manual"} --- .../quantitative/metrics/ranking.rs | 39 ++----------------- .../quantitative/metrics/ranking/queries.rs | 38 ++++++++++++++++++ 2 files changed, 42 insertions(+), 35 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs index 918a8613..340a7115 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs @@ -1,9 +1,11 @@ -use crate::{BTreeMap, BTreeSet, RealWorldJob, ReportSummary, scoring}; +mod queries; + +use crate::{BTreeSet, RealWorldJob, ReportSummary}; pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { source_jobs .iter() - .filter(|job| !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job)) + .filter(|job| queries::is_ranking_query(job)) .map(|job| job.job_id.as_str()) .collect() } @@ -48,36 +50,3 @@ pub(super) fn ranking_coverage_state( pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } } - -fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { - if !source_job.expected_answer.relevance_judgments.is_empty() { - return source_job - .expected_answer - .relevance_judgments - .iter() - .filter(|judgment| judgment.grade > 0.0) - .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) - .collect(); - } - - source_job - .required_evidence - .iter() - .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) - .map(|evidence| (evidence.evidence_id.clone(), 1.0)) - .collect() -} - -fn ranking_query_attempted(job: &RealWorldJob) -> bool { - if !scoring::produced_evidence_order(job).is_empty() { - return true; - } - - let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { - return false; - }; - - answer.trace_explainability.as_ref().is_some_and(|trace| { - trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") - }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs new file mode 100644 index 00000000..8ada5678 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/queries.rs @@ -0,0 +1,38 @@ +use crate::{BTreeMap, RealWorldJob, scoring}; + +pub(super) fn is_ranking_query(job: &RealWorldJob) -> bool { + !ranking_relevance_grades(job).is_empty() && ranking_query_attempted(job) +} + +fn ranking_relevance_grades(source_job: &RealWorldJob) -> BTreeMap { + if !source_job.expected_answer.relevance_judgments.is_empty() { + return source_job + .expected_answer + .relevance_judgments + .iter() + .filter(|judgment| judgment.grade > 0.0) + .map(|judgment| (judgment.evidence_id.clone(), judgment.grade)) + .collect(); + } + + source_job + .required_evidence + .iter() + .filter(|evidence| matches!(evidence.requirement.as_str(), "cite" | "use" | "explain")) + .map(|evidence| (evidence.evidence_id.clone(), 1.0)) + .collect() +} + +fn ranking_query_attempted(job: &RealWorldJob) -> bool { + if !scoring::produced_evidence_order(job).is_empty() { + return true; + } + + let Some(answer) = job.corpus.adapter_response.as_ref().map(|response| &response.answer) else { + return false; + }; + + answer.trace_explainability.as_ref().is_some_and(|trace| { + trace.stages.iter().any(|stage| stage.stage_name == "live_adapter.retrieve") + }) && answer.latency_ms.is_some_and(|latency| latency.is_finite() && latency > 0.0) +} From 9a640111b7e4a40cb80d2fd1a932c0b9ae54e7e3 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:35:28 -0400 Subject: [PATCH 31/58] {"schema":"decodex/commit/1","summary":"Split quantitative confidence interval helpers","authority":"manual"} --- .../metrics/aggregate/confidence.rs | 74 ++----------------- .../metrics/aggregate/confidence/rates.rs | 39 ++++++++++ .../metrics/aggregate/confidence/wilson.rs | 22 ++++++ 3 files changed, 69 insertions(+), 66 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs index e1db5fb8..2a454bdc 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence.rs @@ -1,82 +1,24 @@ -use crate::{ - BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, - quantitative::{QUANTITATIVE_K_VALUES, WILSON_95_Z}, -}; +mod rates; +mod wilson; + +use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow}; pub(super) fn aggregate_confidence_intervals( rows: &[QuantitativePerQueryRow], ) -> BTreeMap { let mut confidence_intervals = BTreeMap::new(); - for metric in rate_metric_names() { - let (numerator, denominator) = aggregate_rate_numerator_denominator(rows, metric.as_str()); + for metric in rates::rate_metric_names() { + let (numerator, denominator) = + rates::aggregate_rate_numerator_denominator(rows, metric.as_str()); if denominator > 0 { confidence_intervals.insert( metric, - wilson_confidence_interval(numerator.min(denominator), denominator), + wilson::wilson_confidence_interval(numerator.min(denominator), denominator), ); } } confidence_intervals } - -fn rate_metric_names() -> Vec { - let mut metrics = Vec::new(); - - for k in QUANTITATIVE_K_VALUES { - metrics.push(format!("recall_at_{k}")); - metrics.push(format!("precision_at_{k}")); - metrics.push(format!("success_at_{k}")); - } - - metrics -} - -fn aggregate_rate_numerator_denominator( - rows: &[QuantitativePerQueryRow], - metric: &str, -) -> (usize, usize) { - let mut numerator = 0; - let mut denominator = 0; - - for row in rows { - let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { - continue; - }; - let Some(row_denominator) = row.denominators.get(metric).copied() else { - continue; - }; - - if row_denominator == 0 { - continue; - } - - denominator += row_denominator; - numerator += (value * row_denominator as f64).round() as usize; - } - - (numerator, denominator) -} - -fn wilson_confidence_interval( - numerator: usize, - denominator: usize, -) -> QuantitativeConfidenceInterval { - let n = denominator as f64; - let p = numerator as f64 / n; - let z2 = WILSON_95_Z * WILSON_95_Z; - let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); - let half_width = - WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); - - QuantitativeConfidenceInterval { - method: "wilson_score".to_string(), - confidence: 0.95, - lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), - upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), - numerator, - denominator, - } -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs new file mode 100644 index 00000000..4cfb3b7f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/rates.rs @@ -0,0 +1,39 @@ +use crate::{QuantitativePerQueryRow, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn rate_metric_names() -> Vec { + let mut metrics = Vec::new(); + + for k in QUANTITATIVE_K_VALUES { + metrics.push(format!("recall_at_{k}")); + metrics.push(format!("precision_at_{k}")); + metrics.push(format!("success_at_{k}")); + } + + metrics +} + +pub(super) fn aggregate_rate_numerator_denominator( + rows: &[QuantitativePerQueryRow], + metric: &str, +) -> (usize, usize) { + let mut numerator = 0; + let mut denominator = 0; + + for row in rows { + let Some(value) = row.metrics.get(metric).and_then(|value| *value) else { + continue; + }; + let Some(row_denominator) = row.denominators.get(metric).copied() else { + continue; + }; + + if row_denominator == 0 { + continue; + } + + denominator += row_denominator; + numerator += (value * row_denominator as f64).round() as usize; + } + + (numerator, denominator) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs new file mode 100644 index 00000000..99c3029d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/confidence/wilson.rs @@ -0,0 +1,22 @@ +use crate::{QuantitativeConfidenceInterval, formatting, quantitative::WILSON_95_Z}; + +pub(super) fn wilson_confidence_interval( + numerator: usize, + denominator: usize, +) -> QuantitativeConfidenceInterval { + let n = denominator as f64; + let p = numerator as f64 / n; + let z2 = WILSON_95_Z * WILSON_95_Z; + let center = (p + z2 / (2.0 * n)) / (1.0 + z2 / n); + let half_width = + WILSON_95_Z * ((p * (1.0 - p) / n + z2 / (4.0 * n * n)).sqrt()) / (1.0 + z2 / n); + + QuantitativeConfidenceInterval { + method: "wilson_score".to_string(), + confidence: 0.95, + lower: formatting::round3((center - half_width).clamp(0.0, 1.0)), + upper: formatting::round3((center + half_width).clamp(0.0, 1.0)), + numerator, + denominator, + } +} From 52c4ae07992adc65e50d5f063b331cbb71ba1330 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:38:16 -0400 Subject: [PATCH 32/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit export helpers","authority":"manual"} --- .../quantitative/audit_manifest/export.rs | 29 ++++--------------- .../audit_manifest/export/claim_boundary.rs | 21 ++++++++++++++ .../audit_manifest/export/identity.rs | 9 ++++++ 3 files changed, 36 insertions(+), 23 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs index e99d5a9c..795960e0 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs @@ -1,6 +1,9 @@ +mod claim_boundary; +mod identity; + use crate::{ ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest, - RealWorldJob, Result, eyre, + RealWorldJob, Result, quantitative::{ self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, audit_manifest::{QuantitativeAuditContext, artifacts, validation}, @@ -15,9 +18,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( let product = args.product.trim(); let adapter_id = args.adapter_id.trim(); - if product.is_empty() || adapter_id.is_empty() { - return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); - } + identity::validate_audit_export_identity(product, adapter_id)?; let corpus_id = quantitative::quantitative_corpus_id(jobs); let ranking_query_count = metrics::ranking_query_count(jobs); @@ -44,7 +45,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, }], - claim_boundary: quantitative_audit_claim_boundary(args), + claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args), }; validation::validate_quantitative_audit_manifest( @@ -63,21 +64,3 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( Ok(manifest) } - -fn quantitative_audit_claim_boundary(args: &ExportQuantitativeAuditManifestArgs) -> String { - args.claim_boundary.clone().unwrap_or_else(|| { - if args.held_out || args.leakage_audited { - concat!( - "Audit manifest supplied by operator; runner validates run/corpus/product/", - "adapter/count/query-id/artifact bindings before opening row gates." - ) - .to_string() - } else { - concat!( - "Diagnostic audit manifest binds the current product-runtime fixture set to ", - "query ids and counts, but it does not prove held-out or leakage-audited status." - ) - .to_string() - } - }) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs new file mode 100644 index 00000000..3d572c61 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/claim_boundary.rs @@ -0,0 +1,21 @@ +use crate::ExportQuantitativeAuditManifestArgs; + +pub(super) fn quantitative_audit_claim_boundary( + args: &ExportQuantitativeAuditManifestArgs, +) -> String { + args.claim_boundary.clone().unwrap_or_else(|| { + if args.held_out || args.leakage_audited { + concat!( + "Audit manifest supplied by operator; runner validates run/corpus/product/", + "adapter/count/query-id/artifact bindings before opening row gates." + ) + .to_string() + } else { + concat!( + "Diagnostic audit manifest binds the current product-runtime fixture set to ", + "query ids and counts, but it does not prove held-out or leakage-audited status." + ) + .to_string() + } + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs new file mode 100644 index 00000000..872da0e6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/identity.rs @@ -0,0 +1,9 @@ +use crate::{Result, eyre}; + +pub(super) fn validate_audit_export_identity(product: &str, adapter_id: &str) -> Result<()> { + if product.is_empty() || adapter_id.is_empty() { + return Err(eyre::eyre!("quantitative audit export requires product and adapter_id.")); + } + + Ok(()) +} From f8959df71695a9bd15d9bd4199746393554851a8 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:47:00 -0400 Subject: [PATCH 33/58] {"schema":"decodex/commit/1","summary":"Split quantitative report row assembly","authority":"manual"} --- .../quantitative/report/row.rs | 46 ++++-------- .../quantitative/report/row/benchmark_row.rs | 71 +++++++++++++++++++ 2 files changed, 84 insertions(+), 33 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs index 71c66266..868863fe 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -1,12 +1,13 @@ +mod benchmark_row; mod query_counts; use crate::{ QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::{ - self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, + self, audit_manifest::{self, QuantitativeAuditContext}, metrics, - report::QuantitativeReportInput, + report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput}, }, }; @@ -55,39 +56,18 @@ pub(super) fn current_quantitative_row( metric_comparable, &audit_evidence, ); - let row = QuantitativeBenchmarkRow { - product: "ELF".to_string(), - adapter_id: input.adapter.adapter_id.clone(), - adapter_name: input.adapter.name.clone(), - suite: quantitative::quantitative_suite_id(input.jobs), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.clone()), - result_state: result_state.to_string(), - comparable: metric_comparable, - metric_comparable, - leaderboard_eligible, - held_out: audit_evidence.held_out, - leakage_audited: audit_evidence.leakage_audited, - audit_manifest_id: audit_evidence.audit_manifest_id, - fixture_regression_only: evidence_class == "fixture_backed", - sample_size: input.jobs.len(), + let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput { + input, + corpus_id: corpus_id.as_str(), + evidence_class, + per_query_rows: per_query_rows.as_slice(), ranking_query_count, - ranking_coverage_state: metrics::ranking_coverage_state( - input.summary, - input.source_jobs.len(), - ranking_query_count, - ) - .to_string(), - ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), - qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) - .to_string(), explicit_qrel_query_count, - metrics: metrics::aggregate_metrics(per_query_rows.as_slice()), - metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), - denominators: metrics::aggregate_denominators(per_query_rows.as_slice()), - confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows.as_slice()), - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - }; + metric_comparable, + result_state, + audit_evidence, + leaderboard_eligible, + }); Ok(CurrentQuantitativeRow { corpus_id, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs new file mode 100644 index 00000000..53198ae6 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs @@ -0,0 +1,71 @@ +use crate::{ + QuantitativeBenchmarkRow, QuantitativePerQueryRow, + quantitative::{ + self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, audit_manifest::QuantitativeAuditEvidence, metrics, + report::QuantitativeReportInput, + }, +}; + +pub(super) struct QuantitativeBenchmarkRowInput<'a, 'b> { + pub(super) input: &'a QuantitativeReportInput<'b>, + pub(super) corpus_id: &'a str, + pub(super) evidence_class: &'a str, + pub(super) per_query_rows: &'a [QuantitativePerQueryRow], + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, + pub(super) metric_comparable: bool, + pub(super) result_state: &'a str, + pub(super) audit_evidence: QuantitativeAuditEvidence, + pub(super) leaderboard_eligible: bool, +} + +pub(super) fn quantitative_benchmark_row( + row_input: QuantitativeBenchmarkRowInput<'_, '_>, +) -> QuantitativeBenchmarkRow { + let QuantitativeBenchmarkRowInput { + input, + corpus_id, + evidence_class, + per_query_rows, + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + result_state, + audit_evidence, + leaderboard_eligible, + } = row_input; + + QuantitativeBenchmarkRow { + product: "ELF".to_string(), + adapter_id: input.adapter.adapter_id.clone(), + adapter_name: input.adapter.name.clone(), + suite: quantitative::quantitative_suite_id(input.jobs), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: result_state.to_string(), + comparable: metric_comparable, + metric_comparable, + leaderboard_eligible, + held_out: audit_evidence.held_out, + leakage_audited: audit_evidence.leakage_audited, + audit_manifest_id: audit_evidence.audit_manifest_id, + fixture_regression_only: evidence_class == "fixture_backed", + sample_size: input.jobs.len(), + ranking_query_count, + ranking_coverage_state: metrics::ranking_coverage_state( + input.summary, + input.source_jobs.len(), + ranking_query_count, + ) + .to_string(), + ranked_candidate_source: metrics::ranked_candidate_source(ranking_query_count).to_string(), + qrel_source: metrics::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) + .to_string(), + explicit_qrel_query_count, + metrics: metrics::aggregate_metrics(per_query_rows), + metric_states: metrics::aggregate_metric_states(result_state, metric_comparable), + denominators: metrics::aggregate_denominators(per_query_rows), + confidence_intervals: metrics::aggregate_confidence_intervals(per_query_rows), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} From c01c488f46655082c4708967fb89ce6e0127f009 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:50:18 -0400 Subject: [PATCH 34/58] {"schema":"decodex/commit/1","summary":"Split quantitative average precision metric","authority":"manual"} --- .../per_query/query_metrics/ranking.rs | 26 +++-------------- .../ranking/average_precision.rs | 28 +++++++++++++++++++ 2 files changed, 32 insertions(+), 22 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs index 515bfaed..5abea808 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs @@ -1,4 +1,6 @@ -use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics}; +mod average_precision; + +use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; pub(super) fn reciprocal_rank( candidates: &[String], @@ -54,25 +56,5 @@ pub(super) fn average_precision( candidates: &[String], relevance: &BTreeMap, ) -> Option { - let positive_count = query_metrics::positive_qrel_count(relevance); - - if positive_count == 0 { - return None; - } - - let mut hit_count = 0; - let mut precision_sum = 0.0; - let mut seen = BTreeSet::new(); - - for (index, candidate) in candidates.iter().enumerate() { - if !seen.insert(candidate.as_str()) { - continue; - } - if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { - hit_count += 1; - precision_sum += hit_count as f64 / (index + 1) as f64; - } - } - - Some(precision_sum / positive_count as f64) + average_precision::average_precision(candidates, relevance) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs new file mode 100644 index 00000000..13c196ca --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/average_precision.rs @@ -0,0 +1,28 @@ +use crate::{BTreeMap, BTreeSet, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn average_precision( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + let positive_count = query_metrics::positive_qrel_count(relevance); + + if positive_count == 0 { + return None; + } + + let mut hit_count = 0; + let mut precision_sum = 0.0; + let mut seen = BTreeSet::new(); + + for (index, candidate) in candidates.iter().enumerate() { + if !seen.insert(candidate.as_str()) { + continue; + } + if relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) { + hit_count += 1; + precision_sum += hit_count as f64 / (index + 1) as f64; + } + } + + Some(precision_sum / positive_count as f64) +} From cc23d1aa3c78c377dadf9995e5b1d15fafc70f31 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 14:56:23 -0400 Subject: [PATCH 35/58] {"schema":"decodex/commit/1","summary":"Split quantitative report audit gates","authority":"manual"} --- .../quantitative/report/row.rs | 29 ++++-------- .../quantitative/report/row/audit_gates.rs | 45 +++++++++++++++++++ 2 files changed, 53 insertions(+), 21 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs index 868863fe..8599700a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -1,12 +1,11 @@ +mod audit_gates; mod benchmark_row; mod query_counts; use crate::{ QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::{ - self, - audit_manifest::{self, QuantitativeAuditContext}, - metrics, + self, metrics, report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput}, }, }; @@ -36,26 +35,14 @@ pub(super) fn current_quantitative_row( let explicit_qrel_query_count = query_counts.explicit_qrel_query_count; let metric_comparable = ranking_query_count > 0; let result_state = quantitative::quantitative_result_state(input.summary); - let audit_evidence = audit_manifest::quantitative_audit_evidence( - input.audit_manifest_path, - QuantitativeAuditContext { - run_id: input.run_id, - corpus_id: corpus_id.as_str(), - product: "ELF", - adapter_id: input.adapter.adapter_id.as_str(), - source_jobs: input.source_jobs, - ranking_query_count, - explicit_qrel_query_count, - }, - )?; - let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( + let audit_gates = audit_gates::quantitative_audit_gates( + input, + corpus_id.as_str(), evidence_class, - input.source_jobs.len(), ranking_query_count, explicit_qrel_query_count, metric_comparable, - &audit_evidence, - ); + )?; let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput { input, corpus_id: corpus_id.as_str(), @@ -65,8 +52,8 @@ pub(super) fn current_quantitative_row( explicit_qrel_query_count, metric_comparable, result_state, - audit_evidence, - leaderboard_eligible, + audit_evidence: audit_gates.audit_evidence, + leaderboard_eligible: audit_gates.leaderboard_eligible, }); Ok(CurrentQuantitativeRow { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs new file mode 100644 index 00000000..31d2ddee --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/audit_gates.rs @@ -0,0 +1,45 @@ +use crate::{ + Result, + quantitative::{ + self, + audit_manifest::{self, QuantitativeAuditContext, QuantitativeAuditEvidence}, + report::QuantitativeReportInput, + }, +}; + +pub(super) struct QuantitativeAuditGates { + pub(super) audit_evidence: QuantitativeAuditEvidence, + pub(super) leaderboard_eligible: bool, +} + +pub(super) fn quantitative_audit_gates( + input: &QuantitativeReportInput<'_>, + corpus_id: &str, + evidence_class: &str, + ranking_query_count: usize, + explicit_qrel_query_count: usize, + metric_comparable: bool, +) -> Result { + let audit_evidence = audit_manifest::quantitative_audit_evidence( + input.audit_manifest_path, + QuantitativeAuditContext { + run_id: input.run_id, + corpus_id, + product: "ELF", + adapter_id: input.adapter.adapter_id.as_str(), + source_jobs: input.source_jobs, + ranking_query_count, + explicit_qrel_query_count, + }, + )?; + let leaderboard_eligible = quantitative::quantitative_row_leaderboard_eligible( + evidence_class, + input.source_jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + metric_comparable, + &audit_evidence, + ); + + Ok(QuantitativeAuditGates { audit_evidence, leaderboard_eligible }) +} From dad67b4b40c29f6eefcb8b4d0d50054a86140421 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:01:19 -0400 Subject: [PATCH 36/58] {"schema":"decodex/commit/1","summary":"Split quantitative report data types","authority":"manual"} --- .../quantitative_reports.rs | 140 ++---------------- .../quantitative_reports/audit.rs | 29 ++++ .../quantitative_reports/benchmark.rs | 89 +++++++++++ .../quantitative_reports/product.rs | 12 ++ 4 files changed, 142 insertions(+), 128 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs index ded35360..a3bff704 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports.rs @@ -1,128 +1,12 @@ -use crate::{BTreeMap, Deserialize, Serialize}; - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativeBenchmarkReport { - pub(super) schema: String, - pub(super) generated_at: String, - pub(super) corpus_id: String, - pub(super) k_values: Vec, - pub(super) rows: Vec, - #[serde(default)] - pub(super) per_query_rows: Vec, - #[serde(default)] - pub(super) metrics_not_encoded: Vec, - pub(super) controls: QuantitativeBenchmarkControls, - pub(super) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativeBenchmarkRow { - pub(super) product: String, - pub(super) adapter_id: String, - pub(super) adapter_name: String, - pub(super) suite: String, - pub(super) evidence_class: String, - pub(super) source_manifest_corpus_id: Option, - pub(super) result_state: String, - pub(super) comparable: bool, - pub(super) metric_comparable: bool, - pub(super) leaderboard_eligible: bool, - pub(super) held_out: bool, - pub(super) leakage_audited: bool, - pub(super) audit_manifest_id: Option, - pub(super) fixture_regression_only: bool, - pub(super) sample_size: usize, - pub(super) ranking_query_count: usize, - pub(super) ranking_coverage_state: String, - pub(super) ranked_candidate_source: String, - pub(super) qrel_source: String, - pub(super) explicit_qrel_query_count: usize, - pub(super) metrics: BTreeMap>, - pub(super) metric_states: BTreeMap, - pub(super) denominators: BTreeMap, - #[serde(default)] - pub(super) confidence_intervals: BTreeMap, - pub(super) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativePerQueryRow { - pub(super) job_id: String, - pub(super) suite: String, - pub(super) evidence_class: String, - pub(super) source_manifest_corpus_id: Option, - pub(super) result_state: String, - pub(super) expected_relevant_count: usize, - pub(super) candidate_count: usize, - pub(super) qrel_source: String, - pub(super) relevance_grade_sum: f64, - pub(super) product: String, - pub(super) adapter_id: String, - pub(super) metrics: BTreeMap>, - pub(super) metric_states: BTreeMap, - pub(super) denominators: BTreeMap, - pub(super) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativeBenchmarkControls { - pub(super) same_corpus_required: bool, - pub(super) same_task_required: bool, - pub(super) ranked_candidates_required_for_ranking_metrics: bool, - pub(super) explicit_relevance_judgments_required_for_leaderboard: bool, - pub(super) minimum_query_count_for_leaderboard: usize, - pub(super) current_query_count: usize, - pub(super) current_ranking_query_count: usize, - pub(super) current_explicit_qrel_query_count: usize, - pub(super) leaderboard_claim_allowed: bool, - pub(super) leakage_control: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativeConfidenceInterval { - pub(super) method: String, - pub(super) confidence: f64, - pub(super) lower: f64, - pub(super) upper: f64, - pub(super) numerator: usize, - pub(super) denominator: usize, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(super) struct QuantitativeProductManifest { - pub(super) schema: String, - pub(super) manifest_id: String, - pub(super) corpus_id: String, - #[serde(default)] - pub(super) rows: Vec, - #[serde(default)] - pub(super) per_query_rows: Vec, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct QuantitativeAuditManifest { - pub(super) schema: String, - pub(super) manifest_id: String, - pub(super) run_id: String, - pub(super) corpus_id: String, - pub(super) product: String, - pub(super) adapter_id: String, - pub(super) held_out: bool, - pub(super) leakage_audited: bool, - pub(super) sample_size: usize, - pub(super) ranking_query_count: usize, - pub(super) explicit_qrel_query_count: usize, - pub(super) query_ids: Vec, - #[serde(default)] - pub(super) controls: Vec, - #[serde(default)] - pub(super) artifacts: Vec, - pub(super) claim_boundary: String, -} - -#[derive(Clone, Debug, Deserialize, Serialize)] -pub(super) struct QuantitativeAuditArtifact { - pub(super) role: String, - pub(super) path: String, - pub(super) sha256: String, -} +mod audit; +mod benchmark; +mod product; + +pub(crate) use self::{ + audit::{QuantitativeAuditArtifact, QuantitativeAuditManifest}, + benchmark::{ + QuantitativeBenchmarkControls, QuantitativeBenchmarkReport, QuantitativeBenchmarkRow, + QuantitativeConfidenceInterval, QuantitativePerQueryRow, + }, + product::QuantitativeProductManifest, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs new file mode 100644 index 00000000..4b2ce584 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/audit.rs @@ -0,0 +1,29 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct QuantitativeAuditManifest { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) run_id: String, + pub(crate) corpus_id: String, + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) held_out: bool, + pub(crate) leakage_audited: bool, + pub(crate) sample_size: usize, + pub(crate) ranking_query_count: usize, + pub(crate) explicit_qrel_query_count: usize, + pub(crate) query_ids: Vec, + #[serde(default)] + pub(crate) controls: Vec, + #[serde(default)] + pub(crate) artifacts: Vec, + pub(crate) claim_boundary: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +pub(crate) struct QuantitativeAuditArtifact { + pub(crate) role: String, + pub(crate) path: String, + pub(crate) sha256: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs new file mode 100644 index 00000000..7dfc1c88 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs @@ -0,0 +1,89 @@ +use crate::{BTreeMap, Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkReport { + pub(crate) schema: String, + pub(crate) generated_at: String, + pub(crate) corpus_id: String, + pub(crate) k_values: Vec, + pub(crate) rows: Vec, + #[serde(default)] + pub(crate) per_query_rows: Vec, + #[serde(default)] + pub(crate) metrics_not_encoded: Vec, + pub(crate) controls: QuantitativeBenchmarkControls, + pub(crate) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkRow { + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) adapter_name: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) comparable: bool, + pub(crate) metric_comparable: bool, + pub(crate) leaderboard_eligible: bool, + pub(crate) held_out: bool, + pub(crate) leakage_audited: bool, + pub(crate) audit_manifest_id: Option, + pub(crate) fixture_regression_only: bool, + pub(crate) sample_size: usize, + pub(crate) ranking_query_count: usize, + pub(crate) ranking_coverage_state: String, + pub(crate) ranked_candidate_source: String, + pub(crate) qrel_source: String, + pub(crate) explicit_qrel_query_count: usize, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + #[serde(default)] + pub(crate) confidence_intervals: BTreeMap, + pub(crate) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativePerQueryRow { + pub(crate) job_id: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) expected_relevant_count: usize, + pub(crate) candidate_count: usize, + pub(crate) qrel_source: String, + pub(crate) relevance_grade_sum: f64, + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + pub(crate) claim_boundary: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkControls { + pub(crate) same_corpus_required: bool, + pub(crate) same_task_required: bool, + pub(crate) ranked_candidates_required_for_ranking_metrics: bool, + pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool, + pub(crate) minimum_query_count_for_leaderboard: usize, + pub(crate) current_query_count: usize, + pub(crate) current_ranking_query_count: usize, + pub(crate) current_explicit_qrel_query_count: usize, + pub(crate) leaderboard_claim_allowed: bool, + pub(crate) leakage_control: String, +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeConfidenceInterval { + pub(crate) method: String, + pub(crate) confidence: f64, + pub(crate) lower: f64, + pub(crate) upper: f64, + pub(crate) numerator: usize, + pub(crate) denominator: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs new file mode 100644 index 00000000..efc5c357 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/product.rs @@ -0,0 +1,12 @@ +use crate::{Deserialize, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeProductManifest { + pub(crate) schema: String, + pub(crate) manifest_id: String, + pub(crate) corpus_id: String, + #[serde(default)] + pub(crate) rows: Vec, + #[serde(default)] + pub(crate) per_query_rows: Vec, +} From e36caa61c671200deaefb65056f1d5691a03d239 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:05:29 -0400 Subject: [PATCH 37/58] {"schema":"decodex/commit/1","summary":"Split quantitative benchmark report types","authority":"manual"} --- .../quantitative_reports/benchmark.rs | 100 ++---------------- .../benchmark/confidence.rs | 11 ++ .../benchmark/controls.rs | 15 +++ .../benchmark/per_query.rs | 20 ++++ .../quantitative_reports/benchmark/report.rs | 19 ++++ .../quantitative_reports/benchmark/row.rs | 31 ++++++ 6 files changed, 107 insertions(+), 89 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs index 7dfc1c88..50d36ff1 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark.rs @@ -1,89 +1,11 @@ -use crate::{BTreeMap, Deserialize, Serialize}; - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(crate) struct QuantitativeBenchmarkReport { - pub(crate) schema: String, - pub(crate) generated_at: String, - pub(crate) corpus_id: String, - pub(crate) k_values: Vec, - pub(crate) rows: Vec, - #[serde(default)] - pub(crate) per_query_rows: Vec, - #[serde(default)] - pub(crate) metrics_not_encoded: Vec, - pub(crate) controls: QuantitativeBenchmarkControls, - pub(crate) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(crate) struct QuantitativeBenchmarkRow { - pub(crate) product: String, - pub(crate) adapter_id: String, - pub(crate) adapter_name: String, - pub(crate) suite: String, - pub(crate) evidence_class: String, - pub(crate) source_manifest_corpus_id: Option, - pub(crate) result_state: String, - pub(crate) comparable: bool, - pub(crate) metric_comparable: bool, - pub(crate) leaderboard_eligible: bool, - pub(crate) held_out: bool, - pub(crate) leakage_audited: bool, - pub(crate) audit_manifest_id: Option, - pub(crate) fixture_regression_only: bool, - pub(crate) sample_size: usize, - pub(crate) ranking_query_count: usize, - pub(crate) ranking_coverage_state: String, - pub(crate) ranked_candidate_source: String, - pub(crate) qrel_source: String, - pub(crate) explicit_qrel_query_count: usize, - pub(crate) metrics: BTreeMap>, - pub(crate) metric_states: BTreeMap, - pub(crate) denominators: BTreeMap, - #[serde(default)] - pub(crate) confidence_intervals: BTreeMap, - pub(crate) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(crate) struct QuantitativePerQueryRow { - pub(crate) job_id: String, - pub(crate) suite: String, - pub(crate) evidence_class: String, - pub(crate) source_manifest_corpus_id: Option, - pub(crate) result_state: String, - pub(crate) expected_relevant_count: usize, - pub(crate) candidate_count: usize, - pub(crate) qrel_source: String, - pub(crate) relevance_grade_sum: f64, - pub(crate) product: String, - pub(crate) adapter_id: String, - pub(crate) metrics: BTreeMap>, - pub(crate) metric_states: BTreeMap, - pub(crate) denominators: BTreeMap, - pub(crate) claim_boundary: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(crate) struct QuantitativeBenchmarkControls { - pub(crate) same_corpus_required: bool, - pub(crate) same_task_required: bool, - pub(crate) ranked_candidates_required_for_ranking_metrics: bool, - pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool, - pub(crate) minimum_query_count_for_leaderboard: usize, - pub(crate) current_query_count: usize, - pub(crate) current_ranking_query_count: usize, - pub(crate) current_explicit_qrel_query_count: usize, - pub(crate) leaderboard_claim_allowed: bool, - pub(crate) leakage_control: String, -} - -#[derive(Clone, Debug, Default, Deserialize, Serialize)] -pub(crate) struct QuantitativeConfidenceInterval { - pub(crate) method: String, - pub(crate) confidence: f64, - pub(crate) lower: f64, - pub(crate) upper: f64, - pub(crate) numerator: usize, - pub(crate) denominator: usize, -} +mod confidence; +mod controls; +mod per_query; +mod report; +mod row; + +pub(crate) use self::{ + confidence::QuantitativeConfidenceInterval, controls::QuantitativeBenchmarkControls, + per_query::QuantitativePerQueryRow, report::QuantitativeBenchmarkReport, + row::QuantitativeBenchmarkRow, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs new file mode 100644 index 00000000..7a3da458 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/confidence.rs @@ -0,0 +1,11 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeConfidenceInterval { + pub(crate) method: String, + pub(crate) confidence: f64, + pub(crate) lower: f64, + pub(crate) upper: f64, + pub(crate) numerator: usize, + pub(crate) denominator: usize, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs new file mode 100644 index 00000000..1e8ea05f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/controls.rs @@ -0,0 +1,15 @@ +use crate::{Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkControls { + pub(crate) same_corpus_required: bool, + pub(crate) same_task_required: bool, + pub(crate) ranked_candidates_required_for_ranking_metrics: bool, + pub(crate) explicit_relevance_judgments_required_for_leaderboard: bool, + pub(crate) minimum_query_count_for_leaderboard: usize, + pub(crate) current_query_count: usize, + pub(crate) current_ranking_query_count: usize, + pub(crate) current_explicit_qrel_query_count: usize, + pub(crate) leaderboard_claim_allowed: bool, + pub(crate) leakage_control: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs new file mode 100644 index 00000000..35ce6d6f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/per_query.rs @@ -0,0 +1,20 @@ +use crate::{BTreeMap, Deserialize, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativePerQueryRow { + pub(crate) job_id: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) expected_relevant_count: usize, + pub(crate) candidate_count: usize, + pub(crate) qrel_source: String, + pub(crate) relevance_grade_sum: f64, + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + pub(crate) claim_boundary: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs new file mode 100644 index 00000000..1a57e138 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/report.rs @@ -0,0 +1,19 @@ +use crate::{ + Deserialize, QuantitativeBenchmarkControls, QuantitativeBenchmarkRow, QuantitativePerQueryRow, + Serialize, +}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkReport { + pub(crate) schema: String, + pub(crate) generated_at: String, + pub(crate) corpus_id: String, + pub(crate) k_values: Vec, + pub(crate) rows: Vec, + #[serde(default)] + pub(crate) per_query_rows: Vec, + #[serde(default)] + pub(crate) metrics_not_encoded: Vec, + pub(crate) controls: QuantitativeBenchmarkControls, + pub(crate) claim_boundary: String, +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs new file mode 100644 index 00000000..cdef9042 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative_reports/benchmark/row.rs @@ -0,0 +1,31 @@ +use crate::{BTreeMap, Deserialize, QuantitativeConfidenceInterval, Serialize}; + +#[derive(Clone, Debug, Default, Deserialize, Serialize)] +pub(crate) struct QuantitativeBenchmarkRow { + pub(crate) product: String, + pub(crate) adapter_id: String, + pub(crate) adapter_name: String, + pub(crate) suite: String, + pub(crate) evidence_class: String, + pub(crate) source_manifest_corpus_id: Option, + pub(crate) result_state: String, + pub(crate) comparable: bool, + pub(crate) metric_comparable: bool, + pub(crate) leaderboard_eligible: bool, + pub(crate) held_out: bool, + pub(crate) leakage_audited: bool, + pub(crate) audit_manifest_id: Option, + pub(crate) fixture_regression_only: bool, + pub(crate) sample_size: usize, + pub(crate) ranking_query_count: usize, + pub(crate) ranking_coverage_state: String, + pub(crate) ranked_candidate_source: String, + pub(crate) qrel_source: String, + pub(crate) explicit_qrel_query_count: usize, + pub(crate) metrics: BTreeMap>, + pub(crate) metric_states: BTreeMap, + pub(crate) denominators: BTreeMap, + #[serde(default)] + pub(crate) confidence_intervals: BTreeMap, + pub(crate) claim_boundary: String, +} From b3c5a5013875016094ab06f4a47af4f15c768ac8 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:09:42 -0400 Subject: [PATCH 38/58] {"schema":"decodex/commit/1","summary":"Split quantitative product row validation","authority":"manual"} --- .../validation/rows/product.rs | 67 ++----------------- .../validation/rows/product/identity.rs | 34 ++++++++++ .../validation/rows/product/leaderboard.rs | 31 +++++++++ 3 files changed, 72 insertions(+), 60 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs index 913b0628..ac009d59 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product.rs @@ -1,7 +1,7 @@ -use crate::{ - Path, QuantitativeBenchmarkRow, QuantitativeProductManifest, Result, eyre, - quantitative::MIN_LEADERBOARD_QUERY_COUNT, -}; +mod identity; +mod leaderboard; + +use crate::{Path, QuantitativeProductManifest, Result}; pub(super) fn validate_quantitative_product_rows( manifest: &QuantitativeProductManifest, @@ -9,65 +9,12 @@ pub(super) fn validate_quantitative_product_rows( corpus_id: &str, ) -> Result<()> { for row in &manifest.rows { - if row.product == "ELF" { - return Err(eyre::eyre!( - "{} quantitative product manifest must not inject ELF self rows.", - path.display() - )); - } - if row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.adapter_name.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative product row.", - path.display() - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } + identity::validate_product_row_identity(path, row, corpus_id)?; + if row.leaderboard_eligible { - validate_leaderboard_eligible_product_row(path, row)?; + leaderboard::validate_leaderboard_eligible_product_row(path, row)?; } } Ok(()) } - -fn validate_leaderboard_eligible_product_row( - path: &Path, - row: &QuantitativeBenchmarkRow, -) -> Result<()> { - let has_audit_manifest_id = row - .audit_manifest_id - .as_deref() - .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); - - if row.evidence_class != "live_real_world" - || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT - || row.ranking_query_count != row.sample_size - || row.explicit_qrel_query_count != row.ranking_query_count - || !row.held_out - || !row.leakage_audited - || !has_audit_manifest_id - { - return Err(eyre::eyre!( - "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", - path.display(), - row.product, - row.adapter_id - )); - } - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs new file mode 100644 index 00000000..5dd82465 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/identity.rs @@ -0,0 +1,34 @@ +use crate::{Path, QuantitativeBenchmarkRow, Result, eyre}; + +pub(super) fn validate_product_row_identity( + path: &Path, + row: &QuantitativeBenchmarkRow, + corpus_id: &str, +) -> Result<()> { + if row.product == "ELF" { + return Err(eyre::eyre!( + "{} quantitative product manifest must not inject ELF self rows.", + path.display() + )); + } + if row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.adapter_name.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + { + return Err(eyre::eyre!("{} has an incomplete quantitative product row.", path.display())); + } + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs new file mode 100644 index 00000000..e5f76ae2 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/product/leaderboard.rs @@ -0,0 +1,31 @@ +use crate::{ + Path, QuantitativeBenchmarkRow, Result, eyre, quantitative::MIN_LEADERBOARD_QUERY_COUNT, +}; + +pub(super) fn validate_leaderboard_eligible_product_row( + path: &Path, + row: &QuantitativeBenchmarkRow, +) -> Result<()> { + let has_audit_manifest_id = row + .audit_manifest_id + .as_deref() + .is_some_and(|audit_manifest_id| !audit_manifest_id.trim().is_empty()); + + if row.evidence_class != "live_real_world" + || row.sample_size < MIN_LEADERBOARD_QUERY_COUNT + || row.ranking_query_count != row.sample_size + || row.explicit_qrel_query_count != row.ranking_query_count + || !row.held_out + || !row.leakage_audited + || !has_audit_manifest_id + { + return Err(eyre::eyre!( + "{} row {}:{} is marked leaderboard_eligible without the required live/product-runtime, query-count, explicit-qrel, held-out, leakage-audit, and audit-manifest controls.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} From fa6ebe9932c114ea66f23f5eea92afe10c5db12c Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:14:38 -0400 Subject: [PATCH 39/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit identity validation","authority":"manual"} --- .../audit_manifest/validation/identity.rs | 69 ++----------------- .../validation/identity/context.rs | 63 +++++++++++++++++ .../validation/identity/schema.rs | 21 ++++++ 3 files changed, 90 insertions(+), 63 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs index 461e9eb6..6444cdea 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity.rs @@ -1,6 +1,8 @@ +mod context; +mod schema; + use crate::{ - Path, QuantitativeAuditManifest, Result, eyre, - quantitative::{QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, audit_manifest::QuantitativeAuditContext}, + Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext, }; pub(super) fn validate_quantitative_audit_identity( @@ -8,66 +10,7 @@ pub(super) fn validate_quantitative_audit_identity( path: &Path, context: &QuantitativeAuditContext<'_>, ) -> Result<()> { - if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { - return Err(eyre::eyre!( - "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", - path.display(), - manifest.schema - )); - } - if manifest.manifest_id.trim().is_empty() { - return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); - } - if manifest.run_id != context.run_id { - return Err(eyre::eyre!( - "{} has run_id {}, expected {}.", - path.display(), - manifest.run_id, - context.run_id - )); - } - if manifest.corpus_id != context.corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected {}.", - path.display(), - manifest.corpus_id, - context.corpus_id - )); - } - if manifest.product != context.product || manifest.adapter_id != context.adapter_id { - return Err(eyre::eyre!( - "{} has product {}:{} but current row is {}:{}.", - path.display(), - manifest.product, - manifest.adapter_id, - context.product, - context.adapter_id - )); - } - if manifest.sample_size != context.source_jobs.len() { - return Err(eyre::eyre!( - "{} has sample_size {}, expected {}.", - path.display(), - manifest.sample_size, - context.source_jobs.len() - )); - } - if manifest.ranking_query_count != context.ranking_query_count { - return Err(eyre::eyre!( - "{} has ranking_query_count {}, expected {}.", - path.display(), - manifest.ranking_query_count, - context.ranking_query_count - )); - } - if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { - return Err(eyre::eyre!( - "{} has explicit_qrel_query_count {}, expected {}.", - path.display(), - manifest.explicit_qrel_query_count, - context.explicit_qrel_query_count - )); - } + schema::validate_quantitative_audit_schema(manifest, path)?; - Ok(()) + context::validate_quantitative_audit_context(manifest, path, context) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs new file mode 100644 index 00000000..d11c8636 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs @@ -0,0 +1,63 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs new file mode 100644 index 00000000..f288eeba --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/schema.rs @@ -0,0 +1,21 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, quantitative::QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, +}; + +pub(super) fn validate_quantitative_audit_schema( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.schema != QUANTITATIVE_AUDIT_MANIFEST_SCHEMA { + return Err(eyre::eyre!( + "{} has schema {}, expected {QUANTITATIVE_AUDIT_MANIFEST_SCHEMA}.", + path.display(), + manifest.schema + )); + } + if manifest.manifest_id.trim().is_empty() { + return Err(eyre::eyre!("{} has an empty manifest_id.", path.display())); + } + + Ok(()) +} From e1dfc21ef5494b1e9a8b5bc7d40ca59e4b335206 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:19:32 -0400 Subject: [PATCH 40/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit digest paths","authority":"manual"} --- .../audit_manifest/artifacts/digest.rs | 36 +++---------------- .../audit_manifest/artifacts/digest/paths.rs | 31 ++++++++++++++++ 2 files changed, 35 insertions(+), 32 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs index bb75c802..d87860d9 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest.rs @@ -1,4 +1,6 @@ -use crate::{Path, PathBuf, Result, fs}; +mod paths; + +use crate::{Path, Result, fs}; pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) -> Result { let mut hasher = blake3::Hasher::new(); @@ -13,7 +15,7 @@ pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) return Ok(hasher.finalize().to_hex().to_string()); } - let paths = audit_fixture_paths(path)?; + let paths = paths::audit_fixture_paths(path)?; for fixture in paths { let relative = fixture @@ -27,36 +29,6 @@ pub(in crate::quantitative::audit_manifest) fn fixture_path_digest(path: &Path) Ok(hasher.finalize().to_hex().to_string()) } -fn audit_fixture_paths(path: &Path) -> Result> { - let mut paths = Vec::new(); - - collect_audit_fixture_paths(path, &mut paths)?; - - paths.sort(); - - Ok(paths) -} - -fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { - if path.is_file() { - paths.push(path.to_path_buf()); - - return Ok(()); - } - - for entry in fs::read_dir(path)? { - let entry_path = entry?.path(); - - if entry_path.is_dir() { - collect_audit_fixture_paths(entry_path.as_path(), paths)?; - } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { - paths.push(entry_path); - } - } - - Ok(()) -} - fn hash_fixture_file(path: &Path, logical_path: &str, hasher: &mut blake3::Hasher) -> Result<()> { hasher.update(logical_path.as_bytes()); hasher.update(b"\0"); diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs new file mode 100644 index 00000000..a7ba276c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/digest/paths.rs @@ -0,0 +1,31 @@ +use crate::{Path, PathBuf, Result, fs}; + +pub(super) fn audit_fixture_paths(path: &Path) -> Result> { + let mut paths = Vec::new(); + + collect_audit_fixture_paths(path, &mut paths)?; + + paths.sort(); + + Ok(paths) +} + +fn collect_audit_fixture_paths(path: &Path, paths: &mut Vec) -> Result<()> { + if path.is_file() { + paths.push(path.to_path_buf()); + + return Ok(()); + } + + for entry in fs::read_dir(path)? { + let entry_path = entry?.path(); + + if entry_path.is_dir() { + collect_audit_fixture_paths(entry_path.as_path(), paths)?; + } else if entry_path.extension().and_then(|ext| ext.to_str()) == Some("json") { + paths.push(entry_path); + } + } + + Ok(()) +} From 788ab64d3db9a123d72fb51b7635596c513707ae Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:25:15 -0400 Subject: [PATCH 41/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit manifest assembly","authority":"manual"} --- .../quantitative/audit_manifest/export.rs | 38 ++-------------- .../audit_manifest/export/manifest.rs | 45 +++++++++++++++++++ 2 files changed, 49 insertions(+), 34 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs index 795960e0..6b23ccfa 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export.rs @@ -1,14 +1,10 @@ mod claim_boundary; mod identity; +mod manifest; use crate::{ - ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest, - RealWorldJob, Result, - quantitative::{ - self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, - audit_manifest::{QuantitativeAuditContext, artifacts, validation}, - metrics, - }, + ExportQuantitativeAuditManifestArgs, QuantitativeAuditManifest, RealWorldJob, Result, + quantitative::audit_manifest::{QuantitativeAuditContext, validation}, }; pub(crate) fn quantitative_audit_manifest_from_jobs( @@ -20,33 +16,7 @@ pub(crate) fn quantitative_audit_manifest_from_jobs( identity::validate_audit_export_identity(product, adapter_id)?; - let corpus_id = quantitative::quantitative_corpus_id(jobs); - let ranking_query_count = metrics::ranking_query_count(jobs); - let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); - let manifest = QuantitativeAuditManifest { - schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), - run_id: args.run_id.clone(), - corpus_id, - product: product.to_string(), - adapter_id: adapter_id.to_string(), - held_out: args.held_out, - leakage_audited: args.leakage_audited, - sample_size: jobs.len(), - ranking_query_count, - explicit_qrel_query_count, - query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), - controls: args.controls.clone(), - artifacts: vec![QuantitativeAuditArtifact { - role: "product_runtime_fixtures".to_string(), - path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), - sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, - }], - claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args), - }; + let manifest = manifest::quantitative_audit_manifest(jobs, args, product, adapter_id)?; validation::validate_quantitative_audit_manifest( &manifest, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs new file mode 100644 index 00000000..dad5a99e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/export/manifest.rs @@ -0,0 +1,45 @@ +use crate::{ + ExportQuantitativeAuditManifestArgs, QuantitativeAuditArtifact, QuantitativeAuditManifest, + RealWorldJob, Result, + quantitative::{ + self, QUANTITATIVE_AUDIT_MANIFEST_SCHEMA, + audit_manifest::{artifacts, export::claim_boundary}, + metrics, + }, +}; + +pub(super) fn quantitative_audit_manifest( + jobs: &[RealWorldJob], + args: &ExportQuantitativeAuditManifestArgs, + product: &str, + adapter_id: &str, +) -> Result { + let corpus_id = quantitative::quantitative_corpus_id(jobs); + let ranking_query_count = metrics::ranking_query_count(jobs); + let explicit_qrel_query_count = metrics::explicit_qrel_query_count(jobs); + + Ok(QuantitativeAuditManifest { + schema: QUANTITATIVE_AUDIT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-audit-manifest", args.run_id)), + run_id: args.run_id.clone(), + corpus_id, + product: product.to_string(), + adapter_id: adapter_id.to_string(), + held_out: args.held_out, + leakage_audited: args.leakage_audited, + sample_size: jobs.len(), + ranking_query_count, + explicit_qrel_query_count, + query_ids: metrics::ranking_query_ids(jobs).into_iter().map(str::to_string).collect(), + controls: args.controls.clone(), + artifacts: vec![QuantitativeAuditArtifact { + role: "product_runtime_fixtures".to_string(), + path: artifacts::audit_artifact_display_path(args.fixtures.as_path()), + sha256: artifacts::fixture_path_digest(args.fixtures.as_path())?, + }], + claim_boundary: claim_boundary::quantitative_audit_claim_boundary(args), + }) +} From b54faa73ec205b795be3e7c106207f3c604856ef Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:28:23 -0400 Subject: [PATCH 42/58] {"schema":"decodex/commit/1","summary":"Split quantitative benchmark row input","authority":"manual"} --- .../quantitative/report/row/benchmark_row.rs | 24 +++++-------------- .../report/row/benchmark_row/input.rs | 17 +++++++++++++ 2 files changed, 23 insertions(+), 18 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs index 53198ae6..4b8b2e31 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row.rs @@ -1,24 +1,12 @@ +mod input; + +pub(super) use self::input::QuantitativeBenchmarkRowInput; + use crate::{ - QuantitativeBenchmarkRow, QuantitativePerQueryRow, - quantitative::{ - self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, audit_manifest::QuantitativeAuditEvidence, metrics, - report::QuantitativeReportInput, - }, + QuantitativeBenchmarkRow, + quantitative::{self, QUANTITATIVE_ROW_CLAIM_BOUNDARY, metrics}, }; -pub(super) struct QuantitativeBenchmarkRowInput<'a, 'b> { - pub(super) input: &'a QuantitativeReportInput<'b>, - pub(super) corpus_id: &'a str, - pub(super) evidence_class: &'a str, - pub(super) per_query_rows: &'a [QuantitativePerQueryRow], - pub(super) ranking_query_count: usize, - pub(super) explicit_qrel_query_count: usize, - pub(super) metric_comparable: bool, - pub(super) result_state: &'a str, - pub(super) audit_evidence: QuantitativeAuditEvidence, - pub(super) leaderboard_eligible: bool, -} - pub(super) fn quantitative_benchmark_row( row_input: QuantitativeBenchmarkRowInput<'_, '_>, ) -> QuantitativeBenchmarkRow { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs new file mode 100644 index 00000000..a8e3f96a --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/benchmark_row/input.rs @@ -0,0 +1,17 @@ +use crate::{ + QuantitativePerQueryRow, + quantitative::{audit_manifest::QuantitativeAuditEvidence, report::QuantitativeReportInput}, +}; + +pub(in crate::quantitative::report::row) struct QuantitativeBenchmarkRowInput<'a, 'b> { + pub(in crate::quantitative::report::row) input: &'a QuantitativeReportInput<'b>, + pub(in crate::quantitative::report::row) corpus_id: &'a str, + pub(in crate::quantitative::report::row) evidence_class: &'a str, + pub(in crate::quantitative::report::row) per_query_rows: &'a [QuantitativePerQueryRow], + pub(in crate::quantitative::report::row) ranking_query_count: usize, + pub(in crate::quantitative::report::row) explicit_qrel_query_count: usize, + pub(in crate::quantitative::report::row) metric_comparable: bool, + pub(in crate::quantitative::report::row) result_state: &'a str, + pub(in crate::quantitative::report::row) audit_evidence: QuantitativeAuditEvidence, + pub(in crate::quantitative::report::row) leaderboard_eligible: bool, +} From 49325f60648101e030f8becee65a410e1a9afa8f Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:33:24 -0400 Subject: [PATCH 43/58] {"schema":"decodex/commit/1","summary":"Split quantitative product export manifest","authority":"manual"} --- .../quantitative/product_manifest/export.rs | 37 ++------------- .../product_manifest/export/manifest.rs | 46 +++++++++++++++++++ .../product_manifest/export/source.rs | 37 +++++++++++++++ 3 files changed, 87 insertions(+), 33 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs index ac105d5a..d72509f8 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export.rs @@ -1,10 +1,11 @@ mod identity; +mod manifest; mod rows; +mod source; use crate::{ ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, REPORT_SCHEMA, - RealWorldReport, Result, eyre, - quantitative::{QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, product_manifest::validation}, + RealWorldReport, Result, eyre, quantitative::product_manifest::validation, }; pub(crate) fn quantitative_product_manifest_from_report( @@ -19,37 +20,7 @@ pub(crate) fn quantitative_product_manifest_from_report( )); } - let source_row = - report.quantitative_scoreboard.rows.first().ok_or_else(|| { - eyre::eyre!("{} has no quantitative product row.", args.report.display()) - })?; - let source_product = source_row.product.as_str(); - let source_adapter_id = source_row.adapter_id.as_str(); - let product = args.product.as_deref().unwrap_or(source_product).trim(); - let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); - let adapter_name = - args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); - - identity::validate_export_identity(args, product, adapter_id, adapter_name)?; - - let row = rows::exported_product_row(source_row, product, adapter_id, adapter_name); - let per_query_rows = rows::exported_per_query_rows( - report, - source_product, - source_adapter_id, - product, - adapter_id, - ); - let manifest = QuantitativeProductManifest { - schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), - manifest_id: args - .manifest_id - .clone() - .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), - corpus_id: report.quantitative_scoreboard.corpus_id.clone(), - rows: vec![row], - per_query_rows, - }; + let manifest = manifest::quantitative_product_manifest(report, args)?; validation::validate_quantitative_product_manifest( &manifest, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs new file mode 100644 index 00000000..592cb19f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/manifest.rs @@ -0,0 +1,46 @@ +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeProductManifest, RealWorldReport, Result, + quantitative::{ + QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA, + product_manifest::export::{identity, rows, source}, + }, +}; + +pub(super) fn quantitative_product_manifest( + report: &RealWorldReport, + args: &ExportQuantitativeProductManifestArgs, +) -> Result { + let source = source::product_export_identity(report, args)?; + + identity::validate_export_identity( + args, + source.product, + source.adapter_id, + source.adapter_name, + )?; + + let row = rows::exported_product_row( + source.row, + source.product, + source.adapter_id, + source.adapter_name, + ); + let per_query_rows = rows::exported_per_query_rows( + report, + source.source_product, + source.source_adapter_id, + source.product, + source.adapter_id, + ); + + Ok(QuantitativeProductManifest { + schema: QUANTITATIVE_PRODUCT_MANIFEST_SCHEMA.to_string(), + manifest_id: args + .manifest_id + .clone() + .unwrap_or_else(|| format!("{}-quantitative-product-manifest", report.run_id)), + corpus_id: report.quantitative_scoreboard.corpus_id.clone(), + rows: vec![row], + per_query_rows, + }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs new file mode 100644 index 00000000..6a3b7ed9 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/source.rs @@ -0,0 +1,37 @@ +use crate::{ + ExportQuantitativeProductManifestArgs, QuantitativeBenchmarkRow, RealWorldReport, Result, eyre, +}; + +pub(super) struct ProductExportIdentity<'report> { + pub(super) row: &'report QuantitativeBenchmarkRow, + pub(super) source_product: &'report str, + pub(super) source_adapter_id: &'report str, + pub(super) product: &'report str, + pub(super) adapter_id: &'report str, + pub(super) adapter_name: &'report str, +} + +pub(super) fn product_export_identity<'report>( + report: &'report RealWorldReport, + args: &'report ExportQuantitativeProductManifestArgs, +) -> Result> { + let source_row = + report.quantitative_scoreboard.rows.first().ok_or_else(|| { + eyre::eyre!("{} has no quantitative product row.", args.report.display()) + })?; + let source_product = source_row.product.as_str(); + let source_adapter_id = source_row.adapter_id.as_str(); + let product = args.product.as_deref().unwrap_or(source_product).trim(); + let adapter_id = args.adapter_id.as_deref().unwrap_or(source_adapter_id).trim(); + let adapter_name = + args.adapter_name.as_deref().unwrap_or(source_row.adapter_name.as_str()).trim(); + + Ok(ProductExportIdentity { + row: source_row, + source_product, + source_adapter_id, + product, + adapter_id, + adapter_name, + }) +} From 449927f2fa427d3a1764ce49e6b8278fb695358f Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:38:41 -0400 Subject: [PATCH 44/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit artifact validation","authority":"manual"} --- .../quantitative/audit_manifest/artifacts.rs | 62 ++----------------- .../audit_manifest/artifacts/validation.rs | 20 ++++++ .../artifacts/validation/digest.rs | 33 ++++++++++ .../artifacts/validation/fields.rs | 26 ++++++++ 4 files changed, 84 insertions(+), 57 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs index 25a0bbb0..855af455 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts.rs @@ -1,60 +1,8 @@ mod digest; mod paths; +mod validation; -pub(super) use self::{digest::fixture_path_digest, paths::audit_artifact_display_path}; - -use crate::{Path, QuantitativeAuditManifest, Result, eyre}; - -pub(super) fn validate_quantitative_audit_artifacts( - manifest: &QuantitativeAuditManifest, - path: &Path, -) -> Result<()> { - if manifest.artifacts.is_empty() { - return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); - } - - for artifact in &manifest.artifacts { - if artifact.role.trim().is_empty() - || artifact.path.trim().is_empty() - || artifact.sha256.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative audit artifact.", - path.display() - )); - } - if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) - { - return Err(eyre::eyre!( - "{} artifact {} has invalid sha256 digest {}.", - path.display(), - artifact.role, - artifact.sha256 - )); - } - - let artifact_path = - paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); - let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| { - eyre::eyre!( - "{} artifact {} could not be digested at {}: {err}", - path.display(), - artifact.role, - artifact_path.display() - ) - })?; - - if actual != artifact.sha256 { - return Err(eyre::eyre!( - "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", - path.display(), - artifact.role, - artifact_path.display(), - artifact.sha256, - actual - )); - } - } - - Ok(()) -} +pub(super) use self::{ + digest::fixture_path_digest, paths::audit_artifact_display_path, + validation::validate_quantitative_audit_artifacts, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs new file mode 100644 index 00000000..21c5e7bb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation.rs @@ -0,0 +1,20 @@ +mod digest; +mod fields; + +use crate::{Path, QuantitativeAuditManifest, Result, eyre}; + +pub(in crate::quantitative::audit_manifest) fn validate_quantitative_audit_artifacts( + manifest: &QuantitativeAuditManifest, + path: &Path, +) -> Result<()> { + if manifest.artifacts.is_empty() { + return Err(eyre::eyre!("{} has no quantitative audit artifacts.", path.display())); + } + + for artifact in &manifest.artifacts { + fields::validate_audit_artifact_fields(path, artifact)?; + digest::validate_audit_artifact_digest(path, artifact)?; + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs new file mode 100644 index 00000000..e6af0f61 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/digest.rs @@ -0,0 +1,33 @@ +use crate::{ + Path, QuantitativeAuditArtifact, Result, eyre, + quantitative::audit_manifest::artifacts::{digest, paths}, +}; + +pub(super) fn validate_audit_artifact_digest( + path: &Path, + artifact: &QuantitativeAuditArtifact, +) -> Result<()> { + let artifact_path = + paths::resolve_quantitative_audit_artifact_path(path, artifact.path.as_str()); + let actual = digest::fixture_path_digest(artifact_path.as_path()).map_err(|err| { + eyre::eyre!( + "{} artifact {} could not be digested at {}: {err}", + path.display(), + artifact.role, + artifact_path.display() + ) + })?; + + if actual != artifact.sha256 { + return Err(eyre::eyre!( + "{} artifact {} sha256 mismatch for {}: manifest {}, actual {}.", + path.display(), + artifact.role, + artifact_path.display(), + artifact.sha256, + actual + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs new file mode 100644 index 00000000..af6c149c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/artifacts/validation/fields.rs @@ -0,0 +1,26 @@ +use crate::{Path, QuantitativeAuditArtifact, Result, eyre}; + +pub(super) fn validate_audit_artifact_fields( + path: &Path, + artifact: &QuantitativeAuditArtifact, +) -> Result<()> { + if artifact.role.trim().is_empty() + || artifact.path.trim().is_empty() + || artifact.sha256.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative audit artifact.", + path.display() + )); + } + if artifact.sha256.len() != 64 || !artifact.sha256.chars().all(|ch| ch.is_ascii_hexdigit()) { + return Err(eyre::eyre!( + "{} artifact {} has invalid sha256 digest {}.", + path.display(), + artifact.role, + artifact.sha256 + )); + } + + Ok(()) +} From 779dbed7360c27fa4ecc101896e38c284a24126a Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:41:42 -0400 Subject: [PATCH 45/58] {"schema":"decodex/commit/1","summary":"Split quantitative ranking metrics","authority":"manual"} --- .../per_query/query_metrics/ranking.rs | 43 +++---------------- .../per_query/query_metrics/ranking/ndcg.rs | 33 ++++++++++++++ .../query_metrics/ranking/reciprocal_rank.rs | 19 ++++++++ 3 files changed, 57 insertions(+), 38 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs index 5abea808..e9d7dbf7 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking.rs @@ -1,23 +1,14 @@ mod average_precision; +mod ndcg; +mod reciprocal_rank; -use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; +use crate::BTreeMap; pub(super) fn reciprocal_rank( candidates: &[String], relevance: &BTreeMap, ) -> Option { - if query_metrics::positive_qrel_count(relevance) == 0 { - return None; - } - - Some( - candidates - .iter() - .position(|candidate| { - relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) - }) - .map_or(0.0, |index| 1.0 / (index + 1) as f64), - ) + reciprocal_rank::reciprocal_rank(candidates, relevance) } pub(super) fn ndcg_at_k( @@ -25,31 +16,7 @@ pub(super) fn ndcg_at_k( relevance: &BTreeMap, k: usize, ) -> Option { - if query_metrics::positive_qrel_count(relevance) == 0 { - return None; - } - - let dcg = candidates - .iter() - .take(k) - .enumerate() - .map(|(index, candidate)| { - relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) - / ((index + 2) as f64).log2() - }) - .sum::(); - let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); - - ideal.sort_by(|left, right| right.total_cmp(left)); - - let idcg = ideal - .iter() - .take(k) - .enumerate() - .map(|(index, grade)| grade / ((index + 2) as f64).log2()) - .sum::(); - - Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) + ndcg::ndcg_at_k(candidates, relevance, k) } pub(super) fn average_precision( diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs new file mode 100644 index 00000000..540d2f66 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/ndcg.rs @@ -0,0 +1,33 @@ +use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn ndcg_at_k( + candidates: &[String], + relevance: &BTreeMap, + k: usize, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + let dcg = candidates + .iter() + .take(k) + .enumerate() + .map(|(index, candidate)| { + relevance.get(candidate.as_str()).copied().unwrap_or(0.0).max(0.0) + / ((index + 2) as f64).log2() + }) + .sum::(); + let mut ideal = relevance.values().copied().filter(|grade| *grade > 0.0).collect::>(); + + ideal.sort_by(|left, right| right.total_cmp(left)); + + let idcg = ideal + .iter() + .take(k) + .enumerate() + .map(|(index, grade)| grade / ((index + 2) as f64).log2()) + .sum::(); + + Some(if idcg > 0.0 { dcg / idcg } else { 0.0 }) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs new file mode 100644 index 00000000..99956367 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/query_metrics/ranking/reciprocal_rank.rs @@ -0,0 +1,19 @@ +use crate::{BTreeMap, quantitative::metrics::per_query::query_metrics}; + +pub(super) fn reciprocal_rank( + candidates: &[String], + relevance: &BTreeMap, +) -> Option { + if query_metrics::positive_qrel_count(relevance) == 0 { + return None; + } + + Some( + candidates + .iter() + .position(|candidate| { + relevance.get(candidate.as_str()).is_some_and(|grade| *grade > 0.0) + }) + .map_or(0.0, |index| 1.0 / (index + 1) as f64), + ) +} From 119287f70cfa6fd42a7e71ae0f1707ef557bdb68 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:46:04 -0400 Subject: [PATCH 46/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query row assembly","authority":"manual"} --- .../quantitative/metrics/per_query.rs | 48 ++----------------- .../quantitative/metrics/per_query/row.rs | 48 +++++++++++++++++++ 2 files changed, 51 insertions(+), 45 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs index cb184dc9..2f8de046 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -1,10 +1,8 @@ mod evidence; mod query_metrics; +mod row; -use crate::{ - JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, - quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, scoring, -}; +use crate::{JobReport, QuantitativePerQueryRow, RealWorldJob}; pub(super) fn quantitative_per_query_rows( source_jobs: &[RealWorldJob], @@ -17,47 +15,7 @@ pub(super) fn quantitative_per_query_rows( .iter() .zip(jobs.iter()) .map(|(source_job, job)| { - quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) + row::quantitative_per_query_row(source_job, job, corpus_id, evidence_class, adapter_id) }) .collect() } - -fn quantitative_per_query_row( - source_job: &RealWorldJob, - job: &JobReport, - corpus_id: &str, - evidence_class: &str, - adapter_id: &str, -) -> QuantitativePerQueryRow { - let relevance = evidence::relevance_grades(source_job, job); - let candidates = scoring::produced_evidence_order(source_job); - let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); - let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); - let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { - "not_encoded" - } else { - formatting::status_str(job.status) - }; - let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); - - QuantitativePerQueryRow { - job_id: job.job_id.clone(), - suite: job.suite_id.clone(), - evidence_class: evidence_class.to_string(), - source_manifest_corpus_id: Some(corpus_id.to_string()), - result_state: formatting::status_str(job.status).to_string(), - expected_relevant_count: positive_relevance_count, - candidate_count: candidates.len(), - qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), - relevance_grade_sum: formatting::round3(relevance.values().sum::()), - product: "ELF".to_string(), - adapter_id: adapter_id.to_string(), - metrics, - metric_states, - denominators: query_metrics::per_query_denominators( - candidates.len(), - positive_relevance_count, - ), - claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), - } -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs new file mode 100644 index 00000000..2a892850 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs @@ -0,0 +1,48 @@ +use crate::{ + JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, + quantitative::{ + QUANTITATIVE_ROW_CLAIM_BOUNDARY, + metrics::per_query::{evidence, query_metrics}, + }, + scoring, +}; + +pub(super) fn quantitative_per_query_row( + source_job: &RealWorldJob, + job: &JobReport, + corpus_id: &str, + evidence_class: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let relevance = evidence::relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); + let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); + let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { + "not_encoded" + } else { + formatting::status_str(job.status) + }; + let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); + + QuantitativePerQueryRow { + job_id: job.job_id.clone(), + suite: job.suite_id.clone(), + evidence_class: evidence_class.to_string(), + source_manifest_corpus_id: Some(corpus_id.to_string()), + result_state: formatting::status_str(job.status).to_string(), + expected_relevant_count: positive_relevance_count, + candidate_count: candidates.len(), + qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + product: "ELF".to_string(), + adapter_id: adapter_id.to_string(), + metrics, + metric_states, + denominators: query_metrics::per_query_denominators( + candidates.len(), + positive_relevance_count, + ), + claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), + } +} From a2282cac363d29b7a1cbb373d161389bcdb57693 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:48:48 -0400 Subject: [PATCH 47/58] {"schema":"decodex/commit/1","summary":"Split quantitative aggregate metric assembly","authority":"manual"} --- .../quantitative/metrics/aggregate.rs | 43 +++---------------- .../quantitative/metrics/aggregate/metrics.rs | 27 ++++++++++++ .../quantitative/metrics/aggregate/states.rs | 20 +++++++++ 3 files changed, 52 insertions(+), 38 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs index b61ee782..9e899d64 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -1,53 +1,20 @@ mod confidence; mod denominators; +mod metrics; mod names; +mod states; -use crate::{ - BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow, formatting, - quantitative::QUANTITATIVE_K_VALUES, -}; +use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow}; pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { - let mut sums = BTreeMap::::new(); - let mut metrics = names::quantitative_metric_names() - .into_iter() - .map(|metric| (metric, None)) - .collect::>(); - - for row in rows { - for (metric, value) in &row.metrics { - if let Some(value) = value { - let (sum, count) = sums.entry(metric.clone()).or_default(); - - *sum += *value; - *count += 1; - } - } - } - for (metric, (sum, count)) in sums { - metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); - } - - metrics + metrics::aggregate_metrics(rows) } pub(super) fn aggregate_metric_states( result_state: &str, metric_comparable: bool, ) -> BTreeMap { - let state = if metric_comparable { result_state } else { "not_encoded" }; - let mut states = BTreeMap::new(); - - for k in QUANTITATIVE_K_VALUES { - states.insert(format!("recall_at_{k}"), state.to_string()); - states.insert(format!("precision_at_{k}"), state.to_string()); - states.insert(format!("success_at_{k}"), state.to_string()); - } - for metric in ["mrr", "ndcg_at_5", "average_precision"] { - states.insert(metric.to_string(), state.to_string()); - } - - states + states::aggregate_metric_states(result_state, metric_comparable) } pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs new file mode 100644 index 00000000..db17c0c1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/metrics.rs @@ -0,0 +1,27 @@ +use crate::{ + BTreeMap, QuantitativePerQueryRow, formatting, quantitative::metrics::aggregate::names, +}; + +pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { + let mut sums = BTreeMap::::new(); + let mut metrics = names::quantitative_metric_names() + .into_iter() + .map(|metric| (metric, None)) + .collect::>(); + + for row in rows { + for (metric, value) in &row.metrics { + if let Some(value) = value { + let (sum, count) = sums.entry(metric.clone()).or_default(); + + *sum += *value; + *count += 1; + } + } + } + for (metric, (sum, count)) in sums { + metrics.insert(metric, (count > 0).then(|| formatting::round3(sum / count as f64))); + } + + metrics +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs new file mode 100644 index 00000000..c9f631bb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate/states.rs @@ -0,0 +1,20 @@ +use crate::{BTreeMap, quantitative::QUANTITATIVE_K_VALUES}; + +pub(super) fn aggregate_metric_states( + result_state: &str, + metric_comparable: bool, +) -> BTreeMap { + let state = if metric_comparable { result_state } else { "not_encoded" }; + let mut states = BTreeMap::new(); + + for k in QUANTITATIVE_K_VALUES { + states.insert(format!("recall_at_{k}"), state.to_string()); + states.insert(format!("precision_at_{k}"), state.to_string()); + states.insert(format!("success_at_{k}"), state.to_string()); + } + for metric in ["mrr", "ndcg_at_5", "average_precision"] { + states.insert(metric.to_string(), state.to_string()); + } + + states +} From 4e6170e3c387c0ece0d7c57397bca832ab3ef7ea Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:51:19 -0400 Subject: [PATCH 48/58] {"schema":"decodex/commit/1","summary":"Split quantitative report input type","authority":"manual"} --- .../quantitative/report.rs | 17 ++++------------- .../quantitative/report/input.rs | 12 ++++++++++++ 2 files changed, 16 insertions(+), 13 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs index 331acc70..3922622a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs @@ -1,23 +1,14 @@ mod controls; +mod input; mod row; +pub(crate) use self::input::QuantitativeReportInput; + use crate::{ - AdapterReport, JobReport, Path, QuantitativeBenchmarkReport, RealWorldJob, ReportSummary, - Result, + QuantitativeBenchmarkReport, Result, quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest}, }; -pub(crate) struct QuantitativeReportInput<'a> { - pub(crate) run_id: &'a str, - pub(crate) generated_at: &'a str, - pub(crate) adapter: &'a AdapterReport, - pub(crate) source_jobs: &'a [RealWorldJob], - pub(crate) jobs: &'a [JobReport], - pub(crate) summary: &'a ReportSummary, - pub(crate) product_manifest_path: Option<&'a Path>, - pub(crate) audit_manifest_path: Option<&'a Path>, -} - pub(crate) fn quantitative_scoreboard_report( input: QuantitativeReportInput<'_>, ) -> Result { diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs new file mode 100644 index 00000000..c4412050 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/input.rs @@ -0,0 +1,12 @@ +use crate::{AdapterReport, JobReport, Path, RealWorldJob, ReportSummary}; + +pub(crate) struct QuantitativeReportInput<'a> { + pub(crate) run_id: &'a str, + pub(crate) generated_at: &'a str, + pub(crate) adapter: &'a AdapterReport, + pub(crate) source_jobs: &'a [RealWorldJob], + pub(crate) jobs: &'a [JobReport], + pub(crate) summary: &'a ReportSummary, + pub(crate) product_manifest_path: Option<&'a Path>, + pub(crate) audit_manifest_path: Option<&'a Path>, +} From c7aebf9f6e4438e0b9046b33537fc2470f464f7e Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:55:10 -0400 Subject: [PATCH 49/58] {"schema":"decodex/commit/1","summary":"Split quantitative audit context validation","authority":"manual"} --- .../validation/identity/context.rs | 58 ++----------------- .../validation/identity/context/counts.rs | 37 ++++++++++++ .../validation/identity/context/fields.rs | 39 +++++++++++++ 3 files changed, 82 insertions(+), 52 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs index d11c8636..1d6be494 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context.rs @@ -1,6 +1,8 @@ +mod counts; +mod fields; + use crate::{ - Path, QuantitativeAuditManifest, Result, eyre, - quantitative::audit_manifest::QuantitativeAuditContext, + Path, QuantitativeAuditManifest, Result, quantitative::audit_manifest::QuantitativeAuditContext, }; pub(super) fn validate_quantitative_audit_context( @@ -8,56 +10,8 @@ pub(super) fn validate_quantitative_audit_context( path: &Path, context: &QuantitativeAuditContext<'_>, ) -> Result<()> { - if manifest.run_id != context.run_id { - return Err(eyre::eyre!( - "{} has run_id {}, expected {}.", - path.display(), - manifest.run_id, - context.run_id - )); - } - if manifest.corpus_id != context.corpus_id { - return Err(eyre::eyre!( - "{} has corpus_id {}, expected {}.", - path.display(), - manifest.corpus_id, - context.corpus_id - )); - } - if manifest.product != context.product || manifest.adapter_id != context.adapter_id { - return Err(eyre::eyre!( - "{} has product {}:{} but current row is {}:{}.", - path.display(), - manifest.product, - manifest.adapter_id, - context.product, - context.adapter_id - )); - } - if manifest.sample_size != context.source_jobs.len() { - return Err(eyre::eyre!( - "{} has sample_size {}, expected {}.", - path.display(), - manifest.sample_size, - context.source_jobs.len() - )); - } - if manifest.ranking_query_count != context.ranking_query_count { - return Err(eyre::eyre!( - "{} has ranking_query_count {}, expected {}.", - path.display(), - manifest.ranking_query_count, - context.ranking_query_count - )); - } - if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { - return Err(eyre::eyre!( - "{} has explicit_qrel_query_count {}, expected {}.", - path.display(), - manifest.explicit_qrel_query_count, - context.explicit_qrel_query_count - )); - } + fields::validate_quantitative_audit_context_fields(manifest, path, context)?; + counts::validate_quantitative_audit_context_counts(manifest, path, context)?; Ok(()) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs new file mode 100644 index 00000000..a9e61f1f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/counts.rs @@ -0,0 +1,37 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context_counts( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.sample_size != context.source_jobs.len() { + return Err(eyre::eyre!( + "{} has sample_size {}, expected {}.", + path.display(), + manifest.sample_size, + context.source_jobs.len() + )); + } + if manifest.ranking_query_count != context.ranking_query_count { + return Err(eyre::eyre!( + "{} has ranking_query_count {}, expected {}.", + path.display(), + manifest.ranking_query_count, + context.ranking_query_count + )); + } + if manifest.explicit_qrel_query_count != context.explicit_qrel_query_count { + return Err(eyre::eyre!( + "{} has explicit_qrel_query_count {}, expected {}.", + path.display(), + manifest.explicit_qrel_query_count, + context.explicit_qrel_query_count + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs new file mode 100644 index 00000000..1b39ccad --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/audit_manifest/validation/identity/context/fields.rs @@ -0,0 +1,39 @@ +use crate::{ + Path, QuantitativeAuditManifest, Result, eyre, + quantitative::audit_manifest::QuantitativeAuditContext, +}; + +pub(super) fn validate_quantitative_audit_context_fields( + manifest: &QuantitativeAuditManifest, + path: &Path, + context: &QuantitativeAuditContext<'_>, +) -> Result<()> { + if manifest.run_id != context.run_id { + return Err(eyre::eyre!( + "{} has run_id {}, expected {}.", + path.display(), + manifest.run_id, + context.run_id + )); + } + if manifest.corpus_id != context.corpus_id { + return Err(eyre::eyre!( + "{} has corpus_id {}, expected {}.", + path.display(), + manifest.corpus_id, + context.corpus_id + )); + } + if manifest.product != context.product || manifest.adapter_id != context.adapter_id { + return Err(eyre::eyre!( + "{} has product {}:{} but current row is {}:{}.", + path.display(), + manifest.product, + manifest.adapter_id, + context.product, + context.adapter_id + )); + } + + Ok(()) +} From a7a45db230703a2b2f8a4fe0fc1b98a86ec1e15a Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 15:58:24 -0400 Subject: [PATCH 50/58] {"schema":"decodex/commit/1","summary":"Split quantitative row basis assembly","authority":"manual"} --- .../quantitative/report/row.rs | 53 +++++++------------ .../quantitative/report/row/basis.rs | 41 ++++++++++++++ 2 files changed, 61 insertions(+), 33 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs index 8599700a..ee420902 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row.rs @@ -1,12 +1,12 @@ mod audit_gates; +mod basis; mod benchmark_row; mod query_counts; use crate::{ QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, - quantitative::{ - self, metrics, - report::{QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput}, + quantitative::report::{ + QuantitativeReportInput, row::benchmark_row::QuantitativeBenchmarkRowInput, }, }; @@ -21,46 +21,33 @@ pub(super) struct CurrentQuantitativeRow { pub(super) fn current_quantitative_row( input: &QuantitativeReportInput<'_>, ) -> Result { - let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); - let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); - let per_query_rows = metrics::quantitative_per_query_rows( - input.source_jobs, - input.jobs, - corpus_id.as_str(), - evidence_class, - input.adapter.adapter_id.as_str(), - ); - let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice()); - let ranking_query_count = query_counts.ranking_query_count; - let explicit_qrel_query_count = query_counts.explicit_qrel_query_count; - let metric_comparable = ranking_query_count > 0; - let result_state = quantitative::quantitative_result_state(input.summary); + let basis = basis::quantitative_row_basis(input); let audit_gates = audit_gates::quantitative_audit_gates( input, - corpus_id.as_str(), - evidence_class, - ranking_query_count, - explicit_qrel_query_count, - metric_comparable, + basis.corpus_id.as_str(), + basis.evidence_class, + basis.ranking_query_count, + basis.explicit_qrel_query_count, + basis.metric_comparable, )?; let row = benchmark_row::quantitative_benchmark_row(QuantitativeBenchmarkRowInput { input, - corpus_id: corpus_id.as_str(), - evidence_class, - per_query_rows: per_query_rows.as_slice(), - ranking_query_count, - explicit_qrel_query_count, - metric_comparable, - result_state, + corpus_id: basis.corpus_id.as_str(), + evidence_class: basis.evidence_class, + per_query_rows: basis.per_query_rows.as_slice(), + ranking_query_count: basis.ranking_query_count, + explicit_qrel_query_count: basis.explicit_qrel_query_count, + metric_comparable: basis.metric_comparable, + result_state: basis.result_state, audit_evidence: audit_gates.audit_evidence, leaderboard_eligible: audit_gates.leaderboard_eligible, }); Ok(CurrentQuantitativeRow { - corpus_id, + corpus_id: basis.corpus_id, row, - per_query_rows, - ranking_query_count, - explicit_qrel_query_count, + per_query_rows: basis.per_query_rows, + ranking_query_count: basis.ranking_query_count, + explicit_qrel_query_count: basis.explicit_qrel_query_count, }) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs new file mode 100644 index 00000000..0f1a7e47 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/row/basis.rs @@ -0,0 +1,41 @@ +use crate::{ + QuantitativePerQueryRow, + quantitative::{ + self, metrics, + report::{QuantitativeReportInput, row::query_counts}, + }, +}; + +pub(super) struct QuantitativeRowBasis { + pub(super) corpus_id: String, + pub(super) evidence_class: &'static str, + pub(super) per_query_rows: Vec, + pub(super) ranking_query_count: usize, + pub(super) explicit_qrel_query_count: usize, + pub(super) metric_comparable: bool, + pub(super) result_state: &'static str, +} + +pub(super) fn quantitative_row_basis(input: &QuantitativeReportInput<'_>) -> QuantitativeRowBasis { + let corpus_id = quantitative::quantitative_corpus_id(input.source_jobs); + let evidence_class = quantitative::quantitative_evidence_class(input.adapter, input.jobs); + let per_query_rows = metrics::quantitative_per_query_rows( + input.source_jobs, + input.jobs, + corpus_id.as_str(), + evidence_class, + input.adapter.adapter_id.as_str(), + ); + let query_counts = query_counts::quantitative_query_counts(per_query_rows.as_slice()); + let ranking_query_count = query_counts.ranking_query_count; + + QuantitativeRowBasis { + corpus_id, + evidence_class, + per_query_rows, + ranking_query_count, + explicit_qrel_query_count: query_counts.explicit_qrel_query_count, + metric_comparable: ranking_query_count > 0, + result_state: quantitative::quantitative_result_state(input.summary), + } +} From e941669c207697ab26a2cb0fc1dfb99dc26285f8 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:04:20 -0400 Subject: [PATCH 51/58] {"schema":"decodex/commit/1","summary":"Split quantitative product export rows","authority":"manual"} --- .../product_manifest/export/rows.rs | 57 +------------------ .../product_manifest/export/rows/per_query.rs | 35 ++++++++++++ .../product_manifest/export/rows/product.rs | 21 +++++++ 3 files changed, 59 insertions(+), 54 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs index 2e1923db..e29f4f74 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows.rs @@ -1,55 +1,4 @@ -use crate::{QuantitativeBenchmarkRow, QuantitativePerQueryRow, RealWorldReport}; +mod per_query; +mod product; -pub(super) fn exported_product_row( - source_row: &QuantitativeBenchmarkRow, - product: &str, - adapter_id: &str, - adapter_name: &str, -) -> QuantitativeBenchmarkRow { - let mut row = source_row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.adapter_name = adapter_name.to_string(); - row.claim_boundary = concat!( - "Exported from a generated real_world_job_report quantitative row; ", - "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." - ) - .to_string(); - - row -} - -pub(super) fn exported_per_query_rows( - report: &RealWorldReport, - source_product: &str, - source_adapter_id: &str, - product: &str, - adapter_id: &str, -) -> Vec { - report - .quantitative_scoreboard - .per_query_rows - .iter() - .filter(|row| row.product == source_product && row.adapter_id == source_adapter_id) - .map(|row| exported_per_query_row(row, product, adapter_id)) - .collect() -} - -fn exported_per_query_row( - source_row: &QuantitativePerQueryRow, - product: &str, - adapter_id: &str, -) -> QuantitativePerQueryRow { - let mut row = source_row.clone(); - - row.product = product.to_string(); - row.adapter_id = adapter_id.to_string(); - row.claim_boundary = concat!( - "Exported from generated report per-query quantitative evidence; ", - "import does not relax paired-significance or leaderboard gates." - ) - .to_string(); - - row -} +pub(super) use self::{per_query::exported_per_query_rows, product::exported_product_row}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs new file mode 100644 index 00000000..fcc61d9e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/per_query.rs @@ -0,0 +1,35 @@ +use crate::{QuantitativePerQueryRow, RealWorldReport}; + +pub(in crate::quantitative::product_manifest::export) fn exported_per_query_rows( + report: &RealWorldReport, + source_product: &str, + source_adapter_id: &str, + product: &str, + adapter_id: &str, +) -> Vec { + report + .quantitative_scoreboard + .per_query_rows + .iter() + .filter(|row| row.product == source_product && row.adapter_id == source_adapter_id) + .map(|row| exported_per_query_row(row, product, adapter_id)) + .collect() +} + +fn exported_per_query_row( + source_row: &QuantitativePerQueryRow, + product: &str, + adapter_id: &str, +) -> QuantitativePerQueryRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.claim_boundary = concat!( + "Exported from generated report per-query quantitative evidence; ", + "import does not relax paired-significance or leaderboard gates." + ) + .to_string(); + + row +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs new file mode 100644 index 00000000..2551c2ff --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/export/rows/product.rs @@ -0,0 +1,21 @@ +use crate::QuantitativeBenchmarkRow; + +pub(in crate::quantitative::product_manifest::export) fn exported_product_row( + source_row: &QuantitativeBenchmarkRow, + product: &str, + adapter_id: &str, + adapter_name: &str, +) -> QuantitativeBenchmarkRow { + let mut row = source_row.clone(); + + row.product = product.to_string(); + row.adapter_id = adapter_id.to_string(); + row.adapter_name = adapter_name.to_string(); + row.claim_boundary = concat!( + "Exported from a generated real_world_job_report quantitative row; ", + "import remains subject to same-corpus, per-query, explicit-qrel, and leaderboard gates." + ) + .to_string(); + + row +} From 52764643af7ff6ad4c9bac1db31c12b8cbe85306 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:07:29 -0400 Subject: [PATCH 52/58] {"schema":"decodex/commit/1","summary":"Split quantitative report imports","authority":"manual"} --- .../quantitative/report.rs | 15 +++++------ .../quantitative/report/imported.rs | 27 +++++++++++++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs index 3922622a..08b4b84a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report.rs @@ -1,4 +1,5 @@ mod controls; +mod imported; mod input; mod row; @@ -6,24 +7,22 @@ pub(crate) use self::input::QuantitativeReportInput; use crate::{ QuantitativeBenchmarkReport, Result, - quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA, product_manifest}, + quantitative::{self, QUANTITATIVE_K_VALUES, QUANTITATIVE_SCOREBOARD_SCHEMA}, }; pub(crate) fn quantitative_scoreboard_report( input: QuantitativeReportInput<'_>, ) -> Result { let current_row = row::current_quantitative_row(&input)?; - let product_manifest = product_manifest::quantitative_product_manifest( + let imported_rows = imported::imported_quantitative_rows( input.product_manifest_path, current_row.corpus_id.as_str(), )?; - let imported_row_count = product_manifest.rows.len(); - let imported_per_query_count = product_manifest.per_query_rows.len(); let mut rows = vec![current_row.row]; let mut merged_per_query_rows = current_row.per_query_rows; - rows.extend(product_manifest.rows); - merged_per_query_rows.extend(product_manifest.per_query_rows); + rows.extend(imported_rows.rows); + merged_per_query_rows.extend(imported_rows.per_query_rows); let leaderboard_claim_allowed = rows.iter().filter(|row| row.leaderboard_eligible).count() >= 2; let controls = controls::quantitative_benchmark_controls( @@ -41,8 +40,8 @@ pub(crate) fn quantitative_scoreboard_report( rows, per_query_rows: merged_per_query_rows, metrics_not_encoded: quantitative::quantitative_metrics_not_encoded( - imported_row_count, - imported_per_query_count, + imported_rows.row_count, + imported_rows.per_query_count, ), controls, claim_boundary: concat!( diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs new file mode 100644 index 00000000..2b2a2515 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/report/imported.rs @@ -0,0 +1,27 @@ +use crate::{ + Path, QuantitativeBenchmarkRow, QuantitativePerQueryRow, Result, quantitative::product_manifest, +}; + +pub(super) struct ImportedQuantitativeRows { + pub(super) rows: Vec, + pub(super) per_query_rows: Vec, + pub(super) row_count: usize, + pub(super) per_query_count: usize, +} + +pub(super) fn imported_quantitative_rows( + product_manifest_path: Option<&Path>, + corpus_id: &str, +) -> Result { + let product_manifest = + product_manifest::quantitative_product_manifest(product_manifest_path, corpus_id)?; + let row_count = product_manifest.rows.len(); + let per_query_count = product_manifest.per_query_rows.len(); + + Ok(ImportedQuantitativeRows { + rows: product_manifest.rows, + per_query_rows: product_manifest.per_query_rows, + row_count, + per_query_count, + }) +} From 80680ac36cda4a1d22efce08c87e18202ce406e2 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:10:34 -0400 Subject: [PATCH 53/58] {"schema":"decodex/commit/1","summary":"Slim quantitative metrics facade","authority":"manual"} --- .../quantitative/metrics.rs | 75 +++---------------- .../quantitative/metrics/aggregate.rs | 12 ++- .../quantitative/metrics/per_query.rs | 2 +- .../quantitative/metrics/ranking.rs | 12 +-- 4 files changed, 25 insertions(+), 76 deletions(-) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs index 779329f6..6ee91f58 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics.rs @@ -2,69 +2,14 @@ mod aggregate; mod per_query; mod ranking; -use crate::{ - BTreeMap, BTreeSet, JobReport, QuantitativeConfidenceInterval, QuantitativePerQueryRow, - RealWorldJob, ReportSummary, +pub(super) use self::{ + aggregate::{ + aggregate_confidence_intervals, aggregate_denominators, aggregate_metric_states, + aggregate_metrics, + }, + per_query::quantitative_per_query_rows, + ranking::{ + aggregate_qrel_source, explicit_qrel_query_count, ranked_candidate_source, + ranking_coverage_state, ranking_query_count, ranking_query_ids, + }, }; - -pub(super) fn quantitative_per_query_rows( - source_jobs: &[RealWorldJob], - jobs: &[JobReport], - corpus_id: &str, - evidence_class: &str, - adapter_id: &str, -) -> Vec { - per_query::quantitative_per_query_rows(source_jobs, jobs, corpus_id, evidence_class, adapter_id) -} - -pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { - aggregate::aggregate_metrics(rows) -} - -pub(super) fn aggregate_metric_states( - result_state: &str, - metric_comparable: bool, -) -> BTreeMap { - aggregate::aggregate_metric_states(result_state, metric_comparable) -} - -pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { - aggregate::aggregate_denominators(rows) -} - -pub(super) fn aggregate_confidence_intervals( - rows: &[QuantitativePerQueryRow], -) -> BTreeMap { - aggregate::aggregate_confidence_intervals(rows) -} - -pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { - ranking::ranking_query_ids(source_jobs) -} - -pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { - ranking::ranking_query_count(source_jobs) -} - -pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { - ranking::explicit_qrel_query_count(source_jobs) -} - -pub(super) fn aggregate_qrel_source( - ranking_query_count: usize, - explicit_qrel_query_count: usize, -) -> &'static str { - ranking::aggregate_qrel_source(ranking_query_count, explicit_qrel_query_count) -} - -pub(super) fn ranking_coverage_state( - summary: &ReportSummary, - source_job_count: usize, - ranking_query_count: usize, -) -> &'static str { - ranking::ranking_coverage_state(summary, source_job_count, ranking_query_count) -} - -pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { - ranking::ranked_candidate_source(ranking_query_count) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs index 9e899d64..992201a6 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/aggregate.rs @@ -6,22 +6,26 @@ mod states; use crate::{BTreeMap, QuantitativeConfidenceInterval, QuantitativePerQueryRow}; -pub(super) fn aggregate_metrics(rows: &[QuantitativePerQueryRow]) -> BTreeMap> { +pub(in crate::quantitative) fn aggregate_metrics( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap> { metrics::aggregate_metrics(rows) } -pub(super) fn aggregate_metric_states( +pub(in crate::quantitative) fn aggregate_metric_states( result_state: &str, metric_comparable: bool, ) -> BTreeMap { states::aggregate_metric_states(result_state, metric_comparable) } -pub(super) fn aggregate_denominators(rows: &[QuantitativePerQueryRow]) -> BTreeMap { +pub(in crate::quantitative) fn aggregate_denominators( + rows: &[QuantitativePerQueryRow], +) -> BTreeMap { denominators::aggregate_denominators(rows) } -pub(super) fn aggregate_confidence_intervals( +pub(in crate::quantitative) fn aggregate_confidence_intervals( rows: &[QuantitativePerQueryRow], ) -> BTreeMap { confidence::aggregate_confidence_intervals(rows) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs index 2f8de046..1c1bf433 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query.rs @@ -4,7 +4,7 @@ mod row; use crate::{JobReport, QuantitativePerQueryRow, RealWorldJob}; -pub(super) fn quantitative_per_query_rows( +pub(in crate::quantitative) fn quantitative_per_query_rows( source_jobs: &[RealWorldJob], jobs: &[JobReport], corpus_id: &str, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs index 340a7115..ab045e46 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs @@ -2,7 +2,7 @@ mod queries; use crate::{BTreeSet, RealWorldJob, ReportSummary}; -pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { +pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { source_jobs .iter() .filter(|job| queries::is_ranking_query(job)) @@ -10,15 +10,15 @@ pub(super) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> .collect() } -pub(super) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { +pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { ranking_query_ids(source_jobs).len() } -pub(super) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { +pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() } -pub(super) fn aggregate_qrel_source( +pub(in crate::quantitative) fn aggregate_qrel_source( ranking_query_count: usize, explicit_qrel_query_count: usize, ) -> &'static str { @@ -33,7 +33,7 @@ pub(super) fn aggregate_qrel_source( } } -pub(super) fn ranking_coverage_state( +pub(in crate::quantitative) fn ranking_coverage_state( summary: &ReportSummary, source_job_count: usize, ranking_query_count: usize, @@ -47,6 +47,6 @@ pub(super) fn ranking_coverage_state( } } -pub(super) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { +pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } } From cf61ab3107ca5b9be15b74b576f73dfbcc2238aa Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:15:16 -0400 Subject: [PATCH 54/58] {"schema":"decodex/commit/1","summary":"Split quantitative ranking helpers","authority":"manual"} --- .../quantitative/metrics/ranking.rs | 58 +++---------------- .../quantitative/metrics/ranking/counts.rs | 17 ++++++ .../quantitative/metrics/ranking/coverage.rs | 19 ++++++ .../quantitative/metrics/ranking/qrels.rs | 14 +++++ 4 files changed, 58 insertions(+), 50 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs index ab045e46..6805ca30 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking.rs @@ -1,52 +1,10 @@ +mod counts; +mod coverage; +mod qrels; mod queries; -use crate::{BTreeSet, RealWorldJob, ReportSummary}; - -pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { - source_jobs - .iter() - .filter(|job| queries::is_ranking_query(job)) - .map(|job| job.job_id.as_str()) - .collect() -} - -pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { - ranking_query_ids(source_jobs).len() -} - -pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { - source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() -} - -pub(in crate::quantitative) fn aggregate_qrel_source( - ranking_query_count: usize, - explicit_qrel_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if explicit_qrel_query_count == ranking_query_count { - "explicit_qrels" - } else if explicit_qrel_query_count == 0 { - "expected_evidence_fallback" - } else { - "mixed" - } -} - -pub(in crate::quantitative) fn ranking_coverage_state( - summary: &ReportSummary, - source_job_count: usize, - ranking_query_count: usize, -) -> &'static str { - if ranking_query_count == 0 { - "not_encoded" - } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { - "complete" - } else { - "partial_coverage" - } -} - -pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { - if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } -} +pub(in crate::quantitative) use self::{ + counts::{explicit_qrel_query_count, ranking_query_count, ranking_query_ids}, + coverage::{ranked_candidate_source, ranking_coverage_state}, + qrels::aggregate_qrel_source, +}; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs new file mode 100644 index 00000000..c8dd4408 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/counts.rs @@ -0,0 +1,17 @@ +use crate::{BTreeSet, RealWorldJob, quantitative::metrics::ranking::queries}; + +pub(in crate::quantitative) fn ranking_query_ids(source_jobs: &[RealWorldJob]) -> BTreeSet<&str> { + source_jobs + .iter() + .filter(|job| queries::is_ranking_query(job)) + .map(|job| job.job_id.as_str()) + .collect() +} + +pub(in crate::quantitative) fn ranking_query_count(source_jobs: &[RealWorldJob]) -> usize { + ranking_query_ids(source_jobs).len() +} + +pub(in crate::quantitative) fn explicit_qrel_query_count(source_jobs: &[RealWorldJob]) -> usize { + source_jobs.iter().filter(|job| !job.expected_answer.relevance_judgments.is_empty()).count() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs new file mode 100644 index 00000000..eb419d40 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/coverage.rs @@ -0,0 +1,19 @@ +use crate::ReportSummary; + +pub(in crate::quantitative) fn ranking_coverage_state( + summary: &ReportSummary, + source_job_count: usize, + ranking_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if ranking_query_count == source_job_count && summary.not_encoded == 0 { + "complete" + } else { + "partial_coverage" + } +} + +pub(in crate::quantitative) fn ranked_candidate_source(ranking_query_count: usize) -> &'static str { + if ranking_query_count == 0 { "not_encoded" } else { "produced_evidence_order" } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs new file mode 100644 index 00000000..9b5c3daa --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/ranking/qrels.rs @@ -0,0 +1,14 @@ +pub(in crate::quantitative) fn aggregate_qrel_source( + ranking_query_count: usize, + explicit_qrel_query_count: usize, +) -> &'static str { + if ranking_query_count == 0 { + "not_encoded" + } else if explicit_qrel_query_count == ranking_query_count { + "explicit_qrels" + } else if explicit_qrel_query_count == 0 { + "expected_evidence_fallback" + } else { + "mixed" + } +} From c919308265f79a648740ddf3c80b4d2f5d769442 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:20:02 -0400 Subject: [PATCH 55/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query validation identity","authority":"manual"} --- .../validation/rows/per_query.rs | 35 ++-------- .../validation/rows/per_query/identity.rs | 66 +++++++++++++++++++ 2 files changed, 70 insertions(+), 31 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs index 4e720a68..12dc5508 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query.rs @@ -1,4 +1,6 @@ -use crate::{BTreeSet, Path, QuantitativeProductManifest, Result, eyre}; +mod identity; + +use crate::{BTreeSet, Path, QuantitativeProductManifest, Result}; pub(super) fn validate_quantitative_per_query_rows( manifest: &QuantitativeProductManifest, @@ -12,36 +14,7 @@ pub(super) fn validate_quantitative_per_query_rows( .collect::>(); for row in &manifest.per_query_rows { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } + identity::validate_per_query_row_identity(path, row, &row_keys, corpus_id)?; } Ok(()) diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs new file mode 100644 index 00000000..509f471e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs @@ -0,0 +1,66 @@ +use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_per_query_row_identity( + path: &Path, + row: &QuantitativePerQueryRow, + row_keys: &BTreeSet<(&str, &str)>, + corpus_id: &str, +) -> Result<()> { + validate_complete_per_query_row(path, row)?; + validate_matching_product_row(path, row, row_keys)?; + + validate_same_corpus_per_query_row(path, row, corpus_id) +} + +fn validate_complete_per_query_row(path: &Path, row: &QuantitativePerQueryRow) -> Result<()> { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + + Ok(()) +} + +fn validate_matching_product_row( + path: &Path, + row: &QuantitativePerQueryRow, + row_keys: &BTreeSet<(&str, &str)>, +) -> Result<()> { + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} + +fn validate_same_corpus_per_query_row( + path: &Path, + row: &QuantitativePerQueryRow, + corpus_id: &str, +) -> Result<()> { + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + + Ok(()) +} From ff093e59ca54b98177f3c01873e4511c3d1bd721 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:22:48 -0400 Subject: [PATCH 56/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query identity checks","authority":"manual"} --- .../validation/rows/per_query/identity.rs | 65 +++---------------- .../rows/per_query/identity/corpus.rs | 19 ++++++ .../rows/per_query/identity/fields.rs | 22 +++++++ .../rows/per_query/identity/product.rs | 18 +++++ 4 files changed, 67 insertions(+), 57 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs index 509f471e..737e869e 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity.rs @@ -1,66 +1,17 @@ -use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre}; +mod corpus; +mod fields; +mod product; -pub(super) fn validate_per_query_row_identity( - path: &Path, - row: &QuantitativePerQueryRow, - row_keys: &BTreeSet<(&str, &str)>, - corpus_id: &str, -) -> Result<()> { - validate_complete_per_query_row(path, row)?; - validate_matching_product_row(path, row, row_keys)?; - - validate_same_corpus_per_query_row(path, row, corpus_id) -} - -fn validate_complete_per_query_row(path: &Path, row: &QuantitativePerQueryRow) -> Result<()> { - if row.job_id.trim().is_empty() - || row.suite.trim().is_empty() - || row.evidence_class.trim().is_empty() - || row.result_state.trim().is_empty() - || row.product.trim().is_empty() - || row.adapter_id.trim().is_empty() - || row.qrel_source.trim().is_empty() - { - return Err(eyre::eyre!( - "{} has an incomplete quantitative per-query product row.", - path.display() - )); - } +use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result}; - Ok(()) -} - -fn validate_matching_product_row( +pub(super) fn validate_per_query_row_identity( path: &Path, row: &QuantitativePerQueryRow, row_keys: &BTreeSet<(&str, &str)>, -) -> Result<()> { - if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { - return Err(eyre::eyre!( - "{} per-query row {}:{} has no matching product row.", - path.display(), - row.product, - row.adapter_id - )); - } - - Ok(()) -} - -fn validate_same_corpus_per_query_row( - path: &Path, - row: &QuantitativePerQueryRow, corpus_id: &str, ) -> Result<()> { - if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { - return Err(eyre::eyre!( - "{} per-query row {}:{} is not same-corpus {}.", - path.display(), - row.product, - row.adapter_id, - corpus_id - )); - } + fields::validate_complete_per_query_row(path, row)?; + product::validate_matching_product_row(path, row, row_keys)?; - Ok(()) + corpus::validate_same_corpus_per_query_row(path, row, corpus_id) } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs new file mode 100644 index 00000000..45d0c11c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/corpus.rs @@ -0,0 +1,19 @@ +use crate::{Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_same_corpus_per_query_row( + path: &Path, + row: &QuantitativePerQueryRow, + corpus_id: &str, +) -> Result<()> { + if row.source_manifest_corpus_id.as_deref() != Some(corpus_id) { + return Err(eyre::eyre!( + "{} per-query row {}:{} is not same-corpus {}.", + path.display(), + row.product, + row.adapter_id, + corpus_id + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs new file mode 100644 index 00000000..049614f1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/fields.rs @@ -0,0 +1,22 @@ +use crate::{Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_complete_per_query_row( + path: &Path, + row: &QuantitativePerQueryRow, +) -> Result<()> { + if row.job_id.trim().is_empty() + || row.suite.trim().is_empty() + || row.evidence_class.trim().is_empty() + || row.result_state.trim().is_empty() + || row.product.trim().is_empty() + || row.adapter_id.trim().is_empty() + || row.qrel_source.trim().is_empty() + { + return Err(eyre::eyre!( + "{} has an incomplete quantitative per-query product row.", + path.display() + )); + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs new file mode 100644 index 00000000..dfed81b1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/product_manifest/validation/rows/per_query/identity/product.rs @@ -0,0 +1,18 @@ +use crate::{BTreeSet, Path, QuantitativePerQueryRow, Result, eyre}; + +pub(super) fn validate_matching_product_row( + path: &Path, + row: &QuantitativePerQueryRow, + row_keys: &BTreeSet<(&str, &str)>, +) -> Result<()> { + if !row_keys.contains(&(row.product.as_str(), row.adapter_id.as_str())) { + return Err(eyre::eyre!( + "{} per-query row {}:{} has no matching product row.", + path.display(), + row.product, + row.adapter_id + )); + } + + Ok(()) +} From 5fe18db2423263f3338adcd501d916e83e906990 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:26:11 -0400 Subject: [PATCH 57/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query row basis","authority":"manual"} --- .../quantitative/metrics/per_query/row.rs | 36 ++++------- .../metrics/per_query/row/basis.rs | 60 +++++++++++++++++++ 2 files changed, 71 insertions(+), 25 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs index 2a892850..7378fd72 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row.rs @@ -1,10 +1,8 @@ +mod basis; + use crate::{ JobReport, QuantitativePerQueryRow, RealWorldJob, formatting, - quantitative::{ - QUANTITATIVE_ROW_CLAIM_BOUNDARY, - metrics::per_query::{evidence, query_metrics}, - }, - scoring, + quantitative::QUANTITATIVE_ROW_CLAIM_BOUNDARY, }; pub(super) fn quantitative_per_query_row( @@ -14,16 +12,7 @@ pub(super) fn quantitative_per_query_row( evidence_class: &str, adapter_id: &str, ) -> QuantitativePerQueryRow { - let relevance = evidence::relevance_grades(source_job, job); - let candidates = scoring::produced_evidence_order(source_job); - let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); - let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); - let metric_state = if positive_relevance_count == 0 || candidates.is_empty() { - "not_encoded" - } else { - formatting::status_str(job.status) - }; - let metric_states = metrics.keys().map(|key| (key.clone(), metric_state.to_string())).collect(); + let basis = basis::quantitative_per_query_row_basis(source_job, job); QuantitativePerQueryRow { job_id: job.job_id.clone(), @@ -31,18 +20,15 @@ pub(super) fn quantitative_per_query_row( evidence_class: evidence_class.to_string(), source_manifest_corpus_id: Some(corpus_id.to_string()), result_state: formatting::status_str(job.status).to_string(), - expected_relevant_count: positive_relevance_count, - candidate_count: candidates.len(), - qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), - relevance_grade_sum: formatting::round3(relevance.values().sum::()), + expected_relevant_count: basis.positive_relevance_count, + candidate_count: basis.candidate_count, + qrel_source: basis.qrel_source, + relevance_grade_sum: basis.relevance_grade_sum, product: "ELF".to_string(), adapter_id: adapter_id.to_string(), - metrics, - metric_states, - denominators: query_metrics::per_query_denominators( - candidates.len(), - positive_relevance_count, - ), + metrics: basis.metrics, + metric_states: basis.metric_states, + denominators: basis.denominators, claim_boundary: QUANTITATIVE_ROW_CLAIM_BOUNDARY.to_string(), } } diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs new file mode 100644 index 00000000..34db9c8b --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs @@ -0,0 +1,60 @@ +use crate::{ + BTreeMap, JobReport, RealWorldJob, formatting, + quantitative::metrics::per_query::{evidence, query_metrics}, + scoring, +}; + +pub(super) struct QuantitativePerQueryRowBasis { + pub(super) positive_relevance_count: usize, + pub(super) candidate_count: usize, + pub(super) qrel_source: String, + pub(super) relevance_grade_sum: f64, + pub(super) metrics: BTreeMap>, + pub(super) metric_states: BTreeMap, + pub(super) denominators: BTreeMap, +} + +pub(super) fn quantitative_per_query_row_basis( + source_job: &RealWorldJob, + job: &JobReport, +) -> QuantitativePerQueryRowBasis { + let relevance = evidence::relevance_grades(source_job, job); + let candidates = scoring::produced_evidence_order(source_job); + let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); + let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); + let candidate_count = candidates.len(); + let metric_states = per_query_metric_states( + metrics.keys(), + positive_relevance_count, + candidate_count, + formatting::status_str(job.status), + ); + + QuantitativePerQueryRowBasis { + positive_relevance_count, + candidate_count, + qrel_source: evidence::qrel_source(source_job, relevance.is_empty()).to_string(), + relevance_grade_sum: formatting::round3(relevance.values().sum::()), + metrics, + metric_states, + denominators: query_metrics::per_query_denominators( + candidate_count, + positive_relevance_count, + ), + } +} + +fn per_query_metric_states<'a>( + metric_names: impl Iterator, + positive_relevance_count: usize, + candidate_count: usize, + result_state: &str, +) -> BTreeMap { + let metric_state = if positive_relevance_count == 0 || candidate_count == 0 { + "not_encoded" + } else { + result_state + }; + + metric_names.map(|key| (key.clone(), metric_state.to_string())).collect() +} From 3626f64f2a9bb8d0976ce9c45d5d96ca93166885 Mon Sep 17 00:00:00 2001 From: Yvette Carlisle Date: Wed, 1 Jul 2026 16:28:27 -0400 Subject: [PATCH 58/58] {"schema":"decodex/commit/1","summary":"Split quantitative per-query metric states","authority":"manual"} --- .../metrics/per_query/row/basis.rs | 19 +++---------------- .../metrics/per_query/row/basis/states.rs | 16 ++++++++++++++++ 2 files changed, 19 insertions(+), 16 deletions(-) create mode 100644 apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs index 34db9c8b..42ed6323 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis.rs @@ -1,3 +1,5 @@ +mod states; + use crate::{ BTreeMap, JobReport, RealWorldJob, formatting, quantitative::metrics::per_query::{evidence, query_metrics}, @@ -23,7 +25,7 @@ pub(super) fn quantitative_per_query_row_basis( let positive_relevance_count = query_metrics::positive_qrel_count(&relevance); let metrics = query_metrics::per_query_metrics(candidates.as_slice(), &relevance); let candidate_count = candidates.len(); - let metric_states = per_query_metric_states( + let metric_states = states::per_query_metric_states( metrics.keys(), positive_relevance_count, candidate_count, @@ -43,18 +45,3 @@ pub(super) fn quantitative_per_query_row_basis( ), } } - -fn per_query_metric_states<'a>( - metric_names: impl Iterator, - positive_relevance_count: usize, - candidate_count: usize, - result_state: &str, -) -> BTreeMap { - let metric_state = if positive_relevance_count == 0 || candidate_count == 0 { - "not_encoded" - } else { - result_state - }; - - metric_names.map(|key| (key.clone(), metric_state.to_string())).collect() -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs new file mode 100644 index 00000000..7c987253 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/quantitative/metrics/per_query/row/basis/states.rs @@ -0,0 +1,16 @@ +use crate::BTreeMap; + +pub(super) fn per_query_metric_states<'a>( + metric_names: impl Iterator, + positive_relevance_count: usize, + candidate_count: usize, + result_state: &str, +) -> BTreeMap { + let metric_state = if positive_relevance_count == 0 || candidate_count == 0 { + "not_encoded" + } else { + result_state + }; + + metric_names.map(|key| (key.clone(), metric_state.to_string())).collect() +}