diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/artifacts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/artifacts.rs index 0670e688..c50ccb95 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/artifacts.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/artifacts.rs @@ -1,12 +1,12 @@ -#[path = "artifacts/answer.rs"] mod answer; -#[path = "artifacts/consolidation.rs"] mod consolidation; -#[path = "artifacts/cost.rs"] mod cost; -#[path = "artifacts/knowledge.rs"] mod knowledge; -#[path = "artifacts/memory.rs"] mod memory; -#[path = "artifacts/proactive.rs"] mod proactive; -#[path = "artifacts/recovery.rs"] mod recovery; -#[path = "artifacts/scheduled.rs"] mod scheduled; -#[path = "artifacts/work.rs"] mod work; +mod answer; +mod consolidation; +mod cost; +mod knowledge; +mod memory; +mod proactive; +mod recovery; +mod scheduled; +mod work; pub(super) use self::{ answer::{ProducedAnswer, ProducedClaim}, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_detail_reports.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_detail_reports.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_detail_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_manifest_reports.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_manifest_reports.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_manifest_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_misc_reports.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_misc_reports.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_misc_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_summary_reports.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_summary_reports.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/external_adapter_reports/external_adapter_summary_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters.rs index 99ca6823..37735d1a 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters.rs @@ -1,7 +1,7 @@ -#[path = "external_adapters/manifest.rs"] mod manifest; -#[path = "external_adapters/outcome.rs"] mod outcome; -#[path = "external_adapters/summary.rs"] mod summary; -#[path = "external_adapters/validation.rs"] mod validation; +mod manifest; +mod outcome; +mod summary; +mod validation; pub(super) use outcome::scenario_comparison_outcome; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation.rs index b0b59883..166603d1 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation.rs @@ -1,11 +1,14 @@ +mod basics; +mod metadata; +mod scenarios; + use std::{collections::BTreeSet, path::Path}; use color_eyre::{Result, eyre}; use crate::{ - AdapterCoverageStatus, AdapterScenarioJudgment, EXTERNAL_ADAPTER_MANIFEST_SCHEMA, - ElfScenarioPosition, ExternalAdapterManifest, ExternalAdapterReport, ExternalDockerIsolation, - SUITES, ScenarioComparisonOutcome, external_adapters::outcome, formatting, + EXTERNAL_ADAPTER_MANIFEST_SCHEMA, ExternalAdapterManifest, ExternalAdapterReport, + ExternalDockerIsolation, }; pub(super) fn validate_external_adapter_manifest( @@ -100,12 +103,12 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re )); } - validate_adapter_execution(path, adapter)?; - validate_adapter_capabilities(path, adapter)?; - validate_adapter_suites(path, adapter)?; - validate_adapter_scenarios(path, adapter)?; - validate_adapter_evidence(path, adapter)?; - validate_adapter_execution_metadata(path, adapter)?; + basics::validate_adapter_execution(path, adapter)?; + basics::validate_adapter_capabilities(path, adapter)?; + basics::validate_adapter_suites(path, adapter)?; + scenarios::validate_adapter_scenarios(path, adapter)?; + basics::validate_adapter_evidence(path, adapter)?; + metadata::validate_adapter_execution_metadata(path, adapter)?; if let Some(follow_up) = &adapter.follow_up && (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty()) @@ -119,225 +122,3 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re Ok(()) } - -fn validate_adapter_execution(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - for evidence in [&adapter.setup, &adapter.run, &adapter.result] { - if evidence.evidence.trim().is_empty() - || evidence.command.as_deref().is_some_and(str::is_empty) - || evidence.artifact.as_deref().is_some_and(str::is_empty) - { - return Err(eyre::eyre!( - "{} adapter {} has incomplete setup/run/result evidence.", - path.display(), - adapter.adapter_id - )); - } - } - - Ok(()) -} - -fn validate_adapter_capabilities(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - for capability in &adapter.capabilities { - if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() { - return Err(eyre::eyre!( - "{} adapter {} has incomplete capability coverage.", - path.display(), - adapter.adapter_id - )); - } - } - - Ok(()) -} - -fn validate_adapter_suites(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - for suite in &adapter.suites { - if !SUITES.contains(&suite.suite_id.as_str()) { - return Err(eyre::eyre!( - "{} adapter {} references unknown suite {}.", - path.display(), - adapter.adapter_id, - suite.suite_id - )); - } - if suite.evidence.trim().is_empty() { - return Err(eyre::eyre!( - "{} adapter {} has suite {} without evidence.", - path.display(), - adapter.adapter_id, - suite.suite_id - )); - } - } - - Ok(()) -} - -fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - for scenario in &adapter.scenarios { - if scenario.scenario_id.trim().is_empty() - || scenario.evidence.trim().is_empty() - || scenario.command.as_deref().is_some_and(str::is_empty) - || scenario.artifact.as_deref().is_some_and(str::is_empty) - { - return Err(eyre::eyre!( - "{} adapter {} has incomplete scenario judgment.", - path.display(), - adapter.adapter_id - )); - } - - if let Some(suite_id) = &scenario.suite_id - && !SUITES.contains(&suite_id.as_str()) - { - return Err(eyre::eyre!( - "{} adapter {} scenario {} references unknown suite {}.", - path.display(), - adapter.adapter_id, - scenario.scenario_id, - suite_id - )); - } - - let outcome = outcome::scenario_comparison_outcome(scenario); - - if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) { - return Err(eyre::eyre!( - "{} adapter {} scenario {} uses blocked status without blocked comparison outcome.", - path.display(), - adapter.adapter_id, - scenario.scenario_id - )); - } - if unmeasured_status_has_measured_outcome(scenario.status, outcome) { - return Err(eyre::eyre!( - "{} adapter {} scenario {} uses {} status with {} outcome.", - path.display(), - adapter.adapter_id, - scenario.scenario_id, - formatting::adapter_status_str(scenario.status), - formatting::scenario_comparison_outcome_str(outcome) - )); - } - if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) { - return Err(eyre::eyre!( - "{} adapter {} scenario {} uses {} status with {} position.", - path.display(), - adapter.adapter_id, - scenario.scenario_id, - formatting::adapter_status_str(scenario.status), - formatting::scenario_position_str(scenario.elf_position) - )); - } - if explicit_outcome_conflicts_with_position(scenario) { - return Err(eyre::eyre!( - "{} adapter {} scenario {} uses {} position with {} outcome.", - path.display(), - adapter.adapter_id, - scenario.scenario_id, - formatting::scenario_position_str(scenario.elf_position), - formatting::scenario_comparison_outcome_str(outcome) - )); - } - } - - Ok(()) -} - -fn blocked_status_missing_blocked_outcome( - status: AdapterCoverageStatus, - outcome: Option, -) -> bool { - status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked) -} - -fn unmeasured_status_has_measured_outcome( - status: AdapterCoverageStatus, - outcome: ScenarioComparisonOutcome, -) -> bool { - matches!( - status, - AdapterCoverageStatus::Blocked - | AdapterCoverageStatus::Incomplete - | AdapterCoverageStatus::NotEncoded - | AdapterCoverageStatus::Unsupported - ) && matches!( - outcome, - ScenarioComparisonOutcome::Win - | ScenarioComparisonOutcome::Tie - | ScenarioComparisonOutcome::Loss - ) -} - -fn unmeasured_status_has_measured_position( - status: AdapterCoverageStatus, - position: ElfScenarioPosition, -) -> bool { - matches!( - status, - AdapterCoverageStatus::Blocked - | AdapterCoverageStatus::Incomplete - | AdapterCoverageStatus::NotEncoded - | AdapterCoverageStatus::Unsupported - ) && matches!( - position, - ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses - ) -} - -fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool { - let Some(outcome) = scenario.comparison_outcome else { - return false; - }; - - !outcome::position_supports_outcome(scenario.elf_position, outcome) -} - -fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - for evidence in &adapter.evidence { - if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { - return Err(eyre::eyre!( - "{} adapter {} has incomplete evidence pointers.", - path.display(), - adapter.adapter_id - )); - } - } - - Ok(()) -} - -fn validate_adapter_execution_metadata(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> { - let Some(metadata) = &adapter.execution_metadata else { - return Ok(()); - }; - - if metadata.setup_path.trim().is_empty() - || metadata.runtime_boundary.trim().is_empty() - || metadata.resource_expectation.trim().is_empty() - || metadata.retry_guidance.iter().any(|guidance| guidance.trim().is_empty()) - || metadata.sources.is_empty() - { - return Err(eyre::eyre!( - "{} adapter {} has incomplete execution metadata.", - path.display(), - adapter.adapter_id - )); - } - - for source in &metadata.sources { - if source.label.trim().is_empty() - || source.url.trim().is_empty() - || source.evidence.trim().is_empty() - { - return Err(eyre::eyre!( - "{} adapter {} has incomplete source metadata.", - path.display(), - adapter.adapter_id - )); - } - } - - Ok(()) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/basics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/basics.rs new file mode 100644 index 00000000..eba451be --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/basics.rs @@ -0,0 +1,85 @@ +use std::path::Path; + +use color_eyre::{Result, eyre}; + +use crate::{ExternalAdapterReport, SUITES}; + +pub(in crate::external_adapters::validation) fn validate_adapter_execution( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + for evidence in [&adapter.setup, &adapter.run, &adapter.result] { + if evidence.evidence.trim().is_empty() + || evidence.command.as_deref().is_some_and(str::is_empty) + || evidence.artifact.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete setup/run/result evidence.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +pub(in crate::external_adapters::validation) fn validate_adapter_capabilities( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + for capability in &adapter.capabilities { + if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete capability coverage.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} + +pub(in crate::external_adapters::validation) fn validate_adapter_suites( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + for suite in &adapter.suites { + if !SUITES.contains(&suite.suite_id.as_str()) { + return Err(eyre::eyre!( + "{} adapter {} references unknown suite {}.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + if suite.evidence.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has suite {} without evidence.", + path.display(), + adapter.adapter_id, + suite.suite_id + )); + } + } + + Ok(()) +} + +pub(in crate::external_adapters::validation) fn validate_adapter_evidence( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + for evidence in &adapter.evidence { + if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() { + return Err(eyre::eyre!( + "{} adapter {} has incomplete evidence pointers.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/metadata.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/metadata.rs new file mode 100644 index 00000000..5a16a6dd --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/metadata.rs @@ -0,0 +1,42 @@ +use std::path::Path; + +use color_eyre::{Result, eyre}; + +use crate::ExternalAdapterReport; + +pub(in crate::external_adapters::validation) fn validate_adapter_execution_metadata( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + let Some(metadata) = &adapter.execution_metadata else { + return Ok(()); + }; + + if metadata.setup_path.trim().is_empty() + || metadata.runtime_boundary.trim().is_empty() + || metadata.resource_expectation.trim().is_empty() + || metadata.retry_guidance.iter().any(|guidance| guidance.trim().is_empty()) + || metadata.sources.is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete execution metadata.", + path.display(), + adapter.adapter_id + )); + } + + for source in &metadata.sources { + if source.label.trim().is_empty() + || source.url.trim().is_empty() + || source.evidence.trim().is_empty() + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete source metadata.", + path.display(), + adapter.adapter_id + )); + } + } + + Ok(()) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/scenarios.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/scenarios.rs new file mode 100644 index 00000000..f22346e9 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/external_adapters/validation/scenarios.rs @@ -0,0 +1,131 @@ +use std::path::Path; + +use color_eyre::{Result, eyre}; + +use crate::{ + AdapterCoverageStatus, AdapterScenarioJudgment, ElfScenarioPosition, ExternalAdapterReport, + SUITES, ScenarioComparisonOutcome, external_adapters::outcome, formatting, +}; + +pub(in crate::external_adapters::validation) fn validate_adapter_scenarios( + path: &Path, + adapter: &ExternalAdapterReport, +) -> Result<()> { + for scenario in &adapter.scenarios { + if scenario.scenario_id.trim().is_empty() + || scenario.evidence.trim().is_empty() + || scenario.command.as_deref().is_some_and(str::is_empty) + || scenario.artifact.as_deref().is_some_and(str::is_empty) + { + return Err(eyre::eyre!( + "{} adapter {} has incomplete scenario judgment.", + path.display(), + adapter.adapter_id + )); + } + + if let Some(suite_id) = &scenario.suite_id + && !SUITES.contains(&suite_id.as_str()) + { + return Err(eyre::eyre!( + "{} adapter {} scenario {} references unknown suite {}.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + suite_id + )); + } + + let outcome = outcome::scenario_comparison_outcome(scenario); + + if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses blocked status without blocked comparison outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id + )); + } + if unmeasured_status_has_measured_outcome(scenario.status, outcome) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + formatting::adapter_status_str(scenario.status), + formatting::scenario_comparison_outcome_str(outcome) + )); + } + if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} status with {} position.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + formatting::adapter_status_str(scenario.status), + formatting::scenario_position_str(scenario.elf_position) + )); + } + if explicit_outcome_conflicts_with_position(scenario) { + return Err(eyre::eyre!( + "{} adapter {} scenario {} uses {} position with {} outcome.", + path.display(), + adapter.adapter_id, + scenario.scenario_id, + formatting::scenario_position_str(scenario.elf_position), + formatting::scenario_comparison_outcome_str(outcome) + )); + } + } + + Ok(()) +} + +fn blocked_status_missing_blocked_outcome( + status: AdapterCoverageStatus, + outcome: Option, +) -> bool { + status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked) +} + +fn unmeasured_status_has_measured_outcome( + status: AdapterCoverageStatus, + outcome: ScenarioComparisonOutcome, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + outcome, + ScenarioComparisonOutcome::Win + | ScenarioComparisonOutcome::Tie + | ScenarioComparisonOutcome::Loss + ) +} + +fn unmeasured_status_has_measured_position( + status: AdapterCoverageStatus, + position: ElfScenarioPosition, +) -> bool { + matches!( + status, + AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::NotEncoded + | AdapterCoverageStatus::Unsupported + ) && matches!( + position, + ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses + ) +} + +fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool { + let Some(outcome) = scenario.comparison_outcome else { + return false; + }; + + !outcome::position_supports_outcome(scenario.elf_position, outcome) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/feature_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/feature_metrics.rs index 5c47c8b4..372a5c84 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/feature_metrics.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/feature_metrics.rs @@ -1,9 +1,9 @@ -#[path = "feature_metrics/common.rs"] mod common; -#[path = "feature_metrics/knowledge.rs"] mod knowledge; -#[path = "feature_metrics/memory_summary.rs"] mod memory_summary; -#[path = "feature_metrics/proactive.rs"] mod proactive; -#[path = "feature_metrics/scheduled.rs"] mod scheduled; -#[path = "feature_metrics/work_continuity.rs"] mod work_continuity; +mod common; +mod knowledge; +mod memory_summary; +mod proactive; +mod scheduled; +mod work_continuity; use crate::{ BTreeSet, DerivedPageArtifact, DerivedPageRebuild, DerivedPageSection, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/consolidation_reports.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/consolidation_reports.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/consolidation_reports.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/consolidation_reports.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_core.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/job_report_core.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_core.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_domain_metrics.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/job_report_domain_metrics.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_domain_metrics.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_evolution.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/job_report_evolution.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_evolution.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_misc.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/job_report_misc.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_misc.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_scoring.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/job_report_scoring.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/job_reports/job_report_scoring.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs similarity index 80% rename from apps/elf-eval/src/bin/real_world_job_benchmark.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/main.rs index 47f2b2e8..9815886f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/main.rs @@ -2,28 +2,28 @@ //! Offline runner and publisher for real-world job benchmark fixtures. -#[path = "real_world_job_benchmark/artifacts.rs"] mod artifacts; -#[path = "real_world_job_benchmark/cli.rs"] mod cli; -#[path = "real_world_job_benchmark/commands.rs"] mod commands; -#[path = "real_world_job_benchmark/diagnostic_reports.rs"] mod diagnostic_reports; -#[path = "real_world_job_benchmark/enums.rs"] mod enums; -#[path = "real_world_job_benchmark/external_adapter_reports.rs"] mod external_adapter_reports; -#[path = "real_world_job_benchmark/external_adapters.rs"] mod external_adapters; -#[path = "real_world_job_benchmark/feature_metrics.rs"] mod feature_metrics; -#[path = "real_world_job_benchmark/fixtures.rs"] mod fixtures; -#[path = "real_world_job_benchmark/formatting.rs"] mod formatting; -#[path = "real_world_job_benchmark/job_reports.rs"] mod job_reports; -#[path = "real_world_job_benchmark/markdown.rs"] mod markdown; -#[path = "real_world_job_benchmark/operational.rs"] mod operational; -#[path = "real_world_job_benchmark/operational_reports.rs"] mod operational_reports; -#[path = "real_world_job_benchmark/recovery.rs"] mod recovery; -#[path = "real_world_job_benchmark/report_root.rs"] mod report_root; -#[path = "real_world_job_benchmark/scoreboard.rs"] mod scoreboard; -#[path = "real_world_job_benchmark/scoreboard_reports.rs"] mod scoreboard_reports; -#[path = "real_world_job_benchmark/scoring.rs"] mod scoring; -#[path = "real_world_job_benchmark/summary.rs"] mod summary; -#[path = "real_world_job_benchmark/summary_reports.rs"] mod summary_reports; -#[path = "real_world_job_benchmark/validation.rs"] mod validation; +mod artifacts; +mod cli; +mod commands; +mod diagnostic_reports; +mod enums; +mod external_adapter_reports; +mod external_adapters; +mod feature_metrics; +mod fixtures; +mod formatting; +mod job_reports; +mod markdown; +mod operational; +mod operational_reports; +mod recovery; +mod report_root; +mod scoreboard; +mod scoreboard_reports; +mod scoring; +mod summary; +mod summary_reports; +mod validation; use std::{ collections::{BTreeMap, BTreeSet}, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs index 306738b9..36f9dba6 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown.rs @@ -1,13 +1,13 @@ -#[path = "markdown/adapters.rs"] mod adapters; -#[path = "markdown/common.rs"] mod common; -#[path = "markdown/domain_metrics.rs"] mod domain_metrics; -#[path = "markdown/evolution.rs"] mod evolution; -#[path = "markdown/followups.rs"] mod followups; -#[path = "markdown/header.rs"] mod header; -#[path = "markdown/jobs.rs"] mod jobs; -#[path = "markdown/operational.rs"] mod operational; -#[path = "markdown/scoreboard.rs"] mod scoreboard; -#[path = "markdown/trace.rs"] mod trace; +mod adapters; +mod common; +mod domain_metrics; +mod evolution; +mod followups; +mod header; +mod jobs; +mod operational; +mod scoreboard; +mod trace; use std::path::Path; diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters.rs index 58937dc9..9288db77 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters.rs @@ -1,8 +1,8 @@ -use crate::markdown::{ - self, AdapterScenarioJudgment, AdapterSource, AdapterStatusCounts, AdapterSuiteCoverage, - DEFAULT_ADAPTER_BEHAVIOR, ExternalAdapterReport, RealWorldReport, ScenarioOutcomeCounts, - ScenarioPositionCounts, -}; +mod cells; +mod counts; +mod details; + +use crate::markdown::{self, DEFAULT_ADAPTER_BEHAVIOR, RealWorldReport}; pub(super) fn render_markdown_capture_integration(out: &mut String, report: &RealWorldReport) { out.push_str("## Capture And Integration Coverage\n\n"); @@ -85,29 +85,29 @@ pub(super) fn render_markdown_external_adapters(out: &mut String, report: &RealW )); out.push_str(&format!( "- Overall statuses: `{}`\n", - adapter_status_counts_display(&summary.overall_status_counts) + counts::adapter_status_counts_display(&summary.overall_status_counts) )); out.push_str(&format!( "- Capability coverage statuses: `{}`\n", - adapter_status_counts_display(&summary.capability_status_counts) + counts::adapter_status_counts_display(&summary.capability_status_counts) )); out.push_str(&format!( "- Real-world suite statuses: `{}`\n", - adapter_status_counts_display(&summary.suite_status_counts) + counts::adapter_status_counts_display(&summary.suite_status_counts) )); - if has_adapter_scenarios(report.external_adapters.adapters.as_slice()) { + if details::has_adapter_scenarios(report.external_adapters.adapters.as_slice()) { out.push_str(&format!( "- Scenario coverage statuses: `{}`\n", - adapter_status_counts_display(&summary.scenario_status_counts) + counts::adapter_status_counts_display(&summary.scenario_status_counts) )); out.push_str(&format!( "- ELF scenario positions: `{}`\n", - scenario_position_counts_display(&summary.scenario_position_counts) + counts::scenario_position_counts_display(&summary.scenario_position_counts) )); out.push_str(&format!( "- Scenario comparison outcomes: `{}`\n", - scenario_outcome_counts_display(&summary.scenario_outcome_counts) + counts::scenario_outcome_counts_display(&summary.scenario_outcome_counts) )); } @@ -126,8 +126,8 @@ pub(super) fn render_markdown_external_adapters(out: &mut String, report: &RealW markdown::adapter_status_str(adapter.run.status), markdown::adapter_status_str(adapter.result.status), adapter.docker_default, - adapter_suite_cell(adapter.suites.as_slice()), - adapter_evidence_cell(adapter) + cells::adapter_suite_cell(adapter.suites.as_slice()), + cells::adapter_evidence_cell(adapter) )); } @@ -147,193 +147,11 @@ pub(super) fn render_markdown_external_adapters(out: &mut String, report: &RealW } } - render_markdown_adapter_scenarios(out, report.external_adapters.adapters.as_slice()); - render_markdown_adapter_execution_metadata(out, report.external_adapters.adapters.as_slice()); + details::render_markdown_adapter_scenarios(out, report.external_adapters.adapters.as_slice()); + details::render_markdown_adapter_execution_metadata( + out, + report.external_adapters.adapters.as_slice(), + ); out.push('\n'); } - -fn render_markdown_adapter_scenarios(out: &mut String, adapters: &[ExternalAdapterReport]) { - if !has_adapter_scenarios(adapters) { - return; - } - - out.push_str("\n### Adapter Scenario Judgments\n\n"); - out.push_str("| Adapter | Scenario | Suite | Status | Outcome | Evidence |\n"); - out.push_str("| --- | --- | --- | --- | --- | --- |\n"); - - for adapter in adapters { - for scenario in &adapter.scenarios { - out.push_str(&format!( - "| `{}` | `{}` | {} | `{}` | `{}` | {} |\n", - markdown::md_inline(adapter.adapter_id.as_str()), - markdown::md_inline(scenario.scenario_id.as_str()), - scenario - .suite_id - .as_deref() - .map(|suite| format!("`{}`", markdown::md_inline(suite))) - .unwrap_or_else(|| "`none`".to_string()), - markdown::adapter_status_str(scenario.status), - markdown::scenario_comparison_outcome_str(markdown::scenario_comparison_outcome( - scenario - )), - adapter_scenario_evidence_cell(scenario) - )); - } - } -} - -fn render_markdown_adapter_execution_metadata( - out: &mut String, - adapters: &[ExternalAdapterReport], -) { - let mut wrote_header = false; - - for adapter in adapters { - let Some(metadata) = &adapter.execution_metadata else { - continue; - }; - - if !wrote_header { - out.push_str("\n### Adapter Execution Metadata\n\n"); - out.push_str("| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth |\n"); - out.push_str("| --- | --- | --- | --- | --- | --- | --- |\n"); - - wrote_header = true; - } - - out.push_str(&format!( - "| `{}` | {} | {} | {} | {} | {} | {} |\n", - markdown::md_inline(adapter.adapter_id.as_str()), - adapter_sources_cell(metadata.sources.as_slice()), - markdown::md_cell(metadata.setup_path.as_str()), - markdown::md_cell(metadata.runtime_boundary.as_str()), - markdown::md_cell(metadata.resource_expectation.as_str()), - markdown::md_list(metadata.retry_guidance.as_slice()), - markdown::md_cell(metadata.research_depth.as_deref().unwrap_or("not recorded")) - )); - } -} - -fn has_adapter_scenarios(adapters: &[ExternalAdapterReport]) -> bool { - adapters.iter().any(|adapter| !adapter.scenarios.is_empty()) -} - -fn adapter_status_counts_display(counts: &AdapterStatusCounts) -> String { - [ - ("real", counts.real), - ("mocked", counts.mocked), - ("unsupported", counts.unsupported), - ("blocked", counts.blocked), - ("incomplete", counts.incomplete), - ("wrong_result", counts.wrong_result), - ("lifecycle_fail", counts.lifecycle_fail), - ("pass", counts.pass), - ("not_encoded", counts.not_encoded), - ] - .into_iter() - .filter(|(_, count)| *count > 0) - .map(|(status, count)| format!("{status}={count}")) - .collect::>() - .join(", ") -} - -fn scenario_position_counts_display(counts: &ScenarioPositionCounts) -> String { - [ - ("wins", counts.wins), - ("ties", counts.ties), - ("loses", counts.loses), - ("untested", counts.untested), - ] - .into_iter() - .filter(|(_, count)| *count > 0) - .map(|(position, count)| format!("{position}={count}")) - .collect::>() - .join(", ") -} - -fn scenario_outcome_counts_display(counts: &ScenarioOutcomeCounts) -> String { - [ - ("win", counts.win), - ("tie", counts.tie), - ("loss", counts.loss), - ("not_tested", counts.not_tested), - ("blocked", counts.blocked), - ("non_goal", counts.non_goal), - ] - .into_iter() - .filter(|(_, count)| *count > 0) - .map(|(outcome, count)| format!("{outcome}={count}")) - .collect::>() - .join(", ") -} - -fn adapter_suite_cell(suites: &[AdapterSuiteCoverage]) -> String { - if suites.is_empty() { - return "`none`".to_string(); - } - - suites - .iter() - .map(|suite| { - format!( - "`{}`: `{}`", - markdown::md_inline(suite.suite_id.as_str()), - markdown::adapter_status_str(suite.status) - ) - }) - .collect::>() - .join("
") -} - -fn adapter_evidence_cell(adapter: &ExternalAdapterReport) -> String { - let setup = adapter - .setup - .command - .as_deref() - .or(adapter.setup.artifact.as_deref()) - .unwrap_or(adapter.setup.evidence.as_str()); - let result = adapter - .result - .artifact - .as_deref() - .or(adapter.result.command.as_deref()) - .unwrap_or(adapter.result.evidence.as_str()); - - format!("setup: `{}`
result: `{}`", markdown::md_inline(setup), markdown::md_inline(result)) -} - -fn adapter_scenario_evidence_cell(scenario: &AdapterScenarioJudgment) -> String { - let evidence = markdown::md_cell(scenario.evidence.as_str()); - let command = scenario - .command - .as_deref() - .map(|command| format!("
command: `{}`", markdown::md_inline(command))) - .unwrap_or_default(); - let artifact = scenario - .artifact - .as_deref() - .map(|artifact| format!("
artifact: `{}`", markdown::md_inline(artifact))) - .unwrap_or_default(); - - format!("{evidence}{command}{artifact}") -} - -fn adapter_sources_cell(sources: &[AdapterSource]) -> String { - if sources.is_empty() { - return "`none`".to_string(); - } - - sources - .iter() - .map(|source| { - format!( - "[{}]({}): {}", - markdown::md_cell(source.label.as_str()), - markdown::md_url(source.url.as_str()), - markdown::md_cell(source.evidence.as_str()) - ) - }) - .collect::>() - .join("
") -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/cells.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/cells.rs new file mode 100644 index 00000000..647d3876 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/cells.rs @@ -0,0 +1,77 @@ +use crate::markdown::{ + self, AdapterScenarioJudgment, AdapterSource, AdapterSuiteCoverage, ExternalAdapterReport, +}; + +pub(in crate::markdown::adapters) fn adapter_suite_cell(suites: &[AdapterSuiteCoverage]) -> String { + if suites.is_empty() { + return "`none`".to_string(); + } + + suites + .iter() + .map(|suite| { + format!( + "`{}`: `{}`", + markdown::md_inline(suite.suite_id.as_str()), + markdown::adapter_status_str(suite.status) + ) + }) + .collect::>() + .join("
") +} + +pub(in crate::markdown::adapters) fn adapter_evidence_cell( + adapter: &ExternalAdapterReport, +) -> String { + let setup = adapter + .setup + .command + .as_deref() + .or(adapter.setup.artifact.as_deref()) + .unwrap_or(adapter.setup.evidence.as_str()); + let result = adapter + .result + .artifact + .as_deref() + .or(adapter.result.command.as_deref()) + .unwrap_or(adapter.result.evidence.as_str()); + + format!("setup: `{}`
result: `{}`", markdown::md_inline(setup), markdown::md_inline(result)) +} + +pub(in crate::markdown::adapters) fn adapter_scenario_evidence_cell( + scenario: &AdapterScenarioJudgment, +) -> String { + let evidence = markdown::md_cell(scenario.evidence.as_str()); + let command = scenario + .command + .as_deref() + .map(|command| format!("
command: `{}`", markdown::md_inline(command))) + .unwrap_or_default(); + let artifact = scenario + .artifact + .as_deref() + .map(|artifact| format!("
artifact: `{}`", markdown::md_inline(artifact))) + .unwrap_or_default(); + + format!("{evidence}{command}{artifact}") +} + +pub(in crate::markdown::adapters) fn adapter_sources_cell(sources: &[AdapterSource]) -> String { + if sources.is_empty() { + return "`none`".to_string(); + } + + sources + .iter() + .map(|source| { + format!( + "[{}]({}): {}", + markdown::md_cell(source.label.as_str()), + markdown::md_url(source.url.as_str()), + markdown::md_cell(source.evidence.as_str()) + ) + }) + .collect::>() + .join("
") +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/counts.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/counts.rs new file mode 100644 index 00000000..a9cc2b3d --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/counts.rs @@ -0,0 +1,56 @@ +use crate::markdown::{AdapterStatusCounts, ScenarioOutcomeCounts, ScenarioPositionCounts}; + +pub(in crate::markdown::adapters) fn adapter_status_counts_display( + counts: &AdapterStatusCounts, +) -> String { + [ + ("real", counts.real), + ("mocked", counts.mocked), + ("unsupported", counts.unsupported), + ("blocked", counts.blocked), + ("incomplete", counts.incomplete), + ("wrong_result", counts.wrong_result), + ("lifecycle_fail", counts.lifecycle_fail), + ("pass", counts.pass), + ("not_encoded", counts.not_encoded), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(status, count)| format!("{status}={count}")) + .collect::>() + .join(", ") +} + +pub(in crate::markdown::adapters) fn scenario_position_counts_display( + counts: &ScenarioPositionCounts, +) -> String { + [ + ("wins", counts.wins), + ("ties", counts.ties), + ("loses", counts.loses), + ("untested", counts.untested), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(position, count)| format!("{position}={count}")) + .collect::>() + .join(", ") +} + +pub(in crate::markdown::adapters) fn scenario_outcome_counts_display( + counts: &ScenarioOutcomeCounts, +) -> String { + [ + ("win", counts.win), + ("tie", counts.tie), + ("loss", counts.loss), + ("not_tested", counts.not_tested), + ("blocked", counts.blocked), + ("non_goal", counts.non_goal), + ] + .into_iter() + .filter(|(_, count)| *count > 0) + .map(|(outcome, count)| format!("{outcome}={count}")) + .collect::>() + .join(", ") +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/details.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/details.rs new file mode 100644 index 00000000..41824508 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/adapters/details.rs @@ -0,0 +1,72 @@ +use crate::markdown::{self, ExternalAdapterReport, adapters::cells}; + +pub(in crate::markdown::adapters) fn has_adapter_scenarios( + adapters: &[ExternalAdapterReport], +) -> bool { + adapters.iter().any(|adapter| !adapter.scenarios.is_empty()) +} + +pub(in crate::markdown::adapters) fn render_markdown_adapter_scenarios( + out: &mut String, + adapters: &[ExternalAdapterReport], +) { + if !has_adapter_scenarios(adapters) { + return; + } + + out.push_str("\n### Adapter Scenario Judgments\n\n"); + out.push_str("| Adapter | Scenario | Suite | Status | Outcome | Evidence |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- |\n"); + + for adapter in adapters { + for scenario in &adapter.scenarios { + out.push_str(&format!( + "| `{}` | `{}` | {} | `{}` | `{}` | {} |\n", + markdown::md_inline(adapter.adapter_id.as_str()), + markdown::md_inline(scenario.scenario_id.as_str()), + scenario + .suite_id + .as_deref() + .map(|suite| format!("`{}`", markdown::md_inline(suite))) + .unwrap_or_else(|| "`none`".to_string()), + markdown::adapter_status_str(scenario.status), + markdown::scenario_comparison_outcome_str(markdown::scenario_comparison_outcome( + scenario + )), + cells::adapter_scenario_evidence_cell(scenario) + )); + } + } +} + +pub(in crate::markdown::adapters) fn render_markdown_adapter_execution_metadata( + out: &mut String, + adapters: &[ExternalAdapterReport], +) { + let mut wrote_header = false; + + for adapter in adapters { + let Some(metadata) = &adapter.execution_metadata else { + continue; + }; + + if !wrote_header { + out.push_str("\n### Adapter Execution Metadata\n\n"); + out.push_str("| Adapter | Sources | Setup Path | Runtime Boundary | Resource Expectation | Retry Guidance | Research Depth |\n"); + out.push_str("| --- | --- | --- | --- | --- | --- | --- |\n"); + + wrote_header = true; + } + + out.push_str(&format!( + "| `{}` | {} | {} | {} | {} | {} | {} |\n", + markdown::md_inline(adapter.adapter_id.as_str()), + cells::adapter_sources_cell(metadata.sources.as_slice()), + markdown::md_cell(metadata.setup_path.as_str()), + markdown::md_cell(metadata.runtime_boundary.as_str()), + markdown::md_cell(metadata.resource_expectation.as_str()), + markdown::md_list(metadata.retry_guidance.as_slice()), + markdown::md_cell(metadata.research_depth.as_deref().unwrap_or("not recorded")) + )); + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header.rs index 44fbb9b5..b1730185 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header.rs @@ -1,7 +1,7 @@ -use crate::markdown::{ - self, KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, RealWorldReport, - ReportSummary, ScheduledMemorySummaryReport, WorkContinuitySummaryReport, -}; +mod optional; +mod quality; + +use crate::markdown::{self, RealWorldReport}; pub(super) fn render_markdown_header( out: &mut String, @@ -72,7 +72,7 @@ pub(super) fn render_markdown_header( report.summary.history_readback_encoded_count )); - render_markdown_quality_summary(out, report); + quality::render_markdown_quality_summary(out, report); out.push_str(&format!("- Mean score: `{:.3}`\n", report.summary.mean_score)); out.push_str(&format!( @@ -94,213 +94,10 @@ pub(super) fn render_markdown_header( )); out.push_str(&format!("- Operator UX gaps: `{}`\n", report.summary.operator_ux_gap_count)); - render_markdown_optional_summary_metrics(out, &report.summary); + optional::render_markdown_optional_summary_metrics(out, &report.summary); out.push_str(&format!( "- Private corpus redaction: `{}`\n\n", markdown::md_inline(report.private_corpus_redaction.policy.as_str()) )); } - -fn render_markdown_optional_summary_metrics(out: &mut String, summary: &ReportSummary) { - if let Some(knowledge) = &summary.knowledge { - render_markdown_knowledge_summary_metrics(out, knowledge); - } - if let Some(memory_summary) = &summary.memory_summary { - render_markdown_memory_summary_metrics(out, memory_summary); - } - if let Some(proactive) = &summary.proactive_brief { - render_markdown_proactive_summary_metrics(out, proactive); - } - if let Some(scheduled) = &summary.scheduled_memory { - render_markdown_scheduled_summary_metrics(out, scheduled); - } - if let Some(work_continuity) = &summary.work_continuity { - render_markdown_work_continuity_summary_metrics(out, work_continuity); - } -} - -fn render_markdown_knowledge_summary_metrics(out: &mut String, knowledge: &KnowledgeSummary) { - out.push_str(&format!("- Knowledge citation coverage: `{:.3}`\n", knowledge.citation_coverage)); - out.push_str(&format!("- Stale claim detection: `{:.3}`\n", knowledge.stale_claim_detection)); - out.push_str(&format!("- Rebuild determinism: `{:.3}`\n", knowledge.rebuild_determinism)); - out.push_str(&format!( - "- Backlinks: `{}` total, `{:.3}` page coverage\n", - knowledge.backlink_count, knowledge.backlink_coverage - )); - out.push_str(&format!("- Version diff coverage: `{:.3}`\n", knowledge.version_diff_coverage)); - out.push_str(&format!("- Page usefulness: `{:.3}`\n", knowledge.page_usefulness)); - out.push_str(&format!( - "- Unsupported summary count: `{}`\n", - knowledge.unsupported_summary_count - )); -} - -fn render_markdown_memory_summary_metrics(out: &mut String, memory_summary: &MemorySummaryReport) { - out.push_str(&format!( - "- Memory summary entries: `{}` across `{}` artifact(s)\n", - memory_summary.entry_count, memory_summary.summary_count - )); - out.push_str(&format!( - "- Memory summary source-ref coverage: `{}/{}` (`{:.3}`)\n", - memory_summary.source_ref_entry_count, - memory_summary.source_ref_required_count, - memory_summary.source_ref_coverage - )); - out.push_str(&format!( - "- Memory summary invalid top-of-mind count: `{}`\n", - memory_summary.invalid_top_of_mind_count - )); - out.push_str(&format!( - "- Memory summary unsupported derived entries: `{}`\n", - memory_summary.unsupported_derived_entry_count - )); - out.push_str(&format!( - "- Memory summary unsupported current entries: `{}`\n", - memory_summary.unsupported_current_entry_count - )); -} - -fn render_markdown_proactive_summary_metrics( - out: &mut String, - proactive: &ProactiveBriefSummaryReport, -) { - out.push_str(&format!( - "- Proactive brief suggestions: `{}` across `{}` artifact(s)\n", - proactive.suggestion_count, proactive.brief_count - )); - out.push_str(&format!( - "- Proactive evidence-ref coverage: `{}/{}` (`{:.3}`)\n", - proactive.evidence_ref_suggestion_count, - proactive.evidence_ref_required_count, - proactive.evidence_ref_coverage - )); - out.push_str(&format!( - "- Proactive freshness/action rationale coverage: `{:.3}` / `{:.3}`\n", - proactive.freshness_coverage, proactive.action_rationale_coverage - )); - out.push_str(&format!( - "- Proactive stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", - proactive.invalid_current_suggestion_count, proactive.tombstone_violation_count - )); - out.push_str(&format!( - "- Proactive rejected/deferred suggestions: `{}` rejected, `{}` deferred\n", - proactive.rejected_count, proactive.deferred_count - )); -} - -fn render_markdown_scheduled_summary_metrics( - out: &mut String, - scheduled: &ScheduledMemorySummaryReport, -) { - out.push_str(&format!( - "- Scheduled memory outputs: `{}` across `{}` task run(s)\n", - scheduled.output_count, scheduled.task_run_count - )); - out.push_str(&format!( - "- Scheduled memory evidence-ref coverage: `{}/{}` (`{:.3}`)\n", - scheduled.evidence_ref_output_count, - scheduled.evidence_ref_required_count, - scheduled.evidence_ref_coverage - )); - out.push_str(&format!( - "- Scheduled memory freshness/action/trace coverage: `{:.3}` / `{:.3}` / `{:.3}`\n", - scheduled.freshness_coverage, scheduled.action_rationale_coverage, scheduled.trace_coverage - )); - out.push_str(&format!( - "- Scheduled memory stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", - scheduled.invalid_current_output_count, scheduled.tombstone_violation_count - )); - out.push_str(&format!( - "- Scheduled memory source mutations: `{}`\n", - scheduled.source_mutation_count - )); -} - -fn render_markdown_work_continuity_summary_metrics( - out: &mut String, - work_continuity: &WorkContinuitySummaryReport, -) { - out.push_str(&format!( - "- Work continuity readbacks: `{}` entries across `{}` artifact(s)\n", - work_continuity.entry_count, work_continuity.readback_count - )); - out.push_str(&format!( - "- Work continuity reset/resume and rationale recall: `{:.3}` / `{:.3}`\n", - work_continuity.reset_resume_success_rate, work_continuity.decision_rationale_recall_rate - )); - out.push_str(&format!( - "- Work continuity rejected-option suppression and explicit next-step precision: `{:.3}` / `{:.3}`\n", - work_continuity.rejected_option_suppression_rate, - work_continuity.explicit_next_step_precision - )); - out.push_str(&format!( - "- Work continuity inferred-step labeling and handoff source-ref coverage: `{:.3}` / `{:.3}`\n", - work_continuity.inferred_next_step_labeling_rate, - work_continuity.handoff_source_ref_coverage - )); - out.push_str(&format!( - "- Work continuity redaction and janitor false-promotion rates: `{:.3}` / `{:.3}`\n", - work_continuity.redaction_rate, work_continuity.janitor_false_promotion_rate - )); - out.push_str(&format!( - "- Work continuity hard-fail markers: `{}` sensitive persistence, `{}` rejected resurrection, `{}` inferred instructions, `{}` journal-only authority claim(s)\n", - work_continuity.sensitive_marker_persistence_count, - work_continuity.rejected_option_resurrection_count, - work_continuity.inferred_step_instruction_count, - work_continuity.journal_only_authority_claim_count - )); -} - -fn render_markdown_quality_summary(out: &mut String, report: &RealWorldReport) { - out.push_str(&format!( - "- Evidence coverage: `{}/{}` (`{:.3}`)\n", - report.summary.evidence_covered_count, - report.summary.evidence_required_count, - report.summary.evidence_coverage - )); - out.push_str(&format!( - "- Source-ref coverage: `{}/{}` (`{:.3}`)\n", - report.summary.source_ref_covered_count, - report.summary.source_ref_required_count, - report.summary.source_ref_coverage - )); - out.push_str(&format!( - "- Quote coverage: `{}/{}` (`{:.3}`)\n", - report.summary.quote_covered_count, - report.summary.quote_required_count, - report.summary.quote_coverage - )); - out.push_str(&format!("- Stale retrieval count: `{}`\n", report.summary.stale_retrieval_count)); - out.push_str(&format!( - "- Scope correctness: `{}/{}` (`{:.3}`), violations `{}`\n", - report.summary.scope_correct_count, - report.summary.scope_check_count, - report.summary.scope_correctness, - report.summary.scope_violation_count - )); - out.push_str(&format!("- Redaction leak count: `{}`\n", report.summary.redaction_leak_count)); - out.push_str(&format!( - "- Qdrant rebuild cases: `{}` encoded, `{}` pass\n", - report.summary.qdrant_rebuild_case_count, report.summary.qdrant_rebuild_pass_count - )); - out.push_str(&format!( - "- Expected evidence recall: `{:.3}` ({}/{})\n", - report.summary.expected_evidence_recall, - report.summary.expected_evidence_matched, - report.summary.expected_evidence_total - )); - out.push_str(&format!( - "- Irrelevant context ratio: `{:.3}` ({} irrelevant)\n", - report.summary.irrelevant_context_ratio, report.summary.irrelevant_context_count - )); - out.push_str(&format!( - "- Trace explainability: `{}` job(s), `{}` wrong-result stage attribution(s)\n", - report.summary.trace_explainability_count, - report.summary.wrong_result_stage_attribution_count - )); - out.push_str(&format!( - "- Consolidation source mutation count: `{}`\n", - report.summary.consolidation.source_mutation_count - )); -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/optional.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/optional.rs new file mode 100644 index 00000000..653727c1 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/optional.rs @@ -0,0 +1,157 @@ +use crate::markdown::{ + KnowledgeSummary, MemorySummaryReport, ProactiveBriefSummaryReport, ReportSummary, + ScheduledMemorySummaryReport, WorkContinuitySummaryReport, +}; + +pub(in crate::markdown::header) fn render_markdown_optional_summary_metrics( + out: &mut String, + summary: &ReportSummary, +) { + if let Some(knowledge) = &summary.knowledge { + render_markdown_knowledge_summary_metrics(out, knowledge); + } + if let Some(memory_summary) = &summary.memory_summary { + render_markdown_memory_summary_metrics(out, memory_summary); + } + if let Some(proactive) = &summary.proactive_brief { + render_markdown_proactive_summary_metrics(out, proactive); + } + if let Some(scheduled) = &summary.scheduled_memory { + render_markdown_scheduled_summary_metrics(out, scheduled); + } + if let Some(work_continuity) = &summary.work_continuity { + render_markdown_work_continuity_summary_metrics(out, work_continuity); + } +} + +fn render_markdown_knowledge_summary_metrics(out: &mut String, knowledge: &KnowledgeSummary) { + out.push_str(&format!("- Knowledge citation coverage: `{:.3}`\n", knowledge.citation_coverage)); + out.push_str(&format!("- Stale claim detection: `{:.3}`\n", knowledge.stale_claim_detection)); + out.push_str(&format!("- Rebuild determinism: `{:.3}`\n", knowledge.rebuild_determinism)); + out.push_str(&format!( + "- Backlinks: `{}` total, `{:.3}` page coverage\n", + knowledge.backlink_count, knowledge.backlink_coverage + )); + out.push_str(&format!("- Version diff coverage: `{:.3}`\n", knowledge.version_diff_coverage)); + out.push_str(&format!("- Page usefulness: `{:.3}`\n", knowledge.page_usefulness)); + out.push_str(&format!( + "- Unsupported summary count: `{}`\n", + knowledge.unsupported_summary_count + )); +} + +fn render_markdown_memory_summary_metrics(out: &mut String, memory_summary: &MemorySummaryReport) { + out.push_str(&format!( + "- Memory summary entries: `{}` across `{}` artifact(s)\n", + memory_summary.entry_count, memory_summary.summary_count + )); + out.push_str(&format!( + "- Memory summary source-ref coverage: `{}/{}` (`{:.3}`)\n", + memory_summary.source_ref_entry_count, + memory_summary.source_ref_required_count, + memory_summary.source_ref_coverage + )); + out.push_str(&format!( + "- Memory summary invalid top-of-mind count: `{}`\n", + memory_summary.invalid_top_of_mind_count + )); + out.push_str(&format!( + "- Memory summary unsupported derived entries: `{}`\n", + memory_summary.unsupported_derived_entry_count + )); + out.push_str(&format!( + "- Memory summary unsupported current entries: `{}`\n", + memory_summary.unsupported_current_entry_count + )); +} + +fn render_markdown_proactive_summary_metrics( + out: &mut String, + proactive: &ProactiveBriefSummaryReport, +) { + out.push_str(&format!( + "- Proactive brief suggestions: `{}` across `{}` artifact(s)\n", + proactive.suggestion_count, proactive.brief_count + )); + out.push_str(&format!( + "- Proactive evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + proactive.evidence_ref_suggestion_count, + proactive.evidence_ref_required_count, + proactive.evidence_ref_coverage + )); + out.push_str(&format!( + "- Proactive freshness/action rationale coverage: `{:.3}` / `{:.3}`\n", + proactive.freshness_coverage, proactive.action_rationale_coverage + )); + out.push_str(&format!( + "- Proactive stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + proactive.invalid_current_suggestion_count, proactive.tombstone_violation_count + )); + out.push_str(&format!( + "- Proactive rejected/deferred suggestions: `{}` rejected, `{}` deferred\n", + proactive.rejected_count, proactive.deferred_count + )); +} + +fn render_markdown_scheduled_summary_metrics( + out: &mut String, + scheduled: &ScheduledMemorySummaryReport, +) { + out.push_str(&format!( + "- Scheduled memory outputs: `{}` across `{}` task run(s)\n", + scheduled.output_count, scheduled.task_run_count + )); + out.push_str(&format!( + "- Scheduled memory evidence-ref coverage: `{}/{}` (`{:.3}`)\n", + scheduled.evidence_ref_output_count, + scheduled.evidence_ref_required_count, + scheduled.evidence_ref_coverage + )); + out.push_str(&format!( + "- Scheduled memory freshness/action/trace coverage: `{:.3}` / `{:.3}` / `{:.3}`\n", + scheduled.freshness_coverage, scheduled.action_rationale_coverage, scheduled.trace_coverage + )); + out.push_str(&format!( + "- Scheduled memory stale/currentness violations: `{}` invalid current, `{}` tombstone violation(s)\n", + scheduled.invalid_current_output_count, scheduled.tombstone_violation_count + )); + out.push_str(&format!( + "- Scheduled memory source mutations: `{}`\n", + scheduled.source_mutation_count + )); +} + +fn render_markdown_work_continuity_summary_metrics( + out: &mut String, + work_continuity: &WorkContinuitySummaryReport, +) { + out.push_str(&format!( + "- Work continuity readbacks: `{}` entries across `{}` artifact(s)\n", + work_continuity.entry_count, work_continuity.readback_count + )); + out.push_str(&format!( + "- Work continuity reset/resume and rationale recall: `{:.3}` / `{:.3}`\n", + work_continuity.reset_resume_success_rate, work_continuity.decision_rationale_recall_rate + )); + out.push_str(&format!( + "- Work continuity rejected-option suppression and explicit next-step precision: `{:.3}` / `{:.3}`\n", + work_continuity.rejected_option_suppression_rate, + work_continuity.explicit_next_step_precision + )); + out.push_str(&format!( + "- Work continuity inferred-step labeling and handoff source-ref coverage: `{:.3}` / `{:.3}`\n", + work_continuity.inferred_next_step_labeling_rate, + work_continuity.handoff_source_ref_coverage + )); + out.push_str(&format!( + "- Work continuity redaction and janitor false-promotion rates: `{:.3}` / `{:.3}`\n", + work_continuity.redaction_rate, work_continuity.janitor_false_promotion_rate + )); + out.push_str(&format!( + "- Work continuity hard-fail markers: `{}` sensitive persistence, `{}` rejected resurrection, `{}` inferred instructions, `{}` journal-only authority claim(s)\n", + work_continuity.sensitive_marker_persistence_count, + work_continuity.rejected_option_resurrection_count, + work_continuity.inferred_step_instruction_count, + work_continuity.journal_only_authority_claim_count + )); +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/quality.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/quality.rs new file mode 100644 index 00000000..c8f20037 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/markdown/header/quality.rs @@ -0,0 +1,57 @@ +use crate::markdown::RealWorldReport; + +pub(in crate::markdown::header) fn render_markdown_quality_summary( + out: &mut String, + report: &RealWorldReport, +) { + out.push_str(&format!( + "- Evidence coverage: `{}/{}` (`{:.3}`)\n", + report.summary.evidence_covered_count, + report.summary.evidence_required_count, + report.summary.evidence_coverage + )); + out.push_str(&format!( + "- Source-ref coverage: `{}/{}` (`{:.3}`)\n", + report.summary.source_ref_covered_count, + report.summary.source_ref_required_count, + report.summary.source_ref_coverage + )); + out.push_str(&format!( + "- Quote coverage: `{}/{}` (`{:.3}`)\n", + report.summary.quote_covered_count, + report.summary.quote_required_count, + report.summary.quote_coverage + )); + out.push_str(&format!("- Stale retrieval count: `{}`\n", report.summary.stale_retrieval_count)); + out.push_str(&format!( + "- Scope correctness: `{}/{}` (`{:.3}`), violations `{}`\n", + report.summary.scope_correct_count, + report.summary.scope_check_count, + report.summary.scope_correctness, + report.summary.scope_violation_count + )); + out.push_str(&format!("- Redaction leak count: `{}`\n", report.summary.redaction_leak_count)); + out.push_str(&format!( + "- Qdrant rebuild cases: `{}` encoded, `{}` pass\n", + report.summary.qdrant_rebuild_case_count, report.summary.qdrant_rebuild_pass_count + )); + out.push_str(&format!( + "- Expected evidence recall: `{:.3}` ({}/{})\n", + report.summary.expected_evidence_recall, + report.summary.expected_evidence_matched, + report.summary.expected_evidence_total + )); + out.push_str(&format!( + "- Irrelevant context ratio: `{:.3}` ({} irrelevant)\n", + report.summary.irrelevant_context_ratio, report.summary.irrelevant_context_count + )); + out.push_str(&format!( + "- Trace explainability: `{}` job(s), `{}` wrong-result stage attribution(s)\n", + report.summary.trace_explainability_count, + report.summary.wrong_result_stage_attribution_count + )); + out.push_str(&format!( + "- Consolidation source mutation count: `{}`\n", + report.summary.consolidation.source_mutation_count + )); +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational.rs index ba979884..b5a29cbc 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/operational.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational.rs @@ -1,11 +1,11 @@ +mod recovery_report; +mod resources; +mod tags; +mod tiers; +mod timings; + use crate::{ - BTreeSet, CorpusProfile, JobReport, OPERATIONAL_EVIDENCE_SCHEMA, - OperationalAuthorityRecoveryReport, OperationalColdStartRestoreRebuild, OperationalCostSummary, - OperationalEvidenceReport, OperationalEvidenceTierReport, OperationalLatencyReport, - OperationalResourceSummary, RealWorldJob, TypedStatus, - formatting::round3, - recovery::{self}, - summary::{self}, + CorpusProfile, JobReport, OPERATIONAL_EVIDENCE_SCHEMA, OperationalEvidenceReport, RealWorldJob, }; const OPERATIONAL_EVIDENCE_TIERS: &[&str] = @@ -18,7 +18,7 @@ pub(super) fn operational_evidence_report( let paired = jobs.iter().zip(reports.iter()).collect::>(); let tiers = OPERATIONAL_EVIDENCE_TIERS .iter() - .map(|tier| operational_evidence_tier_report(tier, paired.as_slice())) + .map(|tier| tiers::operational_evidence_tier_report(tier, paired.as_slice())) .collect::>(); let private_tier = tiers.iter().find(|tier| tier.tier == "private_corpus"); let provider_tier = tiers.iter().find(|tier| tier.tier == "provider_backed"); @@ -27,17 +27,19 @@ pub(super) fn operational_evidence_report( let provider_backed_pass_claim_allowed = provider_tier.is_some_and(|tier| tier.pass_claim_allowed); let missing_private_provider_inputs_are_typed_blockers = private_tier - .is_some_and(operational_tier_has_typed_blocker) - && provider_tier.is_some_and(operational_tier_has_typed_blocker); + .is_some_and(tiers::operational_tier_has_typed_blocker) + && provider_tier.is_some_and(tiers::operational_tier_has_typed_blocker); OperationalEvidenceReport { schema: OPERATIONAL_EVIDENCE_SCHEMA.to_string(), tiers, - latency: operational_latency_report(reports), - cost: operational_cost_summary(reports), - resource: operational_resource_summary(paired.as_slice()), - cold_start_restore_rebuild: operational_cold_start_restore_rebuild(paired.as_slice()), - authority_recovery: operational_authority_recovery(reports), + latency: timings::operational_latency_report(reports), + cost: timings::operational_cost_summary(reports), + resource: resources::operational_resource_summary(paired.as_slice()), + cold_start_restore_rebuild: resources::operational_cold_start_restore_rebuild( + paired.as_slice(), + ), + authority_recovery: recovery_report::operational_authority_recovery(reports), missing_private_provider_inputs_are_typed_blockers, private_corpus_pass_claim_allowed, provider_backed_pass_claim_allowed, @@ -46,265 +48,15 @@ pub(super) fn operational_evidence_report( } pub(super) fn operational_evidence_tier(job: &RealWorldJob) -> &'static str { - if job_has_tag(job, "provider_backed") { + if tags::job_has_tag(job, "provider_backed") { "provider_backed" - } else if job_has_tag(job, "private_corpus") + } else if tags::job_has_tag(job, "private_corpus") || matches!(job.corpus.profile, CorpusProfile::PrivateSanitized) { "private_corpus" - } else if job_has_tag(job, "public_proxy") { + } else if tags::job_has_tag(job, "public_proxy") { "public_proxy" } else { "local_fixture" } } - -fn operational_evidence_tier_report( - tier: &str, - paired: &[(&RealWorldJob, &JobReport)], -) -> OperationalEvidenceTierReport { - let tier_jobs = paired - .iter() - .filter(|(job, _)| operational_evidence_tier(job) == tier) - .copied() - .collect::>(); - let reports = tier_jobs.iter().map(|(_, report)| *report).collect::>(); - let status = if reports.is_empty() { - TypedStatus::NotEncoded - } else { - summary::aggregate_status(reports.as_slice()) - }; - let job_count = reports.len(); - let pass = reports.iter().filter(|report| report.status == TypedStatus::Pass).count(); - let wrong_result = - reports.iter().filter(|report| report.status == TypedStatus::WrongResult).count(); - let lifecycle_fail = - reports.iter().filter(|report| report.status == TypedStatus::LifecycleFail).count(); - let incomplete = - reports.iter().filter(|report| report.status == TypedStatus::Incomplete).count(); - let blocked = reports.iter().filter(|report| report.status == TypedStatus::Blocked).count(); - let not_encoded = usize::from(reports.is_empty()) - + reports.iter().filter(|report| report.status == TypedStatus::NotEncoded).count(); - let unsupported_claim = - reports.iter().filter(|report| report.status == TypedStatus::UnsupportedClaim).count(); - - OperationalEvidenceTierReport { - tier: tier.to_string(), - status, - job_count, - pass, - wrong_result, - lifecycle_fail, - incomplete, - blocked, - not_encoded, - unsupported_claim, - mean_latency_ms: summary::mean_latency_for_reports(reports.as_slice()), - total_cost: summary::total_cost_for_reports(reports.as_slice()), - resource_evidence_count: tier_jobs - .iter() - .filter(|(job, _)| job_has_tag(job, "resource_envelope")) - .count(), - cold_start_evidence_count: tier_jobs - .iter() - .filter(|(job, _)| job_has_tag(job, "cold_start")) - .count(), - restore_evidence_count: tier_jobs - .iter() - .filter(|(job, _)| job_has_tag(job, "restore")) - .count(), - qdrant_rebuild_evidence_count: tier_jobs - .iter() - .filter(|(job, report)| { - job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case - }) - .count(), - pass_claim_allowed: job_count > 0 && status == TypedStatus::Pass, - blocker_reasons: reports - .iter() - .filter(|report| report.status != TypedStatus::Pass) - .map(|report| report.reason.clone()) - .collect(), - job_ids: reports.iter().map(|report| report.job_id.clone()).collect(), - } -} - -fn operational_tier_has_typed_blocker(tier: &OperationalEvidenceTierReport) -> bool { - tier.blocked + tier.incomplete + tier.not_encoded > 0 && !tier.pass_claim_allowed -} - -fn operational_latency_report(reports: &[JobReport]) -> OperationalLatencyReport { - let latencies = reports.iter().filter_map(|report| report.latency_ms).collect::>(); - - OperationalLatencyReport { - measured_job_count: latencies.len(), - missing_latency_job_count: reports.len().saturating_sub(latencies.len()), - mean_ms: summary::mean_latency_for_values(latencies.as_slice()), - max_ms: latencies.iter().copied().reduce(f64::max).map(round3), - } -} - -fn operational_cost_summary(reports: &[JobReport]) -> OperationalCostSummary { - let costs = reports.iter().filter_map(|report| report.cost.as_ref()).collect::>(); - let zero_cost_job_count = - costs.iter().filter(|cost| cost.amount.is_some_and(|amount| amount == 0.0)).count(); - - OperationalCostSummary { - jobs_with_cost_report: costs.len(), - missing_cost_job_count: reports.len().saturating_sub(costs.len()), - zero_cost_job_count, - total: summary::total_cost(reports), - claim_boundary: "Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend.".to_string(), - } -} - -fn operational_resource_summary( - paired: &[(&RealWorldJob, &JobReport)], -) -> OperationalResourceSummary { - let resource_jobs = - paired.iter().filter(|(job, _)| job_has_tag(job, "resource_envelope")).collect::>(); - let latency_resource_dimension_job_count = paired - .iter() - .filter(|(_, report)| { - report.dimension_scores.iter().any(|score| score.dimension == "latency_resource") - }) - .count(); - - OperationalResourceSummary { - resource_envelope_job_count: resource_jobs.len(), - resource_envelope_pass_count: resource_jobs - .iter() - .filter(|(_, report)| report.status == TypedStatus::Pass) - .count(), - latency_resource_dimension_job_count, - job_ids: resource_jobs.iter().map(|(_, report)| report.job_id.clone()).collect(), - } -} - -fn operational_cold_start_restore_rebuild( - paired: &[(&RealWorldJob, &JobReport)], -) -> OperationalColdStartRestoreRebuild { - let cold_start_jobs = - paired.iter().filter(|(job, _)| job_has_tag(job, "cold_start")).collect::>(); - let restore_jobs = - paired.iter().filter(|(job, _)| job_has_tag(job, "restore")).collect::>(); - let qdrant_rebuild_jobs = paired - .iter() - .filter(|(job, report)| job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case) - .collect::>(); - let mut job_ids = cold_start_jobs - .iter() - .chain(restore_jobs.iter()) - .chain(qdrant_rebuild_jobs.iter()) - .map(|(_, report)| report.job_id.clone()) - .collect::>() - .into_iter() - .collect::>(); - - job_ids.sort(); - OperationalColdStartRestoreRebuild { - cold_start_job_count: cold_start_jobs.len(), - cold_start_pass_count: cold_start_jobs - .iter() - .filter(|(_, report)| report.status == TypedStatus::Pass) - .count(), - restore_job_count: restore_jobs.len(), - restore_pass_count: restore_jobs - .iter() - .filter(|(_, report)| report.status == TypedStatus::Pass) - .count(), - qdrant_rebuild_job_count: qdrant_rebuild_jobs.len(), - qdrant_rebuild_pass_count: qdrant_rebuild_jobs - .iter() - .filter(|(_, report)| report.status == TypedStatus::Pass) - .count(), - job_ids, - } -} - -fn operational_authority_recovery(reports: &[JobReport]) -> OperationalAuthorityRecoveryReport { - let recovery_jobs = - reports.iter().filter(|report| !report.recovery_drills.is_empty()).collect::>(); - let drills = - recovery_jobs.iter().flat_map(|report| report.recovery_drills.iter()).collect::>(); - let authority_counts = - drills.iter().flat_map(|drill| drill.authority_record_counts.iter()).collect::>(); - let mut job_ids = recovery_jobs - .iter() - .map(|report| report.job_id.clone()) - .collect::>() - .into_iter() - .collect::>(); - - job_ids.sort(); - OperationalAuthorityRecoveryReport { - drill_count: drills.len(), - drill_pass_count: recovery_jobs - .iter() - .filter(|report| report.status == TypedStatus::Pass) - .flat_map(|report| report.recovery_drills.iter()) - .filter(|drill| recovery::recovery_drill_succeeded(drill)) - .count(), - topology_reported_count: drills - .iter() - .filter(|drill| !drill.topology.authority_store.trim().is_empty()) - .count(), - failure_injection_count: drills.iter().map(|drill| drill.failure_injections.len()).sum(), - degraded_read_labeled_count: drills - .iter() - .filter(|drill| !drill.degraded_read.unavailable_labels.is_empty()) - .count(), - source_of_truth_visible_count: drills - .iter() - .filter(|drill| drill.degraded_read.source_of_truth_visible) - .count(), - backup_pitr_restored_count: drills - .iter() - .filter(|drill| drill.backup_pitr.restored) - .count(), - rpo_target_count: drills.len(), - rpo_met_count: drills - .iter() - .filter(|drill| recovery::recovery_measurement_met(&drill.rpo)) - .count(), - rto_target_count: drills.len(), - rto_met_count: drills - .iter() - .filter(|drill| recovery::recovery_measurement_met(&drill.rto)) - .count(), - authority_plane_count: authority_counts.len(), - record_count_preserved_count: authority_counts - .iter() - .filter(|count| recovery::authority_record_count_balanced(count)) - .count(), - source_ref_preserved_count: authority_counts - .iter() - .filter(|count| count.source_refs_preserved) - .count(), - lifecycle_history_preserved_count: authority_counts - .iter() - .filter(|count| count.lifecycle_history_preserved) - .count(), - idempotent_outbox_replay_count: drills - .iter() - .filter(|drill| recovery::recovery_outbox_replay_succeeded(&drill.outbox_replay)) - .count(), - qdrant_rebuild_complete_count: drills - .iter() - .filter(|drill| recovery::recovery_qdrant_rebuild_succeeded(&drill.qdrant_rebuild)) - .count(), - migration_repair_count: drills - .iter() - .filter(|drill| recovery::recovery_migration_repair_succeeded(&drill.migration_repair)) - .count(), - dead_letter_handled_count: drills - .iter() - .filter(|drill| recovery::recovery_dead_letter_succeeded(&drill.dead_letter)) - .count(), - job_ids, - } -} - -fn job_has_tag(job: &RealWorldJob, tag: &str) -> bool { - job.tags.iter().any(|candidate| candidate == tag) -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational/recovery_report.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/recovery_report.rs new file mode 100644 index 00000000..c9a0c57f --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/recovery_report.rs @@ -0,0 +1,89 @@ +use crate::{ + BTreeSet, JobReport, OperationalAuthorityRecoveryReport, TypedStatus, + recovery::{self}, +}; + +pub(in crate::operational) fn operational_authority_recovery( + reports: &[JobReport], +) -> OperationalAuthorityRecoveryReport { + let recovery_jobs = + reports.iter().filter(|report| !report.recovery_drills.is_empty()).collect::>(); + let drills = + recovery_jobs.iter().flat_map(|report| report.recovery_drills.iter()).collect::>(); + let authority_counts = + drills.iter().flat_map(|drill| drill.authority_record_counts.iter()).collect::>(); + let mut job_ids = recovery_jobs + .iter() + .map(|report| report.job_id.clone()) + .collect::>() + .into_iter() + .collect::>(); + + job_ids.sort(); + OperationalAuthorityRecoveryReport { + drill_count: drills.len(), + drill_pass_count: recovery_jobs + .iter() + .filter(|report| report.status == TypedStatus::Pass) + .flat_map(|report| report.recovery_drills.iter()) + .filter(|drill| recovery::recovery_drill_succeeded(drill)) + .count(), + topology_reported_count: drills + .iter() + .filter(|drill| !drill.topology.authority_store.trim().is_empty()) + .count(), + failure_injection_count: drills.iter().map(|drill| drill.failure_injections.len()).sum(), + degraded_read_labeled_count: drills + .iter() + .filter(|drill| !drill.degraded_read.unavailable_labels.is_empty()) + .count(), + source_of_truth_visible_count: drills + .iter() + .filter(|drill| drill.degraded_read.source_of_truth_visible) + .count(), + backup_pitr_restored_count: drills + .iter() + .filter(|drill| drill.backup_pitr.restored) + .count(), + rpo_target_count: drills.len(), + rpo_met_count: drills + .iter() + .filter(|drill| recovery::recovery_measurement_met(&drill.rpo)) + .count(), + rto_target_count: drills.len(), + rto_met_count: drills + .iter() + .filter(|drill| recovery::recovery_measurement_met(&drill.rto)) + .count(), + authority_plane_count: authority_counts.len(), + record_count_preserved_count: authority_counts + .iter() + .filter(|count| recovery::authority_record_count_balanced(count)) + .count(), + source_ref_preserved_count: authority_counts + .iter() + .filter(|count| count.source_refs_preserved) + .count(), + lifecycle_history_preserved_count: authority_counts + .iter() + .filter(|count| count.lifecycle_history_preserved) + .count(), + idempotent_outbox_replay_count: drills + .iter() + .filter(|drill| recovery::recovery_outbox_replay_succeeded(&drill.outbox_replay)) + .count(), + qdrant_rebuild_complete_count: drills + .iter() + .filter(|drill| recovery::recovery_qdrant_rebuild_succeeded(&drill.qdrant_rebuild)) + .count(), + migration_repair_count: drills + .iter() + .filter(|drill| recovery::recovery_migration_repair_succeeded(&drill.migration_repair)) + .count(), + dead_letter_handled_count: drills + .iter() + .filter(|drill| recovery::recovery_dead_letter_succeeded(&drill.dead_letter)) + .count(), + job_ids, + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational/resources.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/resources.rs new file mode 100644 index 00000000..123d066c --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/resources.rs @@ -0,0 +1,72 @@ +use crate::{ + BTreeSet, JobReport, OperationalColdStartRestoreRebuild, OperationalResourceSummary, + RealWorldJob, TypedStatus, operational::tags, +}; + +pub(in crate::operational) fn operational_resource_summary( + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalResourceSummary { + let resource_jobs = paired + .iter() + .filter(|(job, _)| tags::job_has_tag(job, "resource_envelope")) + .collect::>(); + let latency_resource_dimension_job_count = paired + .iter() + .filter(|(_, report)| { + report.dimension_scores.iter().any(|score| score.dimension == "latency_resource") + }) + .count(); + + OperationalResourceSummary { + resource_envelope_job_count: resource_jobs.len(), + resource_envelope_pass_count: resource_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + latency_resource_dimension_job_count, + job_ids: resource_jobs.iter().map(|(_, report)| report.job_id.clone()).collect(), + } +} + +pub(in crate::operational) fn operational_cold_start_restore_rebuild( + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalColdStartRestoreRebuild { + let cold_start_jobs = + paired.iter().filter(|(job, _)| tags::job_has_tag(job, "cold_start")).collect::>(); + let restore_jobs = + paired.iter().filter(|(job, _)| tags::job_has_tag(job, "restore")).collect::>(); + let qdrant_rebuild_jobs = paired + .iter() + .filter(|(job, report)| { + tags::job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case + }) + .collect::>(); + let mut job_ids = cold_start_jobs + .iter() + .chain(restore_jobs.iter()) + .chain(qdrant_rebuild_jobs.iter()) + .map(|(_, report)| report.job_id.clone()) + .collect::>() + .into_iter() + .collect::>(); + + job_ids.sort(); + OperationalColdStartRestoreRebuild { + cold_start_job_count: cold_start_jobs.len(), + cold_start_pass_count: cold_start_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + restore_job_count: restore_jobs.len(), + restore_pass_count: restore_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + qdrant_rebuild_job_count: qdrant_rebuild_jobs.len(), + qdrant_rebuild_pass_count: qdrant_rebuild_jobs + .iter() + .filter(|(_, report)| report.status == TypedStatus::Pass) + .count(), + job_ids, + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tags.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tags.rs new file mode 100644 index 00000000..adf6f5ab --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tags.rs @@ -0,0 +1,5 @@ +use crate::RealWorldJob; + +pub(in crate::operational) fn job_has_tag(job: &RealWorldJob, tag: &str) -> bool { + job.tags.iter().any(|candidate| candidate == tag) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tiers.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tiers.rs new file mode 100644 index 00000000..5c5a7895 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/tiers.rs @@ -0,0 +1,81 @@ +use crate::{ + JobReport, OperationalEvidenceTierReport, RealWorldJob, TypedStatus, + operational::{self, tags}, + summary, +}; + +pub(in crate::operational) fn operational_evidence_tier_report( + tier: &str, + paired: &[(&RealWorldJob, &JobReport)], +) -> OperationalEvidenceTierReport { + let tier_jobs = paired + .iter() + .filter(|(job, _)| operational::operational_evidence_tier(job) == tier) + .copied() + .collect::>(); + let reports = tier_jobs.iter().map(|(_, report)| *report).collect::>(); + let status = if reports.is_empty() { + TypedStatus::NotEncoded + } else { + summary::aggregate_status(reports.as_slice()) + }; + let job_count = reports.len(); + let pass = reports.iter().filter(|report| report.status == TypedStatus::Pass).count(); + let wrong_result = + reports.iter().filter(|report| report.status == TypedStatus::WrongResult).count(); + let lifecycle_fail = + reports.iter().filter(|report| report.status == TypedStatus::LifecycleFail).count(); + let incomplete = + reports.iter().filter(|report| report.status == TypedStatus::Incomplete).count(); + let blocked = reports.iter().filter(|report| report.status == TypedStatus::Blocked).count(); + let not_encoded = usize::from(reports.is_empty()) + + reports.iter().filter(|report| report.status == TypedStatus::NotEncoded).count(); + let unsupported_claim = + reports.iter().filter(|report| report.status == TypedStatus::UnsupportedClaim).count(); + + OperationalEvidenceTierReport { + tier: tier.to_string(), + status, + job_count, + pass, + wrong_result, + lifecycle_fail, + incomplete, + blocked, + not_encoded, + unsupported_claim, + mean_latency_ms: summary::mean_latency_for_reports(reports.as_slice()), + total_cost: summary::total_cost_for_reports(reports.as_slice()), + resource_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| tags::job_has_tag(job, "resource_envelope")) + .count(), + cold_start_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| tags::job_has_tag(job, "cold_start")) + .count(), + restore_evidence_count: tier_jobs + .iter() + .filter(|(job, _)| tags::job_has_tag(job, "restore")) + .count(), + qdrant_rebuild_evidence_count: tier_jobs + .iter() + .filter(|(job, report)| { + tags::job_has_tag(job, "qdrant_rebuild") || report.qdrant_rebuild_case + }) + .count(), + pass_claim_allowed: job_count > 0 && status == TypedStatus::Pass, + blocker_reasons: reports + .iter() + .filter(|report| report.status != TypedStatus::Pass) + .map(|report| report.reason.clone()) + .collect(), + job_ids: reports.iter().map(|report| report.job_id.clone()).collect(), + } +} + +pub(in crate::operational) fn operational_tier_has_typed_blocker( + tier: &OperationalEvidenceTierReport, +) -> bool { + tier.blocked + tier.incomplete + tier.not_encoded > 0 && !tier.pass_claim_allowed +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/operational/timings.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/timings.rs new file mode 100644 index 00000000..9b0c19ee --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/operational/timings.rs @@ -0,0 +1,32 @@ +use crate::{ + JobReport, OperationalCostSummary, OperationalLatencyReport, formatting::round3, summary, +}; + +pub(in crate::operational) fn operational_latency_report( + reports: &[JobReport], +) -> OperationalLatencyReport { + let latencies = reports.iter().filter_map(|report| report.latency_ms).collect::>(); + + OperationalLatencyReport { + measured_job_count: latencies.len(), + missing_latency_job_count: reports.len().saturating_sub(latencies.len()), + mean_ms: summary::mean_latency_for_values(latencies.as_slice()), + max_ms: latencies.iter().copied().reduce(f64::max).map(round3), + } +} + +pub(in crate::operational) fn operational_cost_summary( + reports: &[JobReport], +) -> OperationalCostSummary { + let costs = reports.iter().filter_map(|report| report.cost.as_ref()).collect::>(); + let zero_cost_job_count = + costs.iter().filter(|cost| cost.amount.is_some_and(|amount| amount == 0.0)).count(); + + OperationalCostSummary { + jobs_with_cost_report: costs.len(), + missing_cost_job_count: reports.len().saturating_sub(costs.len()), + zero_cost_job_count, + total: summary::total_cost(reports), + claim_boundary: "Fixture and local-provider zero-cost reports are execution-accounting evidence only; they do not prove hosted provider spend.".to_string(), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard.rs index ae49e592..70c0b038 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard.rs @@ -1,6 +1,6 @@ -#[path = "scoreboard/common.rs"] mod common; -#[path = "scoreboard/elf.rs"] mod elf; -#[path = "scoreboard/external.rs"] mod external; +mod common; +mod elf; +mod external; use crate::{ AdapterCoverageStatus, AdapterStatusCounts, BTreeMap, BTreeSet, ExternalAdapterReport, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf.rs index 8883926d..ba8d7423 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf.rs @@ -1,8 +1,11 @@ +mod lifecycle; +mod metrics; +mod narrative; +mod operations; +mod retrieval; + use crate::scoreboard::{ - self, BTreeSet, JobReport, RealWorldJob, ReportSummary, SCOREBOARD_RETRIEVAL_K, - ScoreboardAnswerSafetyMetrics, ScoreboardCoverageMetrics, ScoreboardLifecycleMetrics, - ScoreboardMetrics, ScoreboardOperationalMetrics, ScoreboardRankedMetrics, - ScoreboardRetrievalMetrics, ScoreboardRow, TypedStatus, common, + JobReport, RealWorldJob, ReportSummary, ScoreboardRow, TypedStatus, common, }; pub(super) fn elf_scoreboard_row( @@ -13,7 +16,7 @@ pub(super) fn elf_scoreboard_row( let source_id_mapped = summary.source_ref_required_count > 0 && summary.source_ref_coverage >= 1.0; let result_state = common::aggregate_job_report_state(job_reports); - let metrics = scoreboard_metrics_for_reports(raw_jobs, job_reports, summary); + let metrics = metrics::scoreboard_metrics_for_reports(raw_jobs, job_reports, summary); let typed_non_pass_count = job_reports.iter().filter(|job| job.status != TypedStatus::Pass).count(); let mut row = ScoreboardRow { @@ -30,12 +33,12 @@ pub(super) fn elf_scoreboard_row( product_runtime: false, container_digest_identified: false, metrics, - strengths: elf_scoreboard_strengths(summary), + strengths: narrative::elf_scoreboard_strengths(summary), weaknesses: Vec::new(), next_evidence: Vec::new(), source_provenance: vec![ "apps/elf-eval/fixtures/real_world_memory/".to_string(), - "apps/elf-eval/src/bin/real_world_job_benchmark.rs".to_string(), + "apps/elf-eval/src/bin/real_world_job_benchmark/main.rs".to_string(), ], }; @@ -48,268 +51,3 @@ pub(super) fn elf_scoreboard_row( row } - -fn scoreboard_metrics_for_reports( - raw_jobs: &[RealWorldJob], - job_reports: &[JobReport], - summary: &ReportSummary, -) -> ScoreboardMetrics { - ScoreboardMetrics { - retrieval: scoreboard_retrieval_metrics(job_reports, summary), - lifecycle: scoreboard_lifecycle_metrics(raw_jobs, job_reports), - answer_safety: scoreboard_answer_safety_metrics(summary), - operations: scoreboard_operational_metrics(raw_jobs, job_reports, summary), - coverage: ScoreboardCoverageMetrics { - job_count: summary.job_count, - encoded_suite_count: summary.encoded_suite_count, - pass_count: summary.pass, - typed_non_pass_count: job_reports - .iter() - .filter(|job| job.status != TypedStatus::Pass) - .count(), - source_ref_coverage: Some(summary.source_ref_coverage), - evidence_coverage: Some(summary.evidence_coverage), - evidence_class: "fixture_backed".to_string(), - }, - } -} - -fn scoreboard_retrieval_metrics( - job_reports: &[JobReport], - summary: &ReportSummary, -) -> ScoreboardRetrievalMetrics { - let produced_evidence_total = - job_reports.iter().map(|job| job.retrieval_quality.produced_evidence_total).sum(); - let mut relevant_at_k = 0; - let mut precision_denominator_at_k = 0; - let mut reciprocal_rank_sum = 0.0; - let mut ndcg_sum = 0.0; - let mut ranked_job_count = 0; - - for job in job_reports { - let expected = job - .expected_evidence - .iter() - .map(|evidence| evidence.evidence_id.as_str()) - .collect::>(); - let ranked = scoreboard_ranked_metrics_for_job(job, &expected); - - relevant_at_k += ranked.relevant_at_k; - precision_denominator_at_k += ranked.precision_denominator_at_k; - reciprocal_rank_sum += ranked.reciprocal_rank; - ndcg_sum += ranked.ndcg; - ranked_job_count += 1; - } - - ScoreboardRetrievalMetrics { - k: SCOREBOARD_RETRIEVAL_K, - metric_basis: "produced_evidence_order".to_string(), - recall_at_k: Some(scoreboard::ratio_or( - relevant_at_k, - summary.expected_evidence_total, - 1.0, - )), - precision_at_k: Some(scoreboard::ratio_or(relevant_at_k, precision_denominator_at_k, 1.0)), - mrr: Some(common::scoreboard_mean_metric(reciprocal_rank_sum, ranked_job_count)), - ndcg: Some(common::scoreboard_mean_metric(ndcg_sum, ranked_job_count)), - expected_evidence_recall: Some(summary.expected_evidence_recall), - citation_source_ref_coverage: Some(summary.source_ref_coverage), - expected_evidence_matched: summary.expected_evidence_matched, - expected_evidence_total: summary.expected_evidence_total, - produced_evidence_total, - } -} - -fn scoreboard_ranked_metrics_for_job( - job: &JobReport, - expected: &BTreeSet<&str>, -) -> ScoreboardRankedMetrics { - let precision_denominator_at_k = SCOREBOARD_RETRIEVAL_K; - let relevant_at_k = job - .produced_evidence - .iter() - .take(SCOREBOARD_RETRIEVAL_K) - .filter(|evidence_id| expected.contains(evidence_id.as_str())) - .count(); - let reciprocal_rank = job - .produced_evidence - .iter() - .position(|evidence_id| expected.contains(evidence_id.as_str())) - .map_or_else(|| f64::from(expected.is_empty()), |index| 1.0 / (index + 1) as f64); - let ndcg = scoreboard_ndcg(job.produced_evidence.as_slice(), expected); - - ScoreboardRankedMetrics { relevant_at_k, precision_denominator_at_k, reciprocal_rank, ndcg } -} - -fn scoreboard_ndcg(produced_evidence: &[String], expected: &BTreeSet<&str>) -> f64 { - if expected.is_empty() { - return 1.0; - } - - let dcg = produced_evidence - .iter() - .take(SCOREBOARD_RETRIEVAL_K) - .enumerate() - .filter(|(_, evidence_id)| expected.contains(evidence_id.as_str())) - .map(|(index, _)| 1.0 / ((index + 2) as f64).log2()) - .sum::(); - let ideal_hits = expected.len().min(SCOREBOARD_RETRIEVAL_K); - let idcg = (0..ideal_hits).map(|index| 1.0 / ((index + 2) as f64).log2()).sum::(); - - if idcg > 0.0 { dcg / idcg } else { 0.0 } -} - -fn scoreboard_lifecycle_metrics( - raw_jobs: &[RealWorldJob], - job_reports: &[JobReport], -) -> ScoreboardLifecycleMetrics { - let stale_check_count: usize = raw_jobs - .iter() - .map(|job| { - job.negative_traps - .iter() - .filter(|trap| trap.failure_if_used && trap.trap_type == "stale_fact") - .count() - }) - .sum(); - let stale_failure_count = job_reports - .iter() - .map(|job| job.stale_answer_count + job.stale_retrieval_count) - .sum::(); - let update_check_count = - scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_update_job); - let update_correct_count = - scoreboard_lifecycle_correct_count(raw_jobs, job_reports, common::scoreboard_is_update_job); - let delete_check_count = - scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_delete_job); - let delete_correct_count = - scoreboard_lifecycle_correct_count(raw_jobs, job_reports, common::scoreboard_is_delete_job); - let rollback_history_check_count = - scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_rollback_history_job); - let rollback_history_readback_count = raw_jobs - .iter() - .zip(job_reports.iter()) - .filter(|(job, report)| { - common::scoreboard_is_rollback_history_job(job) && report.status == TypedStatus::Pass - }) - .count(); - - ScoreboardLifecycleMetrics { - stale_suppression: Some(scoreboard::ratio_or( - stale_check_count.saturating_sub(stale_failure_count), - stale_check_count, - 1.0, - )), - stale_suppressed_count: stale_check_count.saturating_sub(stale_failure_count), - stale_check_count, - update_correctness: Some(scoreboard::ratio_or( - update_correct_count, - update_check_count, - 1.0, - )), - update_correct_count, - update_check_count, - delete_correctness: Some(scoreboard::ratio_or( - delete_correct_count, - delete_check_count, - 1.0, - )), - delete_correct_count, - delete_check_count, - rollback_history_readback_rate: Some(scoreboard::ratio_or( - rollback_history_readback_count, - rollback_history_check_count, - 1.0, - )), - rollback_history_readback_count, - rollback_history_check_count, - } -} - -fn scoreboard_lifecycle_check_count( - jobs: &[RealWorldJob], - predicate: fn(&RealWorldJob) -> bool, -) -> usize { - jobs.iter().filter(|job| predicate(job)).count() -} - -fn scoreboard_lifecycle_correct_count( - raw_jobs: &[RealWorldJob], - job_reports: &[JobReport], - predicate: fn(&RealWorldJob) -> bool, -) -> usize { - raw_jobs - .iter() - .zip(job_reports.iter()) - .filter(|(job, report)| predicate(job) && report.status == TypedStatus::Pass) - .count() -} - -fn scoreboard_answer_safety_metrics(summary: &ReportSummary) -> ScoreboardAnswerSafetyMetrics { - ScoreboardAnswerSafetyMetrics { - unsupported_claim_rate: Some(scoreboard::ratio( - summary.unsupported_claim_count, - summary.job_count, - )), - unsupported_claim_count: summary.unsupported_claim_count, - stale_answer_rate: Some(scoreboard::ratio(summary.stale_answer_count, summary.job_count)), - stale_answer_count: summary.stale_answer_count, - hallucinated_evidence_rate: Some(summary.irrelevant_context_ratio), - redaction_leak_count: summary.redaction_leak_count, - irrelevant_context_ratio: Some(summary.irrelevant_context_ratio), - } -} - -fn scoreboard_operational_metrics( - raw_jobs: &[RealWorldJob], - job_reports: &[JobReport], - summary: &ReportSummary, -) -> ScoreboardOperationalMetrics { - let resource_envelope_job_count = raw_jobs - .iter() - .filter(|job| common::scoreboard_has_any_tag(job, &["resource_envelope"])) - .count(); - let resource_envelope_pass_count = raw_jobs - .iter() - .zip(job_reports.iter()) - .filter(|(job, report)| { - common::scoreboard_has_any_tag(job, &["resource_envelope"]) - && report.status == TypedStatus::Pass - }) - .count(); - - ScoreboardOperationalMetrics { - mean_latency_ms: summary.mean_latency_ms, - total_cost: summary.total_cost.clone(), - resource_envelope_status: if resource_envelope_job_count == resource_envelope_pass_count { - "pass".to_string() - } else { - "typed_non_pass_present".to_string() - }, - resource_envelope_job_count, - resource_envelope_pass_count, - } -} - -fn elf_scoreboard_strengths(summary: &ReportSummary) -> Vec { - let mut strengths = Vec::new(); - - if summary.expected_evidence_recall >= 1.0 { - strengths.push("Expected evidence recall is complete for encoded jobs.".to_string()); - } - if summary.source_ref_coverage >= 1.0 { - strengths - .push("Source-ref coverage is complete for encoded required evidence.".to_string()); - } - if summary.stale_answer_count == 0 && summary.stale_retrieval_count == 0 { - strengths.push("Encoded stale-answer and stale-retrieval counters are zero.".to_string()); - } - if summary.redaction_leak_count == 0 { - strengths.push("Encoded redaction leak count is zero.".to_string()); - } - if summary.work_continuity.is_some() { - strengths.push("Work Continuity readback metrics are encoded in the report.".to_string()); - } - - strengths -} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/lifecycle.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/lifecycle.rs new file mode 100644 index 00000000..02675033 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/lifecycle.rs @@ -0,0 +1,89 @@ +use crate::scoreboard::{ + self, JobReport, RealWorldJob, ScoreboardLifecycleMetrics, TypedStatus, common, +}; + +pub(in crate::scoreboard::elf) fn scoreboard_lifecycle_metrics( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], +) -> ScoreboardLifecycleMetrics { + let stale_check_count: usize = raw_jobs + .iter() + .map(|job| { + job.negative_traps + .iter() + .filter(|trap| trap.failure_if_used && trap.trap_type == "stale_fact") + .count() + }) + .sum(); + let stale_failure_count = job_reports + .iter() + .map(|job| job.stale_answer_count + job.stale_retrieval_count) + .sum::(); + let update_check_count = + scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_update_job); + let update_correct_count = + scoreboard_lifecycle_correct_count(raw_jobs, job_reports, common::scoreboard_is_update_job); + let delete_check_count = + scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_delete_job); + let delete_correct_count = + scoreboard_lifecycle_correct_count(raw_jobs, job_reports, common::scoreboard_is_delete_job); + let rollback_history_check_count = + scoreboard_lifecycle_check_count(raw_jobs, common::scoreboard_is_rollback_history_job); + let rollback_history_readback_count = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| { + common::scoreboard_is_rollback_history_job(job) && report.status == TypedStatus::Pass + }) + .count(); + + ScoreboardLifecycleMetrics { + stale_suppression: Some(scoreboard::ratio_or( + stale_check_count.saturating_sub(stale_failure_count), + stale_check_count, + 1.0, + )), + stale_suppressed_count: stale_check_count.saturating_sub(stale_failure_count), + stale_check_count, + update_correctness: Some(scoreboard::ratio_or( + update_correct_count, + update_check_count, + 1.0, + )), + update_correct_count, + update_check_count, + delete_correctness: Some(scoreboard::ratio_or( + delete_correct_count, + delete_check_count, + 1.0, + )), + delete_correct_count, + delete_check_count, + rollback_history_readback_rate: Some(scoreboard::ratio_or( + rollback_history_readback_count, + rollback_history_check_count, + 1.0, + )), + rollback_history_readback_count, + rollback_history_check_count, + } +} + +fn scoreboard_lifecycle_check_count( + jobs: &[RealWorldJob], + predicate: fn(&RealWorldJob) -> bool, +) -> usize { + jobs.iter().filter(|job| predicate(job)).count() +} + +fn scoreboard_lifecycle_correct_count( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + predicate: fn(&RealWorldJob) -> bool, +) -> usize { + raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| predicate(job) && report.status == TypedStatus::Pass) + .count() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/metrics.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/metrics.rs new file mode 100644 index 00000000..3c6fec5e --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/metrics.rs @@ -0,0 +1,45 @@ +use crate::scoreboard::{ + self, JobReport, RealWorldJob, ReportSummary, ScoreboardAnswerSafetyMetrics, + ScoreboardCoverageMetrics, ScoreboardMetrics, TypedStatus, + elf::{lifecycle, operations, retrieval}, +}; + +pub(in crate::scoreboard::elf) fn scoreboard_metrics_for_reports( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardMetrics { + ScoreboardMetrics { + retrieval: retrieval::scoreboard_retrieval_metrics(job_reports, summary), + lifecycle: lifecycle::scoreboard_lifecycle_metrics(raw_jobs, job_reports), + answer_safety: scoreboard_answer_safety_metrics(summary), + operations: operations::scoreboard_operational_metrics(raw_jobs, job_reports, summary), + coverage: ScoreboardCoverageMetrics { + job_count: summary.job_count, + encoded_suite_count: summary.encoded_suite_count, + pass_count: summary.pass, + typed_non_pass_count: job_reports + .iter() + .filter(|job| job.status != TypedStatus::Pass) + .count(), + source_ref_coverage: Some(summary.source_ref_coverage), + evidence_coverage: Some(summary.evidence_coverage), + evidence_class: "fixture_backed".to_string(), + }, + } +} + +fn scoreboard_answer_safety_metrics(summary: &ReportSummary) -> ScoreboardAnswerSafetyMetrics { + ScoreboardAnswerSafetyMetrics { + unsupported_claim_rate: Some(scoreboard::ratio( + summary.unsupported_claim_count, + summary.job_count, + )), + unsupported_claim_count: summary.unsupported_claim_count, + stale_answer_rate: Some(scoreboard::ratio(summary.stale_answer_count, summary.job_count)), + stale_answer_count: summary.stale_answer_count, + hallucinated_evidence_rate: Some(summary.irrelevant_context_ratio), + redaction_leak_count: summary.redaction_leak_count, + irrelevant_context_ratio: Some(summary.irrelevant_context_ratio), + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/narrative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/narrative.rs new file mode 100644 index 00000000..316c7b25 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/narrative.rs @@ -0,0 +1,24 @@ +use crate::scoreboard::ReportSummary; + +pub(in crate::scoreboard::elf) fn elf_scoreboard_strengths(summary: &ReportSummary) -> Vec { + let mut strengths = Vec::new(); + + if summary.expected_evidence_recall >= 1.0 { + strengths.push("Expected evidence recall is complete for encoded jobs.".to_string()); + } + if summary.source_ref_coverage >= 1.0 { + strengths + .push("Source-ref coverage is complete for encoded required evidence.".to_string()); + } + if summary.stale_answer_count == 0 && summary.stale_retrieval_count == 0 { + strengths.push("Encoded stale-answer and stale-retrieval counters are zero.".to_string()); + } + if summary.redaction_leak_count == 0 { + strengths.push("Encoded redaction leak count is zero.".to_string()); + } + if summary.work_continuity.is_some() { + strengths.push("Work Continuity readback metrics are encoded in the report.".to_string()); + } + + strengths +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/operations.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/operations.rs new file mode 100644 index 00000000..f7aa8cec --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/operations.rs @@ -0,0 +1,34 @@ +use crate::scoreboard::{ + JobReport, RealWorldJob, ReportSummary, ScoreboardOperationalMetrics, TypedStatus, common, +}; + +pub(in crate::scoreboard::elf) fn scoreboard_operational_metrics( + raw_jobs: &[RealWorldJob], + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardOperationalMetrics { + let resource_envelope_job_count = raw_jobs + .iter() + .filter(|job| common::scoreboard_has_any_tag(job, &["resource_envelope"])) + .count(); + let resource_envelope_pass_count = raw_jobs + .iter() + .zip(job_reports.iter()) + .filter(|(job, report)| { + common::scoreboard_has_any_tag(job, &["resource_envelope"]) + && report.status == TypedStatus::Pass + }) + .count(); + + ScoreboardOperationalMetrics { + mean_latency_ms: summary.mean_latency_ms, + total_cost: summary.total_cost.clone(), + resource_envelope_status: if resource_envelope_job_count == resource_envelope_pass_count { + "pass".to_string() + } else { + "typed_non_pass_present".to_string() + }, + resource_envelope_job_count, + resource_envelope_pass_count, + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/retrieval.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/retrieval.rs new file mode 100644 index 00000000..e2db44eb --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/elf/retrieval.rs @@ -0,0 +1,89 @@ +use crate::scoreboard::{ + self, BTreeSet, JobReport, ReportSummary, SCOREBOARD_RETRIEVAL_K, ScoreboardRankedMetrics, + ScoreboardRetrievalMetrics, common, +}; + +pub(in crate::scoreboard::elf) fn scoreboard_retrieval_metrics( + job_reports: &[JobReport], + summary: &ReportSummary, +) -> ScoreboardRetrievalMetrics { + let produced_evidence_total = + job_reports.iter().map(|job| job.retrieval_quality.produced_evidence_total).sum(); + let mut relevant_at_k = 0; + let mut precision_denominator_at_k = 0; + let mut reciprocal_rank_sum = 0.0; + let mut ndcg_sum = 0.0; + let mut ranked_job_count = 0; + + for job in job_reports { + let expected = job + .expected_evidence + .iter() + .map(|evidence| evidence.evidence_id.as_str()) + .collect::>(); + let ranked = scoreboard_ranked_metrics_for_job(job, &expected); + + relevant_at_k += ranked.relevant_at_k; + precision_denominator_at_k += ranked.precision_denominator_at_k; + reciprocal_rank_sum += ranked.reciprocal_rank; + ndcg_sum += ranked.ndcg; + ranked_job_count += 1; + } + + ScoreboardRetrievalMetrics { + k: SCOREBOARD_RETRIEVAL_K, + metric_basis: "produced_evidence_order".to_string(), + recall_at_k: Some(scoreboard::ratio_or( + relevant_at_k, + summary.expected_evidence_total, + 1.0, + )), + precision_at_k: Some(scoreboard::ratio_or(relevant_at_k, precision_denominator_at_k, 1.0)), + mrr: Some(common::scoreboard_mean_metric(reciprocal_rank_sum, ranked_job_count)), + ndcg: Some(common::scoreboard_mean_metric(ndcg_sum, ranked_job_count)), + expected_evidence_recall: Some(summary.expected_evidence_recall), + citation_source_ref_coverage: Some(summary.source_ref_coverage), + expected_evidence_matched: summary.expected_evidence_matched, + expected_evidence_total: summary.expected_evidence_total, + produced_evidence_total, + } +} + +fn scoreboard_ranked_metrics_for_job( + job: &JobReport, + expected: &BTreeSet<&str>, +) -> ScoreboardRankedMetrics { + let precision_denominator_at_k = SCOREBOARD_RETRIEVAL_K; + let relevant_at_k = job + .produced_evidence + .iter() + .take(SCOREBOARD_RETRIEVAL_K) + .filter(|evidence_id| expected.contains(evidence_id.as_str())) + .count(); + let reciprocal_rank = job + .produced_evidence + .iter() + .position(|evidence_id| expected.contains(evidence_id.as_str())) + .map_or_else(|| f64::from(expected.is_empty()), |index| 1.0 / (index + 1) as f64); + let ndcg = scoreboard_ndcg(job.produced_evidence.as_slice(), expected); + + ScoreboardRankedMetrics { relevant_at_k, precision_denominator_at_k, reciprocal_rank, ndcg } +} + +fn scoreboard_ndcg(produced_evidence: &[String], expected: &BTreeSet<&str>) -> f64 { + if expected.is_empty() { + return 1.0; + } + + let dcg = produced_evidence + .iter() + .take(SCOREBOARD_RETRIEVAL_K) + .enumerate() + .filter(|(_, evidence_id)| expected.contains(evidence_id.as_str())) + .map(|(index, _)| 1.0 / ((index + 2) as f64).log2()) + .sum::(); + let ideal_hits = expected.len().min(SCOREBOARD_RETRIEVAL_K); + let idcg = (0..ideal_hits).map(|index| 1.0 / ((index + 2) as f64).log2()).sum::(); + + if idcg > 0.0 { dcg / idcg } else { 0.0 } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external.rs index f30e40e5..13a1753c 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external.rs @@ -1,7 +1,11 @@ +mod narrative; +mod signals; +mod status; + use crate::scoreboard::{ - self, AdapterCoverageStatus, BTreeMap, BTreeSet, ExternalAdapterReport, SCOREBOARD_RETRIEVAL_K, - ScenarioComparisonOutcome, ScoreboardCoverageMetrics, ScoreboardMetrics, - ScoreboardRetrievalMetrics, ScoreboardRow, common, + AdapterCoverageStatus, BTreeMap, ExternalAdapterReport, SCOREBOARD_RETRIEVAL_K, + ScoreboardCoverageMetrics, ScoreboardMetrics, ScoreboardRetrievalMetrics, ScoreboardRow, + common, }; pub(super) fn external_project_scoreboard_rows( @@ -23,16 +27,16 @@ fn external_project_scoreboard_row( project: String, adapters: &[&ExternalAdapterReport], ) -> ScoreboardRow { - let evidence_class = strongest_scoreboard_evidence_class(adapters); - let result_state = external_project_result_state(adapters); - let source_id_mapped = external_project_source_id_mapped(adapters); - let same_corpus = external_project_same_corpus(adapters); + let evidence_class = status::strongest_scoreboard_evidence_class(adapters); + let result_state = status::external_project_result_state(adapters); + let source_id_mapped = signals::external_project_source_id_mapped(adapters); + let same_corpus = signals::external_project_same_corpus(adapters); let product_runtime = adapters.iter().any(|adapter| adapter.evidence_class == "live_real_world"); let container_digest_identified = - adapters.iter().any(|adapter| adapter_has_container_digest(adapter)); + adapters.iter().any(|adapter| signals::adapter_has_container_digest(adapter)); let typed_non_pass_count = - adapters.iter().map(|adapter| adapter_typed_non_pass_count(adapter)).sum(); + adapters.iter().map(|adapter| status::adapter_typed_non_pass_count(adapter)).sum(); let mut row = ScoreboardRow { product_id: scoreboard_project_id(project.as_str()), product_name: project, @@ -51,10 +55,10 @@ fn external_project_scoreboard_row( evidence_class.as_str(), typed_non_pass_count, ), - strengths: external_project_strengths(adapters), - weaknesses: external_project_weaknesses(adapters), + strengths: narrative::external_project_strengths(adapters), + weaknesses: narrative::external_project_weaknesses(adapters), next_evidence: Vec::new(), - source_provenance: external_project_source_provenance(adapters), + source_provenance: narrative::external_project_source_provenance(adapters), }; common::scoreboard_apply_comparability_gaps(&mut row); @@ -93,289 +97,6 @@ fn external_project_scoreboard_metrics( } } -fn strongest_scoreboard_evidence_class(adapters: &[&ExternalAdapterReport]) -> String { - for evidence_class in ["live_real_world", "live_baseline", "fixture_backed", "research_gate"] { - if adapters.iter().any(|adapter| { - common::scoreboard_evidence_class(adapter.evidence_class.as_str()) == evidence_class - }) { - return evidence_class.to_string(); - } - } - - "research_gate".to_string() -} - -fn external_project_result_state(adapters: &[&ExternalAdapterReport]) -> String { - for status in [ - AdapterCoverageStatus::WrongResult, - AdapterCoverageStatus::Blocked, - AdapterCoverageStatus::Incomplete, - AdapterCoverageStatus::LifecycleFail, - AdapterCoverageStatus::NotEncoded, - AdapterCoverageStatus::Unsupported, - ] { - if adapters.iter().any(|adapter| adapter_has_status(adapter, status)) { - return adapter_status_to_scoreboard_state(status).to_string(); - } - } - - "not_comparable".to_string() -} - -fn adapter_has_status(adapter: &ExternalAdapterReport, status: AdapterCoverageStatus) -> bool { - adapter.overall_status == status - || adapter.setup.status == status - || adapter.run.status == status - || adapter.result.status == status - || adapter.capabilities.iter().any(|capability| capability.status == status) - || adapter.suites.iter().any(|suite| suite.status == status) - || adapter.scenarios.iter().any(|scenario| scenario.status == status) -} - -fn external_project_same_corpus(adapters: &[&ExternalAdapterReport]) -> bool { - let needles = &["same-corpus", "same corpus", "same_corpus", "shared corpus"]; - - adapters.iter().any(|adapter| { - text_mentions_any(adapter.adapter_kind.as_str(), needles) - || adapter_has_reported_same_corpus_text(adapter, needles) - }) -} - -fn external_project_source_id_mapped(adapters: &[&ExternalAdapterReport]) -> bool { - let needles = &[ - "source-id mapped", - "source ids mapped", - "maps to source ids", - "mapped to source ids", - "maps back to source ids", - "map to generated evidence ids", - "mapped to generated evidence ids", - "evidence ids match", - ]; - - adapters.iter().any(|adapter| adapter_has_passing_text(adapter, needles)) -} - -fn adapter_has_passing_text(adapter: &ExternalAdapterReport, needles: &[&str]) -> bool { - adapter_status_mentions_any(adapter.setup.status, adapter.setup.evidence.as_str(), needles) - || adapter_status_mentions_any(adapter.run.status, adapter.run.evidence.as_str(), needles) - || adapter_status_mentions_any( - adapter.result.status, - adapter.result.evidence.as_str(), - needles, - ) || adapter.capabilities.iter().any(|capability| { - adapter_status_mentions_any(capability.status, capability.capability.as_str(), needles) - || adapter_status_mentions_any(capability.status, capability.evidence.as_str(), needles) - }) || adapter.suites.iter().any(|suite| { - adapter_status_mentions_any(suite.status, suite.suite_id.as_str(), needles) - || adapter_status_mentions_any(suite.status, suite.evidence.as_str(), needles) - }) || adapter.scenarios.iter().any(|scenario| { - adapter_status_mentions_any(scenario.status, scenario.scenario_id.as_str(), needles) - || adapter_status_mentions_any(scenario.status, scenario.evidence.as_str(), needles) - }) -} - -fn adapter_has_reported_same_corpus_text( - adapter: &ExternalAdapterReport, - needles: &[&str], -) -> bool { - adapter_status_reports_same_corpus( - adapter.setup.status, - adapter.setup.evidence.as_str(), - needles, - ) || adapter_status_reports_same_corpus( - adapter.run.status, - adapter.run.evidence.as_str(), - needles, - ) || adapter_status_reports_same_corpus( - adapter.result.status, - adapter.result.evidence.as_str(), - needles, - ) || adapter.capabilities.iter().any(|capability| { - adapter_status_reports_same_corpus( - capability.status, - capability.capability.as_str(), - needles, - ) || adapter_status_reports_same_corpus( - capability.status, - capability.evidence.as_str(), - needles, - ) - }) || adapter.suites.iter().any(|suite| { - adapter_status_reports_same_corpus(suite.status, suite.suite_id.as_str(), needles) - || adapter_status_reports_same_corpus(suite.status, suite.evidence.as_str(), needles) - }) || adapter.scenarios.iter().any(|scenario| { - adapter_status_reports_same_corpus(scenario.status, scenario.scenario_id.as_str(), needles) - || adapter_status_reports_same_corpus( - scenario.status, - scenario.evidence.as_str(), - needles, - ) - }) -} - -fn adapter_status_reports_same_corpus( - status: AdapterCoverageStatus, - text: &str, - needles: &[&str], -) -> bool { - matches!( - status, - AdapterCoverageStatus::Pass - | AdapterCoverageStatus::Real - | AdapterCoverageStatus::WrongResult - | AdapterCoverageStatus::LifecycleFail - ) && text_mentions_any(text, needles) -} - -fn adapter_status_mentions_any( - status: AdapterCoverageStatus, - text: &str, - needles: &[&str], -) -> bool { - matches!(status, AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real) - && text_mentions_any(text, needles) -} - -fn text_mentions_any(text: &str, needles: &[&str]) -> bool { - let text = text.to_ascii_lowercase(); - - needles.iter().any(|needle| text.contains(&needle.to_ascii_lowercase())) -} - -fn adapter_status_to_scoreboard_state(status: AdapterCoverageStatus) -> &'static str { - match status { - AdapterCoverageStatus::WrongResult | AdapterCoverageStatus::LifecycleFail => "wrong_result", - AdapterCoverageStatus::Blocked => "blocked", - AdapterCoverageStatus::Incomplete => "incomplete", - AdapterCoverageStatus::NotEncoded | AdapterCoverageStatus::Unsupported => "not_encoded", - AdapterCoverageStatus::Real - | AdapterCoverageStatus::Mocked - | AdapterCoverageStatus::Pass => "not_comparable", - } -} - -fn adapter_typed_non_pass_count(adapter: &ExternalAdapterReport) -> usize { - let direct_statuses = - [adapter.overall_status, adapter.setup.status, adapter.run.status, adapter.result.status]; - let direct = direct_statuses - .into_iter() - .filter(|status| adapter_status_is_typed_non_pass(*status)) - .count(); - let capability = adapter - .capabilities - .iter() - .filter(|capability| adapter_status_is_typed_non_pass(capability.status)) - .count(); - let suites = adapter - .suites - .iter() - .filter(|suite| adapter_status_is_typed_non_pass(suite.status)) - .count(); - let scenarios = adapter - .scenarios - .iter() - .filter(|scenario| adapter_status_is_typed_non_pass(scenario.status)) - .count(); - - direct + capability + suites + scenarios -} - -fn adapter_status_is_typed_non_pass(status: AdapterCoverageStatus) -> bool { - matches!( - status, - AdapterCoverageStatus::Unsupported - | AdapterCoverageStatus::Blocked - | AdapterCoverageStatus::Incomplete - | AdapterCoverageStatus::WrongResult - | AdapterCoverageStatus::LifecycleFail - | AdapterCoverageStatus::NotEncoded - ) -} - -fn adapter_has_container_digest(adapter: &ExternalAdapterReport) -> bool { - adapter.setup.evidence.contains("sha256:") - || adapter.run.evidence.contains("sha256:") - || adapter.result.evidence.contains("sha256:") - || adapter.evidence.iter().any(|evidence| { - evidence.reference.contains("sha256:") || evidence.reference.contains("digest") - }) -} - -fn external_project_strengths(adapters: &[&ExternalAdapterReport]) -> Vec { - let mut strengths = BTreeSet::new(); - - for adapter in adapters { - for capability in &adapter.capabilities { - if matches!( - capability.status, - AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real - ) { - strengths.insert(format!( - "{} capability is {}.", - capability.capability, - scoreboard::adapter_status_str(capability.status) - )); - } - } - for scenario in &adapter.scenarios { - if scoreboard::scenario_comparison_outcome(scenario) == ScenarioComparisonOutcome::Loss - { - strengths.insert(format!( - "Scenario {} is recorded as a competitor strength.", - scenario.scenario_id - )); - } - } - } - - strengths.into_iter().take(6).collect() -} - -fn external_project_weaknesses(adapters: &[&ExternalAdapterReport]) -> Vec { - let mut weaknesses = BTreeSet::new(); - - for adapter in adapters { - if adapter.overall_status != AdapterCoverageStatus::Pass { - weaknesses.insert(format!( - "Adapter {} overall status is {}.", - adapter.adapter_id, - scoreboard::adapter_status_str(adapter.overall_status) - )); - } - - for suite in &adapter.suites { - if adapter_status_is_typed_non_pass(suite.status) { - weaknesses.insert(format!( - "Suite {} is {}.", - suite.suite_id, - scoreboard::adapter_status_str(suite.status) - )); - } - } - } - - weaknesses.into_iter().take(8).collect() -} - -fn external_project_source_provenance(adapters: &[&ExternalAdapterReport]) -> Vec { - let mut provenance = BTreeSet::new(); - - for adapter in adapters { - for evidence in &adapter.evidence { - provenance.insert(evidence.reference.clone()); - } - for artifact in [&adapter.setup.artifact, &adapter.run.artifact, &adapter.result.artifact] - .into_iter() - .flatten() - { - provenance.insert(artifact.clone()); - } - } - - provenance.into_iter().take(12).collect() -} - fn scoreboard_project_id(project: &str) -> String { project .chars() diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/narrative.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/narrative.rs new file mode 100644 index 00000000..45fa5748 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/narrative.rs @@ -0,0 +1,80 @@ +use crate::scoreboard::{ + self, AdapterCoverageStatus, BTreeSet, ExternalAdapterReport, ScenarioComparisonOutcome, + external::status, +}; + +pub(super) fn external_project_strengths(adapters: &[&ExternalAdapterReport]) -> Vec { + let mut strengths = BTreeSet::new(); + + for adapter in adapters { + for capability in &adapter.capabilities { + if matches!( + capability.status, + AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real + ) { + strengths.insert(format!( + "{} capability is {}.", + capability.capability, + scoreboard::adapter_status_str(capability.status) + )); + } + } + for scenario in &adapter.scenarios { + if scoreboard::scenario_comparison_outcome(scenario) == ScenarioComparisonOutcome::Loss + { + strengths.insert(format!( + "Scenario {} is recorded as a competitor strength.", + scenario.scenario_id + )); + } + } + } + + strengths.into_iter().take(6).collect() +} + +pub(super) fn external_project_weaknesses(adapters: &[&ExternalAdapterReport]) -> Vec { + let mut weaknesses = BTreeSet::new(); + + for adapter in adapters { + if adapter.overall_status != AdapterCoverageStatus::Pass { + weaknesses.insert(format!( + "Adapter {} overall status is {}.", + adapter.adapter_id, + scoreboard::adapter_status_str(adapter.overall_status) + )); + } + + for suite in &adapter.suites { + if status::adapter_status_is_typed_non_pass(suite.status) { + weaknesses.insert(format!( + "Suite {} is {}.", + suite.suite_id, + scoreboard::adapter_status_str(suite.status) + )); + } + } + } + + weaknesses.into_iter().take(8).collect() +} + +pub(super) fn external_project_source_provenance( + adapters: &[&ExternalAdapterReport], +) -> Vec { + let mut provenance = BTreeSet::new(); + + for adapter in adapters { + for evidence in &adapter.evidence { + provenance.insert(evidence.reference.clone()); + } + for artifact in [&adapter.setup.artifact, &adapter.run.artifact, &adapter.result.artifact] + .into_iter() + .flatten() + { + provenance.insert(artifact.clone()); + } + } + + provenance.into_iter().take(12).collect() +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/signals.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/signals.rs new file mode 100644 index 00000000..d08cc642 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/signals.rs @@ -0,0 +1,121 @@ +use crate::scoreboard::{AdapterCoverageStatus, ExternalAdapterReport}; + +pub(super) fn external_project_same_corpus(adapters: &[&ExternalAdapterReport]) -> bool { + let needles = &["same-corpus", "same corpus", "same_corpus", "shared corpus"]; + + adapters.iter().any(|adapter| { + text_mentions_any(adapter.adapter_kind.as_str(), needles) + || adapter_has_reported_same_corpus_text(adapter, needles) + }) +} + +pub(super) fn external_project_source_id_mapped(adapters: &[&ExternalAdapterReport]) -> bool { + let needles = &[ + "source-id mapped", + "source ids mapped", + "maps to source ids", + "mapped to source ids", + "maps back to source ids", + "map to generated evidence ids", + "mapped to generated evidence ids", + "evidence ids match", + ]; + + adapters.iter().any(|adapter| adapter_has_passing_text(adapter, needles)) +} + +pub(super) fn adapter_has_container_digest(adapter: &ExternalAdapterReport) -> bool { + adapter.setup.evidence.contains("sha256:") + || adapter.run.evidence.contains("sha256:") + || adapter.result.evidence.contains("sha256:") + || adapter.evidence.iter().any(|evidence| { + evidence.reference.contains("sha256:") || evidence.reference.contains("digest") + }) +} + +fn adapter_has_passing_text(adapter: &ExternalAdapterReport, needles: &[&str]) -> bool { + adapter_status_mentions_any(adapter.setup.status, adapter.setup.evidence.as_str(), needles) + || adapter_status_mentions_any(adapter.run.status, adapter.run.evidence.as_str(), needles) + || adapter_status_mentions_any( + adapter.result.status, + adapter.result.evidence.as_str(), + needles, + ) || adapter.capabilities.iter().any(|capability| { + adapter_status_mentions_any(capability.status, capability.capability.as_str(), needles) + || adapter_status_mentions_any(capability.status, capability.evidence.as_str(), needles) + }) || adapter.suites.iter().any(|suite| { + adapter_status_mentions_any(suite.status, suite.suite_id.as_str(), needles) + || adapter_status_mentions_any(suite.status, suite.evidence.as_str(), needles) + }) || adapter.scenarios.iter().any(|scenario| { + adapter_status_mentions_any(scenario.status, scenario.scenario_id.as_str(), needles) + || adapter_status_mentions_any(scenario.status, scenario.evidence.as_str(), needles) + }) +} + +fn adapter_has_reported_same_corpus_text( + adapter: &ExternalAdapterReport, + needles: &[&str], +) -> bool { + adapter_status_reports_same_corpus( + adapter.setup.status, + adapter.setup.evidence.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + adapter.run.status, + adapter.run.evidence.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + adapter.result.status, + adapter.result.evidence.as_str(), + needles, + ) || adapter.capabilities.iter().any(|capability| { + adapter_status_reports_same_corpus( + capability.status, + capability.capability.as_str(), + needles, + ) || adapter_status_reports_same_corpus( + capability.status, + capability.evidence.as_str(), + needles, + ) + }) || adapter.suites.iter().any(|suite| { + adapter_status_reports_same_corpus(suite.status, suite.suite_id.as_str(), needles) + || adapter_status_reports_same_corpus(suite.status, suite.evidence.as_str(), needles) + }) || adapter.scenarios.iter().any(|scenario| { + adapter_status_reports_same_corpus(scenario.status, scenario.scenario_id.as_str(), needles) + || adapter_status_reports_same_corpus( + scenario.status, + scenario.evidence.as_str(), + needles, + ) + }) +} + +fn adapter_status_reports_same_corpus( + status: AdapterCoverageStatus, + text: &str, + needles: &[&str], +) -> bool { + matches!( + status, + AdapterCoverageStatus::Pass + | AdapterCoverageStatus::Real + | AdapterCoverageStatus::WrongResult + | AdapterCoverageStatus::LifecycleFail + ) && text_mentions_any(text, needles) +} + +fn adapter_status_mentions_any( + status: AdapterCoverageStatus, + text: &str, + needles: &[&str], +) -> bool { + matches!(status, AdapterCoverageStatus::Pass | AdapterCoverageStatus::Real) + && text_mentions_any(text, needles) +} + +fn text_mentions_any(text: &str, needles: &[&str]) -> bool { + let text = text.to_ascii_lowercase(); + + needles.iter().any(|needle| text.contains(&needle.to_ascii_lowercase())) +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/status.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/status.rs new file mode 100644 index 00000000..d177d183 --- /dev/null +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoreboard/external/status.rs @@ -0,0 +1,90 @@ +use crate::scoreboard::{AdapterCoverageStatus, ExternalAdapterReport, common}; + +pub(super) fn strongest_scoreboard_evidence_class(adapters: &[&ExternalAdapterReport]) -> String { + for evidence_class in ["live_real_world", "live_baseline", "fixture_backed", "research_gate"] { + if adapters.iter().any(|adapter| { + common::scoreboard_evidence_class(adapter.evidence_class.as_str()) == evidence_class + }) { + return evidence_class.to_string(); + } + } + + "research_gate".to_string() +} + +pub(super) fn external_project_result_state(adapters: &[&ExternalAdapterReport]) -> String { + for status in [ + AdapterCoverageStatus::WrongResult, + AdapterCoverageStatus::Blocked, + AdapterCoverageStatus::Incomplete, + AdapterCoverageStatus::LifecycleFail, + AdapterCoverageStatus::NotEncoded, + AdapterCoverageStatus::Unsupported, + ] { + if adapters.iter().any(|adapter| adapter_has_status(adapter, status)) { + return adapter_status_to_scoreboard_state(status).to_string(); + } + } + + "not_comparable".to_string() +} + +pub(super) fn adapter_typed_non_pass_count(adapter: &ExternalAdapterReport) -> usize { + let direct_statuses = + [adapter.overall_status, adapter.setup.status, adapter.run.status, adapter.result.status]; + let direct = direct_statuses + .into_iter() + .filter(|status| adapter_status_is_typed_non_pass(*status)) + .count(); + let capability = adapter + .capabilities + .iter() + .filter(|capability| adapter_status_is_typed_non_pass(capability.status)) + .count(); + let suites = adapter + .suites + .iter() + .filter(|suite| adapter_status_is_typed_non_pass(suite.status)) + .count(); + let scenarios = adapter + .scenarios + .iter() + .filter(|scenario| adapter_status_is_typed_non_pass(scenario.status)) + .count(); + + direct + capability + suites + scenarios +} + +pub(super) fn adapter_status_is_typed_non_pass(status: AdapterCoverageStatus) -> bool { + matches!( + status, + AdapterCoverageStatus::Unsupported + | AdapterCoverageStatus::Blocked + | AdapterCoverageStatus::Incomplete + | AdapterCoverageStatus::WrongResult + | AdapterCoverageStatus::LifecycleFail + | AdapterCoverageStatus::NotEncoded + ) +} + +fn adapter_has_status(adapter: &ExternalAdapterReport, status: AdapterCoverageStatus) -> bool { + adapter.overall_status == status + || adapter.setup.status == status + || adapter.run.status == status + || adapter.result.status == status + || adapter.capabilities.iter().any(|capability| capability.status == status) + || adapter.suites.iter().any(|suite| suite.status == status) + || adapter.scenarios.iter().any(|scenario| scenario.status == status) +} + +fn adapter_status_to_scoreboard_state(status: AdapterCoverageStatus) -> &'static str { + match status { + AdapterCoverageStatus::WrongResult | AdapterCoverageStatus::LifecycleFail => "wrong_result", + AdapterCoverageStatus::Blocked => "blocked", + AdapterCoverageStatus::Incomplete => "incomplete", + AdapterCoverageStatus::NotEncoded | AdapterCoverageStatus::Unsupported => "not_encoded", + AdapterCoverageStatus::Real + | AdapterCoverageStatus::Mocked + | AdapterCoverageStatus::Pass => "not_comparable", + } +} diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs index ca945834..088a8842 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/scoring.rs @@ -1,10 +1,10 @@ -#[path = "scoring/answers.rs"] mod answers; -#[path = "scoring/claims.rs"] mod claims; -#[path = "scoring/consolidation.rs"] mod consolidation; -#[path = "scoring/counts.rs"] mod counts; -#[path = "scoring/dimensions.rs"] mod dimensions; -#[path = "scoring/evolution.rs"] mod evolution; -#[path = "scoring/reports.rs"] mod reports; +mod answers; +mod claims; +mod consolidation; +mod counts; +mod dimensions; +mod evolution; +mod reports; use self::{counts::wrong_result_signal_count, evolution::update_rationale_missing_count}; use crate::{ diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary.rs index 3e9e0e31..64d2b03f 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/summary.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/summary.rs @@ -1,12 +1,12 @@ -#[path = "summary/consolidation.rs"] mod consolidation; -#[path = "summary/knowledge.rs"] mod knowledge; -#[path = "summary/memory.rs"] mod memory; -#[path = "summary/metrics.rs"] mod metrics; -#[path = "summary/proactive.rs"] mod proactive; -#[path = "summary/report.rs"] mod report; -#[path = "summary/scheduled.rs"] mod scheduled; -#[path = "summary/suites.rs"] mod suites; -#[path = "summary/work.rs"] mod work; +mod consolidation; +mod knowledge; +mod memory; +mod metrics; +mod proactive; +mod report; +mod scheduled; +mod suites; +mod work; use crate::{ ConsolidationSummaryReport, CostReport, EvolutionSummary, FollowUpReport, JobReport, diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_core.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_core.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_core.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_domain.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_domain.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_domain.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_suite.rs similarity index 100% rename from apps/elf-eval/src/bin/real_world_job_benchmark/summary_report_suite.rs rename to apps/elf-eval/src/bin/real_world_job_benchmark/summary_reports/summary_report_suite.rs diff --git a/apps/elf-eval/src/bin/real_world_job_benchmark/validation.rs b/apps/elf-eval/src/bin/real_world_job_benchmark/validation.rs index 1fc03da8..a725058b 100644 --- a/apps/elf-eval/src/bin/real_world_job_benchmark/validation.rs +++ b/apps/elf-eval/src/bin/real_world_job_benchmark/validation.rs @@ -1,16 +1,16 @@ -#[path = "validation/adapter.rs"] mod adapter; -#[path = "validation/basics.rs"] mod basics; -#[path = "validation/common.rs"] mod common; -#[path = "validation/consolidation.rs"] mod consolidation; -#[path = "validation/expectations.rs"] mod expectations; -#[path = "validation/job_rules.rs"] mod job_rules; -#[path = "validation/memory_summary.rs"] mod memory_summary; -#[path = "validation/page.rs"] mod page; -#[path = "validation/proactive.rs"] mod proactive; -#[path = "validation/recovery_artifact.rs"] mod recovery_artifact; -#[path = "validation/scheduled.rs"] mod scheduled; -#[path = "validation/trace.rs"] mod trace; -#[path = "validation/work_journal.rs"] mod work_journal; +mod adapter; +mod basics; +mod common; +mod consolidation; +mod expectations; +mod job_rules; +mod memory_summary; +mod page; +mod proactive; +mod recovery_artifact; +mod scheduled; +mod trace; +mod work_journal; use self::{ common::{ diff --git a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge.rs b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge.rs index 19a07a21..bc5684ab 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/consolidation_knowledge.rs @@ -18,8 +18,9 @@ fn real_world_live_adapter_sources(workspace: &Path) -> Result { } fn real_world_job_benchmark_sources(workspace: &Path) -> Result { - let mut source = - fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark.rs"))?; + let mut source = fs::read_to_string( + workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark/main.rs"), + )?; append_rust_sources( workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark").as_path(), diff --git a/apps/elf-eval/tests/real_world_job_benchmark/live_adapter_tasks.rs b/apps/elf-eval/tests/real_world_job_benchmark/live_adapter_tasks.rs index 69edc0e0..364d03d0 100644 --- a/apps/elf-eval/tests/real_world_job_benchmark/live_adapter_tasks.rs +++ b/apps/elf-eval/tests/real_world_job_benchmark/live_adapter_tasks.rs @@ -18,8 +18,9 @@ fn real_world_live_adapter_sources(workspace: &Path) -> Result { } fn real_world_job_benchmark_sources(workspace: &Path) -> Result { - let mut source = - fs::read_to_string(workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark.rs"))?; + let mut source = fs::read_to_string( + workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark/main.rs"), + )?; append_rust_sources( workspace.join("apps/elf-eval/src/bin/real_world_job_benchmark").as_path(), diff --git a/docs/runbook/benchmarking/real_world_memory_evolution.md b/docs/runbook/benchmarking/real_world_memory_evolution.md index d8c21d22..ea1f5c6c 100644 --- a/docs/runbook/benchmarking/real_world_memory_evolution.md +++ b/docs/runbook/benchmarking/real_world_memory_evolution.md @@ -18,7 +18,7 @@ Goal: Run and interpret the checked-in memory evolution real-world job fixtures. Read this when: You need to test current facts, historical facts, stale facts, conflicts, corrected memories, and temporal relation validity. Inputs: `apps/elf-eval/fixtures/real_world_memory/evolution/`, -`apps/elf-eval/src/bin/real_world_job_benchmark.rs`, and `Makefile.toml`. +`apps/elf-eval/src/bin/real_world_job_benchmark/main.rs`, and `Makefile.toml`. Depends on: `docs/spec/real_world_agent_memory_benchmark_v1.md`, `docs/runbook/benchmarking/real_world_agent_memory_benchmark.md`, and `docs/evidence/external_memory/comparison_external_projects.md`. diff --git a/docs/spec/agent_memory_quantitative_benchmark_v1.md b/docs/spec/agent_memory_quantitative_benchmark_v1.md index 07023ca1..5974e4bf 100644 --- a/docs/spec/agent_memory_quantitative_benchmark_v1.md +++ b/docs/spec/agent_memory_quantitative_benchmark_v1.md @@ -16,7 +16,7 @@ source_refs: - XY-1098 - XY-1120 code_refs: - - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - apps/elf-eval/tests/real_world_job_benchmark.rs related: - docs/spec/real_world_agent_memory_benchmark_v1.md @@ -24,7 +24,7 @@ related: drift_watch: - docs/spec/agent_memory_quantitative_benchmark_v1.md - docs/spec/real_world_agent_memory_benchmark_v1.md - - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - apps/elf-eval/fixtures/report_snapshots/2026-06-27-public-quantitative-competitor-scoreboard-report.json --- # Agent Memory Quantitative Benchmark v1 diff --git a/docs/spec/real_world_agent_memory_benchmark_v1.md b/docs/spec/real_world_agent_memory_benchmark_v1.md index 68e745a0..c2295b1c 100644 --- a/docs/spec/real_world_agent_memory_benchmark_v1.md +++ b/docs/spec/real_world_agent_memory_benchmark_v1.md @@ -13,13 +13,13 @@ tags: source_refs: [] code_refs: - Makefile.toml - - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - apps/elf-eval/fixtures/real_world_memory/production_ops/authority_plane_recovery_drill.json related: - docs/spec/agent_memory_quantitative_benchmark_v1.md drift_watch: - docs/spec/real_world_agent_memory_benchmark_v1.md - - apps/elf-eval/src/bin/real_world_job_benchmark.rs + - apps/elf-eval/src/bin/real_world_job_benchmark/main.rs - apps/elf-eval/fixtures/real_world_memory/ --- # Real-World Agent Memory Benchmark v1 diff --git a/docs/spec/system_version_registry.md b/docs/spec/system_version_registry.md index 3a82e5e5..8b361533 100644 --- a/docs/spec/system_version_registry.md +++ b/docs/spec/system_version_registry.md @@ -44,7 +44,7 @@ This document is normative. When a new versioned identifier is introduced, it mu - Identifier: `elf.quality_scoreboard/v1`. - Type: Public quantitative competitor scoreboard report and row schema. - Defined in: `docs/spec/agent_memory_quantitative_benchmark_v1.md`. -- Consumers: `apps/elf-eval/src/bin/real_world_job_benchmark.rs`, checked-in +- Consumers: `apps/elf-eval/src/bin/real_world_job_benchmark/main.rs`, checked-in benchmark report snapshots, public benchmarking evidence reports, and agents deciding whether a product row is comparable or only a typed blocker. - Bump rule: Introduce a new identifier only when row states, comparability gates,