Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions apps/elf-eval/src/bin/real_world_job_benchmark/artifacts.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#[path = "artifacts/answer.rs"] mod answer;
#[path = "artifacts/consolidation.rs"] mod consolidation;
#[path = "artifacts/cost.rs"] mod cost;
#[path = "artifacts/knowledge.rs"] mod knowledge;
#[path = "artifacts/memory.rs"] mod memory;
#[path = "artifacts/proactive.rs"] mod proactive;
#[path = "artifacts/recovery.rs"] mod recovery;
#[path = "artifacts/scheduled.rs"] mod scheduled;
#[path = "artifacts/work.rs"] mod work;
mod answer;
mod consolidation;
mod cost;
mod knowledge;
mod memory;
mod proactive;
mod recovery;
mod scheduled;
mod work;

pub(super) use self::{
answer::{ProducedAnswer, ProducedClaim},
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#[path = "external_adapters/manifest.rs"] mod manifest;
#[path = "external_adapters/outcome.rs"] mod outcome;
#[path = "external_adapters/summary.rs"] mod summary;
#[path = "external_adapters/validation.rs"] mod validation;
mod manifest;
mod outcome;
mod summary;
mod validation;

pub(super) use outcome::scenario_comparison_outcome;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
mod basics;
mod metadata;
mod scenarios;

use std::{collections::BTreeSet, path::Path};

use color_eyre::{Result, eyre};

use crate::{
AdapterCoverageStatus, AdapterScenarioJudgment, EXTERNAL_ADAPTER_MANIFEST_SCHEMA,
ElfScenarioPosition, ExternalAdapterManifest, ExternalAdapterReport, ExternalDockerIsolation,
SUITES, ScenarioComparisonOutcome, external_adapters::outcome, formatting,
EXTERNAL_ADAPTER_MANIFEST_SCHEMA, ExternalAdapterManifest, ExternalAdapterReport,
ExternalDockerIsolation,
};

pub(super) fn validate_external_adapter_manifest(
Expand Down Expand Up @@ -100,12 +103,12 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re
));
}

validate_adapter_execution(path, adapter)?;
validate_adapter_capabilities(path, adapter)?;
validate_adapter_suites(path, adapter)?;
validate_adapter_scenarios(path, adapter)?;
validate_adapter_evidence(path, adapter)?;
validate_adapter_execution_metadata(path, adapter)?;
basics::validate_adapter_execution(path, adapter)?;
basics::validate_adapter_capabilities(path, adapter)?;
basics::validate_adapter_suites(path, adapter)?;
scenarios::validate_adapter_scenarios(path, adapter)?;
basics::validate_adapter_evidence(path, adapter)?;
metadata::validate_adapter_execution_metadata(path, adapter)?;

if let Some(follow_up) = &adapter.follow_up
&& (follow_up.title.trim().is_empty() || follow_up.reason.trim().is_empty())
Expand All @@ -119,225 +122,3 @@ fn validate_external_adapter(path: &Path, adapter: &ExternalAdapterReport) -> Re

Ok(())
}

fn validate_adapter_execution(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
for evidence in [&adapter.setup, &adapter.run, &adapter.result] {
if evidence.evidence.trim().is_empty()
|| evidence.command.as_deref().is_some_and(str::is_empty)
|| evidence.artifact.as_deref().is_some_and(str::is_empty)
{
return Err(eyre::eyre!(
"{} adapter {} has incomplete setup/run/result evidence.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}

fn validate_adapter_capabilities(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
for capability in &adapter.capabilities {
if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has incomplete capability coverage.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}

fn validate_adapter_suites(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
for suite in &adapter.suites {
if !SUITES.contains(&suite.suite_id.as_str()) {
return Err(eyre::eyre!(
"{} adapter {} references unknown suite {}.",
path.display(),
adapter.adapter_id,
suite.suite_id
));
}
if suite.evidence.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has suite {} without evidence.",
path.display(),
adapter.adapter_id,
suite.suite_id
));
}
}

Ok(())
}

fn validate_adapter_scenarios(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
for scenario in &adapter.scenarios {
if scenario.scenario_id.trim().is_empty()
|| scenario.evidence.trim().is_empty()
|| scenario.command.as_deref().is_some_and(str::is_empty)
|| scenario.artifact.as_deref().is_some_and(str::is_empty)
{
return Err(eyre::eyre!(
"{} adapter {} has incomplete scenario judgment.",
path.display(),
adapter.adapter_id
));
}

if let Some(suite_id) = &scenario.suite_id
&& !SUITES.contains(&suite_id.as_str())
{
return Err(eyre::eyre!(
"{} adapter {} scenario {} references unknown suite {}.",
path.display(),
adapter.adapter_id,
scenario.scenario_id,
suite_id
));
}

let outcome = outcome::scenario_comparison_outcome(scenario);

if blocked_status_missing_blocked_outcome(scenario.status, scenario.comparison_outcome) {
return Err(eyre::eyre!(
"{} adapter {} scenario {} uses blocked status without blocked comparison outcome.",
path.display(),
adapter.adapter_id,
scenario.scenario_id
));
}
if unmeasured_status_has_measured_outcome(scenario.status, outcome) {
return Err(eyre::eyre!(
"{} adapter {} scenario {} uses {} status with {} outcome.",
path.display(),
adapter.adapter_id,
scenario.scenario_id,
formatting::adapter_status_str(scenario.status),
formatting::scenario_comparison_outcome_str(outcome)
));
}
if unmeasured_status_has_measured_position(scenario.status, scenario.elf_position) {
return Err(eyre::eyre!(
"{} adapter {} scenario {} uses {} status with {} position.",
path.display(),
adapter.adapter_id,
scenario.scenario_id,
formatting::adapter_status_str(scenario.status),
formatting::scenario_position_str(scenario.elf_position)
));
}
if explicit_outcome_conflicts_with_position(scenario) {
return Err(eyre::eyre!(
"{} adapter {} scenario {} uses {} position with {} outcome.",
path.display(),
adapter.adapter_id,
scenario.scenario_id,
formatting::scenario_position_str(scenario.elf_position),
formatting::scenario_comparison_outcome_str(outcome)
));
}
}

Ok(())
}

fn blocked_status_missing_blocked_outcome(
status: AdapterCoverageStatus,
outcome: Option<ScenarioComparisonOutcome>,
) -> bool {
status == AdapterCoverageStatus::Blocked && outcome != Some(ScenarioComparisonOutcome::Blocked)
}

fn unmeasured_status_has_measured_outcome(
status: AdapterCoverageStatus,
outcome: ScenarioComparisonOutcome,
) -> bool {
matches!(
status,
AdapterCoverageStatus::Blocked
| AdapterCoverageStatus::Incomplete
| AdapterCoverageStatus::NotEncoded
| AdapterCoverageStatus::Unsupported
) && matches!(
outcome,
ScenarioComparisonOutcome::Win
| ScenarioComparisonOutcome::Tie
| ScenarioComparisonOutcome::Loss
)
}

fn unmeasured_status_has_measured_position(
status: AdapterCoverageStatus,
position: ElfScenarioPosition,
) -> bool {
matches!(
status,
AdapterCoverageStatus::Blocked
| AdapterCoverageStatus::Incomplete
| AdapterCoverageStatus::NotEncoded
| AdapterCoverageStatus::Unsupported
) && matches!(
position,
ElfScenarioPosition::Wins | ElfScenarioPosition::Ties | ElfScenarioPosition::Loses
)
}

fn explicit_outcome_conflicts_with_position(scenario: &AdapterScenarioJudgment) -> bool {
let Some(outcome) = scenario.comparison_outcome else {
return false;
};

!outcome::position_supports_outcome(scenario.elf_position, outcome)
}

fn validate_adapter_evidence(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
for evidence in &adapter.evidence {
if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has incomplete evidence pointers.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}

fn validate_adapter_execution_metadata(path: &Path, adapter: &ExternalAdapterReport) -> Result<()> {
let Some(metadata) = &adapter.execution_metadata else {
return Ok(());
};

if metadata.setup_path.trim().is_empty()
|| metadata.runtime_boundary.trim().is_empty()
|| metadata.resource_expectation.trim().is_empty()
|| metadata.retry_guidance.iter().any(|guidance| guidance.trim().is_empty())
|| metadata.sources.is_empty()
{
return Err(eyre::eyre!(
"{} adapter {} has incomplete execution metadata.",
path.display(),
adapter.adapter_id
));
}

for source in &metadata.sources {
if source.label.trim().is_empty()
|| source.url.trim().is_empty()
|| source.evidence.trim().is_empty()
{
return Err(eyre::eyre!(
"{} adapter {} has incomplete source metadata.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
use std::path::Path;

use color_eyre::{Result, eyre};

use crate::{ExternalAdapterReport, SUITES};

pub(in crate::external_adapters::validation) fn validate_adapter_execution(
path: &Path,
adapter: &ExternalAdapterReport,
) -> Result<()> {
for evidence in [&adapter.setup, &adapter.run, &adapter.result] {
if evidence.evidence.trim().is_empty()
|| evidence.command.as_deref().is_some_and(str::is_empty)
|| evidence.artifact.as_deref().is_some_and(str::is_empty)
{
return Err(eyre::eyre!(
"{} adapter {} has incomplete setup/run/result evidence.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}

pub(in crate::external_adapters::validation) fn validate_adapter_capabilities(
path: &Path,
adapter: &ExternalAdapterReport,
) -> Result<()> {
for capability in &adapter.capabilities {
if capability.capability.trim().is_empty() || capability.evidence.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has incomplete capability coverage.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}

pub(in crate::external_adapters::validation) fn validate_adapter_suites(
path: &Path,
adapter: &ExternalAdapterReport,
) -> Result<()> {
for suite in &adapter.suites {
if !SUITES.contains(&suite.suite_id.as_str()) {
return Err(eyre::eyre!(
"{} adapter {} references unknown suite {}.",
path.display(),
adapter.adapter_id,
suite.suite_id
));
}
if suite.evidence.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has suite {} without evidence.",
path.display(),
adapter.adapter_id,
suite.suite_id
));
}
}

Ok(())
}

pub(in crate::external_adapters::validation) fn validate_adapter_evidence(
path: &Path,
adapter: &ExternalAdapterReport,
) -> Result<()> {
for evidence in &adapter.evidence {
if evidence.kind.trim().is_empty() || evidence.reference.trim().is_empty() {
return Err(eyre::eyre!(
"{} adapter {} has incomplete evidence pointers.",
path.display(),
adapter.adapter_id
));
}
}

Ok(())
}
Loading